diff --git a/invokeai/app/invocations/ip_adapter.py b/invokeai/app/invocations/ip_adapter.py index e302c2b97a0..40a667c9d0d 100644 --- a/invokeai/app/invocations/ip_adapter.py +++ b/invokeai/app/invocations/ip_adapter.py @@ -1,21 +1,22 @@ from builtins import float -from typing import List, Union +from typing import List, Literal, Union from pydantic import BaseModel, Field, field_validator, model_validator from typing_extensions import Self -from invokeai.app.invocations.baseinvocation import ( - BaseInvocation, - BaseInvocationOutput, - invocation, - invocation_output, -) +from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, OutputField, UIType from invokeai.app.invocations.model import ModelIdentifierField from invokeai.app.invocations.primitives import ImageField from invokeai.app.invocations.util import validate_begin_end_step, validate_weights from invokeai.app.services.shared.invocation_context import InvocationContext -from invokeai.backend.model_manager.config import AnyModelConfig, BaseModelType, IPAdapterConfig, ModelType +from invokeai.backend.model_manager.config import ( + AnyModelConfig, + BaseModelType, + IPAdapterCheckpointConfig, + IPAdapterInvokeAIConfig, + ModelType, +) class IPAdapterField(BaseModel): @@ -48,12 +49,15 @@ class IPAdapterOutput(BaseInvocationOutput): ip_adapter: IPAdapterField = OutputField(description=FieldDescriptions.ip_adapter, title="IP-Adapter") +CLIP_VISION_MODEL_MAP = {"ViT-H": "ip_adapter_sd_image_encoder", "ViT-G": "ip_adapter_sdxl_image_encoder"} + + @invocation("ip_adapter", title="IP-Adapter", tags=["ip_adapter", "control"], category="ip_adapter", version="1.2.2") class IPAdapterInvocation(BaseInvocation): """Collects IP-Adapter info to pass to other nodes.""" # Inputs - image: Union[ImageField, List[ImageField]] = InputField(description="The IP-Adapter image prompt(s).") + image: Union[ImageField, List[ImageField]] = InputField(description="The IP-Adapter image prompt(s).", ui_order=1) ip_adapter_model: ModelIdentifierField = InputField( description="The IP-Adapter model.", title="IP-Adapter Model", @@ -61,7 +65,11 @@ class IPAdapterInvocation(BaseInvocation): ui_order=-1, ui_type=UIType.IPAdapterModel, ) - + clip_vision_model: Literal["auto", "ViT-H", "ViT-G"] = InputField( + description="CLIP Vision model to use. Overrides model settings. Mandatory for checkpoint models.", + default="auto", + ui_order=2, + ) weight: Union[float, List[float]] = InputField( default=1, description="The weight given to the IP-Adapter", title="Weight" ) @@ -86,10 +94,21 @@ def validate_begin_end_step_percent(self) -> Self: def invoke(self, context: InvocationContext) -> IPAdapterOutput: # Lookup the CLIP Vision encoder that is intended to be used with the IP-Adapter model. ip_adapter_info = context.models.get_config(self.ip_adapter_model.key) - assert isinstance(ip_adapter_info, IPAdapterConfig) - image_encoder_model_id = ip_adapter_info.image_encoder_model_id - image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() + assert isinstance(ip_adapter_info, (IPAdapterInvokeAIConfig, IPAdapterCheckpointConfig)) + + if self.clip_vision_model == "auto": + if isinstance(ip_adapter_info, IPAdapterInvokeAIConfig): + image_encoder_model_id = ip_adapter_info.image_encoder_model_id + image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() + else: + raise RuntimeError( + "You need to set the appropriate CLIP Vision model for checkpoint IP Adapter models." + ) + else: + image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + image_encoder_model = self._get_image_encoder(context, image_encoder_model_name) + return IPAdapterOutput( ip_adapter=IPAdapterField( image=self.image, @@ -102,19 +121,25 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: ) def _get_image_encoder(self, context: InvocationContext, image_encoder_model_name: str) -> AnyModelConfig: - found = False - while not found: + image_encoder_models = context.models.search_by_attrs( + name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision + ) + + if not len(image_encoder_models) > 0: + context.logger.warning( + f"The image encoder required by this IP Adapter ({image_encoder_model_name}) is not installed. \ + Downloading and installing now. This may take a while." + ) + + installer = context._services.model_manager.install + job = installer.heuristic_import(f"InvokeAI/{image_encoder_model_name}") + installer.wait_for_job(job, timeout=600) # Wait for up to 10 minutes image_encoder_models = context.models.search_by_attrs( name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision ) - found = len(image_encoder_models) > 0 - if not found: - context.logger.warning( - f"The image encoder required by this IP Adapter ({image_encoder_model_name}) is not installed." - ) - context.logger.warning("Downloading and installing now. This may take a while.") - installer = context._services.model_manager.install - job = installer.heuristic_import(f"InvokeAI/{image_encoder_model_name}") - installer.wait_for_job(job, timeout=600) # wait up to 10 minutes - then raise a TimeoutException - assert len(image_encoder_models) == 1 + + if len(image_encoder_models) == 0: + context.logger.error("Error while fetching CLIP Vision Image Encoder") + assert len(image_encoder_models) == 1 + return image_encoder_models[0] diff --git a/invokeai/app/invocations/latent.py b/invokeai/app/invocations/latent.py index bc79efdeba4..3c66b7014f5 100644 --- a/invokeai/app/invocations/latent.py +++ b/invokeai/app/invocations/latent.py @@ -43,11 +43,7 @@ WithMetadata, ) from invokeai.app.invocations.ip_adapter import IPAdapterField -from invokeai.app.invocations.primitives import ( - DenoiseMaskOutput, - ImageOutput, - LatentsOutput, -) +from invokeai.app.invocations.primitives import DenoiseMaskOutput, ImageOutput, LatentsOutput from invokeai.app.invocations.t2i_adapter import T2IAdapterField from invokeai.app.services.shared.invocation_context import InvocationContext from invokeai.app.util.controlnet_utils import prepare_control_image @@ -68,12 +64,7 @@ ) from ...backend.stable_diffusion.schedulers import SCHEDULER_MAP from ...backend.util.devices import choose_precision, choose_torch_device -from .baseinvocation import ( - BaseInvocation, - BaseInvocationOutput, - invocation, - invocation_output, -) +from .baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output from .controlnet_image_processors import ControlField from .model import ModelIdentifierField, UNetField, VAEField diff --git a/invokeai/app/invocations/metadata.py b/invokeai/app/invocations/metadata.py index 6fc72a1c3fc..2da482c8331 100644 --- a/invokeai/app/invocations/metadata.py +++ b/invokeai/app/invocations/metadata.py @@ -2,16 +2,8 @@ from pydantic import BaseModel, ConfigDict, Field -from invokeai.app.invocations.baseinvocation import ( - BaseInvocation, - BaseInvocationOutput, - invocation, - invocation_output, -) -from invokeai.app.invocations.controlnet_image_processors import ( - CONTROLNET_MODE_VALUES, - CONTROLNET_RESIZE_VALUES, -) +from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output +from invokeai.app.invocations.controlnet_image_processors import CONTROLNET_MODE_VALUES, CONTROLNET_RESIZE_VALUES from invokeai.app.invocations.fields import ( FieldDescriptions, ImageField, @@ -43,6 +35,7 @@ class IPAdapterMetadataField(BaseModel): image: ImageField = Field(description="The IP-Adapter image prompt.") ip_adapter_model: ModelIdentifierField = Field(description="The IP-Adapter model.") + clip_vision_model: Literal["ViT-H", "ViT-G"] = Field(description="The CLIP Vision model") weight: Union[float, list[float]] = Field(description="The weight given to the IP-Adapter") begin_step_percent: float = Field(description="When the IP-Adapter is first applied (% of total steps)") end_step_percent: float = Field(description="When the IP-Adapter is last applied (% of total steps)") diff --git a/invokeai/backend/ip_adapter/ip_adapter.py b/invokeai/backend/ip_adapter/ip_adapter.py index e51966c779c..f3be0421469 100644 --- a/invokeai/backend/ip_adapter/ip_adapter.py +++ b/invokeai/backend/ip_adapter/ip_adapter.py @@ -1,8 +1,11 @@ # copied from https://github.com/tencent-ailab/IP-Adapter (Apache License 2.0) # and modified as needed -from typing import Optional, Union +import pathlib +from typing import List, Optional, TypedDict, Union +import safetensors +import safetensors.torch import torch from PIL import Image from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection @@ -13,10 +16,17 @@ from .resampler import Resampler +class IPAdapterStateDict(TypedDict): + ip_adapter: dict[str, torch.Tensor] + image_proj: dict[str, torch.Tensor] + + class ImageProjModel(torch.nn.Module): """Image Projection Model""" - def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4): + def __init__( + self, cross_attention_dim: int = 1024, clip_embeddings_dim: int = 1024, clip_extra_context_tokens: int = 4 + ): super().__init__() self.cross_attention_dim = cross_attention_dim @@ -25,7 +35,7 @@ def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extr self.norm = torch.nn.LayerNorm(cross_attention_dim) @classmethod - def from_state_dict(cls, state_dict: dict[torch.Tensor], clip_extra_context_tokens=4): + def from_state_dict(cls, state_dict: dict[str, torch.Tensor], clip_extra_context_tokens: int = 4): """Initialize an ImageProjModel from a state_dict. The cross_attention_dim and clip_embeddings_dim are inferred from the shape of the tensors in the state_dict. @@ -45,7 +55,7 @@ def from_state_dict(cls, state_dict: dict[torch.Tensor], clip_extra_context_toke model.load_state_dict(state_dict) return model - def forward(self, image_embeds): + def forward(self, image_embeds: torch.Tensor): embeds = image_embeds clip_extra_context_tokens = self.proj(embeds).reshape( -1, self.clip_extra_context_tokens, self.cross_attention_dim @@ -57,7 +67,7 @@ def forward(self, image_embeds): class MLPProjModel(torch.nn.Module): """SD model with image prompt""" - def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024): + def __init__(self, cross_attention_dim: int = 1024, clip_embeddings_dim: int = 1024): super().__init__() self.proj = torch.nn.Sequential( @@ -68,7 +78,7 @@ def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024): ) @classmethod - def from_state_dict(cls, state_dict: dict[torch.Tensor]): + def from_state_dict(cls, state_dict: dict[str, torch.Tensor]): """Initialize an MLPProjModel from a state_dict. The cross_attention_dim and clip_embeddings_dim are inferred from the shape of the tensors in the state_dict. @@ -87,7 +97,7 @@ def from_state_dict(cls, state_dict: dict[torch.Tensor]): model.load_state_dict(state_dict) return model - def forward(self, image_embeds): + def forward(self, image_embeds: torch.Tensor): clip_extra_context_tokens = self.proj(image_embeds) return clip_extra_context_tokens @@ -97,7 +107,7 @@ class IPAdapter(RawModel): def __init__( self, - state_dict: dict[str, torch.Tensor], + state_dict: IPAdapterStateDict, device: torch.device, dtype: torch.dtype = torch.float16, num_tokens: int = 4, @@ -129,24 +139,27 @@ def calc_size(self): return calc_model_size_by_data(self._image_proj_model) + calc_model_size_by_data(self.attn_weights) - def _init_image_proj_model(self, state_dict): + def _init_image_proj_model( + self, state_dict: dict[str, torch.Tensor] + ) -> Union[ImageProjModel, Resampler, MLPProjModel]: return ImageProjModel.from_state_dict(state_dict, self._num_tokens).to(self.device, dtype=self.dtype) @torch.inference_mode() - def get_image_embeds(self, pil_image, image_encoder: CLIPVisionModelWithProjection): - if isinstance(pil_image, Image.Image): - pil_image = [pil_image] + def get_image_embeds(self, pil_image: List[Image.Image], image_encoder: CLIPVisionModelWithProjection): clip_image = self._clip_image_processor(images=pil_image, return_tensors="pt").pixel_values clip_image_embeds = image_encoder(clip_image.to(self.device, dtype=self.dtype)).image_embeds - image_prompt_embeds = self._image_proj_model(clip_image_embeds) - uncond_image_prompt_embeds = self._image_proj_model(torch.zeros_like(clip_image_embeds)) - return image_prompt_embeds, uncond_image_prompt_embeds + try: + image_prompt_embeds = self._image_proj_model(clip_image_embeds) + uncond_image_prompt_embeds = self._image_proj_model(torch.zeros_like(clip_image_embeds)) + return image_prompt_embeds, uncond_image_prompt_embeds + except RuntimeError as e: + raise RuntimeError("Selected CLIP Vision Model is incompatible with the current IP Adapter") from e class IPAdapterPlus(IPAdapter): """IP-Adapter with fine-grained features""" - def _init_image_proj_model(self, state_dict): + def _init_image_proj_model(self, state_dict: dict[str, torch.Tensor]) -> Union[Resampler, MLPProjModel]: return Resampler.from_state_dict( state_dict=state_dict, depth=4, @@ -157,31 +170,32 @@ def _init_image_proj_model(self, state_dict): ).to(self.device, dtype=self.dtype) @torch.inference_mode() - def get_image_embeds(self, pil_image, image_encoder: CLIPVisionModelWithProjection): - if isinstance(pil_image, Image.Image): - pil_image = [pil_image] + def get_image_embeds(self, pil_image: List[Image.Image], image_encoder: CLIPVisionModelWithProjection): clip_image = self._clip_image_processor(images=pil_image, return_tensors="pt").pixel_values clip_image = clip_image.to(self.device, dtype=self.dtype) clip_image_embeds = image_encoder(clip_image, output_hidden_states=True).hidden_states[-2] - image_prompt_embeds = self._image_proj_model(clip_image_embeds) uncond_clip_image_embeds = image_encoder(torch.zeros_like(clip_image), output_hidden_states=True).hidden_states[ -2 ] - uncond_image_prompt_embeds = self._image_proj_model(uncond_clip_image_embeds) - return image_prompt_embeds, uncond_image_prompt_embeds + try: + image_prompt_embeds = self._image_proj_model(clip_image_embeds) + uncond_image_prompt_embeds = self._image_proj_model(uncond_clip_image_embeds) + return image_prompt_embeds, uncond_image_prompt_embeds + except RuntimeError as e: + raise RuntimeError("Selected CLIP Vision Model is incompatible with the current IP Adapter") from e class IPAdapterFull(IPAdapterPlus): """IP-Adapter Plus with full features.""" - def _init_image_proj_model(self, state_dict: dict[torch.Tensor]): + def _init_image_proj_model(self, state_dict: dict[str, torch.Tensor]): return MLPProjModel.from_state_dict(state_dict).to(self.device, dtype=self.dtype) class IPAdapterPlusXL(IPAdapterPlus): """IP-Adapter Plus for SDXL.""" - def _init_image_proj_model(self, state_dict): + def _init_image_proj_model(self, state_dict: dict[str, torch.Tensor]): return Resampler.from_state_dict( state_dict=state_dict, depth=4, @@ -192,24 +206,48 @@ def _init_image_proj_model(self, state_dict): ).to(self.device, dtype=self.dtype) +def load_ip_adapter_tensors(ip_adapter_ckpt_path: pathlib.Path, device: str) -> IPAdapterStateDict: + state_dict: IPAdapterStateDict = {"ip_adapter": {}, "image_proj": {}} + + if ip_adapter_ckpt_path.suffix == ".safetensors": + model = safetensors.torch.load_file(ip_adapter_ckpt_path, device=device) + for key in model.keys(): + if key.startswith("image_proj."): + state_dict["image_proj"][key.replace("image_proj.", "")] = model[key] + elif key.startswith("ip_adapter."): + state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = model[key] + else: + raise RuntimeError(f"Encountered unexpected IP Adapter state dict key: '{key}'.") + else: + ip_adapter_diffusers_checkpoint_path = ip_adapter_ckpt_path / "ip_adapter.bin" + state_dict = torch.load(ip_adapter_diffusers_checkpoint_path, map_location="cpu") + + return state_dict + + def build_ip_adapter( - ip_adapter_ckpt_path: str, device: torch.device, dtype: torch.dtype = torch.float16 -) -> Union[IPAdapter, IPAdapterPlus]: - state_dict = torch.load(ip_adapter_ckpt_path, map_location="cpu") + ip_adapter_ckpt_path: pathlib.Path, device: torch.device, dtype: torch.dtype = torch.float16 +) -> Union[IPAdapter, IPAdapterPlus, IPAdapterPlusXL, IPAdapterPlus]: + state_dict = load_ip_adapter_tensors(ip_adapter_ckpt_path, device.type) - if "proj.weight" in state_dict["image_proj"]: # IPAdapter (with ImageProjModel). + # IPAdapter (with ImageProjModel) + if "proj.weight" in state_dict["image_proj"]: return IPAdapter(state_dict, device=device, dtype=dtype) - elif "proj_in.weight" in state_dict["image_proj"]: # IPAdaterPlus or IPAdapterPlusXL (with Resampler). + + # IPAdaterPlus or IPAdapterPlusXL (with Resampler) + elif "proj_in.weight" in state_dict["image_proj"]: cross_attention_dim = state_dict["ip_adapter"]["1.to_k_ip.weight"].shape[-1] if cross_attention_dim == 768: - # SD1 IP-Adapter Plus - return IPAdapterPlus(state_dict, device=device, dtype=dtype) + return IPAdapterPlus(state_dict, device=device, dtype=dtype) # SD1 IP-Adapter Plus elif cross_attention_dim == 2048: - # SDXL IP-Adapter Plus - return IPAdapterPlusXL(state_dict, device=device, dtype=dtype) + return IPAdapterPlusXL(state_dict, device=device, dtype=dtype) # SDXL IP-Adapter Plus else: raise Exception(f"Unsupported IP-Adapter Plus cross-attention dimension: {cross_attention_dim}.") - elif "proj.0.weight" in state_dict["image_proj"]: # IPAdapterFull (with MLPProjModel). + + # IPAdapterFull (with MLPProjModel) + elif "proj.0.weight" in state_dict["image_proj"]: return IPAdapterFull(state_dict, device=device, dtype=dtype) + + # Unrecognized IP Adapter Architectures else: raise ValueError(f"'{ip_adapter_ckpt_path}' has an unrecognized IP-Adapter model architecture.") diff --git a/invokeai/backend/ip_adapter/resampler.py b/invokeai/backend/ip_adapter/resampler.py index a8db22c0fd9..a32eeacfdc2 100644 --- a/invokeai/backend/ip_adapter/resampler.py +++ b/invokeai/backend/ip_adapter/resampler.py @@ -9,8 +9,8 @@ # FFN -def FeedForward(dim, mult=4): - inner_dim = int(dim * mult) +def FeedForward(dim: int, mult: int = 4): + inner_dim = dim * mult return nn.Sequential( nn.LayerNorm(dim), nn.Linear(dim, inner_dim, bias=False), @@ -19,8 +19,8 @@ def FeedForward(dim, mult=4): ) -def reshape_tensor(x, heads): - bs, length, width = x.shape +def reshape_tensor(x: torch.Tensor, heads: int): + bs, length, _ = x.shape # (bs, length, width) --> (bs, length, n_heads, dim_per_head) x = x.view(bs, length, heads, -1) # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head) @@ -31,7 +31,7 @@ def reshape_tensor(x, heads): class PerceiverAttention(nn.Module): - def __init__(self, *, dim, dim_head=64, heads=8): + def __init__(self, *, dim: int, dim_head: int = 64, heads: int = 8): super().__init__() self.scale = dim_head**-0.5 self.dim_head = dim_head @@ -45,7 +45,7 @@ def __init__(self, *, dim, dim_head=64, heads=8): self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False) self.to_out = nn.Linear(inner_dim, dim, bias=False) - def forward(self, x, latents): + def forward(self, x: torch.Tensor, latents: torch.Tensor): """ Args: x (torch.Tensor): image features @@ -80,14 +80,14 @@ def forward(self, x, latents): class Resampler(nn.Module): def __init__( self, - dim=1024, - depth=8, - dim_head=64, - heads=16, - num_queries=8, - embedding_dim=768, - output_dim=1024, - ff_mult=4, + dim: int = 1024, + depth: int = 8, + dim_head: int = 64, + heads: int = 16, + num_queries: int = 8, + embedding_dim: int = 768, + output_dim: int = 1024, + ff_mult: int = 4, ): super().__init__() @@ -110,7 +110,15 @@ def __init__( ) @classmethod - def from_state_dict(cls, state_dict: dict[torch.Tensor], depth=8, dim_head=64, heads=16, num_queries=8, ff_mult=4): + def from_state_dict( + cls, + state_dict: dict[str, torch.Tensor], + depth: int = 8, + dim_head: int = 64, + heads: int = 16, + num_queries: int = 8, + ff_mult: int = 4, + ): """A convenience function that initializes a Resampler from a state_dict. Some of the shape parameters are inferred from the state_dict (e.g. dim, embedding_dim, etc.). At the time of @@ -145,7 +153,7 @@ def from_state_dict(cls, state_dict: dict[torch.Tensor], depth=8, dim_head=64, h model.load_state_dict(state_dict) return model - def forward(self, x): + def forward(self, x: torch.Tensor): latents = self.latents.repeat(x.size(0), 1, 1) x = self.proj_in(x) diff --git a/invokeai/backend/model_manager/config.py b/invokeai/backend/model_manager/config.py index 524e39b2a19..82f88c0e817 100644 --- a/invokeai/backend/model_manager/config.py +++ b/invokeai/backend/model_manager/config.py @@ -323,10 +323,13 @@ def get_tag() -> Tag: return Tag(f"{ModelType.Main.value}.{ModelFormat.Diffusers.value}") -class IPAdapterConfig(ModelConfigBase): - """Model config for IP Adaptor format models.""" - +class IPAdapterBaseConfig(ModelConfigBase): type: Literal[ModelType.IPAdapter] = ModelType.IPAdapter + + +class IPAdapterInvokeAIConfig(IPAdapterBaseConfig): + """Model config for IP Adapter diffusers format models.""" + image_encoder_model_id: str format: Literal[ModelFormat.InvokeAI] @@ -335,6 +338,16 @@ def get_tag() -> Tag: return Tag(f"{ModelType.IPAdapter.value}.{ModelFormat.InvokeAI.value}") +class IPAdapterCheckpointConfig(IPAdapterBaseConfig): + """Model config for IP Adapter checkpoint format models.""" + + format: Literal[ModelFormat.Checkpoint] + + @staticmethod + def get_tag() -> Tag: + return Tag(f"{ModelType.IPAdapter.value}.{ModelFormat.Checkpoint.value}") + + class CLIPVisionDiffusersConfig(DiffusersConfigBase): """Model config for CLIPVision.""" @@ -390,7 +403,8 @@ def get_model_discriminator_value(v: Any) -> str: Annotated[LoRADiffusersConfig, LoRADiffusersConfig.get_tag()], Annotated[TextualInversionFileConfig, TextualInversionFileConfig.get_tag()], Annotated[TextualInversionFolderConfig, TextualInversionFolderConfig.get_tag()], - Annotated[IPAdapterConfig, IPAdapterConfig.get_tag()], + Annotated[IPAdapterInvokeAIConfig, IPAdapterInvokeAIConfig.get_tag()], + Annotated[IPAdapterCheckpointConfig, IPAdapterCheckpointConfig.get_tag()], Annotated[T2IAdapterConfig, T2IAdapterConfig.get_tag()], Annotated[CLIPVisionDiffusersConfig, CLIPVisionDiffusersConfig.get_tag()], ], diff --git a/invokeai/backend/model_manager/load/model_loaders/ip_adapter.py b/invokeai/backend/model_manager/load/model_loaders/ip_adapter.py index 89dd46e929b..55eed81fcd5 100644 --- a/invokeai/backend/model_manager/load/model_loaders/ip_adapter.py +++ b/invokeai/backend/model_manager/load/model_loaders/ip_adapter.py @@ -7,19 +7,13 @@ import torch from invokeai.backend.ip_adapter.ip_adapter import build_ip_adapter -from invokeai.backend.model_manager import ( - AnyModel, - AnyModelConfig, - BaseModelType, - ModelFormat, - ModelType, - SubModelType, -) +from invokeai.backend.model_manager import AnyModel, AnyModelConfig, BaseModelType, ModelFormat, ModelType, SubModelType from invokeai.backend.model_manager.load import ModelLoader, ModelLoaderRegistry from invokeai.backend.raw_model import RawModel @ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.IPAdapter, format=ModelFormat.InvokeAI) +@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.IPAdapter, format=ModelFormat.Checkpoint) class IPAdapterInvokeAILoader(ModelLoader): """Class to load IP Adapter diffusers models.""" @@ -32,7 +26,7 @@ def _load_model( raise ValueError("There are no submodels in an IP-Adapter model.") model_path = Path(config.path) model: RawModel = build_ip_adapter( - ip_adapter_ckpt_path=str(model_path / "ip_adapter.bin"), + ip_adapter_ckpt_path=model_path, device=torch.device("cpu"), dtype=self._torch_dtype, ) diff --git a/invokeai/backend/model_manager/probe.py b/invokeai/backend/model_manager/probe.py index ddd9e99eda9..7aa650afaa4 100644 --- a/invokeai/backend/model_manager/probe.py +++ b/invokeai/backend/model_manager/probe.py @@ -230,9 +230,10 @@ def get_model_type_from_checkpoint(cls, model_path: Path, checkpoint: Optional[C return ModelType.LoRA elif any(key.startswith(v) for v in {"controlnet", "control_model", "input_blocks"}): return ModelType.ControlNet + elif any(key.startswith(v) for v in {"image_proj.", "ip_adapter."}): + return ModelType.IPAdapter elif key in {"emb_params", "string_to_param"}: return ModelType.TextualInversion - else: # diffusers-ti if len(ckpt) < 10 and all(isinstance(v, torch.Tensor) for v in ckpt.values()): @@ -527,8 +528,25 @@ def get_base_type(self) -> BaseModelType: class IPAdapterCheckpointProbe(CheckpointProbeBase): + """Class for probing IP Adapters""" + def get_base_type(self) -> BaseModelType: - raise NotImplementedError() + checkpoint = self.checkpoint + for key in checkpoint.keys(): + if not key.startswith(("image_proj.", "ip_adapter.")): + continue + cross_attention_dim = checkpoint["ip_adapter.1.to_k_ip.weight"].shape[-1] + if cross_attention_dim == 768: + return BaseModelType.StableDiffusion1 + elif cross_attention_dim == 1024: + return BaseModelType.StableDiffusion2 + elif cross_attention_dim == 2048: + return BaseModelType.StableDiffusionXL + else: + raise InvalidModelConfigException( + f"IP-Adapter had unexpected cross-attention dimension: {cross_attention_dim}." + ) + raise InvalidModelConfigException(f"{self.model_path}: Unable to determine base type") class CLIPVisionCheckpointProbe(CheckpointProbeBase): @@ -768,7 +786,7 @@ def get_base_type(self) -> BaseModelType: ) -############## register probe classes ###### +# Register probe classes ModelProbe.register_probe("diffusers", ModelType.Main, PipelineFolderProbe) ModelProbe.register_probe("diffusers", ModelType.VAE, VaeFolderProbe) ModelProbe.register_probe("diffusers", ModelType.LoRA, LoRAFolderProbe) diff --git a/invokeai/frontend/web/public/locales/en.json b/invokeai/frontend/web/public/locales/en.json index 1601169e03c..5872d22dfed 100644 --- a/invokeai/frontend/web/public/locales/en.json +++ b/invokeai/frontend/web/public/locales/en.json @@ -217,6 +217,7 @@ "saveControlImage": "Save Control Image", "scribble": "scribble", "selectModel": "Select a model", + "selectCLIPVisionModel": "Select a CLIP Vision model", "setControlImageDimensions": "Set Control Image Dimensions To W/H", "showAdvanced": "Show Advanced", "small": "Small", @@ -655,6 +656,7 @@ "install": "Install", "installAll": "Install All", "installRepo": "Install Repo", + "ipAdapters": "IP Adapters", "load": "Load", "localOnly": "local only", "manual": "Manual", diff --git a/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx b/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx index 25d327e54e3..91f88223523 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx +++ b/invokeai/frontend/web/src/features/controlAdapters/components/parameters/ParamControlAdapterModel.tsx @@ -1,12 +1,18 @@ -import { Combobox, FormControl, Tooltip } from '@invoke-ai/ui-library'; +import type { ComboboxOnChange, ComboboxOption } from '@invoke-ai/ui-library'; +import { Combobox, Flex, FormControl, Tooltip } from '@invoke-ai/ui-library'; import { createMemoizedSelector } from 'app/store/createMemoizedSelector'; import { useAppDispatch, useAppSelector } from 'app/store/storeHooks'; import { useGroupedModelCombobox } from 'common/hooks/useGroupedModelCombobox'; +import { useControlAdapterCLIPVisionModel } from 'features/controlAdapters/hooks/useControlAdapterCLIPVisionModel'; import { useControlAdapterIsEnabled } from 'features/controlAdapters/hooks/useControlAdapterIsEnabled'; import { useControlAdapterModel } from 'features/controlAdapters/hooks/useControlAdapterModel'; import { useControlAdapterModels } from 'features/controlAdapters/hooks/useControlAdapterModels'; import { useControlAdapterType } from 'features/controlAdapters/hooks/useControlAdapterType'; -import { controlAdapterModelChanged } from 'features/controlAdapters/store/controlAdaptersSlice'; +import { + controlAdapterCLIPVisionModelChanged, + controlAdapterModelChanged, +} from 'features/controlAdapters/store/controlAdaptersSlice'; +import type { CLIPVisionModel } from 'features/controlAdapters/store/types'; import { selectGenerationSlice } from 'features/parameters/store/generationSlice'; import { memo, useCallback, useMemo } from 'react'; import { useTranslation } from 'react-i18next'; @@ -29,6 +35,7 @@ const ParamControlAdapterModel = ({ id }: ParamControlAdapterModelProps) => { const { modelConfig } = useControlAdapterModel(id); const dispatch = useAppDispatch(); const currentBaseModel = useAppSelector((s) => s.generation.model?.base); + const currentCLIPVisionModel = useControlAdapterCLIPVisionModel(id); const mainModel = useAppSelector(selectMainModel); const { t } = useTranslation(); @@ -49,6 +56,16 @@ const ParamControlAdapterModel = ({ id }: ParamControlAdapterModelProps) => { [dispatch, id] ); + const onCLIPVisionModelChange = useCallback( + (v) => { + if (!v?.value) { + return; + } + dispatch(controlAdapterCLIPVisionModelChanged({ id, clipVisionModel: v.value as CLIPVisionModel })); + }, + [dispatch, id] + ); + const selectedModel = useMemo( () => (modelConfig && controlAdapterType ? { ...modelConfig, model_type: controlAdapterType } : null), [controlAdapterType, modelConfig] @@ -71,18 +88,51 @@ const ParamControlAdapterModel = ({ id }: ParamControlAdapterModelProps) => { isLoading, }); + const clipVisionOptions = useMemo( + () => [ + { label: 'ViT-H', value: 'ViT-H' }, + { label: 'ViT-G', value: 'ViT-G' }, + ], + [] + ); + + const clipVisionModel = useMemo( + () => clipVisionOptions.find((o) => o.value === currentCLIPVisionModel), + [clipVisionOptions, currentCLIPVisionModel] + ); + return ( - - - - - + + + + + + + {modelConfig?.type === 'ip_adapter' && modelConfig.format === 'checkpoint' && ( + + + + )} + ); }; diff --git a/invokeai/frontend/web/src/features/controlAdapters/hooks/useControlAdapterCLIPVisionModel.ts b/invokeai/frontend/web/src/features/controlAdapters/hooks/useControlAdapterCLIPVisionModel.ts new file mode 100644 index 00000000000..249d2022fe8 --- /dev/null +++ b/invokeai/frontend/web/src/features/controlAdapters/hooks/useControlAdapterCLIPVisionModel.ts @@ -0,0 +1,24 @@ +import { createMemoizedSelector } from 'app/store/createMemoizedSelector'; +import { useAppSelector } from 'app/store/storeHooks'; +import { + selectControlAdapterById, + selectControlAdaptersSlice, +} from 'features/controlAdapters/store/controlAdaptersSlice'; +import { useMemo } from 'react'; + +export const useControlAdapterCLIPVisionModel = (id: string) => { + const selector = useMemo( + () => + createMemoizedSelector(selectControlAdaptersSlice, (controlAdapters) => { + const cn = selectControlAdapterById(controlAdapters, id); + if (cn && cn?.type === 'ip_adapter') { + return cn.clipVisionModel; + } + }), + [id] + ); + + const clipVisionModel = useAppSelector(selector); + + return clipVisionModel; +}; diff --git a/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts b/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts index f4edca41bb5..100bb3f6ad6 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts +++ b/invokeai/frontend/web/src/features/controlAdapters/store/controlAdaptersSlice.ts @@ -14,6 +14,7 @@ import { v4 as uuidv4 } from 'uuid'; import { controlAdapterImageProcessed } from './actions'; import { CONTROLNET_PROCESSORS } from './constants'; import type { + CLIPVisionModel, ControlAdapterConfig, ControlAdapterProcessorType, ControlAdaptersState, @@ -244,6 +245,13 @@ export const controlAdaptersSlice = createSlice({ } caAdapter.updateOne(state, { id, changes: { controlMode } }); }, + controlAdapterCLIPVisionModelChanged: ( + state, + action: PayloadAction<{ id: string; clipVisionModel: CLIPVisionModel }> + ) => { + const { id, clipVisionModel } = action.payload; + caAdapter.updateOne(state, { id, changes: { clipVisionModel } }); + }, controlAdapterResizeModeChanged: ( state, action: PayloadAction<{ @@ -381,6 +389,7 @@ export const { controlAdapterProcessedImageChanged, controlAdapterIsEnabledChanged, controlAdapterModelChanged, + controlAdapterCLIPVisionModelChanged, controlAdapterWeightChanged, controlAdapterBeginStepPctChanged, controlAdapterEndStepPctChanged, diff --git a/invokeai/frontend/web/src/features/controlAdapters/store/types.ts b/invokeai/frontend/web/src/features/controlAdapters/store/types.ts index 93d4915cdf1..329c3187598 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/store/types.ts +++ b/invokeai/frontend/web/src/features/controlAdapters/store/types.ts @@ -243,12 +243,15 @@ export type T2IAdapterConfig = { shouldAutoConfig: boolean; }; +export type CLIPVisionModel = 'ViT-H' | 'ViT-G'; + export type IPAdapterConfig = { type: 'ip_adapter'; id: string; isEnabled: boolean; controlImage: string | null; model: ParameterIPAdapterModel | null; + clipVisionModel: CLIPVisionModel; weight: number; beginStepPct: number; endStepPct: number; diff --git a/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts b/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts index d4796572d47..dc893ceb1c5 100644 --- a/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts +++ b/invokeai/frontend/web/src/features/controlAdapters/util/buildControlAdapter.ts @@ -46,6 +46,7 @@ export const initialIPAdapter: Omit = { isEnabled: true, controlImage: null, model: null, + clipVisionModel: 'ViT-H', weight: 1, beginStepPct: 0, endStepPct: 1, diff --git a/invokeai/frontend/web/src/features/metadata/util/parsers.ts b/invokeai/frontend/web/src/features/metadata/util/parsers.ts index c7c9616bd02..635a63a8dec 100644 --- a/invokeai/frontend/web/src/features/metadata/util/parsers.ts +++ b/invokeai/frontend/web/src/features/metadata/util/parsers.ts @@ -372,6 +372,7 @@ const parseIPAdapter: MetadataParseFunc = async (metada type: 'ip_adapter', isEnabled: true, model: zModelIdentifierField.parse(ipAdapterModel), + clipVisionModel: 'ViT-H', controlImage: image?.image_name ?? null, weight: weight ?? initialIPAdapter.weight, beginStepPct: begin_step_percent ?? initialIPAdapter.beginStepPct, diff --git a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx index adb123f24db..0618af5dd04 100644 --- a/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx +++ b/invokeai/frontend/web/src/features/modelManagerV2/subpanels/ModelPanel/ModelView.tsx @@ -53,7 +53,7 @@ export const ModelView = () => { )} - {data.type === 'ip_adapter' && ( + {data.type === 'ip_adapter' && data.format === 'invokeai' && ( diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts index 2298e84d43e..ad563de4684 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/addIPAdapterToLinearGraph.ts @@ -48,7 +48,7 @@ export const addIPAdapterToLinearGraph = async ( if (!ipAdapter.model) { return; } - const { id, weight, model, beginStepPct, endStepPct, controlImage } = ipAdapter; + const { id, weight, model, clipVisionModel, beginStepPct, endStepPct, controlImage } = ipAdapter; assert(controlImage, 'IP Adapter image is required'); @@ -58,6 +58,7 @@ export const addIPAdapterToLinearGraph = async ( is_intermediate: true, weight: weight, ip_adapter_model: model, + clip_vision_model: clipVisionModel, begin_step_percent: beginStepPct, end_step_percent: endStepPct, image: { @@ -83,7 +84,7 @@ export const addIPAdapterToLinearGraph = async ( }; const buildIPAdapterMetadata = (ipAdapter: IPAdapterConfig): S['IPAdapterMetadataField'] => { - const { controlImage, beginStepPct, endStepPct, model, weight } = ipAdapter; + const { controlImage, beginStepPct, endStepPct, model, clipVisionModel, weight } = ipAdapter; assert(model, 'IP Adapter model is required'); @@ -99,6 +100,7 @@ const buildIPAdapterMetadata = (ipAdapter: IPAdapterConfig): S['IPAdapterMetadat return { ip_adapter_model: model, + clip_vision_model: clipVisionModel, weight, begin_step_percent: beginStepPct, end_step_percent: endStepPct, diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index 2de0f89d138..6e3ddf678b5 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -4112,7 +4112,7 @@ export type components = { * @description The nodes in this graph */ nodes: { - [key: string]: components["schemas"]["ControlNetInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["AddInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"]; + [key: string]: components["schemas"]["StringInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["AddInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["LoRALoaderInvocation"]; }; /** * Edges @@ -4149,7 +4149,7 @@ export type components = { * @description The results of node executions */ results: { - [key: string]: components["schemas"]["FaceMaskOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["CalculateImageTilesOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["CLIPOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["String2Output"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["IntegerOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["LatentsCollectionOutput"]; + [key: string]: components["schemas"]["CLIPOutput"] | components["schemas"]["StringPosNegOutput"] | components["schemas"]["IdealSizeOutput"] | components["schemas"]["SDXLLoRALoaderOutput"] | components["schemas"]["LatentsOutput"] | components["schemas"]["BooleanOutput"] | components["schemas"]["SeamlessModeOutput"] | components["schemas"]["DenoiseMaskOutput"] | components["schemas"]["SchedulerOutput"] | components["schemas"]["VAEOutput"] | components["schemas"]["IntegerCollectionOutput"] | components["schemas"]["ColorCollectionOutput"] | components["schemas"]["ConditioningOutput"] | components["schemas"]["NoiseOutput"] | components["schemas"]["TileToPropertiesOutput"] | components["schemas"]["String2Output"] | components["schemas"]["IntegerOutput"] | components["schemas"]["GradientMaskOutput"] | components["schemas"]["FloatOutput"] | components["schemas"]["FloatCollectionOutput"] | components["schemas"]["LatentsCollectionOutput"] | components["schemas"]["ControlOutput"] | components["schemas"]["FaceOffOutput"] | components["schemas"]["MetadataOutput"] | components["schemas"]["StringOutput"] | components["schemas"]["IterateInvocationOutput"] | components["schemas"]["LoRALoaderOutput"] | components["schemas"]["StringCollectionOutput"] | components["schemas"]["IPAdapterOutput"] | components["schemas"]["ConditioningCollectionOutput"] | components["schemas"]["PairTileImageOutput"] | components["schemas"]["MetadataItemOutput"] | components["schemas"]["CLIPSkipInvocationOutput"] | components["schemas"]["SDXLRefinerModelLoaderOutput"] | components["schemas"]["BooleanCollectionOutput"] | components["schemas"]["UNetOutput"] | components["schemas"]["T2IAdapterOutput"] | components["schemas"]["CollectInvocationOutput"] | components["schemas"]["SDXLModelLoaderOutput"] | components["schemas"]["FaceMaskOutput"] | components["schemas"]["ImageCollectionOutput"] | components["schemas"]["ModelLoaderOutput"] | components["schemas"]["ImageOutput"] | components["schemas"]["ColorOutput"] | components["schemas"]["CalculateImageTilesOutput"]; }; /** * Errors @@ -4310,10 +4310,10 @@ export type components = { is_diffusers: boolean; }; /** - * IPAdapterConfig - * @description Model config for IP Adaptor format models. + * IPAdapterCheckpointConfig + * @description Model config for IP Adapter checkpoint format models. */ - IPAdapterConfig: { + IPAdapterCheckpointConfig: { /** * Key * @description A unique key for this model. @@ -4364,13 +4364,11 @@ export type components = { * @constant */ type: "ip_adapter"; - /** Image Encoder Model Id */ - image_encoder_model_id: string; /** * Format * @constant */ - format: "invokeai"; + format: "checkpoint"; }; /** IPAdapterField */ IPAdapterField: { @@ -4434,6 +4432,13 @@ export type components = { * @description The IP-Adapter model. */ ip_adapter_model: components["schemas"]["ModelIdentifierField"]; + /** + * Clip Vision Model + * @description CLIP Vision model to use. Overrides model settings. Mandatory for checkpoint models. + * @default auto + * @enum {string} + */ + clip_vision_model?: "auto" | "ViT-H" | "ViT-G"; /** * Weight * @description The weight given to the IP-Adapter @@ -4459,6 +4464,69 @@ export type components = { */ type: "ip_adapter"; }; + /** + * IPAdapterInvokeAIConfig + * @description Model config for IP Adapter diffusers format models. + */ + IPAdapterInvokeAIConfig: { + /** + * Key + * @description A unique key for this model. + */ + key: string; + /** + * Hash + * @description The hash of the model file(s). + */ + hash: string; + /** + * Path + * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory. + */ + path: string; + /** + * Name + * @description Name of the model. + */ + name: string; + /** @description The base model. */ + base: components["schemas"]["BaseModelType"]; + /** + * Description + * @description Model description + */ + description?: string | null; + /** + * Source + * @description The original source of the model (path, URL or repo_id). + */ + source: string; + /** @description The type of source */ + source_type: components["schemas"]["ModelSourceType"]; + /** + * Source Api Response + * @description The original API response from the source, as stringified JSON. + */ + source_api_response?: string | null; + /** + * Cover Image + * @description Url for image to preview model + */ + cover_image?: string | null; + /** + * Type + * @default ip_adapter + * @constant + */ + type: "ip_adapter"; + /** Image Encoder Model Id */ + image_encoder_model_id: string; + /** + * Format + * @constant + */ + format: "invokeai"; + }; /** * IPAdapterMetadataField * @description IP Adapter Field, minus the CLIP Vision Encoder model @@ -4468,6 +4536,12 @@ export type components = { image: components["schemas"]["ImageField"]; /** @description The IP-Adapter model. */ ip_adapter_model: components["schemas"]["ModelIdentifierField"]; + /** + * Clip Vision Model + * @description The CLIP Vision model + * @enum {string} + */ + clip_vision_model: "ViT-H" | "ViT-G"; /** * Weight * @description The weight given to the IP-Adapter @@ -7471,7 +7545,7 @@ export type components = { * Config Out * @description After successful installation, this will hold the configuration object. */ - config_out?: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]) | null; + config_out?: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]) | null; /** * Inplace * @description Leave model in its current location; otherwise install under models directory @@ -7626,7 +7700,7 @@ export type components = { */ ModelsList: { /** Models */ - models: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"])[]; + models: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"])[]; }; /** * Multiply Integers @@ -11155,7 +11229,7 @@ export type operations = { /** @description Successful Response */ 200: { content: { - "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; + "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; }; }; /** @description Validation Error */ @@ -11181,7 +11255,7 @@ export type operations = { /** @description The model configuration was retrieved successfully */ 200: { content: { - "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; + "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; }; }; /** @description Bad request */ @@ -11263,7 +11337,7 @@ export type operations = { /** @description The model was updated successfully */ 200: { content: { - "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; + "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; }; }; /** @description Bad request */ @@ -11637,7 +11711,7 @@ export type operations = { /** @description Model converted successfully */ 200: { content: { - "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; + "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]; }; }; /** @description Bad request */ diff --git a/invokeai/frontend/web/src/services/api/types.ts b/invokeai/frontend/web/src/services/api/types.ts index 0bc141960d4..39d5890e51a 100644 --- a/invokeai/frontend/web/src/services/api/types.ts +++ b/invokeai/frontend/web/src/services/api/types.ts @@ -46,7 +46,7 @@ export type LoRAModelConfig = S['LoRADiffusersConfig'] | S['LoRALyCORISConfig']; // TODO(MM2): Can we rename this from Vae -> VAE export type VAEModelConfig = S['VAECheckpointConfig'] | S['VAEDiffusersConfig']; export type ControlNetModelConfig = S['ControlNetDiffusersConfig'] | S['ControlNetCheckpointConfig']; -export type IPAdapterModelConfig = S['IPAdapterConfig']; +export type IPAdapterModelConfig = S['IPAdapterInvokeAIConfig'] | S['IPAdapterCheckpointConfig']; export type T2IAdapterModelConfig = S['T2IAdapterConfig']; type TextualInversionModelConfig = S['TextualInversionFileConfig'] | S['TextualInversionFolderConfig']; type DiffusersModelConfig = S['MainDiffusersConfig'];