From 496e8a00a6ec7d932f06e06502ae4137347353b5 Mon Sep 17 00:00:00 2001 From: yangjianfeng01 Date: Fri, 12 Jan 2024 20:58:09 +0800 Subject: [PATCH 1/2] =?UTF-8?q?paddle=E6=8E=A8=E7=90=86pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppdiffusers/deploy/sdxl/export_model.py | 46 +- ppdiffusers/deploy/sdxl/infer_paddle.py | 475 ++++++ .../paddle_stable_diffusion_xl_housing.py | 90 ++ ppdiffusers/ppdiffusers/__init__.py | 2 + ppdiffusers/ppdiffusers/pipelines/__init__.py | 2 + .../pipelines/paddle_infer_utils.py | 1316 +++++++++++++++++ .../ppdiffusers/pipelines/pipeline_utils.py | 11 +- .../pipelines/stable_diffusion_xl/__init__.py | 5 +- .../paddleinferxl_utils.py | 1308 ++++++++++++++++ ...ipeline_paddleinfer_stable_diffusion_xl.py | 484 ++++++ ppdiffusers/ppdiffusers/utils/__init__.py | 2 + ppdiffusers/ppdiffusers/utils/constants.py | 2 + 12 files changed, 3731 insertions(+), 12 deletions(-) create mode 100644 ppdiffusers/deploy/sdxl/infer_paddle.py create mode 100644 ppdiffusers/deploy/sdxl/paddle_stable_diffusion_xl_housing.py create mode 100644 ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py create mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/paddleinferxl_utils.py create mode 100644 ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_paddleinfer_stable_diffusion_xl.py diff --git a/ppdiffusers/deploy/sdxl/export_model.py b/ppdiffusers/deploy/sdxl/export_model.py index 05672e5a1..9a794590d 100644 --- a/ppdiffusers/deploy/sdxl/export_model.py +++ b/ppdiffusers/deploy/sdxl/export_model.py @@ -21,12 +21,17 @@ from fd_stable_diffusion_xl_housing import ( FastDeploySFastDeployStableDiffusionXLPipelineHousing, ) +from paddle_stable_diffusion_xl_housing import ( + PaddleInferStableDiffusionXLPipelineHousing +) from text_encoder_2_housing import CLIPTextModelWithProjectionHousing from text_encoder_housing import CLIPTextModelHousing from unet_2d_condition_housing import UNet2DConditionModelSDXLHousing from ppdiffusers import FastDeployRuntimeModel, StableDiffusionXLPipeline +from ppdiffusers import PaddleInferModel + def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( model_path: str, @@ -163,25 +168,46 @@ def forward_vae_decoder(self, z): print(f"Save vae_decoder model in {save_path} successfully.") del pipeline.vae - fd_pipe_cls = FastDeploySFastDeployStableDiffusionXLPipelineHousing + paddle_infer_pipe_cls = PaddleInferStableDiffusionXLPipelineHousing print("mark 1") - text_encoder = (FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),) + text_encoder = (PaddleInferModel.from_pretrained(output_path / "text_encoder"),) # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), print("mark 2") - fastdeploy_pipeline = fd_pipe_cls( - vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), - vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), - unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), - text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), - text_encoder_2=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder_2"), + paddle_infer_pipeline = paddle_infer_pipe_cls( + vae_encoder=PaddleInferModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=PaddleInferModel.from_pretrained(output_path / "vae_decoder"), + unet=PaddleInferModel.from_pretrained(output_path / "unet"), + text_encoder=PaddleInferModel.from_pretrained(output_path / "text_encoder"), + text_encoder_2=PaddleInferModel.from_pretrained(output_path / "text_encoder_2"), tokenizer=pipeline.tokenizer, tokenizer_2=pipeline.tokenizer_2, scheduler=pipeline.scheduler, ) print("start saving") - fastdeploy_pipeline.save_pretrained(output_path) - print("FastDeploy pipeline saved to", output_path) + paddle_infer_pipeline.save_pretrained(output_path) + print("PaddleInfer pipeline saved to", output_path) + + + # fd_pipe_cls = FastDeploySFastDeployStableDiffusionXLPipelineHousing + # print("mark 1") + # text_encoder = (FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),) + # # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + # print("mark 2") + + # fastdeploy_pipeline = fd_pipe_cls( + # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + # vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), + # unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), + # text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), + # text_encoder_2=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder_2"), + # tokenizer=pipeline.tokenizer, + # tokenizer_2=pipeline.tokenizer_2, + # scheduler=pipeline.scheduler, + # ) + # print("start saving") + # fastdeploy_pipeline.save_pretrained(output_path) + # print("FastDeploy pipeline saved to", output_path) if __name__ == "__main__": diff --git a/ppdiffusers/deploy/sdxl/infer_paddle.py b/ppdiffusers/deploy/sdxl/infer_paddle.py new file mode 100644 index 000000000..0dbaf2b9c --- /dev/null +++ b/ppdiffusers/deploy/sdxl/infer_paddle.py @@ -0,0 +1,475 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time +import paddle +import random + +# isort: split +import paddle.inference as paddle_infer +import numpy as np +from paddlenlp.trainer.argparser import strtobool +from tqdm.auto import trange + +from ppdiffusers import ( # noqa + DiffusionPipeline, + PaddleInferStableDiffusionXLPipeline, +) +from ppdiffusers.utils import load_image + + +def parse_arguments(): + + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_dir", + default="runwayml/stable-diffusion-v1-5@fastdeploy", + help="The model directory of diffusion_model.", + ) + parser.add_argument( + "--inference_steps", + type=int, + default=50, + help="The number of unet inference steps.", + ) + parser.add_argument( + "--benchmark_steps", + type=int, + default=1, + help="The number of performance benchmark steps.", + ) + parser.add_argument( + "--backend", + type=str, + default="paddle_tensorrt", + # Note(zhoushunjie): Will support 'tensorrt' soon. + choices=["onnx_runtime", "paddle", "paddlelite", "paddle_tensorrt"], + help="The inference runtime backend of unet model and text encoder model.", + ) + parser.add_argument( + "--device", + type=str, + default="gpu", + # Note(shentanyue): Will support more devices. + choices=[ + "cpu", + "gpu", + "huawei_ascend_npu", + "kunlunxin_xpu", + ], + help="The inference runtime device of models.", + ) + parser.add_argument( + "--task_name", + type=str, + default="text2img", + choices=[ + "text2img", + "img2img", + "inpaint", + "all", + ], + help="The task can be one of [text2img, img2img, inpaint, pix2pix, all]. ", + ) + parser.add_argument( + "--parse_prompt_type", + type=str, + default="lpw", + choices=[ + "raw", + "lpw", + ], + help="The parse_prompt_type can be one of [raw, lpw]. ", + ) + parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") + parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode") + parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument( + "--scheduler", + type=str, + default="preconfig-euler-ancestral", + choices=[ + "pndm", + "lms", + "euler", + "euler-ancestral", + "preconfig-euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ], + help="The scheduler type of stable diffusion.", + ) + parser.add_argument( + "--infer_op", + type=str, + default="zero_copy_infer", + choices=[ + "zero_copy_infer", + "raw", + "all", + ], + help="The type of infer op.", + ) + parser.add_argument("--height", type=int, default=1024, help="Height of input image") + parser.add_argument("--width", type=int, default=1024, help="Width of input image") + + return parser.parse_args() + +def create_paddle_inference_runtime( + model_dir="", + model_name="", + use_trt=False, + dynamic_shape=None, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=0, + disable_paddle_trt_ops=[], + disable_paddle_pass=[], + workspace=24*1024*1024*1024, + tune=False, +): + config = paddle_infer.Config() + config.enable_new_executor() + config.enable_memory_optim() + shape_file = f"{model_dir}/{model_name}/shape_range_info.pbtxt" + if tune: + config.collect_shape_range_info(shape_file) + if device_id != -1: + config.use_gpu() + config.enable_use_gpu(memory_pool_init_size_mb=2000, device_id=device_id, precision_mode=precision_mode) + for pass_name in disable_paddle_pass: + config.delete_pass(pass_name) + if use_trt: + config.enable_tensorrt_engine(workspace_size=workspace, + precision_mode=precision_mode, + max_batch_size=1, + min_subgraph_size=3, + uuse_static=True) + config.enable_tensorrt_memory_optim() + config.enable_tuned_tensorrt_dynamic_shape(shape_file, True) + cache_file = os.path.join(model_dir, model_name, "_opt_cache/") + config.set_optim_cache_dir(cache_file) + if precision_mode != paddle_infer.PrecisionType.Half: + only_fp16_passes = [ + "trt_cross_multihead_matmul_fuse_pass", + "trt_flash_multihead_matmul_fuse_pass", + "preln_elementwise_groupnorm_act_pass", + "elementwise_groupnorm_act_pass", + ] + for curr_pass in only_fp16_passes: + config.delete_pass(curr_pass) + return config + +def main(args): + if args.device_id == -1: + paddle.set_device("cpu") + paddle_stream = None + else: + paddle.set_device(f"gpu:{args.device_id}") + seed = 1024 + vae_in_channels = 4 + text_encoder_max_length = 77 + unet_max_length = text_encoder_max_length * 3 # lpw support max_length is 77x3 + min_image_size = 1024 + max_image_size = 1024 + max_image_size = max(min_image_size, max_image_size) + hidden_states = 2048 + unet_in_channels = 4 + bs = 2 + + text_encoder_dynamic_shape = { + "input_ids": { + "min_shape": [1, text_encoder_max_length], + "max_shape": [1, text_encoder_max_length], + "opt_shape": [1, text_encoder_max_length], + } + } + + text_encoder_2_dynamic_shape = { + "input_ids": { + "min_shape": [1, text_encoder_max_length], + "max_shape": [1, text_encoder_max_length], + "opt_shape": [1, text_encoder_max_length], + } + } + + vae_encoder_dynamic_shape = { + "sample": { + "min_shape": [1, 3, min_image_size, min_image_size], + "max_shape": [1, 3, max_image_size, max_image_size], + "opt_shape": [1, 3, min_image_size, min_image_size], + } + } + + vae_decoder_dynamic_shape = { + "latent_sample": { + "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8], + "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + } + } + + unet_dynamic_shape = { + "sample": { + "min_shape": [ + 1, + unet_in_channels, + min_image_size // 8, + min_image_size // 8, + ], + "max_shape": [ + bs, + unet_in_channels, + max_image_size // 8, + max_image_size // 8, + ], + "opt_shape": [ + 2, + unet_in_channels, + min_image_size // 8, + min_image_size // 8, + ], + }, + "timestep": { + "min_shape": [1], + "max_shape": [1], + "opt_shape": [1], + }, + "encoder_hidden_states": { + "min_shape": [1, text_encoder_max_length, hidden_states], + "max_shape": [bs, unet_max_length, hidden_states], + "opt_shape": [2, text_encoder_max_length, hidden_states], + }, + "text_embeds": { + "min_shape": [1, 1280], + "max_shape": [bs, 1280], + "opt_shape": [2, 1280], + }, + "time_ids": { + "min_shape": [1, 6], + "max_shape": [bs, 6], + "opt_shape": [2, 6], + }, + } + # 4. Init runtime + disable_paddle_pass=['auto_mixed_precision_pass'] + infer_configs = dict( + text_encoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + use_trt=False, + model_name="text_encoder", + dynamic_shape=text_encoder_dynamic_shape, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=7, + disable_paddle_trt_ops=["range", "lookup_table_v2"], + tune=False), + text_encoder_2=create_paddle_inference_runtime( + model_dir=args.model_dir, + use_trt=False, + model_name="text_encoder_2", + dynamic_shape=text_encoder_dynamic_shape, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=7, + disable_paddle_trt_ops=["range", "lookup_table_v2"], + tune=False + ), + vae_encoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + model_name="vae_encoder", + use_trt=False, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=7, + tune=False + ), + vae_decoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + model_name="vae_decoder", + use_trt=False, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=7, + disable_paddle_pass=disable_paddle_pass, + tune=False + ), + unet=create_paddle_inference_runtime( + model_dir=args.model_dir, + model_name="unet", + use_trt=False, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=7, + tune=False + ), + ) + pipe = PaddleInferStableDiffusionXLPipeline.from_pretrained( + args.model_dir, + infer_configs=infer_configs, + ) + pipe.set_progress_bar_config(disable=True) + # pipe.change_scheduler(args.scheduler) + parse_prompt_type = args.parse_prompt_type + width = args.width + height = args.height + + if args.infer_op == "all": + infer_op_list = ["zero_copy_infer", "raw"] + else: + infer_op_list = [args.infer_op] + if args.device == "kunlunxin_xpu" or args.backend == "paddle": + print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.") + infer_op_list = ["raw"] + + for infer_op in infer_op_list: + infer_op_dict = { + "vae_encoder": infer_op, + "vae_decoder": infer_op, + "text_encoder": infer_op, + "unet": infer_op, + } + folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + # text2img + prompt = "beautiful scenery nature glass bottle landscape, purple galaxy bottle" + time_costs = [] + negative_prompt = "text, watermark" + # warmup + # pipe( + # prompt, + # num_inference_steps=20, + # height=height, + # width=width, + # # parse_prompt_type=parse_prompt_type, + # # infer_op_dict=infer_op_dict, + # negative_prompt=negative_prompt + + # ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe( + prompt, + output_type="pil", + num_inference_steps=args.inference_steps, + height=height, + width=width, + # parse_prompt_type=parse_prompt_type, + # infer_op_dict=infer_op_dict, + negative_prompt=negative_prompt + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img___1.png") + + if args.task_name in ["img2img", "all"]: + # img2img + img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" + init_image = load_image(img_url) + prompt = "A fantasy landscape, trending on artstation" + time_costs = [] + # warmup + pipe.img2img( + prompt, + image=init_image, + num_inference_steps=20, + height=height, + width=width, + # parse_prompt_type=parse_prompt_type, + infer_op_dict=infer_op_dict, + ) + print("==> Test img2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe.img2img( + prompt, + image=init_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + parse_prompt_type=parse_prompt_type, + infer_op_dict=infer_op_dict, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/img2img.png") + + if args.task_name in ["inpaint", "all"]: + img_url = ( + "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" + ) + mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" + init_image = load_image(img_url) + mask_image = load_image(mask_url) + prompt = "Face of a yellow cat, high resolution, sitting on a park bench" + time_costs = [] + pipe.inpaint( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=20, + height=height, + width=width, + parse_prompt_type=parse_prompt_type, + infer_op_dict=infer_op_dict, + ) + print("==> Test inpaint performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe.inpaint( + prompt, + image=init_image, + mask_image=mask_image, + num_inference_steps=args.inference_steps, + height=height, + width=width, + parse_prompt_type=parse_prompt_type, + infer_op_dict=infer_op_dict, + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + + images[0].save(f"{folder}/inpaint.png") + + +if __name__ == "__main__": + seed=2024 + paddle.seed(seed) + np.random.seed(seed) + random.seed(seed) + args = parse_arguments() + main(args) diff --git a/ppdiffusers/deploy/sdxl/paddle_stable_diffusion_xl_housing.py b/ppdiffusers/deploy/sdxl/paddle_stable_diffusion_xl_housing.py new file mode 100644 index 000000000..fa3806941 --- /dev/null +++ b/ppdiffusers/deploy/sdxl/paddle_stable_diffusion_xl_housing.py @@ -0,0 +1,90 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddlenlp.transformers import CLIPTokenizer + +from ppdiffusers.pipelines.pipeline_utils import DiffusionPipeline +from ppdiffusers.pipelines.stable_diffusion_xl.paddleinferxl_utils import ( + PaddleInferDiffusionXLPipelineMixin, + PaddleInferModel +) +from ppdiffusers.schedulers import KarrasDiffusionSchedulers +from ppdiffusers.utils import logging + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +class PaddleInferStableDiffusionXLPipelineHousing(DiffusionPipeline, PaddleInferDiffusionXLPipelineMixin): + r""" + Pipeline for text-to-image generation using Stable Diffusion XL. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving etc.) + + Args: + vae_encoder ([`FastDeployRuntimeModel`]): + Variational Auto-Encoder (VAE) Model to encode images to latent representations. + vae_decoder ([`FastDeployRuntimeModel`]): + Variational Auto-Encoder (VAE) Model to decode images from latent representations. + text_encoder ([`FastDeployRuntimeModel`]): + Frozen text-encoder. Stable Diffusion XL uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + text_encoder_2 ([` FastDeployRuntimeModel`]): + Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), + specifically the + [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) + variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + tokenizer_2 (`CLIPTokenizer`): + Second Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`FastDeployRuntimeModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + _optional_components = [ + "vae_encoder", + ] + + def __init__( + self, + vae_encoder: PaddleInferModel, + vae_decoder: PaddleInferModel, + text_encoder: PaddleInferModel, + text_encoder_2: PaddleInferModel, + tokenizer: CLIPTokenizer, + tokenizer_2: CLIPTokenizer, + unet: PaddleInferModel, + scheduler: KarrasDiffusionSchedulers, + force_zeros_for_empty_prompt: bool = True, + ): + super().__init__() + self.register_modules( + vae_encoder=vae_encoder, + vae_decoder=vae_decoder, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + ) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.post_init() diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py index 9e8cff73e..2c8c6d3a2 100644 --- a/ppdiffusers/ppdiffusers/__init__.py +++ b/ppdiffusers/ppdiffusers/__init__.py @@ -55,6 +55,7 @@ except OptionalDependencyNotAvailable: from .utils.dummy_paddle_objects import * # noqa F403 else: + from .pipelines import PaddleInferModel from .models import ( AsymmetricAutoencoderKL, AutoencoderKL, @@ -274,6 +275,7 @@ FastDeployStableDiffusionXLImg2ImgPipeline, FastDeployStableDiffusionXLInpaintPipeline, FastDeployStableDiffusionXLMegaPipeline, + PaddleInferStableDiffusionXLPipeline, ) try: diff --git a/ppdiffusers/ppdiffusers/pipelines/__init__.py b/ppdiffusers/ppdiffusers/pipelines/__init__.py index f9adf7de4..6f0eb264b 100644 --- a/ppdiffusers/ppdiffusers/pipelines/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/__init__.py @@ -174,6 +174,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_paddle_and_paddlenlp_and_fastdeploy_objects import * # noqa F403 else: + from .paddle_infer_utils import PaddleInferModel from .controlnet import FastDeployStableDiffusionControlNetPipeline from .stable_diffusion import ( FastDeployCycleDiffusionPipeline, @@ -190,6 +191,7 @@ FastDeployStableDiffusionXLImg2ImgPipeline, FastDeployStableDiffusionXLInpaintPipeline, FastDeployStableDiffusionXLMegaPipeline, + PaddleInferStableDiffusionXLPipeline, ) try: diff --git a/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py b/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py new file mode 100644 index 000000000..fc58b759b --- /dev/null +++ b/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py @@ -0,0 +1,1316 @@ +# coding=utf-8 +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Inc. team. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import os +import re +import shutil +from pathlib import Path +from typing import Dict, List, Optional, Union + +import numpy as np + +from ..image_processor import VaeImageProcessor +from ..schedulers import ( + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + PreconfigEulerAncestralDiscreteScheduler, + PreconfigLMSDiscreteScheduler, + UniPCMultistepScheduler, +) +from ..utils import ( + DIFFUSERS_CACHE, + PADDLE_INFER_MODEL_NAME, + PADDLE_INFER_WEIGHTS_NAME, + FROM_HF_HUB, + HF_HUB_OFFLINE, + PPDIFFUSERS_CACHE, + _add_variant, + _get_model_file, + is_paddle_available, + logging, + randn_tensor, +) +from ..version import VERSION as __version__ + +__all__ = ["PaddleInferModel", "PaddleInferModelDiffusionPipelineMixin"] + +if is_paddle_available(): + import paddle + import paddle.inference as paddle_infer + + +logger = logging.get_logger(__name__) + +re_attention = re.compile( + r""" +\\\(| +\\\)| +\\\[| +\\]| +\\\\| +\\| +\(| +\[| +:([+-]?[.\d]+)\)| +\)| +]| +[^\\()\[\]:]+| +: +""", + re.X, +) + + + + + + + +def parse_prompt_attention(text): + r""" + Parses a string with attention tokens and returns a list of pairs: text and its associated weight. + Accepted tokens are: + (abc) - increases attention to abc by a multiplier of 1.1 + (abc:3.12) - increases attention to abc by a multiplier of 3.12 + [abc] - decreases attention to abc by a multiplier of 1.1 + \( - literal character '(' + \[ - literal character '[' + \) - literal character ')' + \] - literal character ']' + \\ - literal character '\' + anything else - just text + >>> parse_prompt_attention('normal text') + [['normal text', 1.0]] + >>> parse_prompt_attention('an (important) word') + [['an ', 1.0], ['important', 1.1], [' word', 1.0]] + >>> parse_prompt_attention('(unbalanced') + [['unbalanced', 1.1]] + >>> parse_prompt_attention('\(literal\]') + [['(literal]', 1.0]] + >>> parse_prompt_attention('(unnecessary)(parens)') + [['unnecessaryparens', 1.1]] + >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).') + [['a ', 1.0], + ['house', 1.5730000000000004], + [' ', 1.1], + ['on', 1.0], + [' a ', 1.1], + ['hill', 0.55], + [', sun, ', 1.1], + ['sky', 1.4641000000000006], + ['.', 1.1]] + """ + + res = [] + round_brackets = [] + square_brackets = [] + + round_bracket_multiplier = 1.1 + square_bracket_multiplier = 1 / 1.1 + + def multiply_range(start_position, multiplier): + for p in range(start_position, len(res)): + res[p][1] *= multiplier + + for m in re_attention.finditer(text): + text = m.group(0) + weight = m.group(1) + + if text.startswith("\\"): + res.append([text[1:], 1.0]) + elif text == "(": + round_brackets.append(len(res)) + elif text == "[": + square_brackets.append(len(res)) + elif weight is not None and len(round_brackets) > 0: + multiply_range(round_brackets.pop(), float(weight)) + elif text == ")" and len(round_brackets) > 0: + multiply_range(round_brackets.pop(), round_bracket_multiplier) + elif text == "]" and len(square_brackets) > 0: + multiply_range(square_brackets.pop(), square_bracket_multiplier) + else: + res.append([text, 1.0]) + + for pos in round_brackets: + multiply_range(pos, round_bracket_multiplier) + + for pos in square_brackets: + multiply_range(pos, square_bracket_multiplier) + + if len(res) == 0: + res = [["", 1.0]] + + # merge runs of identical weights + i = 0 + while i + 1 < len(res): + if res[i][1] == res[i + 1][1]: + res[i][0] += res[i + 1][0] + res.pop(i + 1) + else: + i += 1 + + return res + + +def get_prompts_with_weights(pipe, prompt: List[str], max_length: int): + r""" + Tokenize a list of prompts and return its tokens with weights of each token. + No padding, starting or ending token is included. + """ + tokens = [] + weights = [] + truncated = False + for text in prompt: + texts_and_weights = parse_prompt_attention(text) + text_token = [] + text_weight = [] + for word, weight in texts_and_weights: + # tokenize and discard the starting and the ending token + token = pipe.tokenizer(word).input_ids[1:-1] + text_token += token + # copy the weight by length of token + text_weight += [weight] * len(token) + # stop if the text is too long (longer than truncation limit) + if len(text_token) > max_length: + truncated = True + break + # truncate + if len(text_token) > max_length: + truncated = True + text_token = text_token[:max_length] + text_weight = text_weight[:max_length] + tokens.append(text_token) + weights.append(text_weight) + if truncated: + logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples") + return tokens, weights + + +def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77): + r""" + Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length. + """ + max_embeddings_multiples = (max_length - 2) // (chunk_length - 2) + weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length + for i in range(len(tokens)): + tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i])) + if no_boseos_middle: + weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i])) + else: + w = [] + if len(weights[i]) == 0: + w = [1.0] * weights_length + else: + for j in range(max_embeddings_multiples): + w.append(1.0) # weight for starting token in this chunk + w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))] + w.append(1.0) # weight for ending token in this chunk + w += [1.0] * (weights_length - len(w)) + weights[i] = w[:] + # we must to tensor first! + return paddle.to_tensor(tokens, dtype="int64"), paddle.to_tensor(weights, dtype="float32") + + +def get_unweighted_text_embeddings( + pipe, + text_input: paddle.Tensor, + chunk_length: int, + no_boseos_middle: Optional[bool] = True, + infer_op=None, +): + """ + When the length of tokens is a multiple of the capacity of the text encoder, + it should be split into chunks and sent to the text encoder individually. + """ + max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2) + + if max_embeddings_multiples > 1: + text_embeddings = [] + for i in range(max_embeddings_multiples): + # extract the i-th chunk + text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone() + + # cover the head and the tail by the starting and the ending tokens + text_input_chunk[:, 0] = text_input[0, 0] + text_input_chunk[:, -1] = text_input[0, -1] + text_embedding = pipe.text_encoder( + input_ids=text_input_chunk, + )[0] + if no_boseos_middle: + if i == 0: + # discard the ending token + text_embedding = text_embedding[:, :-1] + elif i == max_embeddings_multiples - 1: + # discard the starting token + text_embedding = text_embedding[:, 1:] + else: + # discard both starting and ending tokens + text_embedding = text_embedding[:, 1:-1] + + text_embeddings.append(text_embedding) + text_embeddings = paddle.concat(text_embeddings, axis=1) + else: + text_embeddings = pipe.text_encoder( + input_ids=text_input, + )[0] + return text_embeddings + + +def get_weighted_text_embeddings( + pipe, + prompt: Union[str, List[str]], + uncond_prompt: Optional[Union[str, List[str]]] = None, + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + infer_op=None, + **kwargs, +): + r""" + Prompts can be assigned with local weights using brackets. For example, + prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful', + and the embedding tokens corresponding to the words get multiplied by a constant, 1.1. + Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean. + Args: + pipe (`DiffusionPipeline`): + Pipe to provide access to the tokenizer and the text encoder. + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + uncond_prompt (`str` or `List[str]`): + The unconditional prompt or prompts for guide the image generation. If unconditional prompt + is provided, the embeddings of prompt and uncond_prompt are concatenated. + max_embeddings_multiples (`int`, *optional*, defaults to `1`): + The max multiple length of prompt embeddings compared to the max output length of text encoder. + no_boseos_middle (`bool`, *optional*, defaults to `False`): + If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and + ending token in each of the chunk in the middle. + skip_parsing (`bool`, *optional*, defaults to `False`): + Skip the parsing of brackets. + skip_weighting (`bool`, *optional*, defaults to `False`): + Skip the weighting. When the parsing is skipped, it is forced True. + """ + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 + if isinstance(prompt, str): + prompt = [prompt] + + if not skip_parsing: + prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2) + if uncond_prompt is not None: + if isinstance(uncond_prompt, str): + uncond_prompt = [uncond_prompt] + uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2) + else: + prompt_tokens = [ + token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids + ] + prompt_weights = [[1.0] * len(token) for token in prompt_tokens] + if uncond_prompt is not None: + if isinstance(uncond_prompt, str): + uncond_prompt = [uncond_prompt] + uncond_tokens = [ + token[1:-1] + for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids + ] + uncond_weights = [[1.0] * len(token) for token in uncond_tokens] + + # round up the longest length of tokens to a multiple of (model_max_length - 2) + max_length = max([len(token) for token in prompt_tokens]) + if uncond_prompt is not None: + max_length = max(max_length, max([len(token) for token in uncond_tokens])) + + max_embeddings_multiples = min( + max_embeddings_multiples, + (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, + ) + max_embeddings_multiples = max(1, max_embeddings_multiples) + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 + + # pad the length of tokens and weights + # support bert tokenizer + bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id + eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id + pad = pipe.tokenizer.pad_token_id + + prompt_tokens, prompt_weights = pad_tokens_and_weights( + prompt_tokens, + prompt_weights, + max_length, + bos, + eos, + pad, + no_boseos_middle=no_boseos_middle, + chunk_length=pipe.tokenizer.model_max_length, + ) + if uncond_prompt is not None: + uncond_tokens, uncond_weights = pad_tokens_and_weights( + uncond_tokens, + uncond_weights, + max_length, + bos, + eos, + pad, + no_boseos_middle=no_boseos_middle, + chunk_length=pipe.tokenizer.model_max_length, + ) + # get the embeddings + text_embeddings = get_unweighted_text_embeddings( + pipe, + prompt_tokens, + pipe.tokenizer.model_max_length, + no_boseos_middle=no_boseos_middle, + infer_op=infer_op, + ) + if uncond_prompt is not None: + uncond_embeddings = get_unweighted_text_embeddings( + pipe, + uncond_tokens, + pipe.tokenizer.model_max_length, + no_boseos_middle=no_boseos_middle, + infer_op=infer_op, + ) + # assign weights to the prompts and normalize in the sense of mean + # TODO: should we normalize by chunk or in a whole (current implementation)? + if (not skip_parsing) and (not skip_weighting): + previous_mean = text_embeddings.mean(axis=[-2, -1]) + text_embeddings *= prompt_weights.unsqueeze(-1) + text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) + if uncond_prompt is not None: + previous_mean = uncond_embeddings.mean(axis=[-2, -1]) + uncond_embeddings *= uncond_weights.unsqueeze(-1) + uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) + + if uncond_prompt is not None: + return text_embeddings, uncond_embeddings + return text_embeddings, None + + + +class PaddleInferDiffusionPipelineMixin: + + def post_init(self, vae_scaling_factor=0.18215, vae_scale_factor=8, dtype="float32"): + self.vae_scaling_factor = vae_scaling_factor + self.vae_scale_factor = vae_scale_factor + + self.image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_convert_rgb=True) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) + self.dtype = dtype + self.supported_scheduler = [ + "pndm", + "lms", + "preconfig-lms", + "euler", + "euler-ancestral", + "preconfig-euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ] + self.orginal_scheduler_config = self.scheduler.config + + @property + def vae_encoder_num_channels(self): + if self.vae_encoder is None: + return 3 + return self.vae_encoder.model.get_input_info(0).shape[1] + + @property + def vae_decoder_num_latent_channels(self): + if self.vae_decoder is None: + return 4 + return self.vae_decoder.model.get_input_info(0).shape[1] + + @property + def unet_num_latent_channels(self): + return self.unet.model.get_input_info(0).shape[1] + + @property + def unet_hidden_states_dim(self): + return self.unet.model.get_input_info(2).shape[2] + + @property + def text_encoder_hidden_states_dim(self): + if not hasattr(self, "text_encoder") or self.text_encoder is None: + return 768 + return self.text_encoder.model.get_output_info(0).shape[2] + + def change_scheduler(self, scheduler_type="ddim"): + scheduler_type = scheduler_type.lower() + if scheduler_type == "pndm": + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "lms": + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "preconfig-lms": + scheduler = PreconfigLMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "heun": + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler": + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler-ancestral": + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "preconfig-euler-ancestral": + scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-multi": + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-single": + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2-ancestral": + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2": + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "unipc-multi": + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "ddim": + scheduler = DDIMScheduler.from_config( + self.orginal_scheduler_config, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + ) + elif scheduler_type == "ddpm": + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) + elif scheduler_type == "deis-multi": + scheduler = DEISMultistepScheduler.from_config( + self.orginal_scheduler_config, + ) + else: + raise ValueError( + f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!" + ) + self.scheduler = scheduler + + def get_timesteps(self, num_inference_steps, strength=1.0): + if strength >= 1: + return self.scheduler.timesteps, num_inference_steps + + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + + if hasattr(self.scheduler, "step_index_offset"): + self.scheduler.step_index_offset = t_start * self.scheduler.order + + num_inference_steps = num_inference_steps - t_start + # check that number of inference steps is not < 1 - as this doesn't make sense + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" + f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." + ) + + return timesteps, num_inference_steps + + def prepare_controlnet_cond( + self, + controlnet_cond, + controlnet_conditioning_scale, + width, + height, + batch_size, + num_images_per_prompt, + do_classifier_free_guidance=False, + ): + control_image = self.control_image_processor.preprocess( + controlnet_cond, + height=height, + width=width, + ) + if isinstance(controlnet_conditioning_scale, (float, int)): + controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=self.dtype) + elif isinstance(controlnet_conditioning_scale, (list, tuple)): + controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=self.dtype) + else: + raise ValueError( + f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}" + ) + assert controlnet_conditioning_scale.shape[0] == 13 + image_batch_size = control_image.shape[0] + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + control_image = control_image.repeat_interleave(repeat_by, axis=0) + if do_classifier_free_guidance: + control_image = paddle.concat([control_image] * 2) + return control_image, controlnet_conditioning_scale + + def check_inputs( + self, + prompt, + height=512, + width=512, + callback_steps=1, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + strength=1.0, + ): + if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: + raise ValueError( + f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}." + ) + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + def prepare_latents( + self, + batch_size, + height, + width, + generator, + latents=None, + image=None, + timestep=None, + is_strength_max=True, + return_noise=False, + return_image_latents=False, + infer_op=None, + ): + shape = [ + batch_size, + self.vae_decoder_num_latent_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ] + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if (image is None or timestep is None) and not is_strength_max: + raise ValueError( + "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." + "However, either the image or the noise timestep has not been provided." + ) + + if return_image_latents or (latents is None and not is_strength_max): + image = image.cast(dtype=self.dtype) + image_latents = self._encode_vae_image(image, infer_op) + + if latents is None: + noise = randn_tensor(shape, generator=generator, dtype=self.dtype) + # if strength is 1. then initialise the latents to noise, else initial to image + noise + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) + # if pure noise then scale the initial latents by the Scheduler's init sigma + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents + else: + noise = latents + if str(noise.dtype).replace("paddle.", "") != self.dtype: + noise = noise.cast(self.dtype) + latents = noise * self.scheduler.init_noise_sigma + + outputs = (latents,) + + if return_noise: + outputs += (noise,) + + if return_image_latents: + outputs += (image_latents,) + + if len(outputs) == 1: + outputs = latents + return outputs + + def prepare_mask_latents( + self, + mask, + masked_image, + batch_size, + height, + width, + do_classifier_free_guidance, + return_masked_image_latents=True, + infer_op=None, + ): + # resize the mask to latents shape as we concatenate the mask to the latents + # we do that before converting to dtype to avoid breaking in case we're using cpu_offload + # and half precision + mask = paddle.nn.functional.interpolate( + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) + mask = mask.cast(dtype=self.dtype) + + # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method + if mask.shape[0] < batch_size: + if not batch_size % mask.shape[0] == 0: + raise ValueError( + "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" + f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" + " of masks that you pass is divisible by the total requested batch size." + ) + mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1]) + + mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask + if not return_masked_image_latents: + return mask + + masked_image = masked_image.cast(dtype=self.dtype) + masked_image_latents = self._encode_vae_image(masked_image, infer_op) + if masked_image_latents.shape[0] < batch_size: + if not batch_size % masked_image_latents.shape[0] == 0: + raise ValueError( + "The passed images and the required batch size don't match. Images are supposed to be duplicated" + f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." + " Make sure the number of images that you pass is divisible by the total requested batch size." + ) + masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1]) + + masked_image_latents = ( + paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) + + # aligning device to prevent device errors when concating it with the latent model input + masked_image_latents = masked_image_latents.cast(dtype=self.dtype) + return mask, masked_image_latents + + def is_scheduler_support_step_index(self): + kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys()) + return "kwargs" in kwargs_keys or "step_index" in kwargs_keys + + def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs): + image_shape = image.shape + image_latents = self.vae_encoder( + sample=image, + )[0] + + return self.vae_scaling_factor * image_latents + + def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs): + latents_shape = latents.shape + images_vae = self.vae_decoder( + latent_sample=latents, + )[0] + + return images_vae + + def _encode_prompt( + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): + if parse_prompt_type == "lpw": + return self._encode_prompt_lpw( + prompt, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + max_embeddings_multiples=max_embeddings_multiples, + infer_op=infer_op, + **kwargs, + ) + elif parse_prompt_type == "raw": + return self._encode_prompt_raw( + prompt, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + infer_op=infer_op, + ) + elif parse_prompt_type == "webui": + raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.") + + def _encode_prompt_lpw( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int, + do_classifier_free_guidance: bool, + negative_prompt: Union[str, List[str]], + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `list(int)`): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + max_embeddings_multiples (`int`, *optional*, defaults to `3`): + The max multiple length of prompt embeddings compared to the max output length of text encoder. + """ + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None and negative_prompt_embeds is None: + uncond_tokens: List[str] = None + if do_classifier_free_guidance: + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings( + pipe=self, + prompt=prompt, + uncond_prompt=uncond_tokens, + max_embeddings_multiples=max_embeddings_multiples, + infer_op="raw", # NOTE: we can't use zero copy! + **kwargs, + ) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + return prompt_embeds + + def _encode_prompt_raw( + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids # check + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder( + input_ids=text_input_ids, + )[0] + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + if negative_prompt is None: + uncond_tokens = [""] + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pd", + ) + negative_prompt_embeds = self.text_encoder( + input_ids=uncond_input.input_ids, + )[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if paddle.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="np") + image, has_nsfw_concept = self.safety_checker( + images=image.numpy(), + clip_input=safety_checker_input.pixel_values.astype(self.dtype), + infer_op="raw", + ) + image = paddle.to_tensor(image, dtype=self.dtype) + return image, has_nsfw_concept + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + + + + +class PaddleInferModel: + def __init__(self, model=None, **kwargs): + logger.info("ppdiffusers.PaddleInferRuntimeModel") + self.model = model + self.model_save_dir = kwargs.get("model_save_dir", None) + self.latest_model_name = kwargs.get("latest_model_name", None) + self.latest_params_name = kwargs.get("latest_params_name", None) + if self.latest_model_name is None: + self.latest_model_name = PADDLE_INFER_MODEL_NAME + if self.latest_params_name is None: + self.latest_params_name = PADDLE_INFER_WEIGHTS_NAME + def __call__(self, **kwargs): + inputs = {} + for k, v in kwargs.items(): + if k == "timestep": + if v.ndim == 0: + # fix 0D tensor error + v = v.reshape((1,)) + # fix dtype error + v = v.astype("float32") + inputs[k] = v + input_names = self.model.get_input_names() + for i, name in enumerate(input_names): + input_tensor = self.model.get_input_handle(name) + if name not in inputs: + raise ValueError(f"Input {name} is not in the model.") + input_tensor.reshape(inputs[name].shape) + input_tensor.copy_from_cpu(inputs[name].numpy()) + # do the inference + self.model.run() + results = [] + # get out data from output tensor + output_names = self.model.get_output_names() + for i, name in enumerate(output_names): + output_tensor = self.model.get_output_handle(name) + output_data = output_tensor.copy_to_cpu() + results.append(paddle.to_tensor(output_data)) + return results + + @staticmethod + def load_model( + model_path: Union[str, Path], + params_path: Union[str, Path] = None, + infer_config: Optional["paddle_infer.Congig"] = None, + ): + """ + Loads an FastDeploy Inference Model with fastdeploy.RuntimeOption + + Arguments: + model_path (`str` or `Path`): + Model path from which to load + params_path (`str` or `Path`): + Params path from which to load + runtime_options (fd.RuntimeOption, *optional*): + The RuntimeOption of fastdeploy to initialize the fastdeploy runtime. Default setting + the device to cpu and the backend to paddle inference + """ + if infer_config is None: + infer_config = paddle_infer.Config() + infer_config.set_prog_file(model_path) + infer_config.set_params_file(params_path) + return paddle_infer.create_predictor(infer_config) + + def _save_pretrained( + self, + save_directory: Union[str, Path], + model_file_name: Optional[str] = None, + params_file_name: Optional[str] = None, + **kwargs + ): + """ + Save a model and its configuration file to a directory, so that it can be re-loaded using the + [`~FastDeployRuntimeModel.from_pretrained`] class method. It will always save the + latest_model_name. + + Arguments: + save_directory (`str` or `Path`): + Directory where to save the model file. + model_file_name(`str`, *optional*): + Overwrites the default model file name from `"inference.pdmodel"` to `model_file_name`. This allows you to save the + model with a different name. + params_file_name(`str`, *optional*): + Overwrites the default model file name from `"inference.pdiparams"` to `params_file_name`. This allows you to save the + model with a different name. + """ + model_file_name = ( + model_file_name + if model_file_name is not None + else PADDLE_INFER_MODEL_NAME + ) + params_file_name = params_file_name if params_file_name is not None else PADDLE_INFER_WEIGHTS_NAME + + src_model_path = self.model_save_dir.joinpath(self.latest_model_name) + dst_model_path = Path(save_directory).joinpath(model_file_name) + + try: + shutil.copyfile(src_model_path, dst_model_path) + except shutil.SameFileError: + pass + + + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + **kwargs, + ): + """ + Save a model to a directory, so that it can be re-loaded using the [`~FastDeployRuntimeModel.from_pretrained`] class + method.: + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + """ + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + # saving model weights/files + self._save_pretrained(save_directory, **kwargs) + + @classmethod + def _from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, Path], + model_file_name: Optional[str] = None, + params_file_name: Optional[str] = None, + use_auth_token: Optional[Union[bool, str, None]] = None, + revision: Optional[str] = None, + subfolder: Optional[str] = None, + force_download: bool = False, + cache_dir: Optional[str] = None, + infer_config: Optional['paddle_infer.Config'] = None, + from_hf_hub: Optional[bool] = False, + proxies: Optional[Dict] = None, + resume_download: bool = False, + local_files_only: bool = False, + user_agent: Union[Dict, str, None] = None, + is_onnx_model: bool = False, + **kwargs, + ): + """ + Load a model from a directory or the HF Hub. + + Arguments: + pretrained_model_name_or_path (`str` or `Path`): + Directory from which to load + model_file_name (`str`): + Overwrites the default model file name from `"inference.pdmodel"` to `file_name`. This allows you to load + different model files from the same repository or directory. + params_file_name (`str`): + Overwrites the default params file name from `"inference.pdiparams"` to `file_name`. This allows you to load + different model files from the same repository or directory. + use_auth_token (`str` or `bool`): + Is needed to load models from a private or gated repository + revision (`str`): + Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id + cache_dir (`Union[str, Path]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + runtime_options (`fastdeploy.RuntimeOption`, *optional*): + The RuntimeOption of fastdeploy. + subfolder (`str`, *optional*, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo (either remote in + huggingface.co or downloaded locally), you can specify the folder name here. + kwargs (`Dict`, *optional*): + kwargs will be passed to the model during initialization + """ + + model_file_name = ( + model_file_name + if model_file_name is not None + else PADDLE_INFER_MODEL_NAME + ) + params_file_name = params_file_name if params_file_name is not None else PADDLE_INFER_WEIGHTS_NAME + # load model from local directory + if os.path.isdir(pretrained_model_name_or_path): + model_path = os.path.join(pretrained_model_name_or_path, model_file_name) + params_path = os.path.join(pretrained_model_name_or_path, params_file_name) + model = PaddleInferModel.load_model( + model_path, + params_path, + infer_config=infer_config, + ) + kwargs["model_save_dir"] = Path(pretrained_model_name_or_path) + # load model from hub or paddle bos + else: + model_cache_path = _get_model_file( + pretrained_model_name_or_path=pretrained_model_name_or_path, + weights_name=model_file_name, + subfolder=subfolder, + cache_dir=cache_dir, + force_download=force_download, + revision=revision, + from_hf_hub=from_hf_hub, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + + params_cache_path = _get_model_file( + pretrained_model_name_or_path=pretrained_model_name_or_path, + weights_name=params_file_name, + subfolder=subfolder, + cache_dir=cache_dir, + force_download=force_download, + revision=revision, + from_hf_hub=from_hf_hub, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + kwargs["latest_params_name"] = Path(params_cache_path).name + kwargs["model_save_dir"] = Path(model_cache_path).parent + kwargs["latest_model_name"] = Path(model_cache_path).name + + model = PaddleInferModel.load_model( + model_cache_path, + params_cache_path, + infer_config=infer_config, + ) + return cls(model=model, **kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, Path], + model_file_name: Optional[str] = None, + params_file_name: Optional[str] = None, + infer_configs: Optional['paddle_infer.Config'] = None, + **kwargs, + ): + from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) + cache_dir = ( + kwargs.pop("cache_dir", DIFFUSERS_CACHE) if from_hf_hub else kwargs.pop("cache_dir", PPDIFFUSERS_CACHE) + ) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", None) + variant = kwargs.pop("variant", None) + + user_agent = { + "ppdiffusers": __version__, + "file_type": "model", + "framework": "paddleinfer", + } + + return cls._from_pretrained( + pretrained_model_name_or_path=pretrained_model_name_or_path, + model_file_name=_add_variant(model_file_name, variant), + params_file_name=_add_variant(params_file_name, variant), + use_auth_token=use_auth_token, + revision=revision, + subfolder=subfolder, + force_download=force_download, + cache_dir=cache_dir, + infer_config=infer_configs, + from_hf_hub=from_hf_hub, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + user_agent=user_agent, + **kwargs, + ) + + + + + diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py index a92daacee..e055dd817 100644 --- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py +++ b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py @@ -81,6 +81,7 @@ from paddlenlp.transformers import PretrainedModel from .fastdeploy_utils import FastDeployRuntimeModel +from .paddle_infer_utils import PaddleInferModel TRANSFORMERS_SAFE_WEIGHTS_NAME = "model.safetensors" TRANSFORMERS_WEIGHTS_NAME = "pytorch_model.bin" @@ -102,6 +103,7 @@ "SchedulerMixin": ["save_pretrained", "from_pretrained"], "DiffusionPipeline": ["save_pretrained", "from_pretrained"], "FastDeployRuntimeModel": ["save_pretrained", "from_pretrained"], + "PaddleInferModel": ["save_pretrained", "from_pretrained"], }, "paddlenlp.transformers": { "PretrainedTokenizer": ["save_pretrained", "from_pretrained"], @@ -371,6 +373,7 @@ def load_sub_model( pipeline_class: Any, paddle_dtype: paddle.dtype, runtime_options: Any, + infer_configs: Any, model_variants: Dict[str, str], name: str, from_diffusers: bool, @@ -431,6 +434,10 @@ def load_sub_model( ) loading_kwargs["is_onnx_model"] = is_onnx_model + if issubclass(class_obj, PaddleInferModel): + loading_kwargs["infer_configs"] = infer_configs.get(name, None) if isinstance(infer_configs, dict) else infer_configs + + from ppdiffusers import ModelMixin # PaddleNLP or PPDiffusers Model @@ -444,6 +451,7 @@ def load_sub_model( try: # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): + # import pdb; pdb.set_trace() loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) else: # else load from the root directory @@ -462,7 +470,6 @@ def load_sub_model( ) if loaded_sub_model is None: raise ValueError(f"We cant load '{name}' from {pretrained_model_name_or_path} or {cached_folder}! \n {e} ") - return loaded_sub_model @@ -930,6 +937,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P custom_pipeline = kwargs.pop("custom_pipeline", None) custom_revision = kwargs.pop("custom_revision", None) runtime_options = kwargs.pop("runtime_options", None) + infer_configs = kwargs.pop("infer_configs", None) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT) use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) variant = kwargs.pop("variant", None) @@ -1109,6 +1117,7 @@ def load_module(name, value): pipeline_class=pipeline_class, paddle_dtype=paddle_dtype, runtime_options=runtime_options, + infer_configs=infer_configs, model_variants=model_variants, name=name, from_diffusers=from_diffusers, diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/__init__.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/__init__.py index 9bef32127..0c30d8022 100644 --- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/__init__.py +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/__init__.py @@ -55,4 +55,7 @@ class StableDiffusionXLPipelineOutput(BaseOutput): from .pipeline_fastdeploy_stable_diffusion_xl import FastDeployStableDiffusionXLPipeline from .pipeline_fastdeploy_stable_diffusion_xl_img2img import FastDeployStableDiffusionXLImg2ImgPipeline from .pipeline_fastdeploy_stable_diffusion_xl_inpaint import FastDeployStableDiffusionXLInpaintPipeline - from .pipeline_fastdeploy_stable_diffusion_xl_mega import FastDeployStableDiffusionXLMegaPipeline \ No newline at end of file + from .pipeline_fastdeploy_stable_diffusion_xl_mega import FastDeployStableDiffusionXLMegaPipeline + + + from .pipeline_paddleinfer_stable_diffusion_xl import PaddleInferStableDiffusionXLPipeline diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/paddleinferxl_utils.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/paddleinferxl_utils.py new file mode 100644 index 000000000..8a849f05c --- /dev/null +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/paddleinferxl_utils.py @@ -0,0 +1,1308 @@ +# coding=utf-8 +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2022 The HuggingFace Inc. team. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import re +from typing import List, Optional, Union + +from ...image_processor import VaeImageProcessor +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...schedulers import ( + DDIMScheduler, + DDPMScheduler, + DEISMultistepScheduler, + DPMSolverMultistepScheduler, + DPMSolverSinglestepScheduler, + EulerAncestralDiscreteScheduler, + EulerDiscreteScheduler, + HeunDiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + KDPM2DiscreteScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + PreconfigEulerAncestralDiscreteScheduler, + PreconfigLMSDiscreteScheduler, + UniPCMultistepScheduler, +) +from ...utils import is_paddle_available, logging, randn_tensor +from ..paddle_infer_utils import PaddleInferModel + +__all__ = ["PaddleInferDiffusionXLPipelineMixin"] + +if is_paddle_available(): + import paddle + + +logger = logging.get_logger(__name__) + +re_attention = re.compile( + r""" +\\\(| +\\\)| +\\\[| +\\]| +\\\\| +\\| +\(| +\[| +:([+-]?[.\d]+)\)| +\)| +]| +[^\\()\[\]:]+| +: +""", + re.X, +) + + +def parse_prompt_attention(text): + r""" + Parses a string with attention tokens and returns a list of pairs: text and its associated weight. + Accepted tokens are: + (abc) - increases attention to abc by a multiplier of 1.1 + (abc:3.12) - increases attention to abc by a multiplier of 3.12 + [abc] - decreases attention to abc by a multiplier of 1.1 + \( - literal character '(' + \[ - literal character '[' + \) - literal character ')' + \] - literal character ']' + \\ - literal character '\' + anything else - just text + >>> parse_prompt_attention('normal text') + [['normal text', 1.0]] + >>> parse_prompt_attention('an (important) word') + [['an ', 1.0], ['important', 1.1], [' word', 1.0]] + >>> parse_prompt_attention('(unbalanced') + [['unbalanced', 1.1]] + >>> parse_prompt_attention('\(literal\]') + [['(literal]', 1.0]] + >>> parse_prompt_attention('(unnecessary)(parens)') + [['unnecessaryparens', 1.1]] + >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).') + [['a ', 1.0], + ['house', 1.5730000000000004], + [' ', 1.1], + ['on', 1.0], + [' a ', 1.1], + ['hill', 0.55], + [', sun, ', 1.1], + ['sky', 1.4641000000000006], + ['.', 1.1]] + """ + + res = [] + round_brackets = [] + square_brackets = [] + + round_bracket_multiplier = 1.1 + square_bracket_multiplier = 1 / 1.1 + + def multiply_range(start_position, multiplier): + for p in range(start_position, len(res)): + res[p][1] *= multiplier + + for m in re_attention.finditer(text): + text = m.group(0) + weight = m.group(1) + + if text.startswith("\\"): + res.append([text[1:], 1.0]) + elif text == "(": + round_brackets.append(len(res)) + elif text == "[": + square_brackets.append(len(res)) + elif weight is not None and len(round_brackets) > 0: + multiply_range(round_brackets.pop(), float(weight)) + elif text == ")" and len(round_brackets) > 0: + multiply_range(round_brackets.pop(), round_bracket_multiplier) + elif text == "]" and len(square_brackets) > 0: + multiply_range(square_brackets.pop(), square_bracket_multiplier) + else: + res.append([text, 1.0]) + + for pos in round_brackets: + multiply_range(pos, round_bracket_multiplier) + + for pos in square_brackets: + multiply_range(pos, square_bracket_multiplier) + + if len(res) == 0: + res = [["", 1.0]] + + # merge runs of identical weights + i = 0 + while i + 1 < len(res): + if res[i][1] == res[i + 1][1]: + res[i][0] += res[i + 1][0] + res.pop(i + 1) + else: + i += 1 + + return res + + +def get_prompts_with_weights(pipe, prompt: List[str], max_length: int): + r""" + Tokenize a list of prompts and return its tokens with weights of each token. + No padding, starting or ending token is included. + """ + tokens = [] + weights = [] + truncated = False + for text in prompt: + texts_and_weights = parse_prompt_attention(text) + text_token = [] + text_weight = [] + for word, weight in texts_and_weights: + # tokenize and discard the starting and the ending token + token = pipe.tokenizer(word).input_ids[1:-1] + text_token += token + # copy the weight by length of token + text_weight += [weight] * len(token) + # stop if the text is too long (longer than truncation limit) + if len(text_token) > max_length: + truncated = True + break + # truncate + if len(text_token) > max_length: + truncated = True + text_token = text_token[:max_length] + text_weight = text_weight[:max_length] + tokens.append(text_token) + weights.append(text_weight) + if truncated: + logger.warning("Prompt was truncated. Try to shorten the prompt or increase max_embeddings_multiples") + return tokens, weights + + +def pad_tokens_and_weights(tokens, weights, max_length, bos, eos, pad, no_boseos_middle=True, chunk_length=77): + r""" + Pad the tokens (with starting and ending tokens) and weights (with 1.0) to max_length. + """ + max_embeddings_multiples = (max_length - 2) // (chunk_length - 2) + weights_length = max_length if no_boseos_middle else max_embeddings_multiples * chunk_length + for i in range(len(tokens)): + tokens[i] = [bos] + tokens[i] + [eos] + [pad] * (max_length - 2 - len(tokens[i])) + if no_boseos_middle: + weights[i] = [1.0] + weights[i] + [1.0] * (max_length - 1 - len(weights[i])) + else: + w = [] + if len(weights[i]) == 0: + w = [1.0] * weights_length + else: + for j in range(max_embeddings_multiples): + w.append(1.0) # weight for starting token in this chunk + w += weights[i][j * (chunk_length - 2) : min(len(weights[i]), (j + 1) * (chunk_length - 2))] + w.append(1.0) # weight for ending token in this chunk + w += [1.0] * (weights_length - len(w)) + weights[i] = w[:] + # we must to tensor first! + return paddle.to_tensor(tokens, dtype="int64"), paddle.to_tensor(weights, dtype="float32") + + +def get_unweighted_text_embeddings( + pipe, + text_input: paddle.Tensor, + chunk_length: int, + no_boseos_middle: Optional[bool] = True, + infer_op=None, +): + """ + When the length of tokens is a multiple of the capacity of the text encoder, + it should be split into chunks and sent to the text encoder individually. + """ + max_embeddings_multiples = (text_input.shape[1] - 2) // (chunk_length - 2) + + if max_embeddings_multiples > 1: + text_embeddings = [] + for i in range(max_embeddings_multiples): + # extract the i-th chunk + text_input_chunk = text_input[:, i * (chunk_length - 2) : (i + 1) * (chunk_length - 2) + 2].clone() + + # cover the head and the tail by the starting and the ending tokens + text_input_chunk[:, 0] = text_input[0, 0] + text_input_chunk[:, -1] = text_input[0, -1] + text_embedding = pipe.text_encoder( + input_ids=text_input_chunk, + )[0] + if no_boseos_middle: + if i == 0: + # discard the ending token + text_embedding = text_embedding[:, :-1] + elif i == max_embeddings_multiples - 1: + # discard the starting token + text_embedding = text_embedding[:, 1:] + else: + # discard both starting and ending tokens + text_embedding = text_embedding[:, 1:-1] + + text_embeddings.append(text_embedding) + text_embeddings = paddle.concat(text_embeddings, axis=1) + else: + text_embeddings = pipe.text_encoder( + input_ids=text_input, + )[0] + return text_embeddings + + +def get_weighted_text_embeddings( + pipe, + prompt: Union[str, List[str]], + uncond_prompt: Optional[Union[str, List[str]]] = None, + max_embeddings_multiples: Optional[int] = 1, + no_boseos_middle: Optional[bool] = False, + skip_parsing: Optional[bool] = False, + skip_weighting: Optional[bool] = False, + infer_op=None, + **kwargs, +): + r""" + Prompts can be assigned with local weights using brackets. For example, + prompt 'A (very beautiful) masterpiece' highlights the words 'very beautiful', + and the embedding tokens corresponding to the words get multiplied by a constant, 1.1. + Also, to regularize of the embedding, the weighted embedding would be scaled to preserve the original mean. + Args: + pipe (`DiffusionPipeline`): + Pipe to provide access to the tokenizer and the text encoder. + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + uncond_prompt (`str` or `List[str]`): + The unconditional prompt or prompts for guide the image generation. If unconditional prompt + is provided, the embeddings of prompt and uncond_prompt are concatenated. + max_embeddings_multiples (`int`, *optional*, defaults to `1`): + The max multiple length of prompt embeddings compared to the max output length of text encoder. + no_boseos_middle (`bool`, *optional*, defaults to `False`): + If the length of text token is multiples of the capacity of text encoder, whether reserve the starting and + ending token in each of the chunk in the middle. + skip_parsing (`bool`, *optional*, defaults to `False`): + Skip the parsing of brackets. + skip_weighting (`bool`, *optional*, defaults to `False`): + Skip the weighting. When the parsing is skipped, it is forced True. + """ + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 + if isinstance(prompt, str): + prompt = [prompt] + + if not skip_parsing: + prompt_tokens, prompt_weights = get_prompts_with_weights(pipe, prompt, max_length - 2) + if uncond_prompt is not None: + if isinstance(uncond_prompt, str): + uncond_prompt = [uncond_prompt] + uncond_tokens, uncond_weights = get_prompts_with_weights(pipe, uncond_prompt, max_length - 2) + else: + prompt_tokens = [ + token[1:-1] for token in pipe.tokenizer(prompt, max_length=max_length, truncation=True).input_ids + ] + prompt_weights = [[1.0] * len(token) for token in prompt_tokens] + if uncond_prompt is not None: + if isinstance(uncond_prompt, str): + uncond_prompt = [uncond_prompt] + uncond_tokens = [ + token[1:-1] + for token in pipe.tokenizer(uncond_prompt, max_length=max_length, truncation=True).input_ids + ] + uncond_weights = [[1.0] * len(token) for token in uncond_tokens] + + # round up the longest length of tokens to a multiple of (model_max_length - 2) + max_length = max([len(token) for token in prompt_tokens]) + if uncond_prompt is not None: + max_length = max(max_length, max([len(token) for token in uncond_tokens])) + + max_embeddings_multiples = min( + max_embeddings_multiples, + (max_length - 1) // (pipe.tokenizer.model_max_length - 2) + 1, + ) + max_embeddings_multiples = max(1, max_embeddings_multiples) + max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2 + + # pad the length of tokens and weights + # support bert tokenizer + bos = pipe.tokenizer.bos_token_id if pipe.tokenizer.bos_token_id is not None else pipe.tokenizer.cls_token_id + eos = pipe.tokenizer.eos_token_id if pipe.tokenizer.eos_token_id is not None else pipe.tokenizer.sep_token_id + pad = pipe.tokenizer.pad_token_id + + prompt_tokens, prompt_weights = pad_tokens_and_weights( + prompt_tokens, + prompt_weights, + max_length, + bos, + eos, + pad, + no_boseos_middle=no_boseos_middle, + chunk_length=pipe.tokenizer.model_max_length, + ) + if uncond_prompt is not None: + uncond_tokens, uncond_weights = pad_tokens_and_weights( + uncond_tokens, + uncond_weights, + max_length, + bos, + eos, + pad, + no_boseos_middle=no_boseos_middle, + chunk_length=pipe.tokenizer.model_max_length, + ) + # get the embeddings + text_embeddings = get_unweighted_text_embeddings( + pipe, + prompt_tokens, + pipe.tokenizer.model_max_length, + no_boseos_middle=no_boseos_middle, + infer_op=infer_op, + ) + if uncond_prompt is not None: + uncond_embeddings = get_unweighted_text_embeddings( + pipe, + uncond_tokens, + pipe.tokenizer.model_max_length, + no_boseos_middle=no_boseos_middle, + infer_op=infer_op, + ) + # assign weights to the prompts and normalize in the sense of mean + # TODO: should we normalize by chunk or in a whole (current implementation)? + if (not skip_parsing) and (not skip_weighting): + previous_mean = text_embeddings.mean(axis=[-2, -1]) + text_embeddings *= prompt_weights.unsqueeze(-1) + text_embeddings *= (previous_mean / text_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) + if uncond_prompt is not None: + previous_mean = uncond_embeddings.mean(axis=[-2, -1]) + uncond_embeddings *= uncond_weights.unsqueeze(-1) + uncond_embeddings *= (previous_mean / uncond_embeddings.mean(axis=[-2, -1])).unsqueeze(-1).unsqueeze(-1) + + if uncond_prompt is not None: + return text_embeddings, uncond_embeddings + return text_embeddings, None + + +class PaddleInferDiffusionXLPipelineMixin: + def prepare_infer_op_dict(self, infer_op_dict=None, **kwargs): + if infer_op_dict is None: + infer_op_dict = {} + new_infer_op_dict = {} + for name in dir(self): + if name.startswith("_"): + continue + module = getattr(self, name) + if isinstance(module, PaddleInferModel): + infer_op = infer_op_dict.get(name, "zero_copy_infer") if module.is_spport_zero_copy() else "raw" + # if parse_prompt_type in ["lpw", "webui"] and name in ["text_encoder"]: + # if infer_op != "raw": + # logger.warning( + # f"When parse_prompt_type is `{parse_prompt_type}` and module is `{name}`, we will set infer_op to `raw` instead of `{infer_op}`!" + # ) + # infer_op = "raw" + new_infer_op_dict[name] = infer_op + return new_infer_op_dict + + def post_init(self, vae_scaling_factor=0.13025, vae_scale_factor=8, dtype="float32"): + self.vae_scaling_factor = vae_scaling_factor + self.vae_scale_factor = vae_scale_factor + + self.image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor, do_convert_rgb=True) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) + self.dtype = dtype + self.supported_scheduler = [ + "pndm", + "lms", + "preconfig-lms", + "euler", + "euler-ancestral", + "preconfig-euler-ancestral", + "dpm-multi", + "dpm-single", + "unipc-multi", + "ddim", + "ddpm", + "deis-multi", + "heun", + "kdpm2-ancestral", + "kdpm2", + ] + self.orginal_scheduler_config = self.scheduler.config + + @property + def vae_encoder_num_channels(self): + if self.vae_encoder is None: + return 3 + return self.vae_encoder.model.get_input_info(0).shape[1] + + @property + def vae_decoder_num_latent_channels(self): + if self.vae_decoder is None: + return 4 + return self.vae_decoder.model.get_input_info(0).shape[1] + + @property + def unet_num_latent_channels(self): + return self.unet.model.get_input_info(0).shape[1] + + @property + def unet_hidden_states_dim(self): + return self.unet.model.get_input_info(2).shape[2] + + @property + def text_encoder_hidden_states_dim(self): + if not hasattr(self, "text_encoder") or self.text_encoder is None: + return 768 + return self.text_encoder.model.get_output_info(0).shape[2] + + def change_scheduler(self, scheduler_type="ddim"): + scheduler_type = scheduler_type.lower() + if scheduler_type == "pndm": + scheduler = PNDMScheduler.from_config(self.orginal_scheduler_config, skip_prk_steps=True) + elif scheduler_type == "lms": + scheduler = LMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "preconfig-lms": + scheduler = PreconfigLMSDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "heun": + scheduler = HeunDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler": + scheduler = EulerDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "euler-ancestral": + scheduler = EulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "preconfig-euler-ancestral": + scheduler = PreconfigEulerAncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-multi": + scheduler = DPMSolverMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "dpm-single": + scheduler = DPMSolverSinglestepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2-ancestral": + scheduler = KDPM2AncestralDiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "kdpm2": + scheduler = KDPM2DiscreteScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "unipc-multi": + scheduler = UniPCMultistepScheduler.from_config(self.orginal_scheduler_config) + elif scheduler_type == "ddim": + scheduler = DDIMScheduler.from_config( + self.orginal_scheduler_config, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + ) + elif scheduler_type == "ddpm": + scheduler = DDPMScheduler.from_config( + self.orginal_scheduler_config, + ) + elif scheduler_type == "deis-multi": + scheduler = DEISMultistepScheduler.from_config( + self.orginal_scheduler_config, + ) + else: + raise ValueError( + f"Scheduler of type {scheduler_type} doesn't exist! Please choose in {self.supported_scheduler}!" + ) + self.scheduler = scheduler + + def get_timesteps(self, num_inference_steps, strength=1.0): + if strength >= 1: + return self.scheduler.timesteps.cast(self.dtype), num_inference_steps + + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].cast(self.dtype) + + if hasattr(self.scheduler, "step_index_offset"): + self.scheduler.step_index_offset = t_start * self.scheduler.order + + num_inference_steps = num_inference_steps - t_start + # check that number of inference steps is not < 1 - as this doesn't make sense + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" + f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." + ) + + return timesteps, num_inference_steps + + def prepare_controlnet_cond( + self, + controlnet_cond, + controlnet_conditioning_scale, + width, + height, + batch_size, + num_images_per_prompt, + do_classifier_free_guidance=False, + ): + control_image = self.control_image_processor.preprocess( + controlnet_cond, + height=height, + width=width, + ) + if isinstance(controlnet_conditioning_scale, (float, int)): + controlnet_conditioning_scale = paddle.to_tensor([controlnet_conditioning_scale] * 13, dtype=self.dtype) + elif isinstance(controlnet_conditioning_scale, (list, tuple)): + controlnet_conditioning_scale = paddle.to_tensor(controlnet_conditioning_scale, dtype=self.dtype) + else: + raise ValueError( + f"`controlnet_conditioning_scale` has to be of type `float` or `int` or `list` or `tuple` but is {type(controlnet_conditioning_scale)}" + ) + assert controlnet_conditioning_scale.shape[0] == 13 + image_batch_size = control_image.shape[0] + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + control_image = control_image.repeat_interleave(repeat_by, axis=0) + if do_classifier_free_guidance: + control_image = paddle.concat([control_image] * 2) + return control_image, controlnet_conditioning_scale + + def prepare_latents( + self, + batch_size, + height, + width, + generator, + latents=None, + image=None, + timestep=None, + is_strength_max=True, + return_noise=False, + return_image_latents=False, + infer_op=None, + ): + shape = [ + batch_size, + self.vae_decoder_num_latent_channels, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ] + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if (image is None or timestep is None) and not is_strength_max: + raise ValueError( + "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." + "However, either the image or the noise timestep has not been provided." + ) + + if return_image_latents or (latents is None and not is_strength_max): + image = image.cast(dtype=self.dtype) + image_latents = self._encode_vae_image(image, infer_op) + + if latents is None: + noise = randn_tensor(shape, generator=generator, dtype=self.dtype) + # if strength is 1. then initialise the latents to noise, else initial to image + noise + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) + # if pure noise then scale the initial latents by the Scheduler's init sigma + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents + else: + noise = latents + if str(noise.dtype).replace("paddle.", "") != self.dtype: + noise = noise.cast(self.dtype) + latents = noise * self.scheduler.init_noise_sigma + + outputs = (latents,) + + if return_noise: + outputs += (noise,) + + if return_image_latents: + outputs += (image_latents,) + + if len(outputs) == 1: + outputs = latents + return outputs + + def prepare_mask_latents( + self, + mask, + masked_image, + batch_size, + height, + width, + do_classifier_free_guidance, + return_masked_image_latents=True, + infer_op=None, + ): + # resize the mask to latents shape as we concatenate the mask to the latents + # we do that before converting to dtype to avoid breaking in case we're using cpu_offload + # and half precision + mask = paddle.nn.functional.interpolate( + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) + mask = mask.cast(dtype=self.dtype) + + # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method + if mask.shape[0] < batch_size: + if not batch_size % mask.shape[0] == 0: + raise ValueError( + "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" + f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" + " of masks that you pass is divisible by the total requested batch size." + ) + mask = mask.tile([batch_size // mask.shape[0], 1, 1, 1]) + + mask = paddle.concat([mask] * 2) if do_classifier_free_guidance else mask + if not return_masked_image_latents: + return mask + + masked_image = masked_image.cast(dtype=self.dtype) + masked_image_latents = self._encode_vae_image(masked_image, infer_op) + if masked_image_latents.shape[0] < batch_size: + if not batch_size % masked_image_latents.shape[0] == 0: + raise ValueError( + "The passed images and the required batch size don't match. Images are supposed to be duplicated" + f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." + " Make sure the number of images that you pass is divisible by the total requested batch size." + ) + masked_image_latents = masked_image_latents.tile([batch_size // masked_image_latents.shape[0], 1, 1, 1]) + + masked_image_latents = ( + paddle.concat([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents + ) + + # aligning device to prevent device errors when concating it with the latent model input + masked_image_latents = masked_image_latents.cast(dtype=self.dtype) + return mask, masked_image_latents + + def is_scheduler_support_step_index(self): + kwargs_keys = set(inspect.signature(self.scheduler.step).parameters.keys()) + return "kwargs" in kwargs_keys or "step_index" in kwargs_keys + + def _encode_vae_image(self, image: paddle.Tensor, infer_op=None, **kwargs): + image_shape = image.shape + image_latents = self.vae_encoder( + sample=image, + )[0] + + return self.vae_scaling_factor * image_latents + + def _decode_vae_latents(self, latents: paddle.Tensor, infer_op=None, **kwargs): + latents_shape = latents.shape + + images_vae = self.vae_decoder( + latent_sample=latents, + )[0] + + return images_vae + + def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + passed_add_embed_dim = 256 * len(add_time_ids) + 1280 + expected_add_embed_dim = 2816 + if expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + add_time_ids = paddle.to_tensor(data=[add_time_ids], dtype=dtype) + return add_time_ids + + def _get_add_time_ids_2( + self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,)) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(original_size + crops_coords_top_left + target_size) + passed_add_embed_dim = 256 * len(add_time_ids) + 1280 + expected_add_embed_dim = 2816 + if expected_add_embed_dim > passed_add_embed_dim and expected_add_embed_dim - passed_add_embed_dim == 256: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model." + ) + elif expected_add_embed_dim < passed_add_embed_dim and passed_add_embed_dim - expected_add_embed_dim == 256: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model." + ) + elif expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + add_time_ids = paddle.to_tensor(data=[add_time_ids], dtype=dtype) + add_neg_time_ids = paddle.to_tensor(data=[add_neg_time_ids], dtype=dtype) + return add_time_ids, add_neg_time_ids + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + prompt_2, + height, + width, + strength, + callback_steps, + negative_prompt=None, + negative_prompt_2=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + if ( + callback_steps is None + or callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}." + ) + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}." + ) + + def get_timesteps(self, num_inference_steps, strength, denoising_start=None): # noqa + # get the original timestep using init_timestep + if denoising_start is None: + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + t_start = max(num_inference_steps - init_timestep, 0) + else: + t_start = 0 + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + + # Strength is irrelevant if we directly request a timestep to start at; + # that is, strength is determined by the denoising_start instead. + if denoising_start is not None: + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - denoising_start * self.scheduler.config.num_train_timesteps + ) + ) + timesteps = list(filter(lambda ts: ts < discrete_timestep_cutoff, timesteps)) + return paddle.to_tensor(data=timesteps), len(timesteps) + return timesteps, num_inference_steps - t_start + + def encode_prompt( + self, + prompt: str, + prompt_2: Optional[str] = None, + num_images_per_prompt: int = 1, + do_classifier_free_guidance: bool = True, + negative_prompt: Optional[str] = None, + negative_prompt_2: Optional[str] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + pooled_prompt_embeds: Optional[paddle.Tensor] = None, + negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None, + lora_scale: Optional[float] = None, + ): + """ + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + prompt_embeds (`paddle.Tensoroptional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensoroptional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # Define tokenizers and text encoders + tokenizer = self.tokenizer + tokenizer_2 = self.tokenizer_2 + text_encoder = self.text_encoder + text_encoder_2 = self.text_encoder_2 + if prompt_embeds is None: + prompt_2 = prompt_2 or prompt + # textual inversion: procecss multi-vector tokens if necessary + prompt_embeds_list = [] + # for prompt, tokenizer, text_encoder in zip(prompts, tokenizers, text_encoders): + + # 1. text encoder 1 + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, tokenizer) + text_inputs = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids + if ( + untruncated_ids.shape[-1] >= text_input_ids.shape[-1] + and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item() + ): + removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) + logger.warning( + f"The following part of your input was truncated because CLIP can only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}" + ) + # prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True) + prompt_embeds = self.text_encoder( + input_ids=text_input_ids, + ) + prompt_embeds_list.append(prompt_embeds[0]) + + # 2. text encoder 2 + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt_2, tokenizer_2) + + text_inputs = tokenizer_2( + prompt_2, + padding="max_length", + max_length=tokenizer_2.model_max_length, + truncation=True, + return_tensors="pd", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer_2(prompt_2, padding="longest", return_tensors="pd").input_ids + untruncated_ids = tokenizer_2(prompt_2, padding="longest", return_tensors="pd").input_ids + if ( + untruncated_ids.shape[-1] >= text_input_ids.shape[-1] + and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item() + ): + removed_text = tokenizer_2.batch_decode(untruncated_ids[:, tokenizer_2.model_max_length - 1 : -1]) + logger.warning( + f"The following part of your input was truncated because CLIP can only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}" + ) + # prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True) + prompt_embeds = self.text_encoder_2( + input_ids=text_input_ids, + ) + + # We are only ALWAYS interested in the pooled output of the final text encoder + pooled_prompt_embeds = prompt_embeds[1] + # prompt_embeds = prompt_embeds.hidden_states[-2] + # todo: check if this is correct + prompt_embeds = prompt_embeds[0] + prompt_embeds_list.append(prompt_embeds) + + # 3. Concatenate prompt embeddings + prompt_embeds = paddle.concat(x=prompt_embeds_list, axis=-1) + + # get unconditional embeddings for classifier free guidance + zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt + if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: + negative_prompt_embeds = paddle.zeros_like(x=prompt_embeds) + negative_pooled_prompt_embeds = paddle.zeros_like(x=pooled_prompt_embeds) + elif do_classifier_free_guidance and negative_prompt_embeds is None: + negative_prompt = negative_prompt or "" + negative_prompt_2 = negative_prompt_2 or negative_prompt + negative_prompt_embeds_list = [] + # 1. text_enocder 1 + if isinstance(self, TextualInversionLoaderMixin): + negative_prompt = self.maybe_convert_prompt(negative_prompt, tokenizer) + max_length = prompt_embeds.shape[1] + uncond_input = tokenizer( + negative_prompt, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd" + ) + # negative_prompt_embeds = text_encoder(uncond_input.input_ids, output_hidden_states=True) + negative_prompt_embeds = text_encoder( + input_ids=uncond_input.input_ids, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + + # negative_pooled_prompt_embeds = negative_prompt_embeds[0] + # negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] + # todo: check if this is correct + negative_prompt_embeds = negative_prompt_embeds[0] + negative_prompt_embeds_list.append(negative_prompt_embeds) + + # 2. text_enocder 2 + if isinstance(self, TextualInversionLoaderMixin): + negative_prompt = self.maybe_convert_prompt(negative_prompt_2, tokenizer_2) + max_length = prompt_embeds.shape[1] + uncond_input = tokenizer_2( + negative_prompt_2, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd" + ) + # negative_prompt_embeds = text_encoder(uncond_input.input_ids, output_hidden_states=True) + negative_prompt_embeds = text_encoder_2( + input_ids=uncond_input.input_ids, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + + negative_pooled_prompt_embeds = negative_prompt_embeds[1] + # negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] + # todo: check if this is correct + negative_prompt_embeds = negative_prompt_embeds[0] + negative_prompt_embeds_list.append(negative_prompt_embeds) + + # 3. conbine the two embeddings + negative_prompt_embeds = paddle.concat(x=negative_prompt_embeds_list, axis=-1) + prompt_embeds = prompt_embeds.cast(dtype=self.dtype) + # duplicate text embeddings for each generation per prompt, using mps friendly method + bs_embed, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.cast(dtype=self.dtype) + negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + pooled_prompt_embeds = pooled_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt]).reshape( + [bs_embed * num_images_per_prompt, -1] + ) + if do_classifier_free_guidance: + negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.tile( + repeat_times=[1, num_images_per_prompt] + ).reshape([bs_embed * num_images_per_prompt, -1]) + + return (prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds) + + def _encode_prompt_old( + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + parse_prompt_type: Optional[str] = "lpw", + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): + if parse_prompt_type == "lpw": + return self._encode_prompt_lpw( + prompt, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + max_embeddings_multiples=max_embeddings_multiples, + infer_op=infer_op, + **kwargs, + ) + elif parse_prompt_type == "raw": + return self._encode_prompt_raw( + prompt, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + infer_op=infer_op, + ) + elif parse_prompt_type == "webui": + raise NotImplementedError("`parse_prompt_type=webui` is not implemented yet.") + + def _encode_prompt_lpw( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int, + do_classifier_free_guidance: bool, + negative_prompt: Union[str, List[str]], + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + max_embeddings_multiples: Optional[int] = 3, + **kwargs, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `list(int)`): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + max_embeddings_multiples (`int`, *optional*, defaults to `3`): + The max multiple length of prompt embeddings compared to the max output length of text encoder. + """ + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None and negative_prompt_embeds is None: + uncond_tokens: List[str] = None + if do_classifier_free_guidance: + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + prompt_embeds, negative_prompt_embeds = get_weighted_text_embeddings( + pipe=self, + prompt=prompt, + uncond_prompt=uncond_tokens, + max_embeddings_multiples=max_embeddings_multiples, + infer_op="raw", # NOTE: we can't use zero copy! + **kwargs, + ) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + return prompt_embeds + + def _encode_prompt_raw( + self, + prompt, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + infer_op=None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pd", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pd").input_ids # check + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not paddle.equal_all( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder( + input_ids=text_input_ids, + )[0] + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.tile([1, num_images_per_prompt, 1]) + prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1]) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + if negative_prompt is None: + uncond_tokens = [""] + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pd", + ) + negative_prompt_embeds = self.text_encoder( + input_ids=uncond_input.input_ids, + )[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.tile([1, num_images_per_prompt, 1]) + negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1]) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if paddle.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="np") + image, has_nsfw_concept = self.safety_checker( + images=image.numpy(), + clip_input=safety_checker_input.pixel_values.astype(self.dtype), + infer_op="raw", + ) + image = paddle.to_tensor(image, dtype=self.dtype) + return image, has_nsfw_concept + + def prepare_extra_step_kwargs(self, generator, eta): # noqa + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_paddleinfer_stable_diffusion_xl.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_paddleinfer_stable_diffusion_xl.py new file mode 100644 index 000000000..d48a53f8c --- /dev/null +++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_paddleinfer_stable_diffusion_xl.py @@ -0,0 +1,484 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2023 The Hugg gFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import paddle + +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import logging, randn_tensor, replace_example_docstring +from ..paddle_infer_utils import PaddleInferModel +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionXLPipelineOutput +from .paddleinferxl_utils import PaddleInferDiffusionXLPipelineMixin + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name +import paddlenlp + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import paddle + >>> from ppdiffusers import StableDiffusionXLPipeline + + >>> pipe = StableDiffusionXLPipeline.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16 + ... ) + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + ``` +""" + + +# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(axis=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(axis=list(range(1, noise_cfg.ndim)), keepdim=True) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +class PaddleInferStableDiffusionXLPipeline(DiffusionPipeline, PaddleInferDiffusionXLPipelineMixin): + """ + Pipeline for text-to-image generation using Stable Diffusion XL. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`StableDiffusionXLPipeline.load_lora_weights`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] + + as well as the following saving methods: + - *LoRA*: [`loaders.StableDiffusionXLPipeline.save_lora_weights`] + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion XL uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + text_encoder_2 ([` CLIPTextModelWithProjection`]): + Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection), + specifically the + [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k) + variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + tokenizer_2 (`CLIPTokenizer`): + Second Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + + def __init__( + self, + vae_encoder: PaddleInferModel, + vae_decoder: PaddleInferModel, + text_encoder: PaddleInferModel, + text_encoder_2: PaddleInferModel, + tokenizer: paddlenlp.transformers.CLIPTokenizer, + tokenizer_2: paddlenlp.transformers.CLIPTokenizer, + unet: PaddleInferModel, + scheduler: KarrasDiffusionSchedulers, + force_zeros_for_empty_prompt: bool = True, + ): + super().__init__() + self.register_modules( + vae_encoder=vae_encoder, + vae_decoder=vae_decoder, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + ) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + + self.post_init(vae_scaling_factor=0.13025) + + def check_inputs( + self, + prompt, + prompt_2, + height, + width, + callback_steps, + negative_prompt=None, + negative_prompt_2=None, + prompt_embeds=None, + negative_prompt_embeds=None, + pooled_prompt_embeds=None, + negative_pooled_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + if ( + callback_steps is None + or callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}." + ) + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two." + ) + elif prompt_2 is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)): + raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}") + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + elif negative_prompt_2 is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + f"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds` {negative_prompt_embeds.shape}." + ) + if prompt_embeds is not None and pooled_prompt_embeds is None: + raise ValueError( + "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." + ) + if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: + raise ValueError( + "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." + ) + + # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + if latents is None: + latents = randn_tensor(shape, generator=generator, dtype=dtype) + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @paddle.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + prompt_2: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + denoising_end: Optional[float] = None, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + negative_prompt_2: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[paddle.Generator, List[paddle.Generator]]] = None, + latents: Optional[paddle.Tensor] = None, + prompt_embeds: Optional[paddle.Tensor] = None, + negative_prompt_embeds: Optional[paddle.Tensor] = None, + pooled_prompt_embeds: Optional[paddle.Tensor] = None, + negative_pooled_prompt_embeds: Optional[paddle.Tensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, paddle.Tensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Optional[Tuple[int, int]] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Optional[Tuple[int, int]] = None, + + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is + used in both text-encoders + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + denoising_end (`float`, *optional*): + When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be + completed before it is intentionally prematurely terminated. As a result, the returned sample will + still retain a substantial amount of noise as determined by the discrete timesteps selected by the + scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a + "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image + Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + negative_prompt_2 (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and + `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`paddle.Generator` or `List[paddle.Generator]`, *optional*): + One or a list of paddle generator(s). + to make generation deterministic. + latents (`paddle.Tensoroptional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`paddle.Tensoroptional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. + If not provided, pooled text embeddings will be generated from `prompt` input argument. + negative_pooled_prompt_embeds (`paddle.Tensor`, *optional*): + Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` + input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead + of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: paddle.Tensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in ppdiffusers.cross_attention. + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + For most cases, `target_size` should be set to the desired height and width of the generated image. If + not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in + section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + + Examples: + + Returns: + [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is a list with the generated images. + """ + # 0. Default height and width to unet + height = height or 1024 + width = width or 1024 + original_size = original_size or (height, width) + target_size = target_size or (height, width) + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + prompt_2, + height, + width, + callback_steps, + negative_prompt, + negative_prompt_2, + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt=prompt, + prompt_2=prompt_2, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + negative_prompt_2=negative_prompt_2, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + pooled_prompt_embeds=pooled_prompt_embeds, + negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps) + timesteps = self.scheduler.timesteps + # 5. Prepare latent variables + num_channels_latents = 4 + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + generator, + latents, + ) + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + add_time_ids = self._get_add_time_ids( + original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype + ) + if do_classifier_free_guidance: + prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds], axis=0) + add_text_embeds = paddle.concat(x=[negative_pooled_prompt_embeds, add_text_embeds], axis=0) + add_time_ids = paddle.concat(x=[add_time_ids, add_time_ids], axis=0) + + add_time_ids = add_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1]) + + # 8. Denoising loop + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + + # 7.1 Apply denoising_end + if denoising_end is not None and type(denoising_end) == float and denoising_end > 0 and denoising_end < 1: + discrete_timestep_cutoff = int( + round( + self.scheduler.config.num_train_timesteps + - denoising_end * self.scheduler.config.num_train_timesteps + ) + ) + num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps))) + timesteps = timesteps[:num_inference_steps] + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = paddle.concat(x=[latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=prompt_embeds, + text_embeds=add_text_embeds, + time_ids=add_time_ids, + output_shape=latent_model_input.shape, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(chunks=2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0: + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if not output_type == "latent": + image = self._decode_vae_latents( + latents / self.vae_scaling_factor, + ) + + else: + image = latents + return StableDiffusionXLPipelineOutput(images=image) + + image = self.image_processor.postprocess(image, output_type=output_type) + + if not return_dict: + return (image,) + + return StableDiffusionXLPipelineOutput(images=image) + + diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py index ec3dda5e2..707c368b1 100644 --- a/ppdiffusers/ppdiffusers/utils/__init__.py +++ b/ppdiffusers/ppdiffusers/utils/__init__.py @@ -26,6 +26,8 @@ DOWNLOAD_SERVER, FASTDEPLOY_MODEL_NAME, FASTDEPLOY_WEIGHTS_NAME, + PADDLE_INFER_MODEL_NAME, + PADDLE_INFER_WEIGHTS_NAME, FLAX_WEIGHTS_NAME, FROM_DIFFUSERS, FROM_HF_HUB, diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py index 5ce92bfc8..ad3e9b123 100644 --- a/ppdiffusers/ppdiffusers/utils/constants.py +++ b/ppdiffusers/ppdiffusers/utils/constants.py @@ -55,6 +55,8 @@ def str2bool(v): PADDLE_WEIGHTS_NAME = "model_state.pdparams" FASTDEPLOY_WEIGHTS_NAME = "inference.pdiparams" FASTDEPLOY_MODEL_NAME = "inference.pdmodel" +PADDLE_INFER_WEIGHTS_NAME = "inference.pdiparams" +PADDLE_INFER_MODEL_NAME = "inference.pdmodel" WEIGHTS_NAME = PADDLE_WEIGHTS_NAME TEST_DOWNLOAD_SERVER = "https://paddlenlp.bj.bcebos.com/models/community/ppdiffusers/tests" From b3a4774e72aab2d0a8aea4e444c45f0f4c112bad Mon Sep 17 00:00:00 2001 From: yangjianfeng01 Date: Fri, 26 Jan 2024 18:49:12 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=94=AF=E6=8C=81zero=5Fcopy,=E4=BF=AE?= =?UTF-8?q?=E5=A4=8Dtrt=E5=AD=90=E5=9B=BE=E5=88=87=E5=88=86=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ppdiffusers/deploy/sdxl/export_model.py | 48 +- .../deploy/sdxl/export_model_by_paddle.py | 209 +++++++++ ppdiffusers/deploy/sdxl/infer_paddle.py | 434 +++++++----------- .../ppdiffusers/models/unet_2d_condition.py | 2 +- .../pipelines/paddle_infer_utils.py | 17 +- .../ppdiffusers/pipelines/pipeline_utils.py | 6 +- 6 files changed, 411 insertions(+), 305 deletions(-) create mode 100644 ppdiffusers/deploy/sdxl/export_model_by_paddle.py diff --git a/ppdiffusers/deploy/sdxl/export_model.py b/ppdiffusers/deploy/sdxl/export_model.py index 9a794590d..c83390b57 100644 --- a/ppdiffusers/deploy/sdxl/export_model.py +++ b/ppdiffusers/deploy/sdxl/export_model.py @@ -21,17 +21,12 @@ from fd_stable_diffusion_xl_housing import ( FastDeploySFastDeployStableDiffusionXLPipelineHousing, ) -from paddle_stable_diffusion_xl_housing import ( - PaddleInferStableDiffusionXLPipelineHousing -) from text_encoder_2_housing import CLIPTextModelWithProjectionHousing from text_encoder_housing import CLIPTextModelHousing from unet_2d_condition_housing import UNet2DConditionModelSDXLHousing from ppdiffusers import FastDeployRuntimeModel, StableDiffusionXLPipeline -from ppdiffusers import PaddleInferModel - def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( model_path: str, @@ -168,46 +163,25 @@ def forward_vae_decoder(self, z): print(f"Save vae_decoder model in {save_path} successfully.") del pipeline.vae - paddle_infer_pipe_cls = PaddleInferStableDiffusionXLPipelineHousing + fd_pipe_cls = FastDeploySFastDeployStableDiffusionXLPipelineHousing print("mark 1") - text_encoder = (PaddleInferModel.from_pretrained(output_path / "text_encoder"),) + text_encoder = (FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),) # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), print("mark 2") - paddle_infer_pipeline = paddle_infer_pipe_cls( - vae_encoder=PaddleInferModel.from_pretrained(output_path / "vae_encoder"), - vae_decoder=PaddleInferModel.from_pretrained(output_path / "vae_decoder"), - unet=PaddleInferModel.from_pretrained(output_path / "unet"), - text_encoder=PaddleInferModel.from_pretrained(output_path / "text_encoder"), - text_encoder_2=PaddleInferModel.from_pretrained(output_path / "text_encoder_2"), + fastdeploy_pipeline = fd_pipe_cls( + vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), + unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), + text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), + text_encoder_2=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder_2"), tokenizer=pipeline.tokenizer, tokenizer_2=pipeline.tokenizer_2, scheduler=pipeline.scheduler, ) print("start saving") - paddle_infer_pipeline.save_pretrained(output_path) - print("PaddleInfer pipeline saved to", output_path) - - - # fd_pipe_cls = FastDeploySFastDeployStableDiffusionXLPipelineHousing - # print("mark 1") - # text_encoder = (FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"),) - # # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), - # print("mark 2") - - # fastdeploy_pipeline = fd_pipe_cls( - # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), - # vae_decoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_decoder"), - # unet=FastDeployRuntimeModel.from_pretrained(output_path / "unet"), - # text_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder"), - # text_encoder_2=FastDeployRuntimeModel.from_pretrained(output_path / "text_encoder_2"), - # tokenizer=pipeline.tokenizer, - # tokenizer_2=pipeline.tokenizer_2, - # scheduler=pipeline.scheduler, - # ) - # print("start saving") - # fastdeploy_pipeline.save_pretrained(output_path) - # print("FastDeploy pipeline saved to", output_path) + fastdeploy_pipeline.save_pretrained(output_path) + print("FastDeploy pipeline saved to", output_path) if __name__ == "__main__": @@ -229,4 +203,4 @@ def forward_vae_decoder(self, z): convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( args.pretrained_model_name_or_path, args.output_path, args.sample, args.height, args.width - ) + ) \ No newline at end of file diff --git a/ppdiffusers/deploy/sdxl/export_model_by_paddle.py b/ppdiffusers/deploy/sdxl/export_model_by_paddle.py new file mode 100644 index 000000000..3f1f3b277 --- /dev/null +++ b/ppdiffusers/deploy/sdxl/export_model_by_paddle.py @@ -0,0 +1,209 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from pathlib import Path +from types import MethodType + +import paddle + +from paddle_stable_diffusion_xl_housing import ( + PaddleInferStableDiffusionXLPipelineHousing +) +from text_encoder_2_housing import CLIPTextModelWithProjectionHousing +from text_encoder_housing import CLIPTextModelHousing +from unet_2d_condition_housing import UNet2DConditionModelSDXLHousing + +from ppdiffusers import FastDeployRuntimeModel, StableDiffusionXLPipeline + +from ppdiffusers import PaddleInferModel + + +def convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( + model_path: str, + output_path: str, + sample: bool = False, + height: int = None, + width: int = None, +): + # specify unet model with unet pre_temb_act opt enabled. + unet_model = UNet2DConditionModelSDXLHousing.from_pretrained( + model_path, resnet_pre_temb_non_linearity=False, subfolder="unet" + ) + text_encoder_model = CLIPTextModelHousing.from_pretrained(model_path, subfolder="text_encoder") + text_encoder_2_model = CLIPTextModelWithProjectionHousing.from_pretrained(model_path, subfolder="text_encoder_2") + pipeline = StableDiffusionXLPipeline.from_pretrained( + model_path, + unet=unet_model, + text_encoder=text_encoder_model, + text_encoder_2=text_encoder_2_model, + safety_checker=None, + feature_extractor=None, + ).to(paddle_dtype="float32") + + # make sure we disable xformers + pipeline.unet.set_default_attn_processor() + pipeline.vae.set_default_attn_processor() + output_path = Path(output_path) + # calculate latent's H and W + latent_height = height // 8 if height is not None else None + latent_width = width // 8 if width is not None else None + # get arguments + cross_attention_dim = pipeline.unet.config.cross_attention_dim # 768 or 1024 or 1280 + unet_channels = pipeline.unet.config.in_channels # 4 or 9 + vae_in_channels = pipeline.vae.config.in_channels # 3 + vae_latent_channels = pipeline.vae.config.latent_channels # 4 + print( + f"cross_attention_dim: {cross_attention_dim}\n", + f"unet_in_channels: {unet_channels}\n", + f"vae_encoder_in_channels: {vae_in_channels}\n", + f"vae_decoder_latent_channels: {vae_latent_channels}", + ) + + # 1. Convert text_encoder + text_encoder = pipeline.text_encoder + # text_encoder.forward = MethodType(forward_text_encoder, text_encoder) + text_encoder = paddle.jit.to_static( + text_encoder, + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids + ) + save_path = os.path.join(args.output_path, "text_encoder", "inference") + paddle.jit.save(text_encoder, save_path) + print(f"Save text_encoder model in {save_path} successfully.") + del pipeline.text_encoder + + text_encoder_2 = pipeline.text_encoder_2 + # text_encoder_2.forward = MethodType(forward_text_encoder_2, text_encoder_2) + text_encoder_2 = paddle.jit.to_static( + text_encoder_2, + input_spec=[paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids")], # input_ids + ) + save_path = os.path.join(args.output_path, "text_encoder_2", "inference") + paddle.jit.save(text_encoder_2, save_path) + print(f"Save text_encoder_2 model in {save_path} successfully.") + del pipeline.text_encoder_2 + + # 2. Convert unet + unet = paddle.jit.to_static( + pipeline.unet, + input_spec=[ + paddle.static.InputSpec( + shape=[None, unet_channels, latent_height, latent_width], dtype="float32", name="sample" + ), # sample + paddle.static.InputSpec(shape=[1], dtype="float32", name="timestep"), # timestep + paddle.static.InputSpec( + shape=[None, None, cross_attention_dim], dtype="float32", name="encoder_hidden_states" + ), # encoder_hidden_states + paddle.static.InputSpec( + shape=[None, 1280], dtype="float32", name="text_embeds" + ), # added_cond_kwargs_text_embeds + paddle.static.InputSpec(shape=[None, 6], dtype="float32", name="time_ids"), # added_cond_kwargs_time_ids + ], + ) + save_path = os.path.join(args.output_path, "unet", "inference") + paddle.jit.save(unet, save_path) + print(f"Save unet model in {save_path} successfully.") + del pipeline.unet + + def forward_vae_encoder_mode(self, z): + return self.encode(z, True).latent_dist.mode() + + def forward_vae_encoder_sample(self, z): + return self.encode(z, True).latent_dist.sample() + + # 3. Convert vae encoder + vae_encoder = pipeline.vae + if sample: + vae_encoder.forward = MethodType(forward_vae_encoder_sample, vae_encoder) + else: + vae_encoder.forward = MethodType(forward_vae_encoder_mode, vae_encoder) + + vae_encoder = paddle.jit.to_static( + vae_encoder, + input_spec=[ + paddle.static.InputSpec( + shape=[None, vae_in_channels, height, width], + dtype="float32", + name="sample", # N, C, H, W + ), # latent + ], + ) + # Save vae_encoder in static graph model. + save_path = os.path.join(args.output_path, "vae_encoder", "inference") + paddle.jit.save(vae_encoder, save_path) + print(f"Save vae_encoder model in {save_path} successfully.") + + # 4. Convert vae encoder + vae_decoder = pipeline.vae + + def forward_vae_decoder(self, z): + return self.decode(z, True).sample + + vae_decoder.forward = MethodType(forward_vae_decoder, vae_decoder) + vae_decoder = paddle.jit.to_static( + vae_decoder, + input_spec=[ + paddle.static.InputSpec( + shape=[None, vae_latent_channels, latent_height, latent_width], dtype="float32", name="latent_sample" + ), # latent_sample + ], + ) + # Save vae_decoder in static graph model. + save_path = os.path.join(args.output_path, "vae_decoder", "inference") + paddle.jit.save(vae_decoder, save_path) + print(f"Save vae_decoder model in {save_path} successfully.") + del pipeline.vae + + paddle_infer_pipe_cls = PaddleInferStableDiffusionXLPipelineHousing + print("mark 1") + text_encoder = (PaddleInferModel.from_pretrained(output_path / "text_encoder"),) + # vae_encoder=FastDeployRuntimeModel.from_pretrained(output_path / "vae_encoder"), + print("mark 2") + + paddle_infer_pipeline = paddle_infer_pipe_cls( + vae_encoder=PaddleInferModel.from_pretrained(output_path / "vae_encoder"), + vae_decoder=PaddleInferModel.from_pretrained(output_path / "vae_decoder"), + unet=PaddleInferModel.from_pretrained(output_path / "unet"), + text_encoder=PaddleInferModel.from_pretrained(output_path / "text_encoder"), + text_encoder_2=PaddleInferModel.from_pretrained(output_path / "text_encoder_2"), + tokenizer=pipeline.tokenizer, + tokenizer_2=pipeline.tokenizer_2, + scheduler=pipeline.scheduler, + ) + print("start saving") + paddle_infer_pipeline.save_pretrained(output_path) + print("PaddleInfer pipeline saved to", output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + required=True, + help="Path to the `ppdiffusers` checkpoint to convert (either a local directory or on the bos).", + ) + parser.add_argument("--output_path", type=str, required=True, help="Path to the output model.") + parser.add_argument( + "--sample", action="store_true", default=False, help="Export the vae encoder in mode or sample" + ) + parser.add_argument("--height", type=int, default= None, help="The height of output images. Default: None") + parser.add_argument("--width", type=int, default= None, help="The width of output images. Default: None") + args = parser.parse_args() + + convert_ppdiffusers_pipeline_to_fastdeploy_pipeline( + args.pretrained_model_name_or_path, args.output_path, args.sample, args.height, args.width + ) diff --git a/ppdiffusers/deploy/sdxl/infer_paddle.py b/ppdiffusers/deploy/sdxl/infer_paddle.py index 0dbaf2b9c..6dc8c12d5 100644 --- a/ppdiffusers/deploy/sdxl/infer_paddle.py +++ b/ppdiffusers/deploy/sdxl/infer_paddle.py @@ -18,6 +18,8 @@ import paddle import random + + # isort: split import paddle.inference as paddle_infer import numpy as np @@ -36,19 +38,19 @@ def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument( "--model_dir", - default="runwayml/stable-diffusion-v1-5@fastdeploy", + default="static_model/stable-diffusion-xl-base-1.0", help="The model directory of diffusion_model.", ) parser.add_argument( "--inference_steps", type=int, - default=50, + default=30, help="The number of unet inference steps.", ) parser.add_argument( "--benchmark_steps", type=int, - default=1, + default=5, help="The number of performance benchmark steps.", ) parser.add_argument( @@ -78,25 +80,12 @@ def parse_arguments(): default="text2img", choices=[ "text2img", - "img2img", - "inpaint", - "all", - ], - help="The task can be one of [text2img, img2img, inpaint, pix2pix, all]. ", - ) - parser.add_argument( - "--parse_prompt_type", - type=str, - default="lpw", - choices=[ - "raw", - "lpw", ], - help="The parse_prompt_type can be one of [raw, lpw]. ", + help="only [text2img]. ", ) parser.add_argument("--use_fp16", type=strtobool, default=True, help="Wheter to use FP16 mode") parser.add_argument("--use_bf16", type=strtobool, default=False, help="Wheter to use BF16 mode") - parser.add_argument("--device_id", type=int, default=0, help="The selected gpu id. -1 means use cpu") + parser.add_argument("--device_id", type=int, default=7, help="The selected gpu id. -1 means use cpu") parser.add_argument( "--scheduler", type=str, @@ -135,6 +124,21 @@ def parse_arguments(): return parser.parse_args() + +''' +关于trt动态shape的使用: +为了使用trt动态shape,paddle_inference提供了三种方式来设置shape信息: +1. 直接使用set_trt_dynamic_shape_info设置输入的shape范围,后面op的shape自动推导。但是这种方式潜在很多未知bug,极不推荐使用。 +2. 离线收集:首先在静态图模式下,(下面接口的tune==True)使用collect_shape_range_info收集各OP的shape范围到静态图目录下得到 +shape_range_info.pbtxt,然后开启trt,使用enable_tuned_tensorrt_dynamic_shape(path_shape_file, True)使用收集到的shape信息。 +强烈推荐使用这种方式。 +3. 在线收集:直接开启trt 使用enable_tuned_tensorrt_dynamic_shape()接口,接口参数为空,会自动收集输入的shape信息,但这种方式 +比离线收集要慢。 + + +''' + + def create_paddle_inference_runtime( model_dir="", model_name="", @@ -142,11 +146,10 @@ def create_paddle_inference_runtime( dynamic_shape=None, precision_mode=paddle_infer.PrecisionType.Half, device_id=0, - disable_paddle_trt_ops=[], - disable_paddle_pass=[], workspace=24*1024*1024*1024, - tune=False, -): + tune=False, #离线收集shape信息 + auto_tune=False, #在线收集信息 +): config = paddle_infer.Config() config.enable_new_executor() config.enable_memory_optim() @@ -156,27 +159,26 @@ def create_paddle_inference_runtime( if device_id != -1: config.use_gpu() config.enable_use_gpu(memory_pool_init_size_mb=2000, device_id=device_id, precision_mode=precision_mode) - for pass_name in disable_paddle_pass: - config.delete_pass(pass_name) if use_trt: config.enable_tensorrt_engine(workspace_size=workspace, precision_mode=precision_mode, max_batch_size=1, min_subgraph_size=3, - uuse_static=True) + use_static=True,) config.enable_tensorrt_memory_optim() - config.enable_tuned_tensorrt_dynamic_shape(shape_file, True) - cache_file = os.path.join(model_dir, model_name, "_opt_cache/") - config.set_optim_cache_dir(cache_file) - if precision_mode != paddle_infer.PrecisionType.Half: - only_fp16_passes = [ - "trt_cross_multihead_matmul_fuse_pass", - "trt_flash_multihead_matmul_fuse_pass", - "preln_elementwise_groupnorm_act_pass", - "elementwise_groupnorm_act_pass", - ] - for curr_pass in only_fp16_passes: - config.delete_pass(curr_pass) + if dynamic_shape is None: + if auto_tune: + config.enable_tuned_tensorrt_dynamic_shape() + else: + if not os.path.exists(shape_file): + raise ValueError(f"shape_range_info.pbtxt not found in {model_dir}/{model_name}, you should set dyanmic_shape or collect shape_range_info by auto_tune firstly.") + config.enable_tuned_tensorrt_dynamic_shape(shape_file, True) + else: + if dynamic_shape is None: + raise ValueError("dynamic_shape should be set when use trt when you don's have shape-file.") + config.set_trt_dynamic_shape_info(dynamic_shape[0], dynamic_shape[1], dynamic_shape[2]) + cache_path = f"{model_dir}/{model_name}/_opt_cache" + config.set_optim_cache_dir(cache_path) return config def main(args): @@ -196,108 +198,120 @@ def main(args): unet_in_channels = 4 bs = 2 - text_encoder_dynamic_shape = { - "input_ids": { - "min_shape": [1, text_encoder_max_length], - "max_shape": [1, text_encoder_max_length], - "opt_shape": [1, text_encoder_max_length], - } + text_encoder_dynamic_min_shape = { + "input_ids": [1, text_encoder_max_length], } - - text_encoder_2_dynamic_shape = { - "input_ids": { - "min_shape": [1, text_encoder_max_length], - "max_shape": [1, text_encoder_max_length], - "opt_shape": [1, text_encoder_max_length], - } + text_encoder_dynamic_max_shape = { + "input_ids": [1, text_encoder_max_length], + } + text_encoder_dynamic_opt_shape = { + "input_ids": [1, text_encoder_max_length], } - vae_encoder_dynamic_shape = { - "sample": { - "min_shape": [1, 3, min_image_size, min_image_size], - "max_shape": [1, 3, max_image_size, max_image_size], - "opt_shape": [1, 3, min_image_size, min_image_size], - } + text_encoder_dynamic_shape = [text_encoder_dynamic_min_shape, text_encoder_dynamic_max_shape, text_encoder_dynamic_opt_shape] + + text_encoder_2_dynamic_min_shape = { + "input_ids": [1, text_encoder_max_length], } + text_encoder_2_dynamic_max_shape = { + "input_ids": [1, text_encoder_max_length], + } + text_encoder_2_dynamic_opt_shape = { + "input_ids": [1, text_encoder_max_length], + } + + text_encoder_2_dynamic_shape = [text_encoder_2_dynamic_min_shape, text_encoder_2_dynamic_max_shape, text_encoder_2_dynamic_opt_shape] - vae_decoder_dynamic_shape = { - "latent_sample": { - "min_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], - "max_shape": [1, vae_in_channels, max_image_size // 8, max_image_size // 8], - "opt_shape": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], - } + vae_encoder_dynamic_min_shape = { + "sample": [1, 3, min_image_size // 8, min_image_size // 8], } + vae_encoder_dynamic_max_shape = { + "sample": [1, 3, max_image_size // 8, max_image_size // 8], + } + vae_encoder_dynamic_opt_shape = { + "sample": [1, 3, min_image_size // 8, min_image_size // 8], + } + vae_encoder_dynamic_shape = [vae_encoder_dynamic_min_shape, vae_encoder_dynamic_max_shape, vae_encoder_dynamic_opt_shape] + - unet_dynamic_shape = { - "sample": { - "min_shape": [ - 1, - unet_in_channels, - min_image_size // 8, - min_image_size // 8, - ], - "max_shape": [ - bs, - unet_in_channels, - max_image_size // 8, - max_image_size // 8, - ], - "opt_shape": [ - 2, - unet_in_channels, - min_image_size // 8, - min_image_size // 8, - ], - }, - "timestep": { - "min_shape": [1], - "max_shape": [1], - "opt_shape": [1], - }, - "encoder_hidden_states": { - "min_shape": [1, text_encoder_max_length, hidden_states], - "max_shape": [bs, unet_max_length, hidden_states], - "opt_shape": [2, text_encoder_max_length, hidden_states], - }, - "text_embeds": { - "min_shape": [1, 1280], - "max_shape": [bs, 1280], - "opt_shape": [2, 1280], - }, - "time_ids": { - "min_shape": [1, 6], - "max_shape": [bs, 6], - "opt_shape": [2, 6], - }, + vae_decoder_dynamic_min_shape = { + "latent_sample": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], } + vae_decoder_dynamic_max_shape = { + "latent_sample": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + } + vae_decoder_dynamic_opt_shape = { + "latent_sample": [1, vae_in_channels, min_image_size // 8, min_image_size // 8], + } + vae_decoder_dynamic_shape = [vae_decoder_dynamic_min_shape, vae_decoder_dynamic_max_shape, vae_decoder_dynamic_opt_shape] + + unet_min_input_shape ={ + "sample": [1, unet_in_channels, min_image_size // 8, min_image_size // 8], + "timestep": [1], + "encoder_hidden_states": [1, text_encoder_max_length, hidden_states], + "text_embeds": [1, 1280], + "time_ids": [1, 6], + } + unet_max_input_shape ={ + "sample": [bs, unet_in_channels, max_image_size // 8, max_image_size // 8], + "timestep": [1], + "encoder_hidden_states": [bs, unet_max_length, hidden_states], + "text_embeds": [bs, 1280], + "time_ids": [bs, 6], + } + unet_opt_input_shape ={ + "sample": [2, unet_in_channels, min_image_size // 8, min_image_size // 8], + "timestep": [1], + "encoder_hidden_states": [2, text_encoder_max_length, hidden_states], + "text_embeds": [2, 1280], + "time_ids": [2, 6], + } + unet_input_shape=[unet_min_input_shape, unet_max_input_shape, unet_opt_input_shape] # 4. Init runtime - disable_paddle_pass=['auto_mixed_precision_pass'] + only_fp16_passes = [ + "trt_cross_multihead_matmul_fuse_pass", + "trt_flash_multihead_matmul_fuse_pass", + "preln_elementwise_groupnorm_act_pass", + "elementwise_groupnorm_act_pass", + + ] + no_need_passes = [ + 'trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass', + 'add_support_int8_pass', + 'auto_mixed_precision_pass', + ] + paddle_delete_passes = dict( + text_encoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + text_encoder_2=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + vae_encoder=only_fp16_passes + [] if args.use_fp16 else [], + vae_decoder=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + unet=only_fp16_passes + no_need_passes if not args.use_fp16 else no_need_passes, + ) infer_configs = dict( - text_encoder=create_paddle_inference_runtime( - model_dir=args.model_dir, - use_trt=False, - model_name="text_encoder", - dynamic_shape=text_encoder_dynamic_shape, - precision_mode=paddle_infer.PrecisionType.Half, - device_id=7, - disable_paddle_trt_ops=["range", "lookup_table_v2"], - tune=False), + text_encoder=create_paddle_inference_runtime( + model_dir=args.model_dir, + use_trt=False, + model_name="text_encoder", + dynamic_shape=None, + precision_mode=paddle_infer.PrecisionType.Half, + device_id=args.device_id, + tune=False, + ), text_encoder_2=create_paddle_inference_runtime( model_dir=args.model_dir, use_trt=False, model_name="text_encoder_2", - dynamic_shape=text_encoder_dynamic_shape, + dynamic_shape=None, precision_mode=paddle_infer.PrecisionType.Half, - device_id=7, - disable_paddle_trt_ops=["range", "lookup_table_v2"], - tune=False + device_id=args.device_id, + tune=False, ), vae_encoder=create_paddle_inference_runtime( model_dir=args.model_dir, model_name="vae_encoder", use_trt=False, precision_mode=paddle_infer.PrecisionType.Half, - device_id=7, + device_id=args.device_id, tune=False ), vae_decoder=create_paddle_inference_runtime( @@ -305,171 +319,69 @@ def main(args): model_name="vae_decoder", use_trt=False, precision_mode=paddle_infer.PrecisionType.Half, - device_id=7, - disable_paddle_pass=disable_paddle_pass, + device_id=args.device_id, tune=False ), unet=create_paddle_inference_runtime( model_dir=args.model_dir, model_name="unet", - use_trt=False, + use_trt=True, precision_mode=paddle_infer.PrecisionType.Half, - device_id=7, - tune=False + device_id=args.device_id, + dynamic_shape=None, + tune=False, ), ) pipe = PaddleInferStableDiffusionXLPipeline.from_pretrained( args.model_dir, infer_configs=infer_configs, + paddle_delete_passes=paddle_delete_passes, ) pipe.set_progress_bar_config(disable=True) # pipe.change_scheduler(args.scheduler) - parse_prompt_type = args.parse_prompt_type width = args.width height = args.height + folder = f"infer_fp16" if args.use_fp16 else f"infer_fp32" + os.makedirs(folder, exist_ok=True) + if args.task_name in ["text2img", "all"]: + # text2img + prompt = "beautiful scenery nature glass bottle landscape, purple galaxy bottle" + time_costs = [] + negative_prompt = "text, watermark" + # warmup + pipe( + prompt, + num_inference_steps=20, + height=height, + width=width, + # parse_prompt_type=parse_prompt_type, + # infer_op_dict=infer_op_dict, + negative_prompt=negative_prompt - if args.infer_op == "all": - infer_op_list = ["zero_copy_infer", "raw"] - else: - infer_op_list = [args.infer_op] - if args.device == "kunlunxin_xpu" or args.backend == "paddle": - print("When device is kunlunxin_xpu or backend is paddle, we will use `raw` infer op.") - infer_op_list = ["raw"] - - for infer_op in infer_op_list: - infer_op_dict = { - "vae_encoder": infer_op, - "vae_decoder": infer_op, - "text_encoder": infer_op, - "unet": infer_op, - } - folder = f"infer_op_{infer_op}_fp16" if args.use_fp16 else f"infer_op_{infer_op}_fp32" - os.makedirs(folder, exist_ok=True) - if args.task_name in ["text2img", "all"]: - # text2img - prompt = "beautiful scenery nature glass bottle landscape, purple galaxy bottle" - time_costs = [] - negative_prompt = "text, watermark" - # warmup - # pipe( - # prompt, - # num_inference_steps=20, - # height=height, - # width=width, - # # parse_prompt_type=parse_prompt_type, - # # infer_op_dict=infer_op_dict, - # negative_prompt=negative_prompt - - # ) - print("==> Test text2img performance.") - for step in trange(args.benchmark_steps): - start = time.time() - paddle.seed(seed) - images = pipe( - prompt, - output_type="pil", - num_inference_steps=args.inference_steps, - height=height, - width=width, - # parse_prompt_type=parse_prompt_type, - # infer_op_dict=infer_op_dict, - negative_prompt=negative_prompt - ).images - latency = time.time() - start - time_costs += [latency] - # print(f"No {step:3d} time cost: {latency:2f} s") - print( - f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " - f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." - ) - images[0].save(f"{folder}/text2img___1.png") - - if args.task_name in ["img2img", "all"]: - # img2img - img_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/sketch-mountains-input.png" - init_image = load_image(img_url) - prompt = "A fantasy landscape, trending on artstation" - time_costs = [] - # warmup - pipe.img2img( + ) + print("==> Test text2img performance.") + for step in trange(args.benchmark_steps): + start = time.time() + paddle.seed(seed) + images = pipe( prompt, - image=init_image, - num_inference_steps=20, + output_type="pil", + num_inference_steps=args.inference_steps, height=height, width=width, # parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, - ) - print("==> Test img2img performance.") - for step in trange(args.benchmark_steps): - start = time.time() - paddle.seed(seed) - images = pipe.img2img( - prompt, - image=init_image, - num_inference_steps=args.inference_steps, - height=height, - width=width, - parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, - ).images - latency = time.time() - start - time_costs += [latency] - # print(f"No {step:3d} time cost: {latency:2f} s") - print( - f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " - f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." - ) - images[0].save(f"{folder}/img2img.png") - - if args.task_name in ["inpaint", "all"]: - img_url = ( - "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations.png" - ) - mask_url = "https://paddlenlp.bj.bcebos.com/models/community/CompVis/stable-diffusion-v1-4/overture-creations-mask.png" - init_image = load_image(img_url) - mask_image = load_image(mask_url) - prompt = "Face of a yellow cat, high resolution, sitting on a park bench" - time_costs = [] - pipe.inpaint( - prompt, - image=init_image, - mask_image=mask_image, - num_inference_steps=20, - height=height, - width=width, - parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, - ) - print("==> Test inpaint performance.") - for step in trange(args.benchmark_steps): - start = time.time() - paddle.seed(seed) - images = pipe.inpaint( - prompt, - image=init_image, - mask_image=mask_image, - num_inference_steps=args.inference_steps, - height=height, - width=width, - parse_prompt_type=parse_prompt_type, - infer_op_dict=infer_op_dict, - ).images - latency = time.time() - start - time_costs += [latency] - # print(f"No {step:3d} time cost: {latency:2f} s") - print( - f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " - f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." - ) - - images[0].save(f"{folder}/inpaint.png") - + # infer_op_dict=infer_op_dict, + negative_prompt=negative_prompt + ).images + latency = time.time() - start + time_costs += [latency] + # print(f"No {step:3d} time cost: {latency:2f} s") + print( + f"Mean latency: {np.mean(time_costs):2f} s, p50 latency: {np.percentile(time_costs, 50):2f} s, " + f"p90 latency: {np.percentile(time_costs, 90):2f} s, p95 latency: {np.percentile(time_costs, 95):2f} s." + ) + images[0].save(f"{folder}/text2img_30step.png") if __name__ == "__main__": - seed=2024 - paddle.seed(seed) - np.random.seed(seed) - random.seed(seed) args = parse_arguments() main(args) diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py index 23904b256..0a8fe136f 100644 --- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py +++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py @@ -763,7 +763,7 @@ def forward( forward_upsample_size = False upsample_size = None - if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + if any((s != -1 and s % default_overall_up_factor != 0 )for s in sample.shape[-2:]): logger.info("Forward upsample size to force interpolation output size.") forward_upsample_size = True diff --git a/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py b/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py index fc58b759b..6ef4fc98f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py +++ b/ppdiffusers/ppdiffusers/pipelines/paddle_infer_utils.py @@ -1051,16 +1051,17 @@ def __call__(self, **kwargs): v = v.reshape((1,)) # fix dtype error v = v.astype("float32") + if isinstance(v, np.ndarray): + v = paddle.to_tensor(v) inputs[k] = v + input_list = [] input_names = self.model.get_input_names() for i, name in enumerate(input_names): - input_tensor = self.model.get_input_handle(name) if name not in inputs: raise ValueError(f"Input {name} is not in the model.") - input_tensor.reshape(inputs[name].shape) - input_tensor.copy_from_cpu(inputs[name].numpy()) + input_list.append(inputs[name]) # do the inference - self.model.run() + self.model.run(input_list) results = [] # get out data from output tensor output_names = self.model.get_output_names() @@ -1075,6 +1076,7 @@ def load_model( model_path: Union[str, Path], params_path: Union[str, Path] = None, infer_config: Optional["paddle_infer.Congig"] = None, + paddle_delete_pass: Optional[List] = None, ): """ Loads an FastDeploy Inference Model with fastdeploy.RuntimeOption @@ -1092,6 +1094,9 @@ def load_model( infer_config = paddle_infer.Config() infer_config.set_prog_file(model_path) infer_config.set_params_file(params_path) + if paddle_delete_pass is not None: + for pass_name in paddle_delete_pass: + infer_config.delete_pass(pass_name) return paddle_infer.create_predictor(infer_config) def _save_pretrained( @@ -1167,6 +1172,7 @@ def _from_pretrained( force_download: bool = False, cache_dir: Optional[str] = None, infer_config: Optional['paddle_infer.Config'] = None, + paddle_delete_pass: Optional[List] = None, from_hf_hub: Optional[bool] = False, proxies: Optional[Dict] = None, resume_download: bool = False, @@ -1220,6 +1226,7 @@ def _from_pretrained( model_path, params_path, infer_config=infer_config, + paddle_delete_pass=paddle_delete_pass, ) kwargs["model_save_dir"] = Path(pretrained_model_name_or_path) # load model from hub or paddle bos @@ -1271,6 +1278,7 @@ def from_pretrained( model_file_name: Optional[str] = None, params_file_name: Optional[str] = None, infer_configs: Optional['paddle_infer.Config'] = None, + paddle_delete_pass: Optional[List] = None, **kwargs, ): from_hf_hub = kwargs.pop("from_hf_hub", FROM_HF_HUB) @@ -1302,6 +1310,7 @@ def from_pretrained( force_download=force_download, cache_dir=cache_dir, infer_config=infer_configs, + paddle_delete_pass=paddle_delete_pass, from_hf_hub=from_hf_hub, proxies=proxies, resume_download=resume_download, diff --git a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py index e055dd817..73333340f 100644 --- a/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py +++ b/ppdiffusers/ppdiffusers/pipelines/pipeline_utils.py @@ -374,6 +374,7 @@ def load_sub_model( paddle_dtype: paddle.dtype, runtime_options: Any, infer_configs: Any, + paddle_delete_passes: Any, model_variants: Dict[str, str], name: str, from_diffusers: bool, @@ -436,7 +437,7 @@ def load_sub_model( if issubclass(class_obj, PaddleInferModel): loading_kwargs["infer_configs"] = infer_configs.get(name, None) if isinstance(infer_configs, dict) else infer_configs - + loading_kwargs["paddle_delete_pass"] = paddle_delete_passes.get(name, None) if isinstance(paddle_delete_passes, dict) else paddle_delete_passes from ppdiffusers import ModelMixin @@ -451,7 +452,6 @@ def load_sub_model( try: # check if the module is in a subdirectory if os.path.isdir(os.path.join(cached_folder, name)): - # import pdb; pdb.set_trace() loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) else: # else load from the root directory @@ -938,6 +938,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P custom_revision = kwargs.pop("custom_revision", None) runtime_options = kwargs.pop("runtime_options", None) infer_configs = kwargs.pop("infer_configs", None) + paddle_delete_passes = kwargs.pop("paddle_delete_passes", None) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", LOW_CPU_MEM_USAGE_DEFAULT) use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) variant = kwargs.pop("variant", None) @@ -1118,6 +1119,7 @@ def load_module(name, value): paddle_dtype=paddle_dtype, runtime_options=runtime_options, infer_configs=infer_configs, + paddle_delete_passes=paddle_delete_passes, model_variants=model_variants, name=name, from_diffusers=from_diffusers,