2024-01-28 18:09:20 +00:00
|
|
|
import torch
|
|
|
|
import ldm_patched.modules.ops as ops
|
2024-01-28 18:19:19 +00:00
|
|
|
|
2024-01-28 18:09:20 +00:00
|
|
|
from ldm_patched.modules.model_patcher import ModelPatcher
|
|
|
|
from ldm_patched.modules import model_management
|
|
|
|
from modules_forge.ops import use_patched_ops
|
|
|
|
from transformers import modeling_utils
|
|
|
|
|
|
|
|
|
2024-01-28 18:19:19 +00:00
|
|
|
class DiffusersModelPatcher:
|
2024-01-28 18:09:20 +00:00
|
|
|
def __init__(self, pipeline_class, dtype=torch.float16, *args, **kwargs):
|
|
|
|
load_device = model_management.get_torch_device()
|
|
|
|
offload_device = torch.device("cpu")
|
|
|
|
|
|
|
|
if not model_management.should_use_fp16(device=load_device):
|
|
|
|
dtype = torch.float32
|
|
|
|
|
|
|
|
self.dtype = dtype
|
|
|
|
|
|
|
|
with use_patched_ops(ops.manual_cast):
|
|
|
|
with modeling_utils.no_init_weights():
|
2024-01-28 18:13:57 +00:00
|
|
|
self.pipeline = pipeline_class.from_pretrained(*args, **kwargs)
|
2024-01-28 18:09:20 +00:00
|
|
|
|
2024-01-28 18:19:19 +00:00
|
|
|
if hasattr(self.pipeline, 'unet'):
|
2024-01-28 18:23:29 +00:00
|
|
|
if hasattr(self.pipeline.unet, 'set_attn_processor'):
|
2024-01-28 23:31:49 +00:00
|
|
|
from diffusers.models.attention_processor import AttnProcessor2_0
|
2024-01-28 18:23:29 +00:00
|
|
|
self.pipeline.unet.set_attn_processor(AttnProcessor2_0())
|
|
|
|
print('Attention optimization applied to DiffusersModelPatcher')
|
2024-01-28 18:19:19 +00:00
|
|
|
|
2024-02-02 08:14:37 +00:00
|
|
|
self.pipeline = self.pipeline.to(device=offload_device)
|
|
|
|
|
|
|
|
if self.dtype == torch.float16:
|
|
|
|
self.pipeline = self.pipeline.half()
|
|
|
|
|
2024-01-28 18:09:20 +00:00
|
|
|
self.pipeline.eval()
|
|
|
|
|
|
|
|
self.patcher = ModelPatcher(
|
|
|
|
model=self.pipeline,
|
|
|
|
load_device=load_device,
|
|
|
|
offload_device=offload_device)
|
|
|
|
|
|
|
|
def prepare_memory_before_sampling(self, batchsize, latent_width, latent_height):
|
|
|
|
area = 2 * batchsize * latent_width * latent_height
|
|
|
|
inference_memory = (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
|
|
|
|
model_management.load_models_gpu(
|
2024-02-02 08:32:15 +00:00
|
|
|
models=[self.patcher],
|
2024-01-28 18:09:20 +00:00
|
|
|
memory_required=inference_memory
|
2024-01-28 18:14:08 +00:00
|
|
|
)
|
2024-02-02 08:34:18 +00:00
|
|
|
|
|
|
|
def move_tensor_to_current_device(self, x):
|
|
|
|
return x.to(device=self.patcher.current_device, dtype=self.dtype)
|