113 lines
4.2 KiB
Python
113 lines
4.2 KiB
Python
# Consistent with Kohya/A1111 to reduce differences between model training and inference.
|
|
|
|
import os
|
|
import torch
|
|
import ldm_patched.controlnet.cldm
|
|
import ldm_patched.k_diffusion.sampling
|
|
import ldm_patched.ldm.modules.attention
|
|
import ldm_patched.ldm.modules.diffusionmodules.model
|
|
import ldm_patched.ldm.modules.diffusionmodules.openaimodel
|
|
import ldm_patched.ldm.modules.diffusionmodules.openaimodel
|
|
import ldm_patched.modules.args_parser
|
|
import ldm_patched.modules.model_base
|
|
import ldm_patched.modules.model_management
|
|
import ldm_patched.modules.model_patcher
|
|
import ldm_patched.modules.samplers
|
|
import ldm_patched.modules.sd
|
|
import ldm_patched.modules.sd1_clip
|
|
import ldm_patched.modules.clip_vision
|
|
import ldm_patched.modules.ops as ops
|
|
|
|
from modules_forge.ops import use_patched_ops
|
|
from transformers import CLIPTextModel, CLIPTextConfig, modeling_utils
|
|
|
|
|
|
def patched_SDClipModel__init__(self, max_length=77, freeze=True, layer="last", layer_idx=None,
|
|
textmodel_json_config=None, dtype=None, special_tokens=None,
|
|
layer_norm_hidden_state=True, **kwargs):
|
|
torch.nn.Module.__init__(self)
|
|
assert layer in self.LAYERS
|
|
|
|
if special_tokens is None:
|
|
special_tokens = {"start": 49406, "end": 49407, "pad": 49407}
|
|
|
|
if textmodel_json_config is None:
|
|
textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(ldm_patched.modules.sd1_clip.__file__)),
|
|
"sd1_clip_config.json")
|
|
|
|
config = CLIPTextConfig.from_json_file(textmodel_json_config)
|
|
self.num_layers = config.num_hidden_layers
|
|
|
|
with use_patched_ops(ops.manual_cast):
|
|
with modeling_utils.no_init_weights():
|
|
self.transformer = CLIPTextModel(config)
|
|
|
|
if dtype is not None:
|
|
self.transformer.to(dtype)
|
|
|
|
self.transformer.text_model.embeddings.to(torch.float32)
|
|
|
|
if freeze:
|
|
self.freeze()
|
|
|
|
self.max_length = max_length
|
|
self.layer = layer
|
|
self.layer_idx = None
|
|
self.special_tokens = special_tokens
|
|
self.text_projection = torch.nn.Parameter(torch.eye(self.transformer.get_input_embeddings().weight.shape[1]))
|
|
self.logit_scale = torch.nn.Parameter(torch.tensor(4.6055))
|
|
self.enable_attention_masks = False
|
|
|
|
self.layer_norm_hidden_state = layer_norm_hidden_state
|
|
if layer == "hidden":
|
|
assert layer_idx is not None
|
|
assert abs(layer_idx) < self.num_layers
|
|
self.clip_layer(layer_idx)
|
|
self.layer_default = (self.layer, self.layer_idx)
|
|
|
|
|
|
def patched_SDClipModel_forward(self, tokens):
|
|
backup_embeds = self.transformer.get_input_embeddings()
|
|
device = backup_embeds.weight.device
|
|
tokens = self.set_up_textual_embeddings(tokens, backup_embeds)
|
|
tokens = torch.LongTensor(tokens).to(device)
|
|
|
|
attention_mask = None
|
|
if self.enable_attention_masks:
|
|
attention_mask = torch.zeros_like(tokens)
|
|
max_token = self.transformer.get_input_embeddings().weight.shape[0] - 1
|
|
for x in range(attention_mask.shape[0]):
|
|
for y in range(attention_mask.shape[1]):
|
|
attention_mask[x, y] = 1
|
|
if tokens[x, y] == max_token:
|
|
break
|
|
|
|
outputs = self.transformer(input_ids=tokens, attention_mask=attention_mask,
|
|
output_hidden_states=self.layer == "hidden")
|
|
self.transformer.set_input_embeddings(backup_embeds)
|
|
|
|
if self.layer == "last":
|
|
z = outputs.last_hidden_state
|
|
elif self.layer == "pooled":
|
|
z = outputs.pooler_output[:, None, :]
|
|
else:
|
|
z = outputs.hidden_states[self.layer_idx]
|
|
if self.layer_norm_hidden_state:
|
|
z = self.transformer.text_model.final_layer_norm(z)
|
|
|
|
if hasattr(outputs, "pooler_output"):
|
|
pooled_output = outputs.pooler_output.float()
|
|
else:
|
|
pooled_output = None
|
|
|
|
if self.text_projection is not None and pooled_output is not None:
|
|
pooled_output = pooled_output.float().to(self.text_projection.device) @ self.text_projection.float()
|
|
|
|
return z.float(), pooled_output
|
|
|
|
|
|
def patch_all_clip():
|
|
ldm_patched.modules.sd1_clip.SDClipModel.__init__ = patched_SDClipModel__init__
|
|
ldm_patched.modules.sd1_clip.SDClipModel.forward = patched_SDClipModel_forward
|
|
return
|