add two optimizations

--pin-shared-memory and --cuda-malloc

See also the updates in Readme for more details
This commit is contained in:
lllyasviel 2024-02-23 18:39:32 -08:00 committed by GitHub
parent 54c89503eb
commit 88f395091b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 114 additions and 6 deletions

View File

@ -80,7 +80,13 @@ Forge backend removes all WebUI's codes related to resource management and rewor
Without any cmd flag, Forge can run SDXL with 4GB vram and SD1.5 with 2GB vram. Without any cmd flag, Forge can run SDXL with 4GB vram and SD1.5 with 2GB vram.
**The only one flag that you may still need** is `--always-offload-from-vram` (This flag will make things **slower**). This option will let Forge always unload models from VRAM. This can be useful if you use multiple software together and want Forge to use less VRAM and give some vram to other software, or when you are using some old extensions that will compete vram with Forge, or (very rarely) when you get OOM. **Some flags that you may still pay attention to:**
1. `--always-offload-from-vram` (This flag will make things **slower** but less risky). This option will let Forge always unload models from VRAM. This can be useful if you use multiple software together and want Forge to use less VRAM and give some VRAM to other software, or when you are using some old extensions that will compete vram with Forge, or (very rarely) when you get OOM.
2. `--pin-shared-memory` (This flag will make things **faster** but more risky). This will offload models to Shared GPU Memory instead of system RAM when offloading models. On some 30XX/40XX devices with small VRAM (eg, RTX 4050 6GB, RTX 3060 Laptop 6GB, etc), I can observe significant (at least 20\%) speed-up for SDXL. However, this unfortunately cannot be set as default because the OOM of Shared GPU Memory is a much more severe problem than common GPU memory OOM. Pytorch does not provide any robust method to unload or detect Shared GPU Memory. Once the Shared GPU Memory OOM, the entire program will crash (observed with SDXL on GTX 1060/1050/1066), and there is no dynamic method to prevent or recover from the crash. Users need to enable this cmd flag at their own risk.
3. `--cuda-malloc` (This flag will make things **faster** but more risky). This will ask pytorch to use *cudaMallocAsync* for tensor malloc. On some profilers I can observe performance gain at millisecond level, but the real speed up on most my devices are often unnoticed (about or less than 0.1 second per image). This cannot be set as default because many users reported issues that the async malloc will crash the program. Users need to enable this cmd flag at their own risk.
If you really want to play with cmd flags, you can additionally control the GPU with: If you really want to play with cmd flags, you can additionally control the GPU with:

View File

@ -49,9 +49,6 @@ parser.add_argument("--cache-path", type=str, default=None)
parser.add_argument("--in-browser", action="store_true") parser.add_argument("--in-browser", action="store_true")
parser.add_argument("--disable-in-browser", action="store_true") parser.add_argument("--disable-in-browser", action="store_true")
parser.add_argument("--gpu-device-id", type=int, default=None, metavar="DEVICE_ID") parser.add_argument("--gpu-device-id", type=int, default=None, metavar="DEVICE_ID")
cm_group = parser.add_mutually_exclusive_group()
cm_group.add_argument("--async-cuda-allocation", action="store_true")
cm_group.add_argument("--disable-async-cuda-allocation", action="store_true")
parser.add_argument("--disable-attention-upcast", action="store_true") parser.add_argument("--disable-attention-upcast", action="store_true")
@ -118,6 +115,9 @@ parser.add_argument("--disable-server-info", action="store_true")
parser.add_argument("--multi-user", action="store_true") parser.add_argument("--multi-user", action="store_true")
parser.add_argument("--cuda-malloc", action="store_true")
parser.add_argument("--pin-shared-memory", action="store_true")
if ldm_patched.modules.options.args_parsing: if ldm_patched.modules.options.args_parsing:
args = parser.parse_args([]) args = parser.parse_args([])
else: else:

View File

@ -244,6 +244,12 @@ ALWAYS_VRAM_OFFLOAD = args.always_offload_from_vram
if ALWAYS_VRAM_OFFLOAD: if ALWAYS_VRAM_OFFLOAD:
print("Always offload VRAM") print("Always offload VRAM")
PIN_SHARED_MEMORY = args.pin_shared_memory
if PIN_SHARED_MEMORY:
print("Always pin shared GPU memory")
def get_torch_device_name(device): def get_torch_device_name(device):
if hasattr(device, 'type'): if hasattr(device, 'type'):
if device.type == "cuda": if device.type == "cuda":
@ -328,8 +334,8 @@ class LoadedModel:
else: else:
real_async_memory += module_mem real_async_memory += module_mem
m.to(self.model.offload_device) m.to(self.model.offload_device)
# if is_device_cpu(self.model.offload_device): if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
# m._apply(lambda x: x.pin_memory()) m._apply(lambda x: x.pin_memory())
elif hasattr(m, "weight"): elif hasattr(m, "weight"):
m.to(self.device) m.to(self.device)
mem_counter += module_size(m) mem_counter += module_size(m)

View File

@ -0,0 +1,92 @@
import os
import importlib.util
# https://github.com/comfyanonymous/ComfyUI/blob/master/cuda_malloc.py
def get_gpu_names():
if os.name == 'nt':
import ctypes
# Define necessary C structures and types
class DISPLAY_DEVICEA(ctypes.Structure):
_fields_ = [
('cb', ctypes.c_ulong),
('DeviceName', ctypes.c_char * 32),
('DeviceString', ctypes.c_char * 128),
('StateFlags', ctypes.c_ulong),
('DeviceID', ctypes.c_char * 128),
('DeviceKey', ctypes.c_char * 128)
]
# Load user32.dll
user32 = ctypes.windll.user32
# Call EnumDisplayDevicesA
def enum_display_devices():
device_info = DISPLAY_DEVICEA()
device_info.cb = ctypes.sizeof(device_info)
device_index = 0
gpu_names = set()
while user32.EnumDisplayDevicesA(None, device_index, ctypes.byref(device_info), 0):
device_index += 1
gpu_names.add(device_info.DeviceString.decode('utf-8'))
return gpu_names
return enum_display_devices()
else:
return set()
blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M",
"GeForce 940M", "GeForce 930M", "GeForce 920M", "GeForce 910M", "GeForce GTX 750", "GeForce GTX 745", "Quadro K620",
"Quadro K1200", "Quadro K2200", "Quadro M500", "Quadro M520", "Quadro M600", "Quadro M620", "Quadro M1000",
"Quadro M1200", "Quadro M2000", "Quadro M2200", "Quadro M3000", "Quadro M4000", "Quadro M5000", "Quadro M5500", "Quadro M6000",
"GeForce MX110", "GeForce MX130", "GeForce 830M", "GeForce 840M", "GeForce GTX 850M", "GeForce GTX 860M",
"GeForce GTX 1650", "GeForce GTX 1630"
}
def cuda_malloc_supported():
try:
names = get_gpu_names()
except:
names = set()
for x in names:
if "NVIDIA" in x:
for b in blacklist:
if b in x:
return False
return True
def try_cuda_malloc():
do_cuda_malloc = False
try:
version = ""
torch_spec = importlib.util.find_spec("torch")
for folder in torch_spec.submodule_search_locations:
ver_file = os.path.join(folder, "version.py")
if os.path.isfile(ver_file):
spec = importlib.util.spec_from_file_location("torch_version_import", ver_file)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
version = module.__version__
if int(version[0]) >= 2:
do_cuda_malloc = cuda_malloc_supported()
except:
pass
if do_cuda_malloc:
env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None)
if env_var is None:
env_var = "backend:cudaMallocAsync"
else:
env_var += ",backend:cudaMallocAsync"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
print('Using cudaMallocAsync backend.')
else:
print('Failed to use cudaMallocAsync backend.')
return

View File

@ -43,6 +43,10 @@ def initialize_forge():
os.environ['CUDA_VISIBLE_DEVICES'] = str(args_parser.args.gpu_device_id) os.environ['CUDA_VISIBLE_DEVICES'] = str(args_parser.args.gpu_device_id)
print("Set device to:", args_parser.args.gpu_device_id) print("Set device to:", args_parser.args.gpu_device_id)
if args_parser.args.cuda_malloc:
from modules_forge.cuda_malloc import try_cuda_malloc
try_cuda_malloc()
import ldm_patched.modules.model_management as model_management import ldm_patched.modules.model_management as model_management
import torch import torch