add two optimizations

--pin-shared-memory and --cuda-malloc See also the updates in Readme for more details
2024-02-23 18:39:32 -08:00 · 2024-02-23 18:39:32 -08:00 · 88f395091b
commit 88f395091b
parent 54c89503eb
5 changed files with 114 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -80,7 +80,13 @@ Forge backend removes all WebUI's codes related to resource management and rewor
 Without any cmd flag, Forge can run SDXL with 4GB vram and SD1.5 with 2GB vram.
-**The only one flag that you may still need** is `--always-offload-from-vram` (This flag will make things **slower**). This option will let Forge always unload models from VRAM. This can be useful if you use multiple software together and want Forge to use less VRAM and give some vram to other software, or when you are using some old extensions that will compete vram with Forge, or (very rarely) when you get OOM.
+**Some flags that you may still pay attention to:** 
 1. `--always-offload-from-vram` (This flag will make things **slower** but less risky). This option will let Forge always unload models from VRAM. This can be useful if you use multiple software together and want Forge to use less VRAM and give some VRAM to other software, or when you are using some old extensions that will compete vram with Forge, or (very rarely) when you get OOM.
 2. `--pin-shared-memory` (This flag will make things **faster** but more risky). This will offload models to Shared GPU Memory instead of system RAM when offloading models. On some 30XX/40XX devices with small VRAM (eg, RTX 4050 6GB, RTX 3060 Laptop 6GB, etc), I can observe significant (at least 20\%) speed-up for SDXL. However, this unfortunately cannot be set as default because the OOM of Shared GPU Memory is a much more severe problem than common GPU memory OOM. Pytorch does not provide any robust method to unload or detect Shared GPU Memory. Once the Shared GPU Memory OOM, the entire program will crash (observed with SDXL on GTX 1060/1050/1066), and there is no dynamic method to prevent or recover from the crash. Users need to enable this cmd flag at their own risk.
 3. `--cuda-malloc` (This flag will make things **faster** but more risky). This will ask pytorch to use *cudaMallocAsync* for tensor malloc. On some profilers I can observe performance gain at millisecond level, but the real speed up on most my devices are often unnoticed (about or less than 0.1 second per image). This cannot be set as default because many users reported issues that the async malloc will crash the program. Users need to enable this cmd flag at their own risk.
 If you really want to play with cmd flags, you can additionally control the GPU with:
--- a/ldm_patched/modules/args_parser.py
+++ b/ldm_patched/modules/args_parser.py
@ -49,9 +49,6 @@ parser.add_argument("--cache-path", type=str, default=None)
 parser.add_argument("--in-browser", action="store_true")
 parser.add_argument("--disable-in-browser", action="store_true")
 parser.add_argument("--gpu-device-id", type=int, default=None, metavar="DEVICE_ID")
 cm_group = parser.add_mutually_exclusive_group()
 cm_group.add_argument("--async-cuda-allocation", action="store_true")
 cm_group.add_argument("--disable-async-cuda-allocation", action="store_true")
 parser.add_argument("--disable-attention-upcast", action="store_true")
@ -118,6 +115,9 @@ parser.add_argument("--disable-server-info", action="store_true")
 parser.add_argument("--multi-user", action="store_true")
 parser.add_argument("--cuda-malloc", action="store_true")
 parser.add_argument("--pin-shared-memory", action="store_true")
 if ldm_patched.modules.options.args_parsing:
    args = parser.parse_args([])
 else:
--- a/ldm_patched/modules/model_management.py
+++ b/ldm_patched/modules/model_management.py
@ -244,6 +244,12 @@ ALWAYS_VRAM_OFFLOAD = args.always_offload_from_vram
 if ALWAYS_VRAM_OFFLOAD:
    print("Always offload VRAM")
 PIN_SHARED_MEMORY = args.pin_shared_memory
 if PIN_SHARED_MEMORY:
    print("Always pin shared GPU memory")
 def get_torch_device_name(device):
    if hasattr(device, 'type'):
        if device.type == "cuda":
@ -328,8 +334,8 @@ class LoadedModel:
                    else:
                        real_async_memory += module_mem
                        m.to(self.model.offload_device)
-                        # if is_device_cpu(self.model.offload_device):
+                        if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
-                        #     m._apply(lambda x: x.pin_memory())
+                            m._apply(lambda x: x.pin_memory())
                elif hasattr(m, "weight"):
                    m.to(self.device)
                    mem_counter += module_size(m)
--- a/modules_forge/cuda_malloc.py
+++ b/modules_forge/cuda_malloc.py
@ -0,0 +1,92 @@
 import os
 import importlib.util
 # https://github.com/comfyanonymous/ComfyUI/blob/master/cuda_malloc.py
 def get_gpu_names():
    if os.name == 'nt':
        import ctypes
        # Define necessary C structures and types
        class DISPLAY_DEVICEA(ctypes.Structure):
            _fields_ = [
                ('cb', ctypes.c_ulong),
                ('DeviceName', ctypes.c_char * 32),
                ('DeviceString', ctypes.c_char * 128),
                ('StateFlags', ctypes.c_ulong),
                ('DeviceID', ctypes.c_char * 128),
                ('DeviceKey', ctypes.c_char * 128)
            ]
        # Load user32.dll
        user32 = ctypes.windll.user32
        # Call EnumDisplayDevicesA
        def enum_display_devices():
            device_info = DISPLAY_DEVICEA()
            device_info.cb = ctypes.sizeof(device_info)
            device_index = 0
            gpu_names = set()
            while user32.EnumDisplayDevicesA(None, device_index, ctypes.byref(device_info), 0):
                device_index += 1
                gpu_names.add(device_info.DeviceString.decode('utf-8'))
            return gpu_names
        return enum_display_devices()
    else:
        return set()
 blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M",
                "GeForce 940M", "GeForce 930M", "GeForce 920M", "GeForce 910M", "GeForce GTX 750", "GeForce GTX 745", "Quadro K620",
                "Quadro K1200", "Quadro K2200", "Quadro M500", "Quadro M520", "Quadro M600", "Quadro M620", "Quadro M1000",
                "Quadro M1200", "Quadro M2000", "Quadro M2200", "Quadro M3000", "Quadro M4000", "Quadro M5000", "Quadro M5500", "Quadro M6000",
                "GeForce MX110", "GeForce MX130", "GeForce 830M", "GeForce 840M", "GeForce GTX 850M", "GeForce GTX 860M",
                "GeForce GTX 1650", "GeForce GTX 1630"
                }
 def cuda_malloc_supported():
    try:
        names = get_gpu_names()
    except:
        names = set()
    for x in names:
        if "NVIDIA" in x:
            for b in blacklist:
                if b in x:
                    return False
    return True
 def try_cuda_malloc():
    do_cuda_malloc = False
    try:
        version = ""
        torch_spec = importlib.util.find_spec("torch")
        for folder in torch_spec.submodule_search_locations:
            ver_file = os.path.join(folder, "version.py")
            if os.path.isfile(ver_file):
                spec = importlib.util.spec_from_file_location("torch_version_import", ver_file)
                module = importlib.util.module_from_spec(spec)
                spec.loader.exec_module(module)
                version = module.__version__
        if int(version[0]) >= 2:
            do_cuda_malloc = cuda_malloc_supported()
    except:
        pass
    if do_cuda_malloc:
        env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None)
        if env_var is None:
            env_var = "backend:cudaMallocAsync"
        else:
            env_var += ",backend:cudaMallocAsync"
        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
        print('Using cudaMallocAsync backend.')
    else:
        print('Failed to use cudaMallocAsync backend.')
    return
--- a/modules_forge/initialization.py
+++ b/modules_forge/initialization.py
@ -43,6 +43,10 @@ def initialize_forge():
        os.environ['CUDA_VISIBLE_DEVICES'] = str(args_parser.args.gpu_device_id)
        print("Set device to:", args_parser.args.gpu_device_id)
    if args_parser.args.cuda_malloc:
        from modules_forge.cuda_malloc import try_cuda_malloc
        try_cuda_malloc()
    import ldm_patched.modules.model_management as model_management
    import torch