From 88f395091b24236697a15fc006131e28e2fccace Mon Sep 17 00:00:00 2001
From: lllyasviel <19834515+lllyasviel@users.noreply.github.com>
Date: Fri, 23 Feb 2024 18:39:32 -0800
Subject: [PATCH] add two optimizations

--pin-shared-memory and --cuda-malloc

See also the updates in Readme for more details
---
 README.md                               |  8 ++-
 ldm_patched/modules/args_parser.py      |  6 +-
 ldm_patched/modules/model_management.py | 10 ++-
 modules_forge/cuda_malloc.py            | 92 +++++++++++++++++++++++++
 modules_forge/initialization.py         |  4 ++
 5 files changed, 114 insertions(+), 6 deletions(-)
 create mode 100644 modules_forge/cuda_malloc.py

diff --git a/README.md b/README.md
index 99c11553..77a04f11 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,13 @@ Forge backend removes all WebUI's codes related to resource management and rewor
 
 Without any cmd flag, Forge can run SDXL with 4GB vram and SD1.5 with 2GB vram.
 
-**The only one flag that you may still need** is `--always-offload-from-vram` (This flag will make things **slower**). This option will let Forge always unload models from VRAM. This can be useful if you use multiple software together and want Forge to use less VRAM and give some vram to other software, or when you are using some old extensions that will compete vram with Forge, or (very rarely) when you get OOM.
+**Some flags that you may still pay attention to:** 
+
+1. `--always-offload-from-vram` (This flag will make things **slower** but less risky). This option will let Forge always unload models from VRAM. This can be useful if you use multiple software together and want Forge to use less VRAM and give some VRAM to other software, or when you are using some old extensions that will compete vram with Forge, or (very rarely) when you get OOM.
+
+2. `--pin-shared-memory` (This flag will make things **faster** but more risky). This will offload models to Shared GPU Memory instead of system RAM when offloading models. On some 30XX/40XX devices with small VRAM (eg, RTX 4050 6GB, RTX 3060 Laptop 6GB, etc), I can observe significant (at least 20\%) speed-up for SDXL. However, this unfortunately cannot be set as default because the OOM of Shared GPU Memory is a much more severe problem than common GPU memory OOM. Pytorch does not provide any robust method to unload or detect Shared GPU Memory. Once the Shared GPU Memory OOM, the entire program will crash (observed with SDXL on GTX 1060/1050/1066), and there is no dynamic method to prevent or recover from the crash. Users need to enable this cmd flag at their own risk.
+
+3. `--cuda-malloc` (This flag will make things **faster** but more risky). This will ask pytorch to use *cudaMallocAsync* for tensor malloc. On some profilers I can observe performance gain at millisecond level, but the real speed up on most my devices are often unnoticed (about or less than 0.1 second per image). This cannot be set as default because many users reported issues that the async malloc will crash the program. Users need to enable this cmd flag at their own risk.
 
 If you really want to play with cmd flags, you can additionally control the GPU with:
 
diff --git a/ldm_patched/modules/args_parser.py b/ldm_patched/modules/args_parser.py
index 766887a3..e4aac7bc 100644
--- a/ldm_patched/modules/args_parser.py
+++ b/ldm_patched/modules/args_parser.py
@@ -49,9 +49,6 @@ parser.add_argument("--cache-path", type=str, default=None)
 parser.add_argument("--in-browser", action="store_true")
 parser.add_argument("--disable-in-browser", action="store_true")
 parser.add_argument("--gpu-device-id", type=int, default=None, metavar="DEVICE_ID")
-cm_group = parser.add_mutually_exclusive_group()
-cm_group.add_argument("--async-cuda-allocation", action="store_true")
-cm_group.add_argument("--disable-async-cuda-allocation", action="store_true")
 
 parser.add_argument("--disable-attention-upcast", action="store_true")
 
@@ -118,6 +115,9 @@ parser.add_argument("--disable-server-info", action="store_true")
 
 parser.add_argument("--multi-user", action="store_true")
 
+parser.add_argument("--cuda-malloc", action="store_true")
+parser.add_argument("--pin-shared-memory", action="store_true")
+
 if ldm_patched.modules.options.args_parsing:
     args = parser.parse_args([])
 else:
diff --git a/ldm_patched/modules/model_management.py b/ldm_patched/modules/model_management.py
index b4025b32..b5ffd219 100644
--- a/ldm_patched/modules/model_management.py
+++ b/ldm_patched/modules/model_management.py
@@ -244,6 +244,12 @@ ALWAYS_VRAM_OFFLOAD = args.always_offload_from_vram
 if ALWAYS_VRAM_OFFLOAD:
     print("Always offload VRAM")
 
+PIN_SHARED_MEMORY = args.pin_shared_memory
+
+if PIN_SHARED_MEMORY:
+    print("Always pin shared GPU memory")
+
+
 def get_torch_device_name(device):
     if hasattr(device, 'type'):
         if device.type == "cuda":
@@ -328,8 +334,8 @@ class LoadedModel:
                     else:
                         real_async_memory += module_mem
                         m.to(self.model.offload_device)
-                        # if is_device_cpu(self.model.offload_device):
-                        #     m._apply(lambda x: x.pin_memory())
+                        if PIN_SHARED_MEMORY and is_device_cpu(self.model.offload_device):
+                            m._apply(lambda x: x.pin_memory())
                 elif hasattr(m, "weight"):
                     m.to(self.device)
                     mem_counter += module_size(m)
diff --git a/modules_forge/cuda_malloc.py b/modules_forge/cuda_malloc.py
new file mode 100644
index 00000000..8179a60f
--- /dev/null
+++ b/modules_forge/cuda_malloc.py
@@ -0,0 +1,92 @@
+import os
+import importlib.util
+
+
+# https://github.com/comfyanonymous/ComfyUI/blob/master/cuda_malloc.py
+def get_gpu_names():
+    if os.name == 'nt':
+        import ctypes
+
+        # Define necessary C structures and types
+        class DISPLAY_DEVICEA(ctypes.Structure):
+            _fields_ = [
+                ('cb', ctypes.c_ulong),
+                ('DeviceName', ctypes.c_char * 32),
+                ('DeviceString', ctypes.c_char * 128),
+                ('StateFlags', ctypes.c_ulong),
+                ('DeviceID', ctypes.c_char * 128),
+                ('DeviceKey', ctypes.c_char * 128)
+            ]
+
+        # Load user32.dll
+        user32 = ctypes.windll.user32
+
+        # Call EnumDisplayDevicesA
+        def enum_display_devices():
+            device_info = DISPLAY_DEVICEA()
+            device_info.cb = ctypes.sizeof(device_info)
+            device_index = 0
+            gpu_names = set()
+
+            while user32.EnumDisplayDevicesA(None, device_index, ctypes.byref(device_info), 0):
+                device_index += 1
+                gpu_names.add(device_info.DeviceString.decode('utf-8'))
+            return gpu_names
+        return enum_display_devices()
+    else:
+        return set()
+
+
+blacklist = {"GeForce GTX TITAN X", "GeForce GTX 980", "GeForce GTX 970", "GeForce GTX 960", "GeForce GTX 950", "GeForce 945M",
+                "GeForce 940M", "GeForce 930M", "GeForce 920M", "GeForce 910M", "GeForce GTX 750", "GeForce GTX 745", "Quadro K620",
+                "Quadro K1200", "Quadro K2200", "Quadro M500", "Quadro M520", "Quadro M600", "Quadro M620", "Quadro M1000",
+                "Quadro M1200", "Quadro M2000", "Quadro M2200", "Quadro M3000", "Quadro M4000", "Quadro M5000", "Quadro M5500", "Quadro M6000",
+                "GeForce MX110", "GeForce MX130", "GeForce 830M", "GeForce 840M", "GeForce GTX 850M", "GeForce GTX 860M",
+                "GeForce GTX 1650", "GeForce GTX 1630"
+                }
+
+
+def cuda_malloc_supported():
+    try:
+        names = get_gpu_names()
+    except:
+        names = set()
+    for x in names:
+        if "NVIDIA" in x:
+            for b in blacklist:
+                if b in x:
+                    return False
+    return True
+
+
+def try_cuda_malloc():
+    do_cuda_malloc = False
+
+    try:
+        version = ""
+        torch_spec = importlib.util.find_spec("torch")
+        for folder in torch_spec.submodule_search_locations:
+            ver_file = os.path.join(folder, "version.py")
+            if os.path.isfile(ver_file):
+                spec = importlib.util.spec_from_file_location("torch_version_import", ver_file)
+                module = importlib.util.module_from_spec(spec)
+                spec.loader.exec_module(module)
+                version = module.__version__
+        if int(version[0]) >= 2:
+            do_cuda_malloc = cuda_malloc_supported()
+    except:
+        pass
+
+    if do_cuda_malloc:
+        env_var = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', None)
+        if env_var is None:
+            env_var = "backend:cudaMallocAsync"
+        else:
+            env_var += ",backend:cudaMallocAsync"
+
+        os.environ['PYTORCH_CUDA_ALLOC_CONF'] = env_var
+
+        print('Using cudaMallocAsync backend.')
+    else:
+        print('Failed to use cudaMallocAsync backend.')
+    return
diff --git a/modules_forge/initialization.py b/modules_forge/initialization.py
index 98bd6d2e..605b9096 100644
--- a/modules_forge/initialization.py
+++ b/modules_forge/initialization.py
@@ -43,6 +43,10 @@ def initialize_forge():
         os.environ['CUDA_VISIBLE_DEVICES'] = str(args_parser.args.gpu_device_id)
         print("Set device to:", args_parser.args.gpu_device_id)
 
+    if args_parser.args.cuda_malloc:
+        from modules_forge.cuda_malloc import try_cuda_malloc
+        try_cuda_malloc()
+
     import ldm_patched.modules.model_management as model_management
     import torch