Release gpu discovery library after use

Leaving the cudart library loaded kept ~30m of memory pinned in the GPU in the main process. This change ensures we don't hold GPU resources when idle.
2 years ago · 526d4eb204
5 changed files with 31 additions and 10 deletions
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -35,7 +35,6 @@ const (
 )

 var gpuMutex sync.Mutex
-var gpuHandles *handles = nil

 // With our current CUDA compile flags, older than 5.0 will not work properly
 var CudaComputeMin = [2]C.int{5, 0}
@ -85,11 +84,11 @@ var CudartWindowsGlobs = []string{
 var CudaTegra string = os.Getenv("JETSON_JETPACK")

 // Note: gpuMutex must already be held
-func initGPUHandles() {
+func initGPUHandles() *handles {

 	// TODO - if the ollama build is CPU only, don't do these checks as they're irrelevant and confusing

-	gpuHandles = &handles{nil, nil}
+	gpuHandles := &handles{nil, nil}
 	var nvmlMgmtName string
 	var nvmlMgmtPatterns []string
 	var cudartMgmtName string
@ -116,7 +115,7 @@ func initGPUHandles() {
 		}
 		cudartMgmtPatterns = append(cudartMgmtPatterns, CudartLinuxGlobs...)
 	default:
-		return
+		return gpuHandles
 	}

 	slog.Info("Detecting GPU type")
@ -126,7 +125,7 @@ func initGPUHandles() {
 		if cudart != nil {
 			slog.Info("Nvidia GPU detected via cudart")
 			gpuHandles.cudart = cudart
-			return
+			return gpuHandles
 		}
 	}

@ -137,10 +136,10 @@ func initGPUHandles() {
 		if nvml != nil {
 			slog.Info("Nvidia GPU detected via nvidia-ml")
 			gpuHandles.nvml = nvml
-			return
+			return gpuHandles
 		}
 	}
-
+	return gpuHandles
 }

 func GetGPUInfo() GpuInfo {
@ -148,9 +147,16 @@ func GetGPUInfo() GpuInfo {
 	// GPUs so we can report warnings if we see Nvidia/AMD but fail to load the libraries
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
-	if gpuHandles == nil {
-		initGPUHandles()
-	}
+
+	gpuHandles := initGPUHandles()
+	defer func() {
+		if gpuHandles.nvml != nil {
+			C.nvml_release(*gpuHandles.nvml)
+		}
+		if gpuHandles.cudart != nil {
+			C.cudart_release(*gpuHandles.cudart)
+		}
+	}()

 	// All our GPU builds on x86 have AVX enabled, so fallback to CPU if we don't detect at least AVX
 	cpuVariant := GetCPUVariant()
--- a/gpu/gpu_info_cudart.c
+++ b/gpu/gpu_info_cudart.c
@ -191,4 +191,10 @@ void cudart_compute_capability(cudart_handle_t h, cudart_compute_capability_t *r
  }
 }

+void cudart_release(cudart_handle_t h) {
+  LOG(h.verbose, "releasing cudart library\n");
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
 #endif  // __APPLE__
--- a/gpu/gpu_info_cudart.h
+++ b/gpu/gpu_info_cudart.h
@ -55,6 +55,7 @@ typedef struct cudart_compute_capability {
 void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp);
 void cudart_check_vram(cudart_handle_t ch, mem_info_t *resp);
 void cudart_compute_capability(cudart_handle_t th, cudart_compute_capability_t *cc);
+void cudart_release(cudart_handle_t ch);

 #endif  // __GPU_INFO_CUDART_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_nvml.c
+++ b/gpu/gpu_info_nvml.c
@ -211,4 +211,11 @@ void nvml_compute_capability(nvml_handle_t h, nvml_compute_capability_t *resp) {
    }
  }
 }
+
+void nvml_release(nvml_handle_t h) {
+  LOG(h.verbose, "releasing nvml library\n");
+  UNLOAD_LIBRARY(h.handle);
+  h.handle = NULL;
+}
+
 #endif  // __APPLE__
--- a/gpu/gpu_info_nvml.h
+++ b/gpu/gpu_info_nvml.h
@ -51,6 +51,7 @@ typedef struct nvml_compute_capability {
 void nvml_init(char *nvml_lib_path, nvml_init_resp_t *resp);
 void nvml_check_vram(nvml_handle_t ch, mem_info_t *resp);
 void nvml_compute_capability(nvml_handle_t ch, nvml_compute_capability_t *cc);
+void nvml_release(nvml_handle_t ch);

 #endif  // __GPU_INFO_NVML_H__
 #endif  // __APPLE__