Browse Source
discovery: fix cudart driver version (#11614 )
We prefer the nvcuda library, which reports driver versions. When we
dropped cuda v11, we added a safety check for too-old drivers. What
we missed was the cudart fallback discovery logic didn't have driver
version wired up. This fixes cudart discovery to expose the driver
version as well so we no longer reject all GPUs if nvcuda didn't work.
main
Daniel Hiltgen
8 months ago
committed by
GitHub
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with
7 additions and
11 deletions
discover/gpu.go
discover/gpu_info_cudart.c
discover/gpu_info_cudart.h
@ -263,6 +263,8 @@ func GetGPUInfo() GpuInfoList {
var driverMinor int
if cHandles . cudart != nil {
C . cudart_bootstrap ( * cHandles . cudart , C . int ( i ) , & memInfo )
driverMajor = int ( cHandles . cudart . driver_major )
driverMinor = int ( cHandles . cudart . driver_minor )
} else {
C . nvcuda_bootstrap ( * cHandles . nvcuda , C . int ( i ) , & memInfo )
driverMajor = int ( cHandles . nvcuda . driver_major )
@ -69,18 +69,15 @@ void cudart_init(char *cudart_lib_path, cudart_init_resp_t *resp) {
}
int version = 0 ;
cudartDriverVersion_t driverVersion ;
driverVersion . major = 0 ;
driverVersion . minor = 0 ;
// Report driver version if we're in verbose mode, ignore errors
ret = ( * resp - > ch . cudaDriverGetVersion ) ( & version ) ;
if ( ret ! = CUDART_SUCCESS ) {
LOG ( resp - > ch . verbose , " cudaDriverGetVersion failed: %d \n " , ret ) ;
} else {
driverVersion . major = version / 1000 ;
driverVersion . minor = ( version - ( driverVersion . major * 1000 ) ) / 10 ;
LOG ( resp - > ch . verbose , " CUDA driver version: %d-%d \n " , driverVersion . major , driverVersion . minor ) ;
resp - > ch . driver_ major = version / 1000 ;
resp - > ch . driver_ minor = ( version - ( resp - > ch . driver_ major * 1000 ) ) / 10 ;
LOG ( resp - > ch . verbose , " CUDA driver version: %d-%d \n " , resp - > ch . driver_major , resp - > ch . driver_ minor) ;
}
ret = ( * resp - > ch . cudaGetDeviceCount ) ( & resp - > num_devices ) ;
@ -29,11 +29,6 @@ typedef struct cudartMemory_st {
size_t used ;
} cudartMemory_t ;
typedef struct cudartDriverVersion {
int major ;
int minor ;
} cudartDriverVersion_t ;
typedef struct cudaUUID {
unsigned char bytes [ 16 ] ;
} cudaUUID_t ;
@ -123,6 +118,8 @@ typedef struct cudaDeviceProp {
typedef struct cudart_handle {
void * handle ;
uint16_t verbose ;
int driver_major ;
int driver_minor ;
cudartReturn_t ( * cudaSetDevice ) ( int device ) ;
cudartReturn_t ( * cudaDeviceSynchronize ) ( void ) ;
cudartReturn_t ( * cudaDeviceReset ) ( void ) ;