From 152251e37145e5c7c3c86da1ea1b8424f3ad7d96 Mon Sep 17 00:00:00 2001 From: yuri carrara <13474656+yuri-carrara@users.noreply.github.com> Date: Fri, 28 Nov 2025 22:29:18 +0100 Subject: [PATCH 1/2] gpu counter filter --- general/gpu.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/general/gpu.py b/general/gpu.py index 64f1ea7..71cce10 100644 --- a/general/gpu.py +++ b/general/gpu.py @@ -94,6 +94,11 @@ def __init__(self): logger.info(f"GPU/s:") for deviceIndex in range(self.cudaDevicesFound): + cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") + if cuda_visible_devices: + visible_list = [int(x) for x in cuda_visible_devices.split(',') if x.strip() != ""] + if deviceIndex not in visible_list: + continue deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuName = self.deviceGetName(deviceHandle, deviceIndex) From 3a2c3464aecab033e3b118e7a9fda73f59d0105d Mon Sep 17 00:00:00 2001 From: yuri carrara <13474656+yuri-carrara@users.noreply.github.com> Date: Wed, 3 Dec 2025 23:54:27 +0100 Subject: [PATCH 2/2] Enhance GPU visibility and error handling Refactor GPU device handling to improve visibility management and error handling. --- general/gpu.py | 58 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/general/gpu.py b/general/gpu.py index 71cce10..3821862 100644 --- a/general/gpu.py +++ b/general/gpu.py @@ -47,6 +47,7 @@ class CGPUInfo: gpusUtilization = [] gpusVRAM = [] gpusTemperature = [] + visibleDeviceIndices = [] # Store indices of visible devices def __init__(self): if IS_JETSON: @@ -93,12 +94,19 @@ def __init__(self): logger.info(f"GPU/s:") - for deviceIndex in range(self.cudaDevicesFound): - cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") - if cuda_visible_devices: - visible_list = [int(x) for x in cuda_visible_devices.split(',') if x.strip() != ""] - if deviceIndex not in visible_list: - continue + # Determine visible devices + if cuda_visible_devices := os.getenv("CUDA_VISIBLE_DEVICES", None): + logger.info(f"CUDA_VISIBLE_DEVICES: {cuda_visible_devices}") + visible_list = [int(x) for x in cuda_visible_devices.split(',') if x.strip() != ""] + else: + # If not set, all devices are visible + visible_list = list(range(self.cudaDevicesFound)) + + for deviceIndex in visible_list: + if deviceIndex >= self.cudaDevicesFound: + logger.warning(f"Device index {deviceIndex} in CUDA_VISIBLE_DEVICES exceeds available device count {self.cudaDevicesFound}") + continue + deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuName = self.deviceGetName(deviceHandle, deviceIndex) @@ -110,6 +118,9 @@ def __init__(self): 'name': gpuName, }) + # Store the physical device index for later use + self.visibleDeviceIndices.append(deviceIndex) + # Same index as gpus, with default values self.gpusUtilization.append(True) self.gpusVRAM.append(True) @@ -155,7 +166,8 @@ def getStatus(self): gpuType = self.cudaDevice if self.anygpuLoaded and self.cuda and self.cudaAvailable: - for deviceIndex in range(self.cudaDevicesFound): + # Iterate over visible devices using their stored indices + for listIndex, deviceIndex in enumerate(self.visibleDeviceIndices): deviceHandle = self.deviceGetHandleByIndex(deviceIndex) gpuUtilization = -1 @@ -165,16 +177,16 @@ def getStatus(self): gpuTemperature = -1 # GPU Utilization - if self.switchGPU and self.gpusUtilization[deviceIndex]: - try: + try: + if self.switchGPU and self.gpusUtilization[listIndex]: gpuUtilization = self.deviceGetUtilizationRates(deviceHandle) - except Exception as e: - logger.error('Could not get GPU utilization. ' + str(e)) - logger.error('Monitor of GPU is turning off.') - self.switchGPU = False + except Exception as e: + logger.error('Could not get GPU utilization. ' + str(e)) + logger.error('Monitor of GPU is turning off.') + self.switchGPU = False - if self.switchVRAM and self.gpusVRAM[deviceIndex]: - try: + try: + if self.switchVRAM and self.gpusVRAM[listIndex]: memory = self.deviceGetMemoryInfo(deviceHandle) vramUsed = memory['used'] vramTotal = memory['total'] @@ -182,17 +194,17 @@ def getStatus(self): # Check if vramTotal is not zero or None if vramTotal and vramTotal != 0: vramPercent = vramUsed / vramTotal * 100 - except Exception as e: - logger.error('Could not get GPU memory info. ' + str(e)) - self.switchVRAM = False + except Exception as e: + logger.error('Could not get GPU memory info. ' + str(e)) + self.switchVRAM = False # Temperature - if self.switchTemperature and self.gpusTemperature[deviceIndex]: - try: + try: + if self.switchTemperature and self.gpusTemperature[listIndex]: gpuTemperature = self.deviceGetTemperature(deviceHandle) - except Exception as e: - logger.error('Could not get GPU temperature. Turning off this feature. ' + str(e)) - self.switchTemperature = False + except Exception as e: + logger.error('Could not get GPU temperature. Turning off this feature. ' + str(e)) + self.switchTemperature = False gpus.append({ 'gpu_utilization': gpuUtilization,