11"""The ``tracker`` module contains the ``Tracker`` class which can alternatively be imported directly from the ``gpu_tracker`` package."""
22from __future__ import annotations
3+ import abc
34import json
45import dataclasses as dclass
56import platform
1718import pandas as pd
1819
1920
21+ class _GPUQuerier (abc .ABC ):
22+ @classmethod
23+ def _query_gpu (cls , * args ) -> pd .DataFrame :
24+ output = subp .check_output ((cls .command ,) + args , stderr = subp .STDOUT ).decode ()
25+ gpu_info = pd .read_csv (io .StringIO (output ))
26+ return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
27+
28+ @classmethod
29+ def is_available (cls ) -> bool | None :
30+ try :
31+ subp .check_output (cls .command )
32+ return True
33+ except subp .CalledProcessError :
34+ return False
35+ except FileNotFoundError :
36+ return None
37+
38+ @classmethod
39+ @abc .abstractmethod
40+ def static_info (cls ) -> pd .DataFrame :
41+ pass # pragma: nocover
42+
43+ @classmethod
44+ @abc .abstractmethod
45+ def process_ram (cls ) -> pd .DataFrame :
46+ pass # pragma: nocover
47+
48+ @classmethod
49+ @abc .abstractmethod
50+ def ram_and_utilization (cls ) -> pd .DataFrame :
51+ pass # pragma: nocover
52+
53+ class _NvidiaQuerier (_GPUQuerier ):
54+ command = 'nvidia-smi'
55+
56+ @classmethod
57+ def _query_gpu (cls , * args : list [str ], ram_column : str | None = None ):
58+ gpu_info = super ()._query_gpu (* args , '--format=csv' )
59+ gpu_info .columns = [col .replace ('[MiB]' , '' ).replace ('[%]' , '' ).strip () for col in gpu_info .columns ]
60+ gpu_info [ram_column ] = gpu_info [ram_column ].apply (lambda ram : int (ram .replace ('MiB' , '' ).strip ()))
61+ return gpu_info .rename (columns = {ram_column : 'ram' })
62+
63+ @classmethod
64+ def static_info (cls ) -> pd .DataFrame :
65+ return cls ._query_gpu ('--query-gpu=uuid,memory.total' , ram_column = 'memory.total' )
66+
67+ @classmethod
68+ def process_ram (cls ) -> pd .DataFrame :
69+ return cls ._query_gpu ('--query-compute-apps=pid,used_gpu_memory' , ram_column = 'used_gpu_memory' )
70+
71+ @classmethod
72+ def ram_and_utilization (cls ) -> pd .DataFrame :
73+ gpu_info = cls ._query_gpu ('--query-gpu=uuid,memory.used,utilization.gpu' , ram_column = 'memory.used' )
74+ gpu_info = gpu_info .rename (columns = {'utilization.gpu' : 'utilization_percent' })
75+ gpu_info .utilization_percent = [float (percentage .replace ('%' , '' ).strip ()) for percentage in gpu_info .utilization_percent ]
76+ return gpu_info
77+
78+ class _AMDQuerier (_GPUQuerier ):
79+ command = 'amd-smi'
80+
2081class _TrackingProcess (mproc .Process ):
2182 _CPU_PERCENT_INTERVAL = 0.1
2283 _ram_unit2coefficient = {
@@ -43,7 +104,7 @@ class _TrackingProcess(mproc.Process):
43104 def __init__ (
44105 self , stop_event : mproc .Event , sleep_time : float , ram_unit : str , gpu_ram_unit : str , time_unit : str ,
45106 n_expected_cores : int | None , gpu_uuids : set [str ] | None , disable_logs : bool , main_process_id : int ,
46- resource_usage_file : str , extraneous_process_ids : set [int ]):
107+ resource_usage_file : str , extraneous_process_ids : set [int ], gpu_brand : str | None ):
47108 super ().__init__ ()
48109 self ._stop_event = stop_event
49110 if sleep_time < _TrackingProcess ._CPU_PERCENT_INTERVAL :
@@ -63,24 +124,45 @@ def __init__(
63124 self ._hardware_percent_sums = {key : 0. for key in percent_keys }
64125 self ._tracking_iteration = 1
65126 self ._is_linux = platform .system ().lower () == 'linux'
66- self ._nvidia_available = True
67- try :
68- subp .check_output ('nvidia-smi' )
69- except FileNotFoundError :
70- self ._nvidia_available = False
71- self ._log_warning (
72- 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. '
73- 'Otherwise the Max GPU RAM values will remain 0.0' )
127+ cannot_connect_warning = ('The {} command is installed but cannot connect to a GPU. '
128+ 'The GPU RAM and GPU utilization values will remain 0.0.' )
129+ if gpu_brand is None :
130+ nvidia_available = _NvidiaQuerier .is_available ()
131+ nvidia_installed = nvidia_available is not None
132+ nvidia_available = bool (nvidia_available )
133+ amd_available = _AMDQuerier .is_available ()
134+ amd_installed = amd_available is not None
135+ amd_available = bool (amd_available )
136+ if nvidia_available :
137+ gpu_brand = 'nvidia'
138+ elif amd_available :
139+ gpu_brand = 'amd'
140+ elif nvidia_installed :
141+ self ._log_warning (cannot_connect_warning .format ('nvidia-smi' ))
142+ elif amd_installed :
143+ self ._log_warning (cannot_connect_warning .format ('amd-smi' ))
144+ else :
145+ self ._log_warning (
146+ 'Neither the nvidia-smi command nor the amd-smi command is installed. Install one of these to profile the GPU. '
147+ 'Otherwise the GPU RAM and GPU utilization values will remain 0.0.' )
148+ if gpu_brand == 'nvidia' :
149+ self ._gpu_querier = _NvidiaQuerier
150+ elif gpu_brand == 'amd' :
151+ self ._gpu_querier = _AMDQuerier
152+ elif gpu_brand is None :
153+ self ._gpu_querier = None
154+ else :
155+ raise ValueError (f'"{ gpu_brand } " is not a valid GPU brand. Supported values are "nvidia" and "amd".' )
74156 max_ram = MaxRAM (unit = ram_unit , system_capacity = psutil .virtual_memory ().total * self ._ram_coefficient )
75157 system_core_count = psutil .cpu_count ()
76158 cpu_utilization = CPUUtilization (
77159 system_core_count = system_core_count ,
78160 n_expected_cores = n_expected_cores if n_expected_cores is not None else system_core_count )
79- if self ._nvidia_available :
80- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-gpu=uuid,memory.total' )
81- gpu_ram_system_capacity = self ._get_gpu_ram (gpu_info = gpu_info , column = 'memory.total' )
161+ if self ._gpu_querier :
162+ gpu_info = self . _gpu_querier . static_info ( )
163+ gpu_ram_system_capacity = self ._get_gpu_ram (gpu_info = gpu_info )
82164 max_gpu_ram = MaxGPURAM (unit = gpu_ram_unit , system_capacity = gpu_ram_system_capacity )
83- all_uuids = set (gpu_info [ ' uuid' ] )
165+ all_uuids = set (gpu_info . uuid )
84166 if gpu_uuids is None :
85167 self ._gpu_uuids = all_uuids
86168 else :
@@ -143,25 +225,23 @@ def run(self):
143225 self ._resource_usage .max_ram .system = max (
144226 self ._resource_usage .max_ram .system , psutil .virtual_memory ().used * self ._ram_coefficient )
145227 # Get the maximum GPU RAM usage if available.
146- if self ._nvidia_available : # pragma: nocover
147- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-compute-apps=pid,used_gpu_memory' )
228+ if self ._gpu_querier : # pragma: nocover
229+ gpu_info = self . _gpu_querier . process_ram ( )
148230 if len (gpu_info ):
149231 process_ids = {self ._main_process_id }
150232 self ._update_gpu_ram (attr = 'main' , process_ids = process_ids , gpu_info = gpu_info )
151233 process_ids = set (self ._map_processes (processes = descendant_processes , map_func = lambda process : process .pid ))
152234 self ._update_gpu_ram (attr = 'descendants' , process_ids = process_ids , gpu_info = gpu_info )
153235 process_ids .add (self ._main_process_id )
154236 self ._update_gpu_ram (attr = 'combined' , process_ids = process_ids , gpu_info = gpu_info )
155- gpu_info = _TrackingProcess . _query_gpu ( nvidia_command = '--query-gpu=uuid,memory.used,utilization.gpu' )
156- system_gpu_ram = self ._get_gpu_ram (gpu_info , column = 'memory.used' )
237+ gpu_info = self . _gpu_querier . ram_and_utilization ( )
238+ system_gpu_ram = self ._get_gpu_ram (gpu_info )
157239 self ._resource_usage .max_gpu_ram .system = max (self ._resource_usage .max_gpu_ram .system , system_gpu_ram )
158- gpu_info = gpu_info .loc [gpu_info ['uuid' ].apply (lambda gpu_uuid : gpu_uuid in self ._gpu_uuids )]
159- gpu_percentages = [float (percentage .replace ('%' , '' ).strip ()) for percentage in gpu_info ['utilization.gpu' ]]
240+ gpu_info = gpu_info .loc [[uuid in self ._gpu_uuids for uuid in gpu_info .uuid ]]
160241 self ._update_processing_unit_utilization (
161- current_percentages = gpu_percentages ,
242+ current_percentages = list ( gpu_info . utilization_percent ) ,
162243 processing_unit_percentages = self ._resource_usage .gpu_utilization .gpu_percentages , percent_key = 'gpu' ,
163244 n_hardware_units = self ._resource_usage .gpu_utilization .n_expected_gpus )
164-
165245 # Get the mean and maximum CPU usages.
166246 main_n_threads = self ._map_processes ([main_process ], map_func = get_n_threads )
167247 descendant_n_threads = self ._map_processes (descendant_processes , map_func = get_n_threads )
@@ -230,23 +310,13 @@ def _update_ram(self, rss_values: RSSValues, memory_maps_list: list[list] | None
230310 rss_values .total_rss = max (rss_values .total_rss , total_rss )
231311
232312 def _update_gpu_ram (self , attr : str , process_ids : set [int ], gpu_info : pd .DataFrame ):
233- gpu_info = gpu_info .loc [[pid in process_ids for pid in gpu_info [ ' pid' ] ]]
234- gpu_ram = self ._get_gpu_ram (gpu_info , column = 'used_gpu_memory' )
313+ gpu_info = gpu_info .loc [[pid in process_ids for pid in gpu_info . pid ]]
314+ gpu_ram = self ._get_gpu_ram (gpu_info )
235315 max_gpu_ram = getattr (self ._resource_usage .max_gpu_ram , attr )
236316 setattr (self ._resource_usage .max_gpu_ram , attr , max (max_gpu_ram , gpu_ram ))
237317
238- @staticmethod
239- def _query_gpu (nvidia_command : str ) -> pd .DataFrame :
240- command = f'nvidia-smi { nvidia_command } --format=csv'
241- output = subp .check_output (command .split (), stderr = subp .STDOUT ).decode ()
242- gpu_info = pd .read_csv (io .StringIO (output ))
243- gpu_info .columns = [col .replace ('[MiB]' , '' ).replace ('[%]' , '' ).strip () for col in gpu_info .columns ]
244- return gpu_info .map (lambda value : value .strip () if type (value ) is str else value )
245-
246- def _get_gpu_ram (self , gpu_info : pd .DataFrame , column : str ) -> float :
247- gpu_rams = gpu_info [column ]
248- gpu_rams = gpu_rams .apply (lambda ram : int (ram .replace ('MiB' , '' ).strip ()))
249- return sum (gpu_rams ) * self ._gpu_ram_coefficient
318+ def _get_gpu_ram (self , gpu_info : pd .DataFrame ) -> float :
319+ return sum (gpu_info .ram ) * self ._gpu_ram_coefficient
250320
251321 def _update_processing_unit_utilization (
252322 self , current_percentages : list [float ], processing_unit_percentages : ProcessingUnitPercentages ,
@@ -297,7 +367,8 @@ class State(enum.Enum):
297367 def __init__ (
298368 self , sleep_time : float = 1.0 , ram_unit : str = 'gigabytes' , gpu_ram_unit : str = 'gigabytes' , time_unit : str = 'hours' ,
299369 n_expected_cores : int = None , gpu_uuids : set [str ] = None , disable_logs : bool = False , process_id : int = None ,
300- resource_usage_file : str | None = None , n_join_attempts : int = 5 , join_timeout : float = 10.0 ):
370+ resource_usage_file : str | None = None , n_join_attempts : int = 5 , join_timeout : float = 10.0 ,
371+ gpu_brand : str | None = None ):
301372 """
302373 :param sleep_time: The number of seconds to sleep in between usage-collection iterations.
303374 :param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.
@@ -310,6 +381,7 @@ def __init__(
310381 :param resource_usage_file: The file path to the pickle file containing the ``resource_usage`` attribute. This file is automatically deleted and the ``resource_usage`` attribute is set in memory if the tracking successfully completes. But if the tracking is interrupted, the tracking information will be saved in this file as a backup. Defaults to a randomly generated file name in the current working directory of the format ``.gpu-tracker_<random UUID>.pkl``.
311382 :param n_join_attempts: The number of times the tracker attempts to join its underlying sub-process.
312383 :param join_timeout: The amount of time the tracker waits for its underlying sub-process to join.
384+ :param gpu_brand: The brand of GPU to profile. Valid values are "nvidia" and "amd". Defaults to the brand of GPU detected in the system, checking Nvidia first.
313385 :raises ValueError: Raised if invalid units are provided.
314386 """
315387 current_process_id = os .getpid ()
@@ -323,7 +395,7 @@ def __init__(
323395 self ._resource_usage_file = f'.gpu-tracker_{ uuid .uuid1 ()} .pkl' if resource_usage_file is None else resource_usage_file
324396 self ._tracking_process = _TrackingProcess (
325397 self ._stop_event , sleep_time , ram_unit , gpu_ram_unit , time_unit , n_expected_cores , gpu_uuids , disable_logs ,
326- process_id if process_id is not None else current_process_id , self ._resource_usage_file , extraneous_ids )
398+ process_id if process_id is not None else current_process_id , self ._resource_usage_file , extraneous_ids , gpu_brand )
327399 self .resource_usage = None
328400 self .n_join_attempts = n_join_attempts
329401 self .join_timeout = join_timeout
0 commit comments