Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 62 additions & 9 deletions devito/arch/archinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,24 +493,77 @@
return None


def _resolve_uuids_to_indices(uuids):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be changed to return the uuid_to_index mapper -- so to be renamed and also memoized. Then, at the current call site, we apply the uuids...

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed

"""
Map GPU UUID/unique-ID strings to integer device indices.
"""
# (command, pattern) where group(1)=index, group(2)=uuid
# nvidia-smi -L output: "GPU 0: <name> (UUID: GPU-xxxx-...)"
# rocm-smi --showuniqueid output: "GPU[0] : Unique ID: 0x<hex>"
queries = [
(['nvidia-smi', '-L'], r'GPU\s+(\d+):.*\(UUID:\s*([\w-]+)\)'),

Check failure on line 504 in devito/arch/archinfo.py

View workflow job for this annotation

GitHub Actions / Lint the codebase

ruff (E241)

devito/arch/archinfo.py:504:31: E241 Multiple spaces after comma help: Replace with single space
(['rocm-smi', '--showuniqueid'], r'GPU\[(\d+)\].*Unique ID:\s*([\w]+)'),
]
for cmd, pattern in queries:
try:
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
raw = proc.stdout.read().decode()
except OSError:
# Command not available
continue

uuid_to_index = {m.group(2): int(m.group(1))
for line in raw.splitlines()
if (m := re.match(pattern, line))}
if not uuid_to_index:
continue

try:
return tuple(uuid_to_index[u] for u in uuids)
except KeyError:
continue

return None


def get_visible_devices():
device_vars = (
'CUDA_VISIBLE_DEVICES',
'ROCR_VISIBLE_DEVICES',
'HIP_VISIBLE_DEVICES'
)
for v in device_vars:
try:
return v, tuple(int(i) for i in os.environ[v].split(','))
except ValueError:
# Visible devices set via UUIDs or other non-integer identifiers.
warning("Setting visible devices via UUIDs or other non-integer"
" identifiers is currently unsupported: environment variable"
f" {v}={os.environ[v]} ignored.")
except KeyError:
# Environment variable not set
if v not in os.environ:
continue

val = os.environ[v].strip()

errmsg = f"{v}={os.environ[v]!r} exposes no GPU devices."

# Empty string or known "no devices" sentinels
if not val or val.upper() in ('NODEVFILES',):
raise RuntimeError(errmsg)

entries = [e.strip() for e in val.split(',')]

# Try integer parsing first
with suppress(ValueError):
ids = tuple(int(i) for i in entries)
# Negative sentinel (e.g. -1) means no devices exposed
if len(ids) == 1 and ids[0] < 0:
raise RuntimeError(errmsg)

return v, ids

# Try UUID → device index resolution
ids = _resolve_uuids_to_indices(entries)
if ids is not None:
return v, ids

raise RuntimeError(
f"Cannot resolve device specifiers in {v}={os.environ[v]!r}."
)

return None, None


Expand Down
45 changes: 45 additions & 0 deletions tests/test_gpu_common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import re
from subprocess import DEVNULL, PIPE, Popen

import cloudpickle as pickle
import numpy as np
import pytest
Expand Down Expand Up @@ -107,6 +110,48 @@ def test_visible_devices(self, env_variables):
# Default physical deviceid expected to be 0
assert argmap2._physical_deviceid == 0

@pytest.mark.parametrize('env_variables', [
{"CUDA_VISIBLE_DEVICES": "-1"},
{"CUDA_VISIBLE_DEVICES": ""},
{"CUDA_VISIBLE_DEVICES": "NoDevFiles"},
{"ROCR_VISIBLE_DEVICES": "-1"},
])
def test_no_visible_devices(self, env_variables):
"""Accessing _physical_deviceid when no devices are exposed should raise."""
grid = Grid(shape=(10, 10))
u = Function(name='u', grid=grid)

with switchenv(env_variables):
op = Operator(Eq(u, u+1))
argmap = op.arguments()
with pytest.raises(RuntimeError):
_ = argmap._physical_deviceid

def test_visible_devices_uuid(self):
# Query GPU 0's UUID independently of _resolve_uuids_to_indices
try:
proc = Popen(['nvidia-smi', '-L'], stdout=PIPE, stderr=DEVNULL)
output = proc.stdout.read().decode()
except OSError:
pytest.skip("nvidia-smi not available")

uuid = None
for line in output.splitlines():
m = re.match(r'GPU\s+0:.*\(UUID:\s*([\w-]+)\)', line)
if m:
uuid = m.group(1)
break

if uuid is None:
pytest.skip("No GPU 0 UUID found in nvidia-smi output")

grid = Grid(shape=(10, 10))
u = Function(name='u', grid=grid)
with switchenv({'CUDA_VISIBLE_DEVICES': uuid}):
op = Operator(Eq(u, u+1))
argmap = op.arguments()
assert argmap._physical_deviceid == 0

@pytest.mark.parallel(mode=2)
@pytest.mark.parametrize('visible_devices', [
"1,2", "1,0", "0,2,3",
Expand Down
Loading