Skip to content
Merged
4 changes: 4 additions & 0 deletions tools/render-test/options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,10 @@ static rhi::DeviceType _toRenderType(Slang::RenderApiType apiType)
{
outOptions.showAdapterInfo = true;
}
else if (argValue == "-cache-rhi-device")
{
outOptions.cacheRhiDevice = true;
}
else
{
// Lookup
Expand Down
3 changes: 3 additions & 0 deletions tools/render-test/options.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ struct Options

bool skipSPIRVValidation = false;

// Whether to enable RHI device caching (default: false in render-test)
bool cacheRhiDevice = false;

Slang::List<Slang::String> capabilities;

Options() { downstreamArgs.addName("slang"); }
Expand Down
45 changes: 35 additions & 10 deletions tools/render-test/render-test-main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "shader-input-layout.h"
#include "shader-renderer-util.h"
#include "slang-support.h"
#include "slang-test-device-cache.h"
#include "window.h"

#if defined(_WIN32)
Expand Down Expand Up @@ -1440,7 +1441,7 @@ static SlangResult _innerMain(
}
}

renderer_test::CoreToRHIDebugBridge debugCallback;
static renderer_test::CoreToRHIDebugBridge debugCallback;
debugCallback.setCoreCallback(stdWriters->getDebugCallback());

// Use the profile name set on options if set
Expand Down Expand Up @@ -1495,7 +1496,7 @@ static SlangResult _innerMain(
return SLANG_E_NOT_AVAILABLE;
}

Slang::ComPtr<IDevice> device;
CachedDeviceWrapper deviceWrapper;
{
DeviceDesc desc = {};
desc.deviceType = options.deviceType;
Expand Down Expand Up @@ -1558,8 +1559,27 @@ static SlangResult _innerMain(
{
getRHI()->enableDebugLayers();
}
SlangResult res = getRHI()->createDevice(desc, device.writeRef());
if (SLANG_FAILED(res))
Slang::ComPtr<rhi::IDevice> rhiDevice;
SlangResult res;
if (options.cacheRhiDevice)
{
res = DeviceCache::acquireDevice(desc, rhiDevice.writeRef());
if (SLANG_FAILED(res))
{
rhiDevice = nullptr;
}
}
else
{
res = rhi::getRHI()->createDevice(desc, rhiDevice.writeRef());
if (SLANG_FAILED(res))
{
rhiDevice = nullptr;
}
}

// Check result for both cached and non-cached paths
if (SLANG_FAILED(res) || !rhiDevice)
{
// We need to be careful here about SLANG_E_NOT_AVAILABLE. This return value means
// that the renderer couldn't be created because it required *features* that were
Expand All @@ -1575,21 +1595,20 @@ static SlangResult _innerMain(
{
return res;
}

if (!options.onlyStartup)
{
fprintf(stderr, "Unable to create renderer %s\n", rendererName.getBuffer());
}

return res;
}
SLANG_ASSERT(device);
SLANG_ASSERT(rhiDevice);
deviceWrapper = CachedDeviceWrapper(rhiDevice);
}

for (const auto& feature : requiredFeatureList)
{
// If doesn't have required feature... we have to give up
if (!device->hasFeature(feature))
if (!deviceWrapper->hasFeature(feature))
{
return SLANG_E_NOT_AVAILABLE;
}
Expand All @@ -1599,7 +1618,7 @@ static SlangResult _innerMain(
// Print adapter info after device creation but before any other operations
if (options.showAdapterInfo)
{
auto info = device->getInfo();
auto info = deviceWrapper->getInfo();
auto out = stdWriters->getOut();
out.print("Using graphics adapter: %s\n", info.adapterName);
}
Expand All @@ -1613,14 +1632,20 @@ static SlangResult _innerMain(
{
RenderTestApp app;
renderDocBeginFrame();
SLANG_RETURN_ON_FAIL(app.initialize(session, device, options, input));
SLANG_RETURN_ON_FAIL(app.initialize(session, deviceWrapper.get(), options, input));
app.update();
renderDocEndFrame();
app.finalize();
}

return SLANG_OK;
}

SLANG_TEST_TOOL_API void cleanDeviceCache()
{
DeviceCache::cleanCache();
}

SLANG_TEST_TOOL_API SlangResult innerMain(
Slang::StdWriters* stdWriters,
SlangSession* sharedSession,
Expand Down
160 changes: 160 additions & 0 deletions tools/render-test/slang-test-device-cache.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
#include "slang-test-device-cache.h"

#include <algorithm>

// Static member accessor functions (Meyer's singleton pattern)
// This ensures proper destruction order - function-local statics are destroyed
// in reverse order of first access, avoiding the static destruction order fiasco
std::mutex& DeviceCache::getMutex()
{
static std::mutex instance;
return instance;
}

std::unordered_map<
DeviceCache::DeviceCacheKey,
DeviceCache::CachedDevice,
DeviceCache::DeviceCacheKeyHash>&
DeviceCache::getDeviceCache()
{
static std::unordered_map<DeviceCacheKey, CachedDevice, DeviceCacheKeyHash> instance;
return instance;
}

uint64_t& DeviceCache::getNextCreationOrder()
{
static uint64_t instance = 0;
return instance;
}
Comment on lines +24 to +28
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes sense to have a function scoped static for "getMutex".
But when the type is just a primitive type, it seems unnecessary.
Feel free to ignore this comment but I would go with a simpler/traditional code like,

static uint64_t s_nextCreationOrder = 0;


bool DeviceCache::DeviceCacheKey::operator==(const DeviceCacheKey& other) const
{
return deviceType == other.deviceType && enableValidation == other.enableValidation &&
enableRayTracingValidation == other.enableRayTracingValidation &&
profileName == other.profileName && requiredFeatures == other.requiredFeatures;
}

std::size_t DeviceCache::DeviceCacheKeyHash::operator()(const DeviceCacheKey& key) const
{
std::size_t h1 = std::hash<int>{}(static_cast<int>(key.deviceType));
std::size_t h2 = std::hash<bool>{}(key.enableValidation);
std::size_t h3 = std::hash<bool>{}(key.enableRayTracingValidation);
std::size_t h4 = std::hash<std::string>{}(key.profileName);

std::size_t h5 = 0;
for (const auto& feature : key.requiredFeatures)
{
h5 ^= std::hash<std::string>{}(feature) + 0x9e3779b9 + (h5 << 6) + (h5 >> 2);
}

return h1 ^ (h2 << 1) ^ (h3 << 2) ^ (h4 << 3) ^ (h5 << 4);
}
Comment on lines +37 to +51
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI, chatGPT suggested the following as a cleaner alternative,

    return std::hash<
        std::tuple<
            int, bool, bool, std::string, std::vector<std::string>
        >
    >{}(
        std::tuple{
            static_cast<int>(key.deviceType),
            key.enableValidation,
            key.enableRayTracingValidation,
            key.profileName,
            key.requiredFeatures
        }
    );

I haven't tried it.


DeviceCache::CachedDevice::CachedDevice()
: creationOrder(0)
{
}

void DeviceCache::evictOldestDeviceIfNeeded()
{
auto& deviceCache = getDeviceCache();
if (deviceCache.size() < MAX_CACHED_DEVICES)
return;

// Find the oldest device to evict
auto oldestIt = deviceCache.end();
uint64_t oldestCreationOrder = UINT64_MAX;

for (auto it = deviceCache.begin(); it != deviceCache.end(); ++it)
{
if (it->second.creationOrder < oldestCreationOrder)
{
oldestCreationOrder = it->second.creationOrder;
oldestIt = it;
}
}

// Remove the oldest device - ComPtr will handle the actual device release
if (oldestIt != deviceCache.end())
{
deviceCache.erase(oldestIt);
}
}

SlangResult DeviceCache::acquireDevice(const rhi::DeviceDesc& desc, rhi::IDevice** outDevice)
{
if (!outDevice)
return SLANG_E_INVALID_ARG;

*outDevice = nullptr;

// Skip caching for CUDA devices due to crashes
if (desc.deviceType == rhi::DeviceType::CUDA)
Comment on lines +91 to +92
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are not caching for CUDA?
What is the problem with this?
If this is a temporary WAR, we may need a new github issue for this.

Copy link
Contributor Author

@gtong-nv gtong-nv Sep 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

During the CUDA device destroying, it calls the debugCall func pointer.
Our debug callback lifetime is per-test, and doesn't out live the device.

I tried a few approaches and but seems there is no easy fix. That requires a careful design to be threadsafe. I will create another issue.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why can't we make debug callback also have the same lifetime as the device?

{
return rhi::getRHI()->createDevice(desc, outDevice);
}

std::lock_guard<std::mutex> lock(getMutex());
auto& deviceCache = getDeviceCache();
auto& nextCreationOrder = getNextCreationOrder();

// Create cache key
DeviceCacheKey key;
key.deviceType = desc.deviceType;
key.enableValidation = desc.enableValidation;
key.enableRayTracingValidation = desc.enableRayTracingValidation;
key.profileName = desc.slang.targetProfile ? desc.slang.targetProfile : "Unknown";

// Add required features to key
for (int i = 0; i < desc.requiredFeatureCount; ++i)
{
key.requiredFeatures.push_back(desc.requiredFeatures[i]);
}
std::sort(key.requiredFeatures.begin(), key.requiredFeatures.end());
Comment on lines +109 to +113
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it make sense to use std::set if the list has to be always sorted?


// Evict oldest device if we've reached the limit
evictOldestDeviceIfNeeded();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we may want to print some message when the verbose mode is enabled for a debugging purpose.
It will be useful if we can track when certain devices were created and when certain devices were evicted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be good, but I haven't seen verbose option in render-test-tool. That would requires us adding another option.
I will do that in a follow up PR


// Check if we have a cached device
auto it = deviceCache.find(key);
if (it != deviceCache.end())
{
// Return the cached device - COM reference counting handles the references
*outDevice = it->second.device.get();
if (*outDevice)
{
(*outDevice)->addRef();
return SLANG_OK;
}
}

// Create new device
Slang::ComPtr<rhi::IDevice> device;
auto result = rhi::getRHI()->createDevice(desc, device.writeRef());
if (SLANG_FAILED(result))
{
return result;
}

// Cache the device
CachedDevice& cached = deviceCache[key];
cached.device = device;
cached.creationOrder = nextCreationOrder++;

// Return the device with proper reference counting
*outDevice = device.get();
if (*outDevice)
{
(*outDevice)->addRef();
}

return SLANG_OK;
}


void DeviceCache::cleanCache()
{
std::lock_guard<std::mutex> lock(getMutex());
auto& deviceCache = getDeviceCache();
deviceCache.clear();
}
97 changes: 97 additions & 0 deletions tools/render-test/slang-test-device-cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#pragma once

#include <mutex>
#include <slang-rhi.h>
#include <string>
#include <unordered_map>
#include <vector>

// Device Cache for preventing NVIDIA Tegra driver state corruption
// This cache reuses Vulkan instances and devices to avoid the VK_ERROR_INCOMPATIBLE_DRIVER
// issue that occurs after ~19 device creation/destruction cycles on Tegra platforms.
// Uses ComPtr for automatic device lifecycle management - devices are released when removed from
// cache.
class DeviceCache
{
public:
struct DeviceCacheKey
{
rhi::DeviceType deviceType;
bool enableValidation;
bool enableRayTracingValidation;
std::string profileName;
std::vector<std::string> requiredFeatures;

bool operator==(const DeviceCacheKey& other) const;
};

struct DeviceCacheKeyHash
{
std::size_t operator()(const DeviceCacheKey& key) const;
};

struct CachedDevice
{
Slang::ComPtr<rhi::IDevice> device;
uint64_t creationOrder;

CachedDevice();
};

private:
static constexpr int MAX_CACHED_DEVICES = 10;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to ignore this comment, but we may want to set the value from the command-line argument for a debugging purpose.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do that in a follow up PR, as this will require we adding another option to both render-test-tool and cmdline arg to slang-test.


// Use function-local statics to control destruction order (Meyer's singleton pattern)
static std::mutex& getMutex();
static std::unordered_map<DeviceCacheKey, CachedDevice, DeviceCacheKeyHash>& getDeviceCache();
static uint64_t& getNextCreationOrder();

static void evictOldestDeviceIfNeeded();

public:
static SlangResult acquireDevice(const rhi::DeviceDesc& desc, rhi::IDevice** outDevice);
static void cleanCache();
};

// RAII wrapper for cached devices to ensure proper cleanup
class CachedDeviceWrapper
{
private:
Slang::ComPtr<rhi::IDevice> m_device;

public:
CachedDeviceWrapper() = default;

CachedDeviceWrapper(Slang::ComPtr<rhi::IDevice> device)
: m_device(device)
{
}

~CachedDeviceWrapper() {}

// Move constructor
CachedDeviceWrapper(CachedDeviceWrapper&& other) noexcept
: m_device(std::move(other.m_device))
{
}

// Move assignment
CachedDeviceWrapper& operator=(CachedDeviceWrapper&& other) noexcept
{
if (this != &other)
{
m_device = std::move(other.m_device);
}
return *this;
}

// Delete copy constructor and assignment
CachedDeviceWrapper(const CachedDeviceWrapper&) = delete;
CachedDeviceWrapper& operator=(const CachedDeviceWrapper&) = delete;

rhi::IDevice* get() const { return m_device.get(); }
rhi::IDevice* operator->() const { return m_device.get(); }
operator bool() const { return m_device != nullptr; }

Slang::ComPtr<rhi::IDevice>& getComPtr() { return m_device; }
};
Loading