diff --git a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp index c214a17c3..22fa32b01 100644 --- a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp +++ b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp @@ -52,7 +52,7 @@ void releaseAllResourcesImpl() { called = true; releaseAllGenerator(); releaseAllDeviceMem(); - releaseAllEvent(); + event_pool_clear(); devproxy::finalizeVendor(); } diff --git a/dipu/torch_dipu/csrc_dipu/base/basedef.h b/dipu/torch_dipu/csrc_dipu/base/basedef.h index 8f416e8ef..8e76117f7 100644 --- a/dipu/torch_dipu/csrc_dipu/base/basedef.h +++ b/dipu/torch_dipu/csrc_dipu/base/basedef.h @@ -35,3 +35,7 @@ const auto DIPU_BACKEND_SPARSE_TYPE = const auto DICL_BACKEND_NAME = "dicl"; } // namespace dipu + +namespace dipu { +auto inline constexpr kMaxDeviceNumber = 16; +} diff --git a/dipu/torch_dipu/csrc_dipu/base/utility.h b/dipu/torch_dipu/csrc_dipu/base/utility.h new file mode 100644 index 000000000..0b0db74ca --- /dev/null +++ b/dipu/torch_dipu/csrc_dipu/base/utility.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +namespace dipu { + +template +struct make_static_value_array {}; + +template +struct make_static_value_array> { + using value_type = std::decay_t)>; + using type = std::array; + type static inline constexpr value{T::template value...}; +}; + +// Create an array of static values. Each type T should provide a static +// 'value()' function to return a reference of the underlying static variable. +// +// In this way, value can be lazily initialized. +template +// Note: in C++17 "inline constexpr" (without "static") is necessary to make +// sure there is only one entity. +auto inline constexpr make_static_value_array_v = + make_static_value_array>::value; + +// A CRTP helper to simplify access method of static value holder. +// +// Usage: +// +// ``` +// struct integer : static_value_array { +// template auto static value() -> int & { +// auto static instance = 0; +// return instance; +// } +// }; +// +// Then we can use `integer::get(1)` to access `integer::value<1>()`. +template +struct static_value_array { + auto static array() -> decltype(auto) { + auto static constexpr instance = make_static_value_array_v; + return instance; + } + template + auto static get(std::size_t index, A&&... args) -> decltype(auto) { + return array()[index](std::forward(args)...); + } +}; + +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp index d3ad103fa..5ff3d11b9 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp @@ -1,102 +1,77 @@ #include "DIPUEventPool.h" #include -#include -#include #include -namespace dipu { +#include "csrc_dipu/base/basedef.h" +#include "csrc_dipu/base/utility.h" +#include "csrc_dipu/runtime/device/deviceapis.h" -template -class EventPool final { - protected: - std::deque event_pool_; - unsigned int allocate_num_ = 0; +namespace { - std::function allocator_; - std::function deleter_; - using mutex_t = std::recursive_mutex; - mutex_t event_mutex_; +class EventPool { + std::deque pool; + std::recursive_mutex mutex; public: - EventPool(const std::function& allocator, - const std::function& deleter) - : allocator_(allocator), deleter_(deleter) {} - - EventPool(const EventPool&) = delete; - EventPool(EventPool&&) = delete; - EventPool& operator=(const EventPool&) = delete; - EventPool& operator=(EventPool&&) = delete; - - ~EventPool() = default; - - void release() { - std::lock_guard _(event_mutex_); - for (auto& event : event_pool_) { - deleter_(event); - allocate_num_--; - } - event_pool_.clear(); - } - - void get(T& event) { - bool need_allocator = false; + void acquire(dipu::deviceEvent_t& event) { { - std::lock_guard _(event_mutex_); - if (event_pool_.empty()) { - need_allocator = true; - } else { - event = event_pool_.back(); - event_pool_.pop_back(); + std::scoped_lock _(mutex); + if (!pool.empty()) { + event = pool.back(); + pool.pop_back(); + return; } } - if (need_allocator) { - allocator_(event); - } + + dipu::devapis::createEvent(&event); + } + + void release(dipu::deviceEvent_t& event) { + std::scoped_lock _(mutex); + pool.emplace_back(event); } - void restore(T& event) { - std::lock_guard _(event_mutex_); - event_pool_.emplace_back(event); + void clear() { // should it called inside destructor? + std::scoped_lock _(mutex); + for (auto& event : pool) { + dipu::devapis::destroyEvent(event); + } + pool.clear(); } }; -EventPool* getEventPool() { - const int index = devproxy::current_device(); -// GlobalEventPool for different cards , construct when really needed -#define dispatch_event_pool(device_id) \ - if (index == (device_id)) { \ - static EventPool gDIPUEventPool( \ - [](deviceEvent_t& event) { devapis::createEvent(&event); }, \ - [](deviceEvent_t& event) { devapis::destroyEvent(event); }); \ - return &gDIPUEventPool; \ +struct StaticEventPoolArray + : dipu::static_value_array { + template + inline static auto& value() { + auto static instance = EventPool(); + return instance; } +}; - dispatch_event_pool(0); - dispatch_event_pool(1); - dispatch_event_pool(2); - dispatch_event_pool(3); - dispatch_event_pool(4); - dispatch_event_pool(5); - dispatch_event_pool(6); - dispatch_event_pool(7); - dispatch_event_pool(8); - dispatch_event_pool(9); - dispatch_event_pool(10); - dispatch_event_pool(11); - dispatch_event_pool(12); - dispatch_event_pool(13); - dispatch_event_pool(14); - dispatch_event_pool(15); - TORCH_CHECK(false, "support up to 16 cards"); +auto event_pool(int index) -> EventPool& { + TORCH_CHECK(0 <= index and index < dipu::kMaxDeviceNumber, + "device index out of range"); + return StaticEventPoolArray::get(index); } -void getEventFromPool(deviceEvent_t& event) { getEventPool()->get(event); } +} // namespace + +namespace dipu { + +void event_pool_acquire(int index, deviceEvent_t& event) { + event_pool(index).acquire(event); +} -void restoreEventToPool(deviceEvent_t& event) { - getEventPool()->restore(event); +void event_pool_release(int index, deviceEvent_t& event) { + event_pool(index).release(event); } -void releaseAllEvent() { getEventPool()->release(); } +void event_pool_clear() { + for (auto& getter : StaticEventPoolArray::array()) { + getter().clear(); + } +} } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h index 1dd105ac1..e8aceaac2 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h @@ -6,10 +6,8 @@ namespace dipu { -void getEventFromPool(deviceEvent_t& event); - -void restoreEventToPool(deviceEvent_t& event); - -void releaseAllEvent(); +void event_pool_acquire(int index, deviceEvent_t& event); +void event_pool_release(int index, deviceEvent_t& event); +void event_pool_clear(); } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h index 6d557ba41..e1b9d309b 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h @@ -5,6 +5,8 @@ #include #include +#include "csrc_dipu/base/basedef.h" +#include "csrc_dipu/base/utility.h" #include "csrc_dipu/runtime/core/DIPUEvent.h" #include "DIPUAsyncResourcePool.h" @@ -213,48 +215,33 @@ struct RawAllocator { using type = DIPURawHostAllocator; }; -template -c10::Allocator* get_allocator_impl(c10::Allocator* raw_allocator) { - // Construct when really needed - // async_mem_pool is used when cache_allocator being destructed so it should - // be destructed after cache_allocator - static AsyncMemPoolImpl async_mem_pool; - static AllocatorImpl cache_allocator; - static int n = [&]() { - cache_allocator.set_raw_allocator(raw_allocator); - cache_allocator.set_async_mem_pool(&async_mem_pool); - return 0; - }(); - return &cache_allocator; -} - -template -c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) { -#define DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(id) \ - if (device_id == (id)) { \ - return get_allocator_impl( \ - raw_allocator); \ +template +struct StaticAllocatorArray + : dipu::static_value_array< + StaticAllocatorArray, + dipu::kMaxDeviceNumber> { + template + inline static c10::Allocator* value(c10::Allocator* raw_allocator) { + // Construct when really needed + // async_mem_pool is used when cache_allocator being destructed so it should + // be destructed after cache_allocator + static AsyncMemPoolImpl async_mem_pool; + static AllocatorImpl cache_allocator; + static int n = [&]() { + cache_allocator.set_raw_allocator(raw_allocator); + cache_allocator.set_async_mem_pool(&async_mem_pool); + return 0; + }(); + return &cache_allocator; } +}; - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(0); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(1); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(2); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(3); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(4); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(5); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(6); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(7); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(8); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(9); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(10); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(11); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(12); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(13); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(14); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(15); - TORCH_CHECK(false, "support up to 16 cards"); +template +c10::Allocator* get_allocator(int index, c10::Allocator* raw_allocator) { + TORCH_CHECK(0 <= index and index < dipu::kMaxDeviceNumber, + "device index out of range"); + return StaticAllocatorArray::get(index, raw_allocator); } -#undef DIPU_ALLOCATOR_DISPATCH_DEVICE_ID #define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, \ algorithm, priority) \ diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp index e2a460156..e8ff561d8 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp @@ -172,9 +172,15 @@ bool isStreamEmpty(deviceStream_t stream) { // device event related // ===================== -void createEvent(deviceEvent_t* event) { return getEventFromPool(*event); } +void createEvent(deviceEvent_t* event) { + auto index = current_device(); + return event_pool_acquire(index, *event); +} -void destroyEvent(deviceEvent_t event) { return restoreEventToPool(event); } +void destroyEvent(deviceEvent_t event) { + auto index = current_device(); + return event_pool_release(index, event); +} void waitEvent(deviceEvent_t event) { return devapis::waitEvent(event); }