From 1a33acf2468586499b73583e56def46e9a8d2cbf Mon Sep 17 00:00:00 2001 From: wiryls <7984500+wiryls@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:30:52 +0800 Subject: [PATCH 1/3] refactor: add make_static_function_array and simplify code --- .../torch_dipu/csrc_dipu/base/DIPUGlobals.cpp | 2 +- dipu/torch_dipu/csrc_dipu/base/utility.h | 29 ++++ .../csrc_dipu/runtime/core/DIPUEventPool.cpp | 127 ++++++++---------- .../csrc_dipu/runtime/core/DIPUEventPool.h | 8 +- .../core/allocator/DIPUCachingAllocator.h | 63 ++++----- .../runtime/devproxy/deviceproxy.cpp | 10 +- 6 files changed, 118 insertions(+), 121 deletions(-) create mode 100644 dipu/torch_dipu/csrc_dipu/base/utility.h diff --git a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp index c214a17c3..22fa32b01 100644 --- a/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp +++ b/dipu/torch_dipu/csrc_dipu/base/DIPUGlobals.cpp @@ -52,7 +52,7 @@ void releaseAllResourcesImpl() { called = true; releaseAllGenerator(); releaseAllDeviceMem(); - releaseAllEvent(); + event_pool_clear(); devproxy::finalizeVendor(); } diff --git a/dipu/torch_dipu/csrc_dipu/base/utility.h b/dipu/torch_dipu/csrc_dipu/base/utility.h new file mode 100644 index 000000000..6f4b79a65 --- /dev/null +++ b/dipu/torch_dipu/csrc_dipu/base/utility.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +namespace dipu { + +// Create an array of functor. Each functor should provide a 'value()' function +// to return a reference of a static variable. Thus the value could be lazily +// initialized. +template +struct make_static_function_array { + template > + struct slot {}; + template + struct slot> { + // deduction friendly + using type = std::decay_t)>; + // the actual array + auto inline static constexpr value = + std::array{T::template value...}; + }; +}; + +template +using static_function_array = + typename make_static_function_array::template slot<>; + +} // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp index d3ad103fa..217a43034 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp @@ -1,102 +1,81 @@ #include "DIPUEventPool.h" +#include #include #include #include #include -namespace dipu { +#include "csrc_dipu/base/utility.h" +#include "csrc_dipu/runtime/device/deviceapis.h" -template -class EventPool final { - protected: - std::deque event_pool_; - unsigned int allocate_num_ = 0; +namespace { - std::function allocator_; - std::function deleter_; - using mutex_t = std::recursive_mutex; - mutex_t event_mutex_; +class EventPool { + std::deque pool; + std::recursive_mutex mutex; public: - EventPool(const std::function& allocator, - const std::function& deleter) - : allocator_(allocator), deleter_(deleter) {} - - EventPool(const EventPool&) = delete; - EventPool(EventPool&&) = delete; - EventPool& operator=(const EventPool&) = delete; - EventPool& operator=(EventPool&&) = delete; - - ~EventPool() = default; - - void release() { - std::lock_guard _(event_mutex_); - for (auto& event : event_pool_) { - deleter_(event); - allocate_num_--; - } - event_pool_.clear(); - } - - void get(T& event) { - bool need_allocator = false; + void acquire(dipu::deviceEvent_t& event) { { - std::lock_guard _(event_mutex_); - if (event_pool_.empty()) { - need_allocator = true; - } else { - event = event_pool_.back(); - event_pool_.pop_back(); + std::scoped_lock _(mutex); + if (!pool.empty()) { + event = pool.back(); + pool.pop_back(); + return; } } - if (need_allocator) { - allocator_(event); - } + + dipu::devapis::createEvent(&event); + } + + void release(dipu::deviceEvent_t& event) { + std::scoped_lock _(mutex); + pool.emplace_back(event); } - void restore(T& event) { - std::lock_guard _(event_mutex_); - event_pool_.emplace_back(event); + void clear() { // should it called inside destructor? + std::scoped_lock _(mutex); + for (auto& event : pool) { + dipu::devapis::destroyEvent(event); + } + pool.clear(); } }; -EventPool* getEventPool() { - const int index = devproxy::current_device(); -// GlobalEventPool for different cards , construct when really needed -#define dispatch_event_pool(device_id) \ - if (index == (device_id)) { \ - static EventPool gDIPUEventPool( \ - [](deviceEvent_t& event) { devapis::createEvent(&event); }, \ - [](deviceEvent_t& event) { devapis::destroyEvent(event); }); \ - return &gDIPUEventPool; \ +struct EventPoolHolder { + template + inline static auto& value() { + auto static instance = EventPool(); + return instance; } +}; + +auto constexpr max_card_number = 16; +using EventPoolHolderArray = + dipu::static_function_array; - dispatch_event_pool(0); - dispatch_event_pool(1); - dispatch_event_pool(2); - dispatch_event_pool(3); - dispatch_event_pool(4); - dispatch_event_pool(5); - dispatch_event_pool(6); - dispatch_event_pool(7); - dispatch_event_pool(8); - dispatch_event_pool(9); - dispatch_event_pool(10); - dispatch_event_pool(11); - dispatch_event_pool(12); - dispatch_event_pool(13); - dispatch_event_pool(14); - dispatch_event_pool(15); - TORCH_CHECK(false, "support up to 16 cards"); +auto event_pool(int index) -> EventPool& { + TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards"); + return EventPoolHolderArray::value[index](); } -void getEventFromPool(deviceEvent_t& event) { getEventPool()->get(event); } +} // namespace + +namespace dipu { + +void event_pool_acquire(int index, deviceEvent_t& event) { + event_pool(index).acquire(event); +} -void restoreEventToPool(deviceEvent_t& event) { - getEventPool()->restore(event); +void event_pool_release(int index, deviceEvent_t& event) { + event_pool(index).release(event); } -void releaseAllEvent() { getEventPool()->release(); } +void event_pool_clear() { + for (auto& pool : EventPoolHolderArray::value) { + pool().clear(); + } +} } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h index 1dd105ac1..e8aceaac2 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.h @@ -6,10 +6,8 @@ namespace dipu { -void getEventFromPool(deviceEvent_t& event); - -void restoreEventToPool(deviceEvent_t& event); - -void releaseAllEvent(); +void event_pool_acquire(int index, deviceEvent_t& event); +void event_pool_release(int index, deviceEvent_t& event); +void event_pool_clear(); } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h index 6d557ba41..eb01f68dd 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h @@ -5,6 +5,7 @@ #include #include +#include "csrc_dipu/base/utility.h" #include "csrc_dipu/runtime/core/DIPUEvent.h" #include "DIPUAsyncResourcePool.h" @@ -213,48 +214,32 @@ struct RawAllocator { using type = DIPURawHostAllocator; }; -template -c10::Allocator* get_allocator_impl(c10::Allocator* raw_allocator) { - // Construct when really needed - // async_mem_pool is used when cache_allocator being destructed so it should - // be destructed after cache_allocator - static AsyncMemPoolImpl async_mem_pool; - static AllocatorImpl cache_allocator; - static int n = [&]() { - cache_allocator.set_raw_allocator(raw_allocator); - cache_allocator.set_async_mem_pool(&async_mem_pool); - return 0; - }(); - return &cache_allocator; -} - -template -c10::Allocator* get_allocator(int device_id, c10::Allocator* raw_allocator) { -#define DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(id) \ - if (device_id == (id)) { \ - return get_allocator_impl( \ - raw_allocator); \ +template +struct AllocatorHolder { + template + inline static c10::Allocator* value(c10::Allocator* raw_allocator) { + // Construct when really needed + // async_mem_pool is used when cache_allocator being destructed so it should + // be destructed after cache_allocator + static AsyncMemPoolImpl async_mem_pool; + static AllocatorImpl cache_allocator; + static int n = [&]() { + cache_allocator.set_raw_allocator(raw_allocator); + cache_allocator.set_async_mem_pool(&async_mem_pool); + return 0; + }(); + return &cache_allocator; } +}; - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(0); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(1); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(2); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(3); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(4); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(5); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(6); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(7); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(8); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(9); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(10); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(11); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(12); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(13); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(14); - DIPU_ALLOCATOR_DISPATCH_DEVICE_ID(15); - TORCH_CHECK(false, "support up to 16 cards"); +template +c10::Allocator* get_allocator(int index, c10::Allocator* raw_allocator) { + auto constexpr max_card_number = 16; + TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards"); + return dipu::static_function_array< + AllocatorHolder, + max_card_number>::value[index](raw_allocator); } -#undef DIPU_ALLOCATOR_DISPATCH_DEVICE_ID #define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, \ algorithm, priority) \ diff --git a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp index e2a460156..e8ff561d8 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/devproxy/deviceproxy.cpp @@ -172,9 +172,15 @@ bool isStreamEmpty(deviceStream_t stream) { // device event related // ===================== -void createEvent(deviceEvent_t* event) { return getEventFromPool(*event); } +void createEvent(deviceEvent_t* event) { + auto index = current_device(); + return event_pool_acquire(index, *event); +} -void destroyEvent(deviceEvent_t event) { return restoreEventToPool(event); } +void destroyEvent(deviceEvent_t event) { + auto index = current_device(); + return event_pool_release(index, event); +} void waitEvent(deviceEvent_t event) { return devapis::waitEvent(event); } From 2a4af4500200b869675fc141f939c552c5ffe320 Mon Sep 17 00:00:00 2001 From: wiryls <7984500+wiryls@users.noreply.github.com> Date: Mon, 29 Jul 2024 12:52:13 +0800 Subject: [PATCH 2/3] refactor: simplify --- dipu/torch_dipu/csrc_dipu/base/utility.h | 44 ++++++++++++------- .../csrc_dipu/runtime/core/DIPUEventPool.cpp | 8 ++-- .../core/allocator/DIPUCachingAllocator.h | 6 +-- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/dipu/torch_dipu/csrc_dipu/base/utility.h b/dipu/torch_dipu/csrc_dipu/base/utility.h index 6f4b79a65..38de9d055 100644 --- a/dipu/torch_dipu/csrc_dipu/base/utility.h +++ b/dipu/torch_dipu/csrc_dipu/base/utility.h @@ -5,25 +5,35 @@ namespace dipu { -// Create an array of functor. Each functor should provide a 'value()' function -// to return a reference of a static variable. Thus the value could be lazily -// initialized. -template -struct make_static_function_array { - template > - struct slot {}; - template - struct slot> { - // deduction friendly - using type = std::decay_t)>; - // the actual array - auto inline static constexpr value = - std::array{T::template value...}; - }; +template > +struct make_static_value_array {}; + +template +struct make_static_value_array> { + // deduction friendly. + using value_type = std::decay_t)>; + auto inline constexpr static value = + std::array{T::template value...}; }; +// Create an array of static values. Each type T should provide a static +// 'value()' function to return a reference of the underlying static variable. +// +// e.g. T is something like: +// +// ``` +// struct foo { +// template auto static value() -> int & { +// auto static instance = 0; +// return instance; +// } +// }; +// ``` +// +// In this way, value can be lazily initialized. template -using static_function_array = - typename make_static_function_array::template slot<>; +// Note: in C++17 "inline constexpr" is necessary to make sure there is only one +// entity. +auto inline constexpr static_value_array = make_static_value_array::value; } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp index 217a43034..29c25a0d5 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp @@ -52,12 +52,10 @@ struct EventPoolHolder { }; auto constexpr max_card_number = 16; -using EventPoolHolderArray = - dipu::static_function_array; auto event_pool(int index) -> EventPool& { TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards"); - return EventPoolHolderArray::value[index](); + return dipu::static_value_array[index](); } } // namespace @@ -73,8 +71,8 @@ void event_pool_release(int index, deviceEvent_t& event) { } void event_pool_clear() { - for (auto& pool : EventPoolHolderArray::value) { - pool().clear(); + for (auto& h : dipu::static_value_array) { + h().clear(); } } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h index eb01f68dd..f36a2cd91 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h @@ -236,9 +236,9 @@ template c10::Allocator* get_allocator(int index, c10::Allocator* raw_allocator) { auto constexpr max_card_number = 16; TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards"); - return dipu::static_function_array< - AllocatorHolder, - max_card_number>::value[index](raw_allocator); + return dipu::static_value_array< + AllocatorHolder, max_card_number>[index]( + raw_allocator); } #define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, \ From dab6a5926657ab0dbfbe2bd33037f6ce15e41fac Mon Sep 17 00:00:00 2001 From: wiryls <7984500+wiryls@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:03:12 +0800 Subject: [PATCH 3/3] refactor: provide a helper struct to build static array --- dipu/torch_dipu/csrc_dipu/base/basedef.h | 4 ++ dipu/torch_dipu/csrc_dipu/base/utility.h | 40 +++++++++++++------ .../csrc_dipu/runtime/core/DIPUEventPool.cpp | 18 ++++----- .../core/allocator/DIPUCachingAllocator.h | 16 ++++---- 4 files changed, 48 insertions(+), 30 deletions(-) diff --git a/dipu/torch_dipu/csrc_dipu/base/basedef.h b/dipu/torch_dipu/csrc_dipu/base/basedef.h index 8f416e8ef..8e76117f7 100644 --- a/dipu/torch_dipu/csrc_dipu/base/basedef.h +++ b/dipu/torch_dipu/csrc_dipu/base/basedef.h @@ -35,3 +35,7 @@ const auto DIPU_BACKEND_SPARSE_TYPE = const auto DICL_BACKEND_NAME = "dicl"; } // namespace dipu + +namespace dipu { +auto inline constexpr kMaxDeviceNumber = 16; +} diff --git a/dipu/torch_dipu/csrc_dipu/base/utility.h b/dipu/torch_dipu/csrc_dipu/base/utility.h index 38de9d055..0b0db74ca 100644 --- a/dipu/torch_dipu/csrc_dipu/base/utility.h +++ b/dipu/torch_dipu/csrc_dipu/base/utility.h @@ -5,35 +5,49 @@ namespace dipu { -template > +template struct make_static_value_array {}; -template -struct make_static_value_array> { - // deduction friendly. +template +struct make_static_value_array> { using value_type = std::decay_t)>; - auto inline constexpr static value = - std::array{T::template value...}; + using type = std::array; + type static inline constexpr value{T::template value...}; }; // Create an array of static values. Each type T should provide a static // 'value()' function to return a reference of the underlying static variable. // -// e.g. T is something like: +// In this way, value can be lazily initialized. +template +// Note: in C++17 "inline constexpr" (without "static") is necessary to make +// sure there is only one entity. +auto inline constexpr make_static_value_array_v = + make_static_value_array>::value; + +// A CRTP helper to simplify access method of static value holder. +// +// Usage: // // ``` -// struct foo { +// struct integer : static_value_array { // template auto static value() -> int & { // auto static instance = 0; // return instance; // } // }; -// ``` // -// In this way, value can be lazily initialized. +// Then we can use `integer::get(1)` to access `integer::value<1>()`. template -// Note: in C++17 "inline constexpr" is necessary to make sure there is only one -// entity. -auto inline constexpr static_value_array = make_static_value_array::value; +struct static_value_array { + auto static array() -> decltype(auto) { + auto static constexpr instance = make_static_value_array_v; + return instance; + } + template + auto static get(std::size_t index, A&&... args) -> decltype(auto) { + return array()[index](std::forward(args)...); + } +}; } // namespace dipu diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp index 29c25a0d5..5ff3d11b9 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/DIPUEventPool.cpp @@ -1,11 +1,9 @@ #include "DIPUEventPool.h" -#include #include -#include -#include #include +#include "csrc_dipu/base/basedef.h" #include "csrc_dipu/base/utility.h" #include "csrc_dipu/runtime/device/deviceapis.h" @@ -43,7 +41,8 @@ class EventPool { } }; -struct EventPoolHolder { +struct StaticEventPoolArray + : dipu::static_value_array { template inline static auto& value() { auto static instance = EventPool(); @@ -51,11 +50,10 @@ struct EventPoolHolder { } }; -auto constexpr max_card_number = 16; - auto event_pool(int index) -> EventPool& { - TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards"); - return dipu::static_value_array[index](); + TORCH_CHECK(0 <= index and index < dipu::kMaxDeviceNumber, + "device index out of range"); + return StaticEventPoolArray::get(index); } } // namespace @@ -71,8 +69,8 @@ void event_pool_release(int index, deviceEvent_t& event) { } void event_pool_clear() { - for (auto& h : dipu::static_value_array) { - h().clear(); + for (auto& getter : StaticEventPoolArray::array()) { + getter().clear(); } } diff --git a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h index f36a2cd91..e1b9d309b 100644 --- a/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h +++ b/dipu/torch_dipu/csrc_dipu/runtime/core/allocator/DIPUCachingAllocator.h @@ -5,6 +5,7 @@ #include #include +#include "csrc_dipu/base/basedef.h" #include "csrc_dipu/base/utility.h" #include "csrc_dipu/runtime/core/DIPUEvent.h" @@ -215,7 +216,10 @@ struct RawAllocator { }; template -struct AllocatorHolder { +struct StaticAllocatorArray + : dipu::static_value_array< + StaticAllocatorArray, + dipu::kMaxDeviceNumber> { template inline static c10::Allocator* value(c10::Allocator* raw_allocator) { // Construct when really needed @@ -232,13 +236,11 @@ struct AllocatorHolder { } }; -template +template c10::Allocator* get_allocator(int index, c10::Allocator* raw_allocator) { - auto constexpr max_card_number = 16; - TORCH_CHECK(0 <= index and index < max_card_number, "support up to 16 cards"); - return dipu::static_value_array< - AllocatorHolder, max_card_number>[index]( - raw_allocator); + TORCH_CHECK(0 <= index and index < dipu::kMaxDeviceNumber, + "device index out of range"); + return StaticAllocatorArray::get(index, raw_allocator); } #define DIPU_REGISTER_ALLOCATOR(name, device_type, CachingAllocator, \