diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index a1ad81f625cd6..a9414eb324d09 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -140,6 +140,7 @@ endif() set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner") set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner") set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner") +set(MLIR_ENABLE_LEVELZERO_RUNNER 0 CACHE BOOL "Enable building the MLIR LevelZero runner") set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the MLIR SPIR-V cpu runner") set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the MLIR Vulkan runner") set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZeroRuntime.cmake similarity index 66% rename from mlir/cmake/modules/FindLevelZero.cmake rename to mlir/cmake/modules/FindLevelZeroRuntime.cmake index 012187f0afc0b..2a8fb3a16d16f 100644 --- a/mlir/cmake/modules/FindLevelZero.cmake +++ b/mlir/cmake/modules/FindLevelZeroRuntime.cmake @@ -20,7 +20,6 @@ include(FindPackageHandleStandardArgs) # Search path priority # 1. CMake Variable LEVEL_ZERO_DIR # 2. Environment Variable LEVEL_ZERO_DIR - if(NOT LEVEL_ZERO_DIR) if(DEFINED ENV{LEVEL_ZERO_DIR}) set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}") @@ -28,32 +27,32 @@ if(NOT LEVEL_ZERO_DIR) endif() if(LEVEL_ZERO_DIR) - find_path(LevelZero_INCLUDE_DIR + find_path(LevelZeroRuntime_INCLUDE_DIR NAMES level_zero/ze_api.h PATHS ${LEVEL_ZERO_DIR}/include NO_DEFAULT_PATH ) if(LINUX) - find_library(LevelZero_LIBRARY + find_library(LevelZeroRuntime_LIBRARY NAMES ze_loader PATHS ${LEVEL_ZERO_DIR}/lib - ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu + ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu NO_DEFAULT_PATH ) else() - find_library(LevelZero_LIBRARY + find_library(LevelZeroRuntime_LIBRARY NAMES ze_loader PATHS ${LEVEL_ZERO_DIR}/lib NO_DEFAULT_PATH ) endif() else() - find_path(LevelZero_INCLUDE_DIR + find_path(LevelZeroRuntime_INCLUDE_DIR NAMES level_zero/ze_api.h ) - find_library(LevelZero_LIBRARY + find_library(LevelZeroRuntime_LIBRARY NAMES ze_loader ) endif() @@ -64,12 +63,14 @@ endif() # lists of equal lengths, with the shorter string getting zero-padded. function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT) # Convert the strings to list - string(REPLACE "." ";" VL1 ${VERSION_STR1}) - string(REPLACE "." ";" VL2 ${VERSION_STR2}) + string(REPLACE "." ";" VL1 ${VERSION_STR1}) + string(REPLACE "." ";" VL2 ${VERSION_STR2}) + # get lengths of both lists list(LENGTH VL1 VL1_LEN) list(LENGTH VL2 VL2_LEN) set(LEN ${VL1_LEN}) + # If they differ in size pad the shorter list with 0s if(VL1_LEN GREATER VL2_LEN) math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL) @@ -98,12 +99,10 @@ function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT) set(${OUTPUT} TRUE PARENT_SCOPE) endif() endforeach() - - endfunction(compare_versions) +endfunction(compare_versions) # Creates a small function to run and extract the LevelZero loader version. function(get_l0_loader_version) - set(L0_VERSIONEER_SRC [====[ #include @@ -142,19 +141,20 @@ function(get_l0_loader_version) # We need both the directories in the include path as ze_loader.h # includes "ze_api.h" and not "level_zero/ze_api.h". - list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) - list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero) + list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR}) + list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR}/level_zero) list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR) try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE - "${CMAKE_BINARY_DIR}" - "${L0_VERSIONEER_FILE}" - LINK_LIBRARIES ${LevelZero_LIBRARY} - CMAKE_FLAGS - "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}" - RUN_OUTPUT_VARIABLE L0_VERSION + "${CMAKE_BINARY_DIR}" + "${L0_VERSIONEER_FILE}" + LINK_LIBRARIES ${LevelZeroRuntime_LIBRARY} + CMAKE_FLAGS + "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}" + RUN_OUTPUT_VARIABLE L0_VERSION ) - if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN)) - set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE) + + if(${L0_VERSIONEER_COMPILE} AND(DEFINED L0_VERSIONEER_RUN)) + set(LevelZeroRuntime_VERSION ${L0_VERSION} PARENT_SCOPE) message(STATUS "Found Level Zero of version: ${L0_VERSION}") else() message(FATAL_ERROR @@ -163,59 +163,61 @@ function(get_l0_loader_version) endif() endfunction(get_l0_loader_version) -if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY) - list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}") - list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) +if(LevelZeroRuntime_INCLUDE_DIR AND LevelZeroRuntime_LIBRARY) + list(APPEND LevelZeroRuntime_LIBRARIES "${LevelZeroRuntime_LIBRARY}") + list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR}) + if(OpenCL_FOUND) - list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}) + list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}) endif() - cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH) - set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH}) - - if(NOT TARGET LevelZero::LevelZero) - add_library(LevelZero::LevelZero INTERFACE IMPORTED) - set_target_properties(LevelZero::LevelZero - PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}" - ) - set_target_properties(LevelZero::LevelZero - PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}" - ) + cmake_path(GET LevelZeroRuntime_LIBRARY PARENT_PATH LevelZeroRuntime_LIBRARIES_PATH) + set(LevelZeroRuntime_LIBRARIES_DIR ${LevelZeroRuntime_LIBRARIES_PATH}) + + if(NOT TARGET LevelZeroRuntime::LevelZeroRuntime) + add_library(LevelZeroRuntime::LevelZeroRuntime INTERFACE IMPORTED) + set_target_properties(LevelZeroRuntime::LevelZeroRuntime + PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZeroRuntime_LIBRARIES}" + ) + set_target_properties(LevelZeroRuntime::LevelZeroRuntime + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZeroRuntime_INCLUDE_DIRS}" + ) endif() endif() # Check if a specific version of Level Zero is required -if(LevelZero_FIND_VERSION) +if(LevelZeroRuntime_FIND_VERSION) get_l0_loader_version() set(VERSION_GT_FIND_VERSION FALSE) compare_versions( - ${LevelZero_VERSION} - ${LevelZero_FIND_VERSION} + ${LevelZeroRuntime_VERSION} + ${LevelZeroRuntime_FIND_VERSION} VERSION_GT_FIND_VERSION ) + if(${VERSION_GT_FIND_VERSION}) - set(LevelZero_FOUND TRUE) + set(LevelZeroRuntime_FOUND TRUE) else() - set(LevelZero_FOUND FALSE) + set(LevelZeroRuntime_FOUND FALSE) endif() else() - set(LevelZero_FOUND TRUE) + set(LevelZeroRuntime_FOUND TRUE) endif() -find_package_handle_standard_args(LevelZero +find_package_handle_standard_args(LevelZeroRuntime REQUIRED_VARS - LevelZero_FOUND - LevelZero_INCLUDE_DIRS - LevelZero_LIBRARY - LevelZero_LIBRARIES_DIR + LevelZeroRuntime_FOUND + LevelZeroRuntime_INCLUDE_DIRS + LevelZeroRuntime_LIBRARY + LevelZeroRuntime_LIBRARIES_DIR HANDLE_COMPONENTS ) -mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS) +mark_as_advanced(LevelZeroRuntime_LIBRARY LevelZeroRuntime_INCLUDE_DIRS) -if(LevelZero_FOUND) - find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}" - "(found version ${LevelZero_VERSION})" +if(LevelZeroRuntime_FOUND) + find_package_message(LevelZeroRuntime "Found LevelZero: ${LevelZeroRuntime_LIBRARY}" + "(found version ${LevelZeroRuntime_VERSION})" ) else() - find_package_message(LevelZero "Could not find LevelZero" "") + find_package_message(LevelZeroRuntime "Could not find LevelZero" "") endif() diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index dd2ac75b88798..fdeb4dacf9278 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -14,6 +14,7 @@ set(LLVM_OPTIONAL_SOURCES RunnerUtils.cpp OptUtils.cpp JitRunner.cpp + LevelZeroRuntimeWrappers.cpp SpirvCpuRuntimeWrappers.cpp SyclRuntimeWrappers.cpp VulkanRuntimeWrappers.cpp @@ -374,6 +375,15 @@ if(LLVM_ENABLE_PIC) ) endif() + if(MLIR_ENABLE_SYCL_RUNNER OR MLIR_ENABLE_LEVELZERO_RUNNER) + # Both runtimes require LevelZero, so we can find it once. + find_package(LevelZeroRuntime) + + if(NOT LevelZeroRuntime_FOUND) + message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.") + endif() + endif() + if(MLIR_ENABLE_SYCL_RUNNER) find_package(SyclRuntime) @@ -381,12 +391,6 @@ if(LLVM_ENABLE_PIC) message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.") endif() - find_package(LevelZero) - - if(NOT LevelZero_FOUND) - message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.") - endif() - add_mlir_library(mlir_sycl_runtime SHARED SyclRuntimeWrappers.cpp @@ -404,9 +408,28 @@ if(LLVM_ENABLE_PIC) ${MLIR_INCLUDE_DIRS} ) - target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime) + target_link_libraries(mlir_sycl_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime SyclRuntime::SyclRuntime) + + set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}") + endif() + + if(MLIR_ENABLE_LEVELZERO_RUNNER) + add_mlir_library(mlir_levelzero_runtime + SHARED + LevelZeroRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) + + target_compile_options(mlir_levelzero_runtime PUBLIC -fexceptions -frtti) + + target_include_directories(mlir_levelzero_runtime PRIVATE + ${MLIR_INCLUDE_DIRS} + ) + + target_link_libraries(mlir_levelzero_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime) - set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}") + set_property(TARGET mlir_levelzero_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}") endif() if(MLIR_ENABLE_SPIRV_CPU_RUNNER) diff --git a/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp new file mode 100644 index 0000000000000..21eaf28c9f214 --- /dev/null +++ b/mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp @@ -0,0 +1,573 @@ +//===- LevelZeroRuntimeWrappers.cpp - MLIR Level Zero (L0) wrapper library-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements wrappers around the Level Zero (L0) runtime library with C linkage +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/Twine.h" + +#include "level_zero/ze_api.h" +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { +template +auto catchAll(F &&func) { + try { + return func(); + } catch (const std::exception &e) { + std::cerr << "An exception was thrown: " << e.what() << std::endl; + std::abort(); + } catch (...) { + std::cerr << "An unknown exception was thrown." << std::endl; + std::abort(); + } +} + +#define L0_SAFE_CALL(call) \ + { \ + ze_result_t status = (call); \ + if (status != ZE_RESULT_SUCCESS) { \ + const char *errorString; \ + zeDriverGetLastErrorDescription(NULL, &errorString); \ + std::cerr << "L0 error " << status << ": " << errorString << std::endl; \ + std::abort(); \ + } \ + } +} // namespace + +//===----------------------------------------------------------------------===// +// L0 RT context & device setters +//===----------------------------------------------------------------------===// + +// Returns the L0 driver handle for the given index. Default index is 0 +// (i.e., returns the first driver handle of the available drivers). + +static ze_driver_handle_t getDriver(uint32_t idx = 0) { + ze_init_driver_type_desc_t driver_type = {}; + driver_type.stype = ZE_STRUCTURE_TYPE_INIT_DRIVER_TYPE_DESC; + driver_type.flags = ZE_INIT_DRIVER_TYPE_FLAG_GPU; + driver_type.pNext = nullptr; + uint32_t driverCount{0}; + thread_local static std::vector drivers; + thread_local static bool isDriverInitialised{false}; + if (isDriverInitialised && idx < drivers.size()) + return drivers[idx]; + L0_SAFE_CALL(zeInitDrivers(&driverCount, nullptr, &driver_type)); + if (!driverCount) + throw std::runtime_error("No L0 drivers found."); + drivers.resize(driverCount); + L0_SAFE_CALL(zeInitDrivers(&driverCount, drivers.data(), &driver_type)); + if (idx >= driverCount) + throw std::runtime_error((llvm::Twine("Requested driver idx out-of-bound, " + "number of availabe drivers: ") + + std::to_string(driverCount)) + .str()); + isDriverInitialised = true; + return drivers[idx]; +} + +static ze_device_handle_t getDevice(const uint32_t driverIdx = 0, + const int32_t devIdx = 0) { + thread_local static ze_device_handle_t l0Device; + thread_local int32_t currDevIdx{-1}; + thread_local uint32_t currDriverIdx{0}; + if (currDriverIdx == driverIdx && currDevIdx == devIdx) + return l0Device; + auto driver = getDriver(driverIdx); + uint32_t deviceCount{0}; + L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, nullptr)); + if (!deviceCount) + throw std::runtime_error("getDevice failed: did not find L0 device."); + if (static_cast(deviceCount) < devIdx + 1) + throw std::runtime_error("getDevice failed: devIdx out-of-bounds."); + std::vector devices(deviceCount); + L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, devices.data())); + l0Device = devices[devIdx]; + currDriverIdx = driverIdx; + currDevIdx = devIdx; + return l0Device; +} + +// Returns the default L0 context of the defult driver. +static ze_context_handle_t getContext(ze_driver_handle_t driver) { + thread_local static ze_context_handle_t context; + thread_local static bool isContextInitialised{false}; + if (isContextInitialised) + return context; + ze_context_desc_t ctxtDesc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0}; + L0_SAFE_CALL(zeContextCreate(driver, &ctxtDesc, &context)); + isContextInitialised = true; + return context; +} + +//===----------------------------------------------------------------------===// +// L0 RT helper structs +//===----------------------------------------------------------------------===// + +struct ZeContextDeleter { + void operator()(ze_context_handle_t ctx) const { + if (ctx) + L0_SAFE_CALL(zeContextDestroy(ctx)); + } +}; + +struct ZeCommandListDeleter { + void operator()(ze_command_list_handle_t cmdList) const { + if (cmdList) + L0_SAFE_CALL(zeCommandListDestroy(cmdList)); + } +}; +using UniqueZeContext = + std::unique_ptr::type, + ZeContextDeleter>; +using UniqueZeCommandList = + std::unique_ptr::type, + ZeCommandListDeleter>; +struct L0RTContextWrapper { + ze_driver_handle_t driver{nullptr}; + ze_device_handle_t device{nullptr}; + UniqueZeContext context; + // Usually, one immediate command list with ordinal 0 suffices for + // both copy and compute ops, but leaves HW underutilized. + UniqueZeCommandList immCmdListCompute; + // Copy engines can be used for both memcpy and memset, but + // they have limitations for memset pattern size (e.g., 1 byte). + UniqueZeCommandList immCmdListCopy; + uint32_t copyEngineMaxMemoryFillPatternSize{-1u}; + + L0RTContextWrapper() = default; + L0RTContextWrapper(const uint32_t driverIdx = 0, const int32_t devIdx = 0) + : driver(getDriver(driverIdx)), device(getDevice(devIdx)) { + // Create context + ze_context_handle_t ctx = getContext(driver); + context.reset(ctx); + + // Determine ordinals + uint32_t computeEngineOrdinal = -1u, copyEngineOrdinal = -1u; + ze_device_properties_t deviceProperties{}; + L0_SAFE_CALL(zeDeviceGetProperties(device, &deviceProperties)); + uint32_t queueGroupCount = 0; + L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties( + device, &queueGroupCount, nullptr)); + std::vector queueGroupProperties( + queueGroupCount); + L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties( + device, &queueGroupCount, queueGroupProperties.data())); + + for (uint32_t queueGroupIdx = 0; queueGroupIdx < queueGroupCount; + ++queueGroupIdx) { + const auto &group = queueGroupProperties[queueGroupIdx]; + if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) + computeEngineOrdinal = queueGroupIdx; + else if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) { + copyEngineOrdinal = queueGroupIdx; + copyEngineMaxMemoryFillPatternSize = group.maxMemoryFillPatternSize; + } + if (copyEngineOrdinal != -1u && computeEngineOrdinal != -1u) + break; + } + + // Fallback to the default queue if no dedicated copy queue is available. + if (copyEngineOrdinal == -1u) + copyEngineOrdinal = computeEngineOrdinal; + + assert(copyEngineOrdinal != -1u && computeEngineOrdinal != -1u && + "Expected two engines to be available."); + + // Create copy command list + ze_command_queue_desc_t cmdQueueDesc{ + ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + nullptr, + copyEngineOrdinal, // ordinal + 0, // index (assume one physical engine in the group) + 0, // flags + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + + ze_command_list_handle_t rawCmdListCopy = nullptr; + L0_SAFE_CALL(zeCommandListCreateImmediate(context.get(), device, + &cmdQueueDesc, &rawCmdListCopy)); + immCmdListCopy.reset(rawCmdListCopy); + + // Create compute command list + cmdQueueDesc.ordinal = computeEngineOrdinal; + ze_command_list_handle_t rawCmdListCompute = nullptr; + L0_SAFE_CALL(zeCommandListCreateImmediate( + context.get(), device, &cmdQueueDesc, &rawCmdListCompute)); + immCmdListCompute.reset(rawCmdListCompute); + } + L0RTContextWrapper(const L0RTContextWrapper &) = delete; + L0RTContextWrapper &operator=(const L0RTContextWrapper &) = delete; + // Allow move + L0RTContextWrapper(L0RTContextWrapper &&) noexcept = default; + L0RTContextWrapper &operator=(L0RTContextWrapper &&) noexcept = default; + ~L0RTContextWrapper() = default; +}; + +struct ZeEventDeleter { + void operator()(ze_event_handle_t event) const { + if (event) + L0_SAFE_CALL(zeEventDestroy(event)); + } +}; + +struct ZeEventPoolDeleter { + void operator()(ze_event_pool_handle_t pool) const { + if (pool) + L0_SAFE_CALL(zeEventPoolDestroy(pool)); + } +}; + +using UniqueZeEvent = + std::unique_ptr::type, + ZeEventDeleter>; +using UniqueZeEventPool = + std::unique_ptr::type, + ZeEventPoolDeleter>; + +// L0 only supports pre-determined sizes of event pools, +// implement a runtime data structure to avoid running out of events. + +struct DynamicEventPool { + constexpr static size_t numEventsPerPool{128}; + + std::vector eventPools; + std::vector availableEvents; + std::unordered_map takenEvents; + + // Limit the number of events to avoid running out of memory. + // The limit is set to 32K events, which should be sufficient for most use + // cases. + size_t maxEventsCount{32768}; // 32K events + size_t currentEventsLimit{0}; + size_t currentEventsCnt{0}; + L0RTContextWrapper *rtCtx; + + DynamicEventPool(L0RTContextWrapper *rtCtx) : rtCtx(rtCtx) { + createNewPool(numEventsPerPool); + } + + DynamicEventPool(const DynamicEventPool &) = delete; + DynamicEventPool &operator=(const DynamicEventPool &) = delete; + + // Allow move + DynamicEventPool(DynamicEventPool &&) noexcept = default; + DynamicEventPool &operator=(DynamicEventPool &&) noexcept = default; + + ~DynamicEventPool() { + assert(takenEvents.empty() && "Some events were not released"); + } + + void createNewPool(size_t numEvents) { + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = numEvents; + + ze_event_pool_handle_t rawPool = nullptr; + L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context.get(), &eventPoolDesc, 1, + &rtCtx->device, &rawPool)); + + eventPools.emplace_back(UniqueZeEventPool(rawPool)); + currentEventsLimit += numEvents; + } + + ze_event_handle_t takeEvent() { + ze_event_handle_t rawEvent = nullptr; + + if (!availableEvents.empty()) { + // Reuse one + auto uniqueEvent = std::move(availableEvents.back()); + availableEvents.pop_back(); + rawEvent = uniqueEvent.get(); + takenEvents[rawEvent] = std::move(uniqueEvent); + } else { + if (currentEventsCnt >= maxEventsCount) { + throw std::runtime_error("DynamicEventPool: reached max events limit"); + } + if (currentEventsCnt == currentEventsLimit) + createNewPool(numEventsPerPool); + + ze_event_desc_t eventDesc = { + ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, + static_cast(currentEventsCnt % numEventsPerPool), + ZE_EVENT_SCOPE_FLAG_DEVICE, ZE_EVENT_SCOPE_FLAG_HOST}; + + ze_event_handle_t newEvent = nullptr; + L0_SAFE_CALL( + zeEventCreate(eventPools.back().get(), &eventDesc, &newEvent)); + + takenEvents[newEvent] = UniqueZeEvent(newEvent); + rawEvent = newEvent; + currentEventsCnt++; + } + + return rawEvent; + } + + void releaseEvent(ze_event_handle_t event) { + auto it = takenEvents.find(event); + assert(it != takenEvents.end() && + "Attempting to release unknown or already released event"); + + L0_SAFE_CALL(zeEventHostReset(event)); + availableEvents.emplace_back(std::move(it->second)); + takenEvents.erase(it); + } +}; + +L0RTContextWrapper &getRtContext() { + thread_local static L0RTContextWrapper rtContext(0); + return rtContext; +} + +DynamicEventPool &getDynamicEventPool() { + thread_local static DynamicEventPool dynEventPool{&getRtContext()}; + return dynEventPool; +} + +struct StreamWrapper { + // avoid event pointer invalidations + std::deque implicitEventStack; + DynamicEventPool &dynEventPool; + + StreamWrapper(DynamicEventPool &dynEventPool) : dynEventPool(dynEventPool) {} + ~StreamWrapper() { sync(); } + + ze_event_handle_t *getLastImplicitEventPtr() { + // Assume current implicit events will not be used after `sync`. + return implicitEventStack.size() ? &implicitEventStack.back() : nullptr; + } + + void sync(ze_event_handle_t explicitEvent = nullptr) { + ze_event_handle_t syncEvent{nullptr}; + if (!explicitEvent) { + ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr(); + syncEvent = lastImplicitEventPtr ? *lastImplicitEventPtr : nullptr; + } else { + syncEvent = explicitEvent; + } + if (syncEvent) + L0_SAFE_CALL(zeEventHostSynchronize( + syncEvent, std::numeric_limits::max())); + // All of the "implicit" events were signaled and are of no use, release + // them. "explicit" event must be "released" via mgpuEventDestroy + for (auto event : implicitEventStack) + dynEventPool.releaseEvent(event); + implicitEventStack.clear(); + } + + template + void enqueueOp(Func &&op) { + ze_event_handle_t newImplicitEvent = dynEventPool.takeEvent(); + ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr(); + const uint32_t numWaitEvents = lastImplicitEventPtr ? 1 : 0; + std::forward(op)(newImplicitEvent, numWaitEvents, + lastImplicitEventPtr); + implicitEventStack.push_back(newImplicitEvent); + } +}; + +static ze_module_handle_t loadModule(const void *data, size_t dataSize) { + assert(data); + ze_module_handle_t zeModule; + ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_IL_SPIRV, + dataSize, + (const uint8_t *)data, + nullptr, + nullptr}; + ze_module_build_log_handle_t buildLogHandle; + ze_result_t result = + zeModuleCreate(getRtContext().context.get(), getRtContext().device, &desc, + &zeModule, &buildLogHandle); + if (result != ZE_RESULT_SUCCESS) { + std::cerr << "Error creating module, error code: " << result << std::endl; + size_t logSize = 0; + L0_SAFE_CALL(zeModuleBuildLogGetString(buildLogHandle, &logSize, nullptr)); + std::string buildLog(" ", logSize); + L0_SAFE_CALL( + zeModuleBuildLogGetString(buildLogHandle, &logSize, buildLog.data())); + std::cerr << "Build log:\n" << buildLog << std::endl; + std::abort(); + } + return zeModule; +} + +//===----------------------------------------------------------------------===// +// L0 Wrappers definition +//===----------------------------------------------------------------------===// + +extern "C" StreamWrapper *mgpuStreamCreate() { + return new StreamWrapper(getDynamicEventPool()); +} + +extern "C" void mgpuStreamSynchronize(StreamWrapper *stream) { + if (stream) + stream->sync(); +} + +extern "C" void mgpuStreamDestroy(StreamWrapper *stream) { delete stream; } + +extern "C" void mgpuStreamWaitEvent(StreamWrapper *stream, + ze_event_handle_t event) { + assert(stream && "Invalid stream"); + assert(event && "Invalid event"); + stream->sync(event); +} + +extern "C" ze_event_handle_t mgpuEventCreate() { + return getDynamicEventPool().takeEvent(); +} + +extern "C" void mgpuEventDestroy(ze_event_handle_t event) { + return getDynamicEventPool().releaseEvent(event); +} + +extern "C" void mgpuEventSynchronize(ze_event_handle_t event) { + L0_SAFE_CALL( + zeEventHostSynchronize(event, std::numeric_limits::max())); + L0_SAFE_CALL(zeEventHostReset(event)); +} + +extern "C" void mgpuEventRecord(ze_event_handle_t event, + StreamWrapper *stream) { + L0_SAFE_CALL(zeCommandListAppendSignalEvent( + getRtContext().immCmdListCopy.get(), event)); + L0_SAFE_CALL(zeCommandListAppendSignalEvent( + getRtContext().immCmdListCompute.get(), event)); +} + +extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream, + bool isShared) { + return catchAll([&]() { + void *memPtr = nullptr; + constexpr size_t alignment{64}; + ze_device_mem_alloc_desc_t deviceDesc = {}; + deviceDesc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC; + if (isShared) { + ze_host_mem_alloc_desc_t hostDesc = {}; + hostDesc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC; + L0_SAFE_CALL(zeMemAllocShared(getRtContext().context.get(), &deviceDesc, + &hostDesc, size, alignment, + getRtContext().device, &memPtr)); + } else { + L0_SAFE_CALL(zeMemAllocDevice(getRtContext().context.get(), &deviceDesc, + size, alignment, getRtContext().device, + &memPtr)); + } + if (!memPtr) + throw std::runtime_error("mem allocation failed!"); + return memPtr; + }); +} + +extern "C" void mgpuMemFree(void *ptr, StreamWrapper *stream) { + stream->sync(); + if (ptr) + L0_SAFE_CALL(zeMemFree(getRtContext().context.get(), ptr)); +} + +extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes, + StreamWrapper *stream) { + stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents, + ze_event_handle_t *waitEvents) { + L0_SAFE_CALL(zeCommandListAppendMemoryCopy( + getRtContext().immCmdListCopy.get(), dst, src, sizeBytes, newEvent, + numWaitEvents, waitEvents)); + }); +} + +template +void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count, + StreamWrapper *stream) { + L0RTContextWrapper &rtContext = getRtContext(); + auto listType = + rtContext.copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE) + ? rtContext.immCmdListCopy.get() + : rtContext.immCmdListCompute.get(); + stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents, + ze_event_handle_t *waitEvents) { + L0_SAFE_CALL(zeCommandListAppendMemoryFill( + listType, dst, &value, sizeof(PATTERN_TYPE), + count * sizeof(PATTERN_TYPE), newEvent, numWaitEvents, waitEvents)); + }); +} +extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count, + StreamWrapper *stream) { + mgpuMemset(dst, value, count, stream); +} + +extern "C" void mgpuMemset16(void *dst, unsigned short value, size_t count, + StreamWrapper *stream) { + mgpuMemset(dst, value, count, stream); +} + +extern "C" ze_module_handle_t mgpuModuleLoad(const void *data, + size_t gpuBlobSize) { + return catchAll([&]() { return loadModule(data, gpuBlobSize); }); +} + +extern "C" ze_kernel_handle_t mgpuModuleGetFunction(ze_module_handle_t module, + const char *name) { + assert(module && name); + ze_kernel_handle_t zeKernel; + ze_kernel_desc_t desc = {}; + desc.pKernelName = name; + L0_SAFE_CALL(zeKernelCreate(module, &desc, &zeKernel)); + return zeKernel; +} + +extern "C" void mgpuLaunchKernel(ze_kernel_handle_t kernel, size_t gridX, + size_t gridY, size_t gridZ, size_t blockX, + size_t blockY, size_t blockZ, + size_t sharedMemBytes, StreamWrapper *stream, + void **params, void ** /*extra*/, + size_t paramsCount) { + + if (sharedMemBytes > 0) { + paramsCount = paramsCount - 1; // Last param is shared memory size + L0_SAFE_CALL( + zeKernelSetArgumentValue(kernel, paramsCount, sharedMemBytes, nullptr)); + } + for (size_t i = 0; i < paramsCount; ++i) + L0_SAFE_CALL(zeKernelSetArgumentValue(kernel, static_cast(i), + sizeof(void *), params[i])); + L0_SAFE_CALL(zeKernelSetGroupSize(kernel, blockX, blockY, blockZ)); + ze_group_count_t dispatch; + dispatch.groupCountX = static_cast(gridX); + dispatch.groupCountY = static_cast(gridY); + dispatch.groupCountZ = static_cast(gridZ); + stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents, + ze_event_handle_t *waitEvents) { + L0_SAFE_CALL(zeCommandListAppendLaunchKernel( + getRtContext().immCmdListCompute.get(), kernel, &dispatch, newEvent, + numWaitEvents, waitEvents)); + }); +} + +extern "C" void mgpuModuleUnload(ze_module_handle_t module) { + L0_SAFE_CALL(zeModuleDestroy(module)); +} + +extern "C" void mgpuSetDefaultDevice(int32_t devIdx) { + catchAll([&]() { + // For now, a user must ensure that streams and events complete + // and are destroyed before switching a device. + getRtContext() = L0RTContextWrapper(devIdx); + getDynamicEventPool() = DynamicEventPool(&getRtContext()); + }); +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 89568e7766ae5..a4a942de3c9a7 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -167,6 +167,10 @@ if(MLIR_ENABLE_SYCL_RUNNER) list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime) endif() +if(MLIR_ENABLE_LEVELZERO_RUNNER) + list(APPEND MLIR_TEST_DEPENDS mlir_levelzero_runtime) +endif() + if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB) list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs) endif() diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir new file mode 100644 index 0000000000000..7e66dee0272f6 --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir @@ -0,0 +1,59 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]> + memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]> + func.func @main() { + %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32> + %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32> + %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32> + %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32> + call @printMemrefF32(%cast) : (memref<*xf32>) -> () + return + } + func.func private @printMemrefF32(memref<*xf32>) + func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %mem = gpu.alloc host_shared () : memref<2x2x2xf32> + memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32> + %memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32> + memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32> + %memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<2x2x2xf32> + memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32> + %6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32> + %7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32> + gpu.wait [%7] + return %alloc : memref<2x2x2xf32> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> + %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> + %5 = arith.addf %3, %4 : f32 + memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32> + gpu.return + } + } + // CHECK: [2.3, 4.5] + // CHECK: [7.8, 10.2] + // CHECK: [12.7, 14.9] + // CHECK: [18.2, 20.6] +} diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir new file mode 100644 index 0000000000000..df8fbe4d86d9c --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir @@ -0,0 +1,57 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]> + memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]> + func.func @main() { + %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64> + %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64> + %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64> + %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64> + call @printMemrefI64(%cast) : (memref<*xi64>) -> () + return + } + func.func private @printMemrefI64(memref<*xi64>) + func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %mem = gpu.alloc host_shared () : memref<3x3xi64> + memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64> + %memref_0 = gpu.alloc host_shared () : memref<3x3xi64> + memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64> + %memref_2 = gpu.alloc host_shared () : memref<3x3xi64> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<3x3xi64> + memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64> + %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64> + %7 = gpu.dealloc async [%6] %mem : memref<3x3xi64> + gpu.wait [%7] + return %alloc : memref<3x3xi64> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<3x3xi64> + %3 = memref.load %arg1[%0, %1] : memref<3x3xi64> + %4 = arith.addi %2, %3 : i64 + memref.store %4, %arg2[%0, %1] : memref<3x3xi64> + gpu.return + } + } + // CHECK: [2, 4100, 6], + // CHECK: [16777224, 10, 4294971404], + // CHECK: [16777230, 1103806595088, 1099511627794] +} diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir new file mode 100644 index 0000000000000..cd99f2c70dc6e --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-memcpy-addf32-to-spirv.mlir @@ -0,0 +1,56 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]> + memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]> + func.func @main() { + %0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32> + %1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32> + %2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32> + %cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32> + call @printMemrefF32(%cast) : (memref<*xf32>) -> () + memref.dealloc %2 : memref<2x2x2xf32> + return + } + func.func private @printMemrefF32(memref<*xf32>) + func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %memref = gpu.alloc () : memref<2x2x2xf32> + gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32> + %memref_0 = gpu.alloc () : memref<2x2x2xf32> + gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32> + %memref_1 = gpu.alloc () : memref<2x2x2xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) + args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>) + %alloc = memref.alloc() : memref<2x2x2xf32> + gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32> + gpu.dealloc %memref_1 : memref<2x2x2xf32> + gpu.dealloc %memref_0 : memref<2x2x2xf32> + gpu.dealloc %memref : memref<2x2x2xf32> + return %alloc : memref<2x2x2xf32> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = gpu.block_id z + %3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32> + %4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32> + %5 = arith.addf %3, %4 : f32 + memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32> + gpu.return + } + } + // CHECK: [2.3, 4.5] + // CHECK: [7.8, 10.2] + // CHECK: [12.7, 14.9] + // CHECK: [18.2, 20.6] +} diff --git a/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir new file mode 100644 index 0000000000000..8d022ac1cf277 --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/gpu-reluf32-to-spirv.mlir @@ -0,0 +1,86 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @relu attributes {gpu.container_module} { + memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[ + [-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01], + [1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01], + [1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01], + [1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01] + ]> + + func.func @main() { + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %c0 = arith.constant 0 : index + %0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32> + + scf.for %arg0 = %c0 to %c100 step %c1 { + %1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32> + %cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32> + func.call @printMemrefF32(%cast) : (memref<*xf32>) -> () + // CHECK: [0, 0, 0, 0.4, 0.5], + // CHECK: [0.1, 0, 0.3, 0, 0.5], + // CHECK: [0.1, 0.2, 0.3, 0, 0], + // CHECK: [0.1, 0.2, 0.3, 0.4, 0.5] + } + return + } + + func.func private @printMemrefF32(memref<*xf32>) + func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> { + %c5 = arith.constant 5 : index + %c4 = arith.constant 4 : index + %cst = arith.constant 0.000000e+00 : f32 + %c1 = arith.constant 1 : index + %memref = gpu.alloc host_shared () : memref<4x5xf32> + memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32> + %memref_0 = gpu.alloc host_shared () : memref<4x5xi1> + %2 = gpu.wait async + %3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) + args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>) + gpu.wait [%3] + %memref_1 = gpu.alloc host_shared () : memref<4x5xf32> + %4 = gpu.wait async + %5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) + args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, + %memref_1 : memref<4x5xf32>) + gpu.wait [%5] + %alloc = memref.alloc() : memref<4x5xf32> + memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32> + %6 = gpu.wait async + %7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32> + %8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1> + %9 = gpu.dealloc async [%8] %memref : memref<4x5xf32> + return %alloc : memref<4x5xf32> + } + gpu.module @test_kernel + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<4x5xf32> + %3 = arith.cmpf olt, %2, %arg1 : f32 + memref.store %3, %arg2[%0, %1] : memref<4x5xi1> + gpu.return + } + } + gpu.module @test_kernel_0 + attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel + attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<4x5xi1> + %3 = memref.load %arg1[%0, %1] : memref<4x5xf32> + %4 = arith.select %2, %arg2, %3 : f32 + memref.store %4, %arg3[%0, %1] : memref<4x5xf32> + gpu.return + } + } +} diff --git a/mlir/test/Integration/GPU/LevelZero/lit.local.cfg b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg new file mode 100644 index 0000000000000..36c7ad5f57c7e --- /dev/null +++ b/mlir/test/Integration/GPU/LevelZero/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.enable_levelzero_runner: + config.unsupported = True diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index feaf5fb852a1d..f392bdacadd3c 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -224,6 +224,9 @@ def find_real_python_interpreter(): if config.enable_sycl_runner: tools.extend([add_runtime("mlir_sycl_runtime")]) +if config.enable_levelzero_runner: + tools.extend([add_runtime("mlir_levelzero_runtime")]) + if config.enable_spirv_cpu_runner: tools.extend([add_runtime("mlir_spirv_cpu_runtime")]) diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index b1185e19d86e8..d904780af4224 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -34,6 +34,7 @@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@ config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@" config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@" config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@ +config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@ config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@ config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@ config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@