From e6bcef6f83d6f079c8dab5ad9b3a355a5fb445c4 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Fri, 11 Jul 2025 20:13:51 +0000 Subject: [PATCH 01/12] Add XeVM target and XeVM dialect integration tests. Covers remaining parts required for XeVM dialect intgration testing. It has two high level components - XeVM target and serialization support - XeVM dialect integration tests using SYCL runtime --- mlir/CMakeLists.txt | 8 + mlir/include/mlir/InitAllDialects.h | 2 + mlir/include/mlir/Target/LLVM/XeVM/Target.h | 31 +++ mlir/include/mlir/Target/LLVM/XeVM/Utils.h | 39 +++ mlir/include/mlir/Target/LLVMIR/Dialect/All.h | 3 + .../Dialect/XeVM/XeVMToLLVMIRTranslation.h | 33 +++ .../GPU/Transforms/XeVMAttachTarget.cpp | 1 + mlir/lib/Target/LLVM/CMakeLists.txt | 32 +++ mlir/lib/Target/LLVM/XeVM/Target.cpp | 257 ++++++++++++++++++ mlir/lib/Target/LLVMIR/CMakeLists.txt | 1 + mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt | 1 + .../Target/LLVMIR/Dialect/XeVM/CMakeLists.txt | 21 ++ .../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp | 117 ++++++++ .../Dialect/XeVM/GPU/lit.local.cfg | 4 + .../Dialect/XeVM/GPU/xevm_block_dpas.mlir | 135 +++++++++ .../XeVM/GPU/xevm_block_load_store.mlir | 103 +++++++ .../xevm_block_load_store_pack_register.mlir | 119 ++++++++ .../GPU/xevm_block_load_store_transpose.mlir | 127 +++++++++ .../Dialect/XeVM/GPU/xevm_store_cst.mlir | 74 +++++ mlir/test/lib/Dialect/GPU/CMakeLists.txt | 1 + mlir/test/lit.site.cfg.py.in | 1 + 21 files changed, 1110 insertions(+) create mode 100644 mlir/include/mlir/Target/LLVM/XeVM/Target.h create mode 100644 mlir/include/mlir/Target/LLVM/XeVM/Utils.h create mode 100644 mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h create mode 100644 mlir/lib/Target/LLVM/XeVM/Target.cpp create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir create mode 100644 mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index a1ad81f625cd6..7c9d62051f9f8 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -137,6 +137,14 @@ else() set(MLIR_ENABLE_ROCM_CONVERSIONS 0) endif() +# Build the XeVM conversions and run according tests if the SPIRV backend +# is available. +if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD) + set(MLIR_ENABLE_XEVM_CONVERSIONS 1) +else() + set(MLIR_ENABLE_XEVM_CONVERSIONS 0) +endif() + set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner") set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner") set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner") diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index c6fcf1a0d510b..79dcafe69f0a5 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -102,6 +102,7 @@ #include "mlir/Interfaces/CastInterfaces.h" #include "mlir/Target/LLVM/NVVM/Target.h" #include "mlir/Target/LLVM/ROCDL/Target.h" +#include "mlir/Target/LLVM/XeVM/Target.h" #include "mlir/Target/SPIRV/Target.h" namespace mlir { @@ -200,6 +201,7 @@ inline void registerAllDialects(DialectRegistry ®istry) { NVVM::registerNVVMTargetInterfaceExternalModels(registry); ROCDL::registerROCDLTargetInterfaceExternalModels(registry); spirv::registerSPIRVTargetInterfaceExternalModels(registry); + xevm::registerXeVMTargetInterfaceExternalModels(registry); } /// Append all the MLIR dialects to the registry contained in the given context. diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Target.h b/mlir/include/mlir/Target/LLVM/XeVM/Target.h new file mode 100644 index 0000000000000..31a93d0ebabfc --- /dev/null +++ b/mlir/include/mlir/Target/LLVM/XeVM/Target.h @@ -0,0 +1,31 @@ +//===-- Target.h - MLIR XeVM target registration ----------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides registration calls for attaching the XeVM target interface. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_XEVM_TARGET_H +#define MLIR_TARGET_XEVM_TARGET_H + +namespace mlir { +class DialectRegistry; +class MLIRContext; +} // namespace mlir + +namespace mlir::xevm { +/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in +/// the given registry. +void registerXeVMTargetInterfaceExternalModels(mlir::DialectRegistry ®istry); + +/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in +/// the registry associated with the given context. +void registerXeVMTargetInterfaceExternalModels(mlir::MLIRContext &context); +} // namespace mlir::xevm + +#endif // MLIR_TARGET_XEVM_TARGET_H diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h new file mode 100644 index 0000000000000..c11a97f0d960a --- /dev/null +++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h @@ -0,0 +1,39 @@ +//===-- Utils.h - MLIR XeVM target utils ------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files declares XeVM target related utility classes and functions. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_LLVM_XEVM_UTILS_H +#define MLIR_TARGET_LLVM_XEVM_UTILS_H + +#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h" +#include "mlir/Dialect/LLVMIR/XeVMDialect.h" +#include "mlir/Target/LLVM/ModuleToObject.h" + +namespace mlir { +namespace xevm { + +/// Base class for all XeVM serializations from GPU modules into binary strings. +/// By default this class serializes into LLVM bitcode. +class SerializeGPUModuleBase : public mlir::LLVM::ModuleToObject { +public: + SerializeGPUModuleBase(mlir::Operation &module, XeVMTargetAttr target, + const mlir::gpu::TargetOptions &targetOptions = {}); + + static void init(); + XeVMTargetAttr getTarget() const; + +protected: + XeVMTargetAttr target; +}; +} // namespace xevm +} // namespace mlir + +#endif // MLIR_TARGET_LLVM_XEVM_UTILS_H diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h index 60615cf601655..e4670cb1a9622 100644 --- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h +++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h @@ -28,6 +28,7 @@ #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h" namespace mlir { class DialectRegistry; @@ -47,6 +48,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry ®istry) { registerROCDLDialectTranslation(registry); registerSPIRVDialectTranslation(registry); registerVCIXDialectTranslation(registry); + registerXeVMDialectTranslation(registry); // Extension required for translating GPU offloading Ops. gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry); @@ -63,6 +65,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry ®istry) { registerNVVMDialectTranslation(registry); registerROCDLDialectTranslation(registry); registerSPIRVDialectTranslation(registry); + registerXeVMDialectTranslation(registry); // Extension required for translating GPU offloading Ops. gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry); diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h new file mode 100644 index 0000000000000..149a2119657d5 --- /dev/null +++ b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h @@ -0,0 +1,33 @@ +//===-- XeVMToLLVMIRTranslation.h - XeVM to LLVM IR -------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides registration calls for XeVM dialect to LLVM IR translation. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H +#define MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H + +namespace mlir { + +class DialectRegistry; +class MLIRContext; +} // namespace mlir + +namespace mlir { +/// Register the XeVM dialect and the translation from it to the LLVM IR in the +/// given registry; +void registerXeVMDialectTranslation(mlir::DialectRegistry ®istry); + +/// Register the XeVM dialect and the translation from it in the registry +/// associated with the given context. +void registerXeVMDialectTranslation(mlir::MLIRContext &context); + +} // namespace mlir + +#endif // MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H diff --git a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp index e9cf4939a13b8..6da76e9e7a331 100644 --- a/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/XeVMAttachTarget.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/LLVMIR/XeVMDialect.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" +#include "mlir/Target/LLVM/XeVM/Target.h" #include "llvm/Support/Regex.h" namespace mlir { diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt index 83fbf7a5fe5f3..ed15c5d2ab2ca 100644 --- a/mlir/lib/Target/LLVM/CMakeLists.txt +++ b/mlir/lib/Target/LLVM/CMakeLists.txt @@ -209,3 +209,35 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS) ) endif() +if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD) + set(SPIRV_LIBS + SPIRVCodeGen + + ) +endif() + +add_mlir_dialect_library(MLIRXeVMTarget + XeVM/Target.cpp + + OBJECT + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR + + LINK_COMPONENTS + ${SPIRV_LIBS} + + LINK_LIBS PUBLIC + MLIRIR + MLIRExecutionEngineUtils + MLIRSupport + MLIRGPUDialect + MLIRTargetLLVM + MLIRXeVMToLLVMIRTranslation +) + +# Ensure SPIRV headers are included. Warning: references build directory! +target_include_directories(MLIRXeVMTarget PRIVATE + ${LLVM_MAIN_SRC_DIR}/lib/Target/SPIRV + ${LLVM_BINARY_DIR}/lib/Target/SPIRV +) diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp new file mode 100644 index 0000000000000..380e2bff222ca --- /dev/null +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -0,0 +1,257 @@ +//===-- Target.cpp - MLIR LLVM XeVM target compilation ----------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines XeVM target related functions including registration +// calls for the `#xevm.target` compilation attribute. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Target/LLVM/XeVM/Target.h" + +#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/LLVMIR/XeVMDialect.h" +#include "mlir/IR/ExtensibleDialect.h" +#include "mlir/Target/LLVM/XeVM/Utils.h" +#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" + +#include "llvm/Config/Targets.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" + +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +// FIXME: One of the headers uses `.inc` file from the build directory, this +// does not work for installation (i.e., DCMAKE_INSTALL_PREFIX) caching as build +// directory will not be cached. Since float atomics are not yet supported by +// the backend anyway, we can afford to temporarily comment this section. + +// #if LLVM_HAS_SPIRV_TARGET +// #pragma GCC diagnostic push +// #pragma GCC diagnostic ignored "-Wnon-virtual-dtor" +// #include "SPIRVTargetMachine.h" +// #pragma GCC diagnostic pop + +// #include "SPIRVCommandLine.h" +// #endif // LLVM_HAS_SPIRV_TARGET + +#include + +using namespace mlir; + +namespace { +// XeVM implementation of the gpu:TargetAttrInterface. +class XeVMTargetAttrImpl + : public gpu::TargetAttrInterface::FallbackModel { +public: + std::optional> + serializeToObject(Attribute attribute, Operation *module, + const gpu::TargetOptions &options) const; + + Attribute createObject(Attribute attribute, Operation *module, + const SmallVector &object, + const gpu::TargetOptions &options) const; +}; +} // namespace + +void mlir::xevm::registerXeVMTargetInterfaceExternalModels( + DialectRegistry ®istry) { + registry.addExtension( + +[](MLIRContext *ctx, mlir::xevm::XeVMDialect *dialect) { + mlir::xevm::XeVMTargetAttr::attachInterface(*ctx); + }); +} + +void mlir::xevm::registerXeVMTargetInterfaceExternalModels( + MLIRContext &context) { + DialectRegistry registry; + registerXeVMTargetInterfaceExternalModels(registry); + context.appendDialectRegistry(registry); +} + +mlir::xevm::SerializeGPUModuleBase::SerializeGPUModuleBase( + Operation &module, mlir::xevm::XeVMTargetAttr target, + const gpu::TargetOptions &targetOptions) + : ModuleToObject(module, target.getTriple(), "", {}, target.getO()), + target(target) {} + +void mlir::xevm::SerializeGPUModuleBase::init() { + static llvm::once_flag initializeBackendOnce; + llvm::call_once(initializeBackendOnce, []() { +#if LLVM_HAS_SPIRV_TARGET + LLVMInitializeSPIRVTarget(); + LLVMInitializeSPIRVTargetInfo(); + LLVMInitializeSPIRVTargetMC(); + LLVMInitializeSPIRVAsmPrinter(); +#endif + }); +} + +mlir::xevm::XeVMTargetAttr +mlir::xevm::SerializeGPUModuleBase::getTarget() const { + return target; +} + +namespace { +class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase { +public: + SpirSerializer(Operation &module, mlir::xevm::XeVMTargetAttr target, + const gpu::TargetOptions &targetOptions) + : mlir::xevm::SerializeGPUModuleBase(module, target, targetOptions) {} + + gpu::GPUModuleOp getOperation(); + + std::optional> + moduleToObject(llvm::Module &llvmModule) override; + +private: + std::optional + translateToSPIRVBinary(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine); + gpu::TargetOptions targetOptions; +}; +} // namespace + +gpu::GPUModuleOp SpirSerializer::getOperation() { + return dyn_cast( + &mlir::xevm::SerializeGPUModuleBase::getOperation()); +} + +std::optional> +SpirSerializer::moduleToObject(llvm::Module &llvmModule) { + // Return LLVM IR if the compilation target is `offload`. + if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) + return mlir::xevm::SerializeGPUModuleBase::moduleToObject(llvmModule); + +#if !LLVM_HAS_SPIRV_TARGET + getOperation()->emitError( + "The `SPIRV` target was not built. Please enable it when building LLVM."); + return std::nullopt; +#endif // LLVM_HAS_SPIRV_TARGET + + std::optional targetMachine = + getOrCreateTargetMachine(); + if (!targetMachine) { + getOperation().emitError() << "Target Machine unavailable for triple " + << triple << ", can't compile with LLVM\n"; + return std::nullopt; + } + + //===----------------------------------------------------------------------===// + // Workaround to enable spirv extensions that are not added to target machine + // by default. + + // FIXME: see fixme comment above SPIRV headers. + // #if LLVM_HAS_SPIRV_TARGET + // std::set AllowedExtIds{ + // llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_add, + // llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add}; + // llvm::SPIRVTargetMachine *STM = + // static_cast(targetMachine.value()); + // const_cast(STM->getSubtargetImpl()) + // ->initAvailableExtensions(AllowedExtIds); + // #endif // LLVM_HAS_SPIRV_TARGET + + //===----------------------------------------------------------------------===// + + // Return SPIRV if the compilation target is `assembly`. + if (targetOptions.getCompilationTarget() == + gpu::CompilationTarget::Assembly) { + std::optional serializedISA = + translateToISA(llvmModule, **targetMachine); + if (!serializedISA) { + getOperation().emitError() << "Failed translating the module to ISA."; + return std::nullopt; + } + // Make sure to include the null terminator. + StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); + return SmallVector(bin.begin(), bin.end()); + } + + std::optional serializedSPIRVBinary = + translateToSPIRVBinary(llvmModule, **targetMachine); + if (!serializedSPIRVBinary) { + getOperation().emitError() << "Failed translating the module to Binary."; + return std::nullopt; + } + if (serializedSPIRVBinary->size() % 4) { + getOperation().emitError() << "SPIRV code size must be a multiple of 4."; + return std::nullopt; + } + StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size()); + return SmallVector(bin.begin(), bin.end()); +} + +std::optional +SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::string targetISA; + llvm::raw_string_ostream stream(targetISA); + + { // Drop pstream after this to prevent the ISA from being stuck buffering + llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; + + if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CodeGenFileType::ObjectFile)) + return std::nullopt; + + codegenPasses.run(llvmModule); + } + return targetISA; +} + +std::optional> +XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, + const gpu::TargetOptions &options) const { + if (!module) + return std::nullopt; + auto gpuMod = dyn_cast(module); + if (!gpuMod) { + module->emitError("expected to be a gpu.module op"); + return std::nullopt; + } + gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) { + if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) { + funcOp.setIntelReqdSubGroupSize(16); + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); + + SpirSerializer serializer( + *module, cast(attribute), options); + serializer.init(); + +#if !LLVM_HAS_SPIRV_TARGET + module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV " + "without having the target built."); +#endif + + return serializer.run(); +} + +Attribute +XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module, + const SmallVector &object, + const gpu::TargetOptions &options) const { + gpu::CompilationTarget format = options.getCompilationTarget(); + DictionaryAttr objectProps; + Builder builder(attribute.getContext()); + return builder.getAttr( + attribute, format, + builder.getStringAttr(StringRef(object.data(), object.size())), + objectProps, /*kernels=*/nullptr); +} diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt index af22a7ff04bf0..9ea5c6835e8ef 100644 --- a/mlir/lib/Target/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt @@ -60,6 +60,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration MLIRROCDLToLLVMIRTranslation MLIRSPIRVToLLVMIRTranslation MLIRVCIXToLLVMIRTranslation + MLIRXeVMToLLVMIRTranslation ) add_mlir_translation_library(MLIRTargetLLVMIRImport diff --git a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt index f030fa78942d5..86c731a1074c3 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt @@ -10,3 +10,4 @@ add_subdirectory(OpenMP) add_subdirectory(ROCDL) add_subdirectory(SPIRV) add_subdirectory(VCIX) +add_subdirectory(XeVM) diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt new file mode 100644 index 0000000000000..6308d7e2e4404 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt @@ -0,0 +1,21 @@ +set(LLVM_OPTIONAL_SOURCES + XeVMToLLVMIRTranslation.cpp +) + +add_mlir_translation_library(MLIRXeVMToLLVMIRTranslation + XeVMToLLVMIRTranslation.cpp + + DEPENDS + MLIRXeVMConversionsIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRDialectUtils + MLIRIR + MLIRLLVMDialect + MLIRXeVMDialect + MLIRSupport + MLIRTargetLLVMIRExport +) diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp new file mode 100644 index 0000000000000..f961d25ff1b86 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp @@ -0,0 +1,117 @@ +//===-- XeVMToLLVMIRTranslation.cpp - Translate XeVM to LLVM IR -*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a translation between the MLIR XeVM dialect and +// LLVM IR. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h" +#include "mlir/Dialect/LLVMIR/XeVMDialect.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Operation.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" + +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" + +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::LLVM; + +namespace { +/// Implementation of the dialect interface that converts operations belonging +/// to the XeVM dialect to LLVM IR. +class XeVMDialectLLVMIRTranslationInterface + : public LLVMTranslationDialectInterface { +public: + using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface; + + /// Translates the given operation to LLVM IR using the provided IR builder + /// and saving the state in `moduleTranslation`. + LogicalResult + convertOperation(Operation *op, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const final { + /* TODO */ + return failure(); + } + + /// Attaches module-level metadata for functions marked as kernels. + LogicalResult + amendOperation(Operation *op, ArrayRef instructions, + NamedAttribute attribute, + LLVM::ModuleTranslation &moduleTranslation) const final { + StringRef attrName = attribute.getName().getValue(); + if (attrName == mlir::xevm::XeVMDialect::getCacheControlsAttrName()) { + auto cacheControlsArray = dyn_cast(attribute.getValue()); + if (cacheControlsArray.size() != 2) { + return op->emitOpError( + "Expected both L1 and L3 cache control attributes!"); + } + if (instructions.size() != 1) { + return op->emitOpError("Expecting a single instruction"); + } + return handleDecorationCacheControl(instructions.front(), + cacheControlsArray.getValue()); + } + auto func = dyn_cast(op); + if (!func) + return failure(); + + return success(); + } + +private: + template + static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) { + return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val)); + } + + static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst, + ArrayRef attrs) { + SmallVector decorations; + llvm::LLVMContext &ctx = inst->getContext(); + llvm::Type *i32Ty = llvm::IntegerType::getInt32Ty(ctx); + llvm::transform(attrs, std::back_inserter(decorations), + [&ctx, i32Ty](Attribute attr) -> llvm::Metadata * { + auto valuesArray = dyn_cast(attr).getValue(); + std::array metadata; + llvm::transform( + valuesArray, metadata.begin(), + [i32Ty](Attribute valueAttr) { + return getConstantIntMD( + i32Ty, cast(valueAttr).getValue()); + }); + return llvm::MDNode::get(ctx, metadata); + }); + constexpr llvm::StringLiteral decorationCacheControlMDName = + "spirv.DecorationCacheControlINTEL"; + inst->setMetadata(decorationCacheControlMDName, + llvm::MDNode::get(ctx, decorations)); + return success(); + } +}; +} // namespace + +void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry ®istry) { + registry.insert(); + registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) { + dialect->addInterfaces(); + }); +} + +void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) { + DialectRegistry registry; + registerXeVMDialectTranslation(registry); + context.appendDialectRegistry(registry); +} diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg new file mode 100644 index 0000000000000..d172445e6ee54 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg @@ -0,0 +1,4 @@ +if not config.run_xevm_tests: + config.unsupported = True +if not config.enable_sycl_runner: + config.unsupported = True diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir new file mode 100644 index 0000000000000..07bd15a35083e --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir @@ -0,0 +1,135 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + // - Sets of `matrix_mad` intrinsics can differ based on device's *minimal* supported sub-group size. + // The *minimum supported* sub-group size should be used to call `matrix_mad` intrinsics. + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html + + gpu.func @block_dpas(%a: !llvm.ptr<1>, %b: !llvm.ptr<1>, %c: !llvm.ptr<1>) kernel { + %base_width_a = arith.constant 32 : i32 + %base_height_a = arith.constant 8 : i32 + %base_pitch_a = arith.constant 32 : i32 + %x = arith.constant 0 : i32 + %y = arith.constant 0 : i32 + %loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16> + + %base_width_b = arith.constant 32 : i32 + %base_height_b = arith.constant 16 : i32 + %base_pitch_b = arith.constant 32 : i32 + %loaded_b1 = xevm.blockload2d %b, %base_width_b, %base_height_b, %base_pitch_b, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %loaded_b_casted = vector.bitcast %loaded_b1 : vector<16xi16> to vector<8xi32> + + %base_width_c = arith.constant 64 : i32 + %base_height_c = arith.constant 8 : i32 + %base_pitch_c = arith.constant 64 : i32 + %loaded_c = xevm.blockload2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + + %loaded_c_casted = vector.bitcast %loaded_c : vector<8xi32> to vector<8xf32> + %c_result = xevm.mma %loaded_a, %loaded_b_casted, %loaded_c_casted {shape=, types=} : (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> + %c_result_casted = vector.bitcast %c_result : vector<8xf32> to vector<8xi32> + + xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) + gpu.return + } + } + + func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + %memref_a = gpu.alloc() : memref<8x16xf16> + gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16> + %a_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_a : memref<8x16xf16> -> index + %a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64 + %a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr + %a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_b = gpu.alloc() : memref<16x16xf16> + gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16> + %b_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_b : memref<16x16xf16> -> index + %b_ptr_as_i64 = arith.index_cast %b_ptr_as_idx : index to i64 + %b_ptr = llvm.inttoptr %b_ptr_as_i64 : i64 to !llvm.ptr + %b_ptr_casted = llvm.addrspacecast %b_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_c = gpu.alloc() : memref<8x16xf32> + gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32> + %c_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_c : memref<8x16xf32> -> index + %c_ptr_as_i64 = arith.index_cast %c_ptr_as_idx : index to i64 + %c_ptr = llvm.inttoptr %c_ptr_as_i64 : i64 to !llvm.ptr + %c_ptr_casted = llvm.addrspacecast %c_ptr : !llvm.ptr to !llvm.ptr<1> + + gpu.launch_func @kernel::@block_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_ptr_casted : !llvm.ptr<1>, %b_ptr_casted : !llvm.ptr<1>, %c_ptr_casted : !llvm.ptr<1>) + gpu.dealloc %memref_a : memref<8x16xf16> + gpu.dealloc %memref_b : memref<16x16xf16> + %res = memref.alloc() : memref<8x16xf32> + gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32> + gpu.dealloc %memref_c : memref<8x16xf32> + return %res : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf16> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + %row_idx = arith.index_cast %i : index to i32 + %row = arith.sitofp %row_idx : i32 to f16 + memref.store %row, %A[%i, %j] : memref<8x16xf16> + } + } + %B = memref.alloc() : memref<16x16xf16> + scf.for %i = %c0 to %c16 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + %col_idx = arith.index_cast %j : index to i32 + %col = arith.sitofp %col_idx : i32 to f16 + memref.store %col, %B[%i, %j] : memref<16x16xf16> + } + } + + %C = memref.alloc() : memref<8x16xf32> + %c0_f16 = arith.constant 0.0 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32> + } + } + + %C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32> + %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16> + call @printMemrefF32(%C_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // CHECK-NEXT: [0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240] + // CHECK-NEXT: [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480] + // CHECK-NEXT: [0, 48, 96, 144, 192, 240, 288, 336, 384, 432, 480, 528, 576, 624, 672, 720] + // CHECK-NEXT: [0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960] + // CHECK-NEXT: [0, 80, 160, 240, 320, 400, 480, 560, 640, 720, 800, 880, 960, 1040, 1120, 1200] + // CHECK-NEXT: [0, 96, 192, 288, 384, 480, 576, 672, 768, 864, 960, 1056, 1152, 1248, 1344, 1440] + // CHECK-NEXT: [0, 112, 224, 336, 448, 560, 672, 784, 896, 1008, 1120, 1232, 1344, 1456, 1568, 1680] + + memref.dealloc %A : memref<8x16xf16> + memref.dealloc %B : memref<16x16xf16> + memref.dealloc %C : memref<8x16xf32> + memref.dealloc %C_res : memref<8x16xf32> + return + } + func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface } + func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface } + +} diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir new file mode 100644 index 0000000000000..3efb43bd0e426 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir @@ -0,0 +1,103 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + + gpu.module @kernel { + // - `cl_intel_subgroups` block load/store intrinsics operate at the *maximum* sub-group size, + // regardless of the active sub-group size. Make sure `clGetKernelSubGroupInfo` meets your expectations. + // - The attribute `intel_reqd_sub_group_size` establishes the maximum sub-group size for a kernel. + // + // Note: launching 16 threads without explicit `intel_reqd_sub_group_size = 16` may still use + // the default sub-group size of 32. + // + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_required_subgroup_size.html + // https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html + + gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel { + %base_width = arith.constant 64 : i32 // bytewidth of the block + %base_height = arith.constant 8 : i32 // number of rows + %base_pitch = arith.constant 64 : i32 // bytewidth of the base row + %x = arith.constant 0 : i32 + %y = arith.constant 0 : i32 + // If `intel_reqd_sub_group_size = 16` is not set, the default (32) is used and this `blockload2d` would only load 4 elements into vector<8xi32> + %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32> + %c0 = arith.constant 0 : i32 + %thread_x = gpu.thread_id x + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32 + %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32 + %loaded_f32_modified = vector.insertelement %thread_x_f32, %loaded_f32[%c0 : i32] : vector<8xf32> + %loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32> + xevm.blockstore2d %dst, %base_width, %base_height, %base_pitch, %x, %y, %loaded_modified <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) + gpu.return + } + } + + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`) + %memref_src = gpu.alloc() : memref<8x16xf32> + gpu.memcpy %memref_src, %src : memref<8x16xf32>, memref<8x16xf32> + %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<8x16xf32> -> index + %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64 + %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr + %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_dst = gpu.alloc() : memref<8x16xf32> + %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index + %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64 + %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr + %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1> + + gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>) + gpu.dealloc %memref_src : memref<8x16xf32> + %dst = memref.alloc() : memref<8x16xf32> + gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32> + gpu.dealloc %memref_dst : memref<8x16xf32> + return %dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + memref.dealloc %B : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir new file mode 100644 index 0000000000000..88d2e90794fb0 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir @@ -0,0 +1,119 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel { + %base_width = arith.constant 32 : i32 // bytewidth of the block + %base_height_load = arith.constant 16 : i32 // number of rows + %base_pitch = arith.constant 32 : i32 // bytewidth of the base row + %x = arith.constant 0 : i32 + %y = arith.constant 0 : i32 + + // Consider the following two loads: + // Normal load: + %loaded = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %loaded_f16_flat = vector.bitcast %loaded : vector<16xi16> to vector<16xf16> + %loaded_f16 = vector.shape_cast %loaded_f16_flat : vector<16xf16> to vector<8x1x2xf16> + + // Register packed load: + %loaded_packed = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=false, pack_register=true}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + %loaded_packed_f16_flat = vector.bitcast %loaded_packed : vector<8xi32> to vector<16xf16> + %loaded_packed_f16 = vector.shape_cast %loaded_packed_f16_flat : vector<16xf16> to vector<8x1x2xf16> + // Both can be represented the same way in code as vector<16xf16>. + // A normal load pads a value to a dword (e.g., 32-bit) when loaded to a register. + // Packed load "packs" multiple sub-dword values along the column (↓), allowing a single register to hold multiple values. + // In SIMT, a work-item reads values along the column (↓), hence a sequence of values loaded by packing to register is logically equivalent to the sequence of values loaded using a normal load. + // The load results of both methods can have the same logical representation, but are expected to differ in physical layout and register efficiency. + + %thread_x = gpu.thread_id x + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32 + %thread_x_f16 = arith.sitofp %thread_x_i32 : i32 to f16 + %loaded_f16_modified = vector.insert %thread_x_f16, %loaded_packed_f16 [0,0,1] : f16 into vector<8x1x2xf16> // Both loaded_packed_f16 and loaded_f16 can be used here + // We can only store [1,2,4,8]x[16] shapes for f16, so we have to do 2 stores + %loaded_f16_modified_slice_0 = vector.extract_strided_slice %loaded_f16_modified {offsets = [0, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16> + %loaded_f16_modified_slice_0_flat = vector.shape_cast %loaded_f16_modified_slice_0 : vector<4x1x2xf16> to vector<8xf16> + %base_height_store = arith.constant 8 : i32 // number of rows + %base_width_store = arith.constant 32 : i32 // bytewidth of the block + %base_pitch_store = arith.constant 32 : i32 // bytewidth of the base row + xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_0_flat <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>) + + %loaded_f16_modified_slice_1 = vector.extract_strided_slice %loaded_f16_modified {offsets = [4, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16> + %loaded_f16_modified_slice_1_flat = vector.shape_cast %loaded_f16_modified_slice_1 : vector<4x1x2xf16> to vector<8xf16> + + %second_half_offset = arith.muli %base_pitch_store, %base_height_store : i32 + %second_half_ptr = llvm.getelementptr %dst[%second_half_offset] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 + xevm.blockstore2d %second_half_ptr, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_1_flat <{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>) + gpu.return + } + } + + + func.func @test(%src : memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`) + %memref_src = gpu.alloc() : memref<16x16xf16> + gpu.memcpy %memref_src, %src : memref<16x16xf16>, memref<16x16xf16> + %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x16xf16> -> index + %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64 + %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr + %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_dst = gpu.alloc() : memref<16x16xf16> + %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<16x16xf16> -> index + %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64 + %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr + %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1> + + gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>) + gpu.dealloc %memref_src : memref<16x16xf16> + %dst = memref.alloc() : memref<16x16xf16> + gpu.memcpy %dst, %memref_dst : memref<16x16xf16>, memref<16x16xf16> + gpu.dealloc %memref_dst : memref<16x16xf16> + return %dst : memref<16x16xf16> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<16x16xf16> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 16 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.1 : f16 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<16x16xf16> + } + } + %B = call @test(%A) : (memref<16x16xf16>) -> memref<16x16xf16> + %B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16> + %A_cast = memref.cast %A : memref<16x16xf16> to memref<*xf16> + call @printMemrefF16(%A_cast) : (memref<*xf16>) -> () + call @printMemrefF16(%B_cast) : (memref<*xf16>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.1{{.*}}] + // CHECK-COUNT-224: 11.1 + // CHECK-NEXT: [11.1{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.1{{.*}}] + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-208: 11.1 + // CHECK-NEXT: [11.1{{.*}}] + + memref.dealloc %A : memref<16x16xf16> + memref.dealloc %B : memref<16x16xf16> + return + } + func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface } +} diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir new file mode 100644 index 0000000000000..646d1fc9b4666 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir @@ -0,0 +1,127 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel { + %base_width = arith.constant 32 : i32 // bytewidth of the block + %base_height = arith.constant 16 : i32 // number of rows + %base_pitch = arith.constant 32 : i32 // bytewidth of the base row + %x = arith.constant 0 : i32 + %y = arith.constant 0 : i32 + // Normally a work-item loads a vertical slice (↓), but with *transpose* a work-item loads a horizontal slice (→). + // The tile dimension we want to slice must be a multiple of the sub-group size: e.g., we want to slice rows (→), then we need SG_SIZE % tile_height == 0. + %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y <{elem_size_in_bits=32 : i32, tile_width=8 : i32, tile_height=16 : i32, v_blocks=1 : i32, transpose=true, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32> + + %c0 = arith.constant 0 : i32 + %thread_x = gpu.thread_id x + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32 + %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32 + %loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[7] : f32 into vector<8xf32> // Use this to see where threadIds end up stored + %loaded_f32_modified_1 = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32> + + %base_height_store = arith.constant 8 : i32 // number of rows + %base_width_store = arith.constant 64 : i32 // bytewidth of the block + %base_pitch_store = arith.constant 64 : i32 // bytewidth of the base row + // "Transposed" stores are not available, meaning a work-item can store its vector as a vertical slice (↓). + xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded <{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) + gpu.return + } + } + + + func.func @test(%src : memref<16x8xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`) + %memref_src = gpu.alloc() : memref<16x8xf32> + gpu.memcpy %memref_src, %src : memref<16x8xf32>, memref<16x8xf32> + %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x8xf32> -> index + %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64 + %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr + %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_dst = gpu.alloc() : memref<8x16xf32> + %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index + %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64 + %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr + %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1> + + gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>) + gpu.dealloc %memref_src : memref<16x8xf32> + %dst = memref.alloc() : memref<8x16xf32> + gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32> + gpu.dealloc %memref_dst : memref<8x16xf32> + return %dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<16x8xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f16 + scf.for %i = %c0 to %c16 step %c1 { + scf.for %j = %c0 to %c8 step %c1 { + %c_10_f = arith.constant 10.0 : f32 + %j_i64 = arith.index_cast %j : index to i64 + %j_i32 = llvm.trunc %j_i64 : i64 to i32 + %j_f32 = arith.sitofp %j_i32 : i32 to f32 + %jj = arith.divf %j_f32, %c_10_f : f32 + + %i_i64 = arith.index_cast %i : index to i64 + %i_i32 = llvm.trunc %i_i64 : i64 to i32 + %i_f32 = arith.sitofp %i_i32 : i32 to f32 + %ii = arith.addf %i_f32, %jj : f32 + memref.store %ii, %A[%i, %j] : memref<16x8xf32> + } + } + %B = call @test(%A) : (memref<16x8xf32>) -> memref<8x16xf32> + %A_cast = memref.cast %A : memref<16x8xf32> to memref<*xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], + // CHECK-NEXT: [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7], + // CHECK-NEXT: [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7], + // CHECK-NEXT: [3, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7], + // CHECK-NEXT: [4, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7], + // CHECK-NEXT: [5, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7], + // CHECK-NEXT: [6, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7], + // CHECK-NEXT: [7, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7], + // CHECK-NEXT: [8, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7], + // CHECK-NEXT: [9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7], + // CHECK-NEXT: [10, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7], + // CHECK-NEXT: [11, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7], + // CHECK-NEXT: [12, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7], + // CHECK-NEXT: [13, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7], + // CHECK-NEXT: [14, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7], + // CHECK-NEXT: [15, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7] + + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + // CHECK-NEXT: [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 14.1, 15.1], + // CHECK-NEXT: [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2, 11.2, 12.2, 13.2, 14.2, 15.2], + // CHECK-NEXT: [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 9.3, 10.3, 11.3, 12.3, 13.3, 14.3, 15.3], + // CHECK-NEXT: [0.4, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4, 9.4, 10.4, 11.4, 12.4, 13.4, 14.4, 15.4], + // CHECK-NEXT: [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5], + // CHECK-NEXT: [0.6, 1.6, 2.6, 3.6, 4.6, 5.6, 6.6, 7.6, 8.6, 9.6, 10.6, 11.6, 12.6, 13.6, 14.6, 15.6], + // CHECK-NEXT: [0.7, 1.7, 2.7, 3.7, 4.7, 5.7, 6.7, 7.7, 8.7, 9.7, 10.7, 11.7, 12.7, 13.7, 14.7, 15.7] + + memref.dealloc %A : memref<16x8xf32> + memref.dealloc %B : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface } +} diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir new file mode 100644 index 0000000000000..7ead3577857f5 --- /dev/null +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir @@ -0,0 +1,74 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \ +// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ +// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + + gpu.module @kernel { + gpu.func @store_constant(%ptr: !llvm.ptr<1>) kernel { + %const_val = arith.constant 42.0 : f32 + %thread_x = gpu.lane_id + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %ptr_next_1 = llvm.getelementptr %ptr[%thread_x_i64] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, i32 + llvm.store %const_val, %ptr_next_1 : f32, !llvm.ptr<1> + gpu.return + } + } + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_0 = gpu.alloc() : memref<8x16xf32> + gpu.memcpy %memref_0, %src : memref<8x16xf32>, memref<8x16xf32> + %0 = memref.extract_aligned_pointer_as_index %memref_0 : memref<8x16xf32> -> index + %1 = arith.index_cast %0 : index to i64 + %2 = llvm.inttoptr %1 : i64 to !llvm.ptr + %src_casted = llvm.addrspacecast %2 : !llvm.ptr to !llvm.ptr<1> + gpu.launch_func @kernel::@store_constant blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_casted : !llvm.ptr<1>) + %dst = memref.alloc() : memref<8x16xf32> + gpu.memcpy %dst, %memref_0 : memref<8x16xf32>, memref<8x16xf32> + gpu.dealloc %memref_0 : memref<8x16xf32> + + return %dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + memref.dealloc %B : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/mlir/test/lib/Dialect/GPU/CMakeLists.txt b/mlir/test/lib/Dialect/GPU/CMakeLists.txt index 418c884dc03b3..882d5abc2eeb8 100644 --- a/mlir/test/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/test/lib/Dialect/GPU/CMakeLists.txt @@ -30,6 +30,7 @@ set(LIBS MLIRVectorDialect MLIRVectorToLLVMPass MLIRXeVMDialect + MLIRXeVMToLLVMIRTranslation ) add_mlir_library(MLIRGPUTestPasses diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index 132aabe135940..08dc98938a31a 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -32,6 +32,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@ config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@" config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@" +config.run_xevm_tests = @MLIR_ENABLE_XEVM_CONVERSIONS@ config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@ config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@ config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@ From 717bdf9970544a4b2eb87dc861679ed8f6dba74c Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Fri, 25 Jul 2025 19:51:52 +0000 Subject: [PATCH 02/12] Add XeVM to LLVMIR translation. --- mlir/include/mlir/Target/LLVMIR/Dialect/All.h | 3 + .../Dialect/XeVM/XeVMToLLVMIRTranslation.h | 31 +++++ mlir/lib/Target/LLVMIR/CMakeLists.txt | 1 + mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt | 1 + .../Target/LLVMIR/Dialect/XeVM/CMakeLists.txt | 21 ++++ .../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp | 108 ++++++++++++++++++ mlir/test/Target/LLVMIR/xevm.mlir | 101 ++++++++++++++++ 7 files changed, 266 insertions(+) create mode 100644 mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt create mode 100644 mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp create mode 100644 mlir/test/Target/LLVMIR/xevm.mlir diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h index 60615cf601655..e4670cb1a9622 100644 --- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h +++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h @@ -28,6 +28,7 @@ #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/VCIX/VCIXToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h" namespace mlir { class DialectRegistry; @@ -47,6 +48,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry ®istry) { registerROCDLDialectTranslation(registry); registerSPIRVDialectTranslation(registry); registerVCIXDialectTranslation(registry); + registerXeVMDialectTranslation(registry); // Extension required for translating GPU offloading Ops. gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry); @@ -63,6 +65,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry ®istry) { registerNVVMDialectTranslation(registry); registerROCDLDialectTranslation(registry); registerSPIRVDialectTranslation(registry); + registerXeVMDialectTranslation(registry); // Extension required for translating GPU offloading Ops. gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry); diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h new file mode 100644 index 0000000000000..b4f6750718fe8 --- /dev/null +++ b/mlir/include/mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h @@ -0,0 +1,31 @@ +//===-- XeVMToLLVMIRTranslation.h - XeVM to LLVM IR -------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This provides registration calls for XeVM dialect to LLVM IR translation. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H +#define MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H + +namespace mlir { + +class DialectRegistry; +class MLIRContext; + +/// Register the XeVM dialect and the translation from it to the LLVM IR in the +/// given registry; +void registerXeVMDialectTranslation(mlir::DialectRegistry ®istry); + +/// Register the XeVM dialect and the translation from it in the registry +/// associated with the given context. +void registerXeVMDialectTranslation(mlir::MLIRContext &context); + +} // namespace mlir + +#endif // MLIR_TARGET_LLVMIR_DIALECT_XEVM_XEVMTOLLVMIRTRANSLATION_H diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt index af22a7ff04bf0..9ea5c6835e8ef 100644 --- a/mlir/lib/Target/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt @@ -60,6 +60,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration MLIRROCDLToLLVMIRTranslation MLIRSPIRVToLLVMIRTranslation MLIRVCIXToLLVMIRTranslation + MLIRXeVMToLLVMIRTranslation ) add_mlir_translation_library(MLIRTargetLLVMIRImport diff --git a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt index f030fa78942d5..86c731a1074c3 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/Dialect/CMakeLists.txt @@ -10,3 +10,4 @@ add_subdirectory(OpenMP) add_subdirectory(ROCDL) add_subdirectory(SPIRV) add_subdirectory(VCIX) +add_subdirectory(XeVM) diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt new file mode 100644 index 0000000000000..6308d7e2e4404 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/CMakeLists.txt @@ -0,0 +1,21 @@ +set(LLVM_OPTIONAL_SOURCES + XeVMToLLVMIRTranslation.cpp +) + +add_mlir_translation_library(MLIRXeVMToLLVMIRTranslation + XeVMToLLVMIRTranslation.cpp + + DEPENDS + MLIRXeVMConversionsIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRDialectUtils + MLIRIR + MLIRLLVMDialect + MLIRXeVMDialect + MLIRSupport + MLIRTargetLLVMIRExport +) diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp new file mode 100644 index 0000000000000..67ecb53ca4b3b --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp @@ -0,0 +1,108 @@ +//===-- XeVMToLLVMIRTranslation.cpp - Translate XeVM to LLVM IR -*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a translation between the MLIR XeVM dialect and +// LLVM IR. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h" +#include "mlir/Dialect/LLVMIR/XeVMDialect.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Operation.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" + +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" + +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" + +using namespace mlir; +using namespace mlir::LLVM; + +namespace { +/// Implementation of the dialect interface that converts operations belonging +/// to the XeVM dialect to LLVM IR. +class XeVMDialectLLVMIRTranslationInterface + : public LLVMTranslationDialectInterface { +public: + using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface; + + /// Attaches module-level metadata for functions marked as kernels. + LogicalResult + amendOperation(Operation *op, ArrayRef instructions, + NamedAttribute attribute, + LLVM::ModuleTranslation &moduleTranslation) const final { + StringRef attrName = attribute.getName().getValue(); + if (attrName == mlir::xevm::XeVMDialect::getCacheControlsAttrName()) { + auto cacheControlsArray = dyn_cast(attribute.getValue()); + if (cacheControlsArray.size() != 2) { + return op->emitOpError( + "Expected both L1 and L3 cache control attributes!"); + } + if (instructions.size() != 1) { + return op->emitOpError("Expecting a single instruction"); + } + return handleDecorationCacheControl(instructions.front(), + cacheControlsArray.getValue()); + } + auto func = dyn_cast(op); + if (!func) + return failure(); + + return success(); + } + +private: + template + static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) { + return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val)); + } + + static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst, + ArrayRef attrs) { + SmallVector decorations; + llvm::LLVMContext &ctx = inst->getContext(); + llvm::Type *i32Ty = llvm::IntegerType::getInt32Ty(ctx); + llvm::transform( + attrs, std::back_inserter(decorations), + [&ctx, i32Ty](Attribute attr) -> llvm::Metadata * { + auto valuesArray = dyn_cast(attr).getValue(); + std::array metadata; + llvm::transform( + valuesArray, metadata.begin(), [i32Ty](Attribute valueAttr) { + return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + i32Ty, cast(valueAttr).getValue())); + }); + return llvm::MDNode::get(ctx, metadata); + }); + constexpr llvm::StringLiteral decorationCacheControlMDName = + "spirv.DecorationCacheControlINTEL"; + inst->setMetadata(decorationCacheControlMDName, + llvm::MDNode::get(ctx, decorations)); + return success(); + } +}; +} // namespace + +void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry ®istry) { + registry.insert(); + registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) { + dialect->addInterfaces(); + }); +} + +void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) { + DialectRegistry registry; + registerXeVMDialectTranslation(registry); + context.appendDialectRegistry(registry); +} diff --git a/mlir/test/Target/LLVMIR/xevm.mlir b/mlir/test/Target/LLVMIR/xevm.mlir new file mode 100644 index 0000000000000..c71c235233ad5 --- /dev/null +++ b/mlir/test/Target/LLVMIR/xevm.mlir @@ -0,0 +1,101 @@ +// RUN: mlir-translate --split-input-file -mlir-to-llvmir %s | FileCheck %s + +module { + llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return} + llvm.func @blockload2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) -> vector<8xi16> { + %0 = llvm.mlir.undef : vector<2xi32> + %1 = llvm.mlir.constant(0 : i32) : i32 + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32> + %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32> + %5 = llvm.mlir.constant(8 : i32) : i32 + %6 = llvm.alloca %5 x i16 : (i32) -> !llvm.ptr + // CHECK-LABEL: call spir_func void @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt + // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] + llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, %4, %6) + {function_type = !llvm.func, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage, no_unwind, + sym_name = "_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt", visibility_ = 0 : i64, will_return, + xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]} + : (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> () + %7 = llvm.load %6 : !llvm.ptr -> vector<8xi16> + llvm.return %7 : vector<8xi16> + } +} + +// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} +// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0} +// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0} + +// ----- +module { + llvm.func spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) attributes {no_unwind, will_return} + llvm.func @blockstore2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<8xi32>) { + %0 = llvm.mlir.undef : vector<2xi32> + %1 = llvm.mlir.constant(0 : i32) : i32 + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32> + %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32> + %5 = llvm.mlir.constant(8 : i32) : i32 + %6 = llvm.alloca %5 x i32 : (i32) -> !llvm.ptr + llvm.store %arg6, %6 : vector<8xi32>, !llvm.ptr + // CHECK-LABEL: call spir_func void @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj + // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] + llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, %4, %6) + {function_type = !llvm.func, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage, no_unwind, + sym_name = "_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64, will_return, + xevm.DecorationCacheControl = [[6443 : i32, 0 : i32, 2 : i32, 0 : i32], [6443 : i32, 1 : i32, 2 : i32, 0 : i32]]} + : (!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) -> () + llvm.return + } +} + +// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} +// CHECK: ![[DECO2]] = !{i32 6443, i32 0, i32 2, i32 0} +// CHECK: ![[DECO3]] = !{i32 6443, i32 1, i32 2, i32 0} + +// ----- +module { + llvm.func spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} + llvm.func @blockprefetch2d(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + %0 = llvm.mlir.undef : vector<2xi32> + %1 = llvm.mlir.constant(0 : i32) : i32 + %2 = llvm.mlir.constant(1 : i32) : i32 + %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32> + %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32> + // CHECK-LABEL: call spir_func void @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i + // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] + llvm.call spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(%arg0, %arg1, %arg2, %arg3, %4) + {function_type = !llvm.func, i32, i32, i32, vector<2xi32>)>, linkage = #llvm.linkage, + memory_effects = #llvm.memory_effects, no_unwind, + sym_name = "_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i", visibility_ = 0 : i64, + xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]} + : (!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) -> () + llvm.return + } +} + +// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} +// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0} +// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0} + +// ----- +module { + llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64) attributes {memory_effects = #llvm.memory_effects, no_unwind} + llvm.func @prefetch(%arg0: !llvm.ptr<1>) { + %0 = llvm.mlir.constant(1 : i64) : i64 + // CHECK-LABEL: call spir_func void @_Z8prefetchPU3AS1Kcm + // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] + llvm.call spir_funccc @_Z8prefetchPU3AS1Kcm(%arg0, %0) + {function_type = !llvm.func, i64)>, linkage = #llvm.linkage, + memory_effects = #llvm.memory_effects, + no_unwind, sym_name = "_Z8prefetchPU3AS1Kcm", visibility_ = 0 : i64, + xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]} + : (!llvm.ptr<1>, i64) -> () + llvm.return + } +} + +// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} +// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0} +// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0} + From 6fef03fd48437168700b37ccd4607eeec6048024 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Mon, 28 Jul 2025 17:08:42 +0000 Subject: [PATCH 03/12] Address reviewer comments. --- .../Dialect/XeVM/XeVMToLLVMIRTranslation.cpp | 9 +- mlir/test/Target/LLVMIR/xevm.mlir | 82 +------------------ 2 files changed, 3 insertions(+), 88 deletions(-) diff --git a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp index 67ecb53ca4b3b..73b166d045d5b 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.cpp @@ -63,11 +63,6 @@ class XeVMDialectLLVMIRTranslationInterface } private: - template - static llvm::Metadata *getConstantIntMD(llvm::Type *type, IntTy val) { - return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(type, val)); - } - static LogicalResult handleDecorationCacheControl(llvm::Instruction *inst, ArrayRef attrs) { SmallVector decorations; @@ -94,14 +89,14 @@ class XeVMDialectLLVMIRTranslationInterface }; } // namespace -void ::mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry ®istry) { +void mlir::registerXeVMDialectTranslation(::mlir::DialectRegistry ®istry) { registry.insert(); registry.addExtension(+[](MLIRContext *ctx, xevm::XeVMDialect *dialect) { dialect->addInterfaces(); }); } -void ::mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) { +void mlir::registerXeVMDialectTranslation(::mlir::MLIRContext &context) { DialectRegistry registry; registerXeVMDialectTranslation(registry); context.appendDialectRegistry(registry); diff --git a/mlir/test/Target/LLVMIR/xevm.mlir b/mlir/test/Target/LLVMIR/xevm.mlir index c71c235233ad5..a3dd0b6c17914 100644 --- a/mlir/test/Target/LLVMIR/xevm.mlir +++ b/mlir/test/Target/LLVMIR/xevm.mlir @@ -1,93 +1,13 @@ // RUN: mlir-translate --split-input-file -mlir-to-llvmir %s | FileCheck %s module { - llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return} - llvm.func @blockload2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) -> vector<8xi16> { - %0 = llvm.mlir.undef : vector<2xi32> - %1 = llvm.mlir.constant(0 : i32) : i32 - %2 = llvm.mlir.constant(1 : i32) : i32 - %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32> - %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32> - %5 = llvm.mlir.constant(8 : i32) : i32 - %6 = llvm.alloca %5 x i16 : (i32) -> !llvm.ptr - // CHECK-LABEL: call spir_func void @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt - // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] - llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, %4, %6) - {function_type = !llvm.func, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage, no_unwind, - sym_name = "_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt", visibility_ = 0 : i64, will_return, - xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]} - : (!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) -> () - %7 = llvm.load %6 : !llvm.ptr -> vector<8xi16> - llvm.return %7 : vector<8xi16> - } -} - -// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} -// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0} -// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0} - -// ----- -module { - llvm.func spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) attributes {no_unwind, will_return} - llvm.func @blockstore2d_cache_control(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: vector<8xi32>) { - %0 = llvm.mlir.undef : vector<2xi32> - %1 = llvm.mlir.constant(0 : i32) : i32 - %2 = llvm.mlir.constant(1 : i32) : i32 - %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32> - %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32> - %5 = llvm.mlir.constant(8 : i32) : i32 - %6 = llvm.alloca %5 x i32 : (i32) -> !llvm.ptr - llvm.store %arg6, %6 : vector<8xi32>, !llvm.ptr - // CHECK-LABEL: call spir_func void @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj - // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] - llvm.call spir_funccc @_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, %4, %6) - {function_type = !llvm.func, i32, i32, i32, vector<2xi32>, ptr)>, linkage = #llvm.linkage, no_unwind, - sym_name = "_Z42intel_sub_group_2d_block_write_32b_8r16x1cPU3AS1viiiDv2_iPj", visibility_ = 0 : i64, will_return, - xevm.DecorationCacheControl = [[6443 : i32, 0 : i32, 2 : i32, 0 : i32], [6443 : i32, 1 : i32, 2 : i32, 0 : i32]]} - : (!llvm.ptr<1> {llvm.nonnull, llvm.writeonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.readonly}) -> () - llvm.return - } -} - -// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} -// CHECK: ![[DECO2]] = !{i32 6443, i32 0, i32 2, i32 0} -// CHECK: ![[DECO3]] = !{i32 6443, i32 1, i32 2, i32 0} - -// ----- -module { - llvm.func spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) attributes {memory_effects = #llvm.memory_effects, no_unwind} - llvm.func @blockprefetch2d(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - %0 = llvm.mlir.undef : vector<2xi32> - %1 = llvm.mlir.constant(0 : i32) : i32 - %2 = llvm.mlir.constant(1 : i32) : i32 - %3 = llvm.insertelement %arg4, %0[%1 : i32] : vector<2xi32> - %4 = llvm.insertelement %arg5, %3[%2 : i32] : vector<2xi32> - // CHECK-LABEL: call spir_func void @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i - // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] - llvm.call spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(%arg0, %arg1, %arg2, %arg3, %4) - {function_type = !llvm.func, i32, i32, i32, vector<2xi32>)>, linkage = #llvm.linkage, - memory_effects = #llvm.memory_effects, no_unwind, - sym_name = "_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i", visibility_ = 0 : i64, - xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]} - : (!llvm.ptr<1> {llvm.nonnull}, i32, i32, i32, vector<2xi32>) -> () - llvm.return - } -} - -// CHECK: ![[DECO1]] = !{![[DECO2:.*]], ![[DECO3:.*]]} -// CHECK: ![[DECO2]] = !{i32 6442, i32 0, i32 1, i32 0} -// CHECK: ![[DECO3]] = !{i32 6442, i32 1, i32 1, i32 0} - -// ----- -module { - llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64) attributes {memory_effects = #llvm.memory_effects, no_unwind} + llvm.func spir_funccc @_Z8prefetchPU3AS1Kcm(!llvm.ptr<1>, i64) llvm.func @prefetch(%arg0: !llvm.ptr<1>) { %0 = llvm.mlir.constant(1 : i64) : i64 // CHECK-LABEL: call spir_func void @_Z8prefetchPU3AS1Kcm // CHECK-SAME: !spirv.DecorationCacheControlINTEL ![[DECO1:.*]] llvm.call spir_funccc @_Z8prefetchPU3AS1Kcm(%arg0, %0) {function_type = !llvm.func, i64)>, linkage = #llvm.linkage, - memory_effects = #llvm.memory_effects, no_unwind, sym_name = "_Z8prefetchPU3AS1Kcm", visibility_ = 0 : i64, xevm.DecorationCacheControl = [[6442 : i32, 0 : i32, 1 : i32, 0 : i32], [6442 : i32, 1 : i32, 1 : i32, 0 : i32]]} : (!llvm.ptr<1>, i64) -> () From 6e160dfde82aca85aa12aa22ed4abe551f40b204 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 31 Jul 2025 00:48:40 +0000 Subject: [PATCH 04/12] Temp save. --- mlir/include/mlir/Target/LLVM/XeVM/Utils.h | 15 +- mlir/lib/Target/LLVM/XeVM/Target.cpp | 295 ++++++++++++++++----- 2 files changed, 237 insertions(+), 73 deletions(-) diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h index c11a97f0d960a..95764fa99e0fc 100644 --- a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h +++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h @@ -15,6 +15,7 @@ #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h" #include "mlir/Dialect/LLVMIR/XeVMDialect.h" +#include "mlir/IR/Attributes.h" #include "mlir/Target/LLVM/ModuleToObject.h" namespace mlir { @@ -22,16 +23,24 @@ namespace xevm { /// Base class for all XeVM serializations from GPU modules into binary strings. /// By default this class serializes into LLVM bitcode. -class SerializeGPUModuleBase : public mlir::LLVM::ModuleToObject { +class SerializeGPUModuleBase : public LLVM::ModuleToObject { public: - SerializeGPUModuleBase(mlir::Operation &module, XeVMTargetAttr target, - const mlir::gpu::TargetOptions &targetOptions = {}); + SerializeGPUModuleBase(Operation &module, XeVMTargetAttr target, + const gpu::TargetOptions &targetOptions = {}); static void init(); XeVMTargetAttr getTarget() const; + /// Loads the bitcode files in `librariesToLink`. + std::optional>> + loadBitcodeFiles(llvm::Module &module) override; + protected: XeVMTargetAttr target; + /// List of LLVM bitcode to link into after translation to LLVM IR. + /// The attributes can be StringAttr pointing to a file path, or + /// a Resource blob pointing to the LLVM bitcode in-memory. + SmallVector librariesToLink; }; } // namespace xevm } // namespace mlir diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 380e2bff222ca..85d8405d20888 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -1,12 +1,12 @@ -//===-- Target.cpp - MLIR LLVM XeVM target compilation ----------*- C++ -*-===// +//===- Target.cpp - MLIR LLVM XeVM target compilation -----------*- C++ -*-===// // -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // -// This file defines XeVM target related functions including registration +// This files defines XeVM target related functions including registration // calls for the `#xevm.target` compilation attribute. // //===----------------------------------------------------------------------===// @@ -15,41 +15,36 @@ #include "mlir/Dialect/GPU/IR/CompilationInterfaces.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/XeVMDialect.h" -#include "mlir/IR/ExtensibleDialect.h" +#include "mlir/IR/BuiltinAttributeInterfaces.h" +#include "mlir/IR/BuiltinDialect.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/DialectResourceBlobManager.h" #include "mlir/Target/LLVM/XeVM/Utils.h" #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Config/Targets.h" -#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/Program.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" - -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" - -// FIXME: One of the headers uses `.inc` file from the build directory, this -// does not work for installation (i.e., DCMAKE_INSTALL_PREFIX) caching as build -// directory will not be cached. Since float atomics are not yet supported by -// the backend anyway, we can afford to temporarily comment this section. +#include "llvm/Support/raw_ostream.h" +#include "llvm/Bitcode/BitcodeWriter.h" -// #if LLVM_HAS_SPIRV_TARGET -// #pragma GCC diagnostic push -// #pragma GCC diagnostic ignored "-Wnon-virtual-dtor" -// #include "SPIRVTargetMachine.h" -// #pragma GCC diagnostic pop - -// #include "SPIRVCommandLine.h" -// #endif // LLVM_HAS_SPIRV_TARGET - -#include +#include +#include using namespace mlir; +using namespace mlir::xevm; namespace { // XeVM implementation of the gpu:TargetAttrInterface. @@ -68,10 +63,9 @@ class XeVMTargetAttrImpl void mlir::xevm::registerXeVMTargetInterfaceExternalModels( DialectRegistry ®istry) { - registry.addExtension( - +[](MLIRContext *ctx, mlir::xevm::XeVMDialect *dialect) { - mlir::xevm::XeVMTargetAttr::attachInterface(*ctx); - }); + registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) { + XeVMTargetAttr::attachInterface(*ctx); + }); } void mlir::xevm::registerXeVMTargetInterfaceExternalModels( @@ -81,13 +75,17 @@ void mlir::xevm::registerXeVMTargetInterfaceExternalModels( context.appendDialectRegistry(registry); } -mlir::xevm::SerializeGPUModuleBase::SerializeGPUModuleBase( - Operation &module, mlir::xevm::XeVMTargetAttr target, +SerializeGPUModuleBase::SerializeGPUModuleBase( + Operation &module, XeVMTargetAttr target, const gpu::TargetOptions &targetOptions) : ModuleToObject(module, target.getTriple(), "", {}, target.getO()), - target(target) {} + target(target), librariesToLink(targetOptions.getLibrariesToLink()) { + if (target.getLinkFiles()) + librariesToLink.append(target.getLinkFiles().begin(), + target.getLinkFiles().end()); +} -void mlir::xevm::SerializeGPUModuleBase::init() { +void SerializeGPUModuleBase::init() { static llvm::once_flag initializeBackendOnce; llvm::call_once(initializeBackendOnce, []() { #if LLVM_HAS_SPIRV_TARGET @@ -99,24 +97,40 @@ void mlir::xevm::SerializeGPUModuleBase::init() { }); } -mlir::xevm::XeVMTargetAttr -mlir::xevm::SerializeGPUModuleBase::getTarget() const { - return target; +XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; } + +std::optional>> +SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) { + if (librariesToLink.empty()) + return SmallVector>(); + SmallVector> bcFiles; + if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink, + bcFiles))) + return std::nullopt; + return std::move(bcFiles); } namespace { -class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase { +class SpirSerializer : public SerializeGPUModuleBase { public: - SpirSerializer(Operation &module, mlir::xevm::XeVMTargetAttr target, + SpirSerializer(Operation &module, XeVMTargetAttr target, const gpu::TargetOptions &targetOptions) - : mlir::xevm::SerializeGPUModuleBase(module, target, targetOptions) {} + : SerializeGPUModuleBase(module, target, targetOptions), + targetOptions(targetOptions) {} gpu::GPUModuleOp getOperation(); + /// Serializes the LLVM module to an object format, depending on the + /// compilation target selected in target options. std::optional> moduleToObject(llvm::Module &llvmModule) override; + /// Compiles to native code using `ocloc`. + std::optional> compileToBinary(const std::string &asmStr, + StringRef inputFormat); + private: + std::optional findTool(StringRef tool); std::optional translateToSPIRVBinary(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine); @@ -125,19 +139,27 @@ class SpirSerializer : public mlir::xevm::SerializeGPUModuleBase { } // namespace gpu::GPUModuleOp SpirSerializer::getOperation() { - return dyn_cast( - &mlir::xevm::SerializeGPUModuleBase::getOperation()); + return dyn_cast(&SerializeGPUModuleBase::getOperation()); } std::optional> SpirSerializer::moduleToObject(llvm::Module &llvmModule) { +#define DEBUG_TYPE "serialize-to-llvm" + LLVM_DEBUG({ + llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr() + << "\n"; + llvm::dbgs() << llvmModule << "\n"; + llvm::dbgs().flush(); + }); +#undef DEBUG_TYPE + // Return LLVM IR if the compilation target is `offload`. if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) - return mlir::xevm::SerializeGPUModuleBase::moduleToObject(llvmModule); + return SerializeGPUModuleBase::moduleToObject(llvmModule); #if !LLVM_HAS_SPIRV_TARGET - getOperation()->emitError( - "The `SPIRV` target was not built. Please enable it when building LLVM."); + getOperation()->emitError("The `SPIRV` target was not built. Please enable " + "it when building LLVM."); return std::nullopt; #endif // LLVM_HAS_SPIRV_TARGET @@ -145,36 +167,29 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { getOrCreateTargetMachine(); if (!targetMachine) { getOperation().emitError() << "Target Machine unavailable for triple " - << triple << ", can't compile with LLVM\n"; + << triple << ", can't optimize with LLVM\n"; return std::nullopt; } - //===----------------------------------------------------------------------===// - // Workaround to enable spirv extensions that are not added to target machine - // by default. - - // FIXME: see fixme comment above SPIRV headers. - // #if LLVM_HAS_SPIRV_TARGET - // std::set AllowedExtIds{ - // llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float_add, - // llvm::SPIRV::Extension::Extension::SPV_EXT_shader_atomic_float16_add}; - // llvm::SPIRVTargetMachine *STM = - // static_cast(targetMachine.value()); - // const_cast(STM->getSubtargetImpl()) - // ->initAvailableExtensions(AllowedExtIds); - // #endif // LLVM_HAS_SPIRV_TARGET + std::optional serializedISA = + translateToISA(llvmModule, **targetMachine); + if (!serializedISA) { + getOperation().emitError() << "Failed translating the module to ISA." + << triple << ", can't compile with LLVM\n"; + return std::nullopt; + } - //===----------------------------------------------------------------------===// +#define DEBUG_TYPE "serialize-to-isa" + LLVM_DEBUG({ + llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr() << "\n"; + llvm::dbgs() << *serializedISA << "\n"; + llvm::dbgs().flush(); + }); +#undef DEBUG_TYPE // Return SPIRV if the compilation target is `assembly`. if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly) { - std::optional serializedISA = - translateToISA(llvmModule, **targetMachine); - if (!serializedISA) { - getOperation().emitError() << "Failed translating the module to ISA."; - return std::nullopt; - } // Make sure to include the null terminator. StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); return SmallVector(bin.begin(), bin.end()); @@ -194,6 +209,137 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { return SmallVector(bin.begin(), bin.end()); } +std::optional SpirSerializer::findTool(StringRef tool) { + // 1. Check the toolkit path given in the command line. + StringRef pathRef = targetOptions.getToolkitPath(); + SmallVector path; + if (!pathRef.empty()) { + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + llvm::sys::path::append(path, "bin", tool); + if (llvm::sys::fs::can_execute(path)) + return StringRef(path.data(), path.size()).str(); + } + // 2. Check PATH. + if (std::optional toolPath = + llvm::sys::Process::FindInEnvPath("PATH", tool)) + return *toolPath; + + getOperation().emitError() + << "Couldn't find the `" << tool + << "` binary. Please specify the toolkit " + "path via GpuModuleToBinaryPass or add the compiler to $PATH`."; + return std::nullopt; +} + +// There is 1 way to finalize SPIR-V to native code: IGC +// There are 2 ways to access IGC: AOT (ocloc) and JIT (L0 runtime). +// - L0 runtime consumes SPIR-V and is external to MLIR codebase (rt wrappers). +// - `ocloc` tool can be "queried" from within MLIR. +std::optional> +SpirSerializer::compileToBinary(const std::string &asmStr, + StringRef inputFormat) { + using TmpFile = std::pair, llvm::FileRemover>; + // Find the `ocloc` tool. + std::optional oclocCompiler = findTool("ocloc"); + if (!oclocCompiler) + return std::nullopt; + Location loc = getOperation().getLoc(); + std::string basename = + llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(), + getTarget().getTriple(), getTarget().getChip()); + + auto createTemp = [&](StringRef name, + StringRef suffix) -> std::optional { + llvm::SmallString<128> filePath; + if (auto ec = llvm::sys::fs::createTemporaryFile(name, suffix, filePath)) { + getOperation().emitError() + << "Couldn't create the temp file: `" << filePath + << "`, error message: " << ec.message(); + return std::nullopt; + } + return TmpFile(filePath, llvm::FileRemover(filePath.c_str())); + }; + // Create temp file + std::optional asmFile = createTemp(basename, "asm"); + std::optional binFile = createTemp(basename, ""); + std::optional logFile = createTemp(basename, "log"); + if (!logFile || !asmFile || !binFile) + return std::nullopt; + // Dump the assembly to a temp file + std::error_code ec; + { + llvm::raw_fd_ostream asmStream(asmFile->first, ec); + if (ec) { + emitError(loc) << "Couldn't open the file: `" << asmFile->first + << "`, error message: " << ec.message(); + return std::nullopt; + } + asmStream << asmStr; + if (asmStream.has_error()) { + emitError(loc) << "An error occurred while writing the assembly to: `" + << asmFile->first << "`."; + return std::nullopt; + } + asmStream.flush(); + } + // Set cmd options + std::pair> cmdOpts = + targetOptions.tokenizeCmdOptions(); + // Example: --gpu-module-to-binary="opts='opt1 opt2'" + const std::string cmdOptsStr = "\"" + llvm::join(cmdOpts.second, " ") + "\""; + SmallVector oclocArgs( + {"ocloc", "compile", "-file", asmFile->first, inputFormat, "-device", + getTarget().getChip(), "-output", binFile->first, "-output_no_suffix", + "-options", cmdOptsStr}); + +// Dump tool invocation commands. +#define DEBUG_TYPE "serialize-to-binary" + LLVM_DEBUG({ + llvm::dbgs() << "Tool invocation for module: " + << getOperation().getNameAttr() << "\n"; + llvm::interleave(oclocArgs, llvm::dbgs(), " "); + llvm::dbgs() << "\n"; + }); +#undef DEBUG_TYPE + // Helper function for printing tool error logs. + std::string message; + auto emitLogError = + [&](StringRef toolName) -> std::optional> { + if (message.empty()) { + llvm::ErrorOr> toolStderr = + llvm::MemoryBuffer::getFile(logFile->first); + if (toolStderr) + emitError(loc) << toolName << " invocation failed. Log:\n" + << toolStderr->get()->getBuffer(); + else + emitError(loc) << toolName << " invocation failed."; + return std::nullopt; + } + emitError(loc) << toolName + << " invocation failed, error message: " << message; + return std::nullopt; + }; + std::optional redirects[] = { + std::nullopt, + logFile->first, + logFile->first, + }; + // Invoke ocloc. + if (llvm::sys::ExecuteAndWait(oclocCompiler.value(), oclocArgs, std::nullopt, + redirects, 0, 0, &message)) + return emitLogError("`ocloc`"); + binFile->first.append(".bin"); + llvm::ErrorOr> binaryBuffer = + llvm::MemoryBuffer::getFile(binFile->first); + if (!binaryBuffer) { + emitError(loc) << "Couldn't open the file: `" << binFile->first + << "`, error message: " << binaryBuffer.getError().message(); + return std::nullopt; + } + StringRef bin = (*binaryBuffer)->getBuffer(); + return SmallVector(bin.begin(), bin.end()); +} + std::optional SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine) { @@ -232,7 +378,7 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, }); SpirSerializer serializer( - *module, cast(attribute), options); + *module, cast(attribute), options); serializer.init(); #if !LLVM_HAS_SPIRV_TARGET @@ -247,9 +393,18 @@ Attribute XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module, const SmallVector &object, const gpu::TargetOptions &options) const { + Builder builder(attribute.getContext()); gpu::CompilationTarget format = options.getCompilationTarget(); + auto target = cast(attribute); + SmallVector properties; + if (format == gpu::CompilationTarget::Assembly) + properties.push_back( + builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))); + DictionaryAttr objectProps; - Builder builder(attribute.getContext()); + if (!properties.empty()) + objectProps = builder.getDictionaryAttr(properties); + return builder.getAttr( attribute, format, builder.getStringAttr(StringRef(object.data(), object.size())), From b1f6bc3ad57ceb9cc20662eb691bbdbbc7427740 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 31 Jul 2025 18:04:13 +0000 Subject: [PATCH 05/12] Temp save. --- mlir/lib/Target/LLVM/XeVM/Target.cpp | 29 +++++----------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 85d8405d20888..8f95fdc3a8beb 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -132,8 +132,6 @@ class SpirSerializer : public SerializeGPUModuleBase { private: std::optional findTool(StringRef tool); std::optional - translateToSPIRVBinary(llvm::Module &llvmModule, - llvm::TargetMachine &targetMachine); gpu::TargetOptions targetOptions; }; } // namespace @@ -188,13 +186,14 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { #undef DEBUG_TYPE // Return SPIRV if the compilation target is `assembly`. - if (targetOptions.getCompilationTarget() == - gpu::CompilationTarget::Assembly) { +// if (targetOptions.getCompilationTarget() == +// gpu::CompilationTarget::Assembly) { // Make sure to include the null terminator. StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); return SmallVector(bin.begin(), bin.end()); - } +// } +/* std::optional serializedSPIRVBinary = translateToSPIRVBinary(llvmModule, **targetMachine); if (!serializedSPIRVBinary) { @@ -207,6 +206,7 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { } StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size()); return SmallVector(bin.begin(), bin.end()); +*/ } std::optional SpirSerializer::findTool(StringRef tool) { @@ -340,25 +340,6 @@ SpirSerializer::compileToBinary(const std::string &asmStr, return SmallVector(bin.begin(), bin.end()); } -std::optional -SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule, - llvm::TargetMachine &targetMachine) { - std::string targetISA; - llvm::raw_string_ostream stream(targetISA); - - { // Drop pstream after this to prevent the ISA from being stuck buffering - llvm::buffer_ostream pstream(stream); - llvm::legacy::PassManager codegenPasses; - - if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, - llvm::CodeGenFileType::ObjectFile)) - return std::nullopt; - - codegenPasses.run(llvmModule); - } - return targetISA; -} - std::optional> XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const { From 415166e221e7684dd63a147106343ce44a1d1acd Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 31 Jul 2025 22:15:29 +0000 Subject: [PATCH 06/12] Temp save. --- mlir/lib/Target/LLVM/XeVM/Target.cpp | 29 +++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 8f95fdc3a8beb..4d67e67fa5a56 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -132,6 +132,8 @@ class SpirSerializer : public SerializeGPUModuleBase { private: std::optional findTool(StringRef tool); std::optional + translateToSPIRVBinary(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine); gpu::TargetOptions targetOptions; }; } // namespace @@ -186,14 +188,14 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { #undef DEBUG_TYPE // Return SPIRV if the compilation target is `assembly`. -// if (targetOptions.getCompilationTarget() == -// gpu::CompilationTarget::Assembly) { + if (targetOptions.getCompilationTarget() == + gpu::CompilationTarget::Assembly) { // Make sure to include the null terminator. StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); return SmallVector(bin.begin(), bin.end()); -// } + } + -/* std::optional serializedSPIRVBinary = translateToSPIRVBinary(llvmModule, **targetMachine); if (!serializedSPIRVBinary) { @@ -206,7 +208,24 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { } StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size()); return SmallVector(bin.begin(), bin.end()); -*/ +} + +std::optional +SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::string targetISA; + llvm::raw_string_ostream stream(targetISA); + + { // Drop pstream after this to prevent the ISA from being stuck buffering + llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; + if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CodeGenFileType::ObjectFile)) + return std::nullopt; + + codegenPasses.run(llvmModule); + } + return targetISA; } std::optional SpirSerializer::findTool(StringRef tool) { From dcefcc6a14d95d772309936e2e666800744c78f7 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 31 Jul 2025 22:51:32 +0000 Subject: [PATCH 07/12] Temp save. --- mlir/lib/Target/LLVM/XeVM/Target.cpp | 41 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 4d67e67fa5a56..8bc934190d0c3 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -28,6 +28,7 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/Target/TargetMachine.h" +#include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/Config/Targets.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/FileUtilities.h" @@ -38,7 +39,6 @@ #include "llvm/Support/Program.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Bitcode/BitcodeWriter.h" #include #include @@ -171,31 +171,31 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { return std::nullopt; } - std::optional serializedISA = - translateToISA(llvmModule, **targetMachine); - if (!serializedISA) { - getOperation().emitError() << "Failed translating the module to ISA." - << triple << ", can't compile with LLVM\n"; - return std::nullopt; - } + // Return SPIRV if the compilation target is `assembly`. + if (targetOptions.getCompilationTarget() == + gpu::CompilationTarget::Assembly) { + std::optional serializedISA = + translateToISA(llvmModule, **targetMachine); + if (!serializedISA) { + getOperation().emitError() << "Failed translating the module to ISA." + << triple << ", can't compile with LLVM\n"; + return std::nullopt; + } #define DEBUG_TYPE "serialize-to-isa" - LLVM_DEBUG({ - llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr() << "\n"; - llvm::dbgs() << *serializedISA << "\n"; - llvm::dbgs().flush(); - }); + LLVM_DEBUG({ + llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr() + << "\n"; + llvm::dbgs() << *serializedISA << "\n"; + llvm::dbgs().flush(); + }); #undef DEBUG_TYPE - // Return SPIRV if the compilation target is `assembly`. - if (targetOptions.getCompilationTarget() == - gpu::CompilationTarget::Assembly) { // Make sure to include the null terminator. StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); return SmallVector(bin.begin(), bin.end()); } - std::optional serializedSPIRVBinary = translateToSPIRVBinary(llvmModule, **targetMachine); if (!serializedSPIRVBinary) { @@ -353,8 +353,8 @@ SpirSerializer::compileToBinary(const std::string &asmStr, if (!binaryBuffer) { emitError(loc) << "Couldn't open the file: `" << binFile->first << "`, error message: " << binaryBuffer.getError().message(); - return std::nullopt; - } + return std::nullopt; + } StringRef bin = (*binaryBuffer)->getBuffer(); return SmallVector(bin.begin(), bin.end()); } @@ -377,8 +377,7 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, return WalkResult::advance(); }); - SpirSerializer serializer( - *module, cast(attribute), options); + SpirSerializer serializer(*module, cast(attribute), options); serializer.init(); #if !LLVM_HAS_SPIRV_TARGET From 6d2e1ee3412bae9c2a2b5a28b910a9bb9b90b153 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 7 Aug 2025 17:02:29 +0000 Subject: [PATCH 08/12] Refactor to prepare for future target triples. --- mlir/include/mlir/Target/LLVM/XeVM/Utils.h | 13 +- mlir/lib/Target/LLVM/XeVM/Target.cpp | 341 ++++++++++----------- 2 files changed, 182 insertions(+), 172 deletions(-) diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h index 95764fa99e0fc..ae0836ea89f57 100644 --- a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h +++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h @@ -28,19 +28,30 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject { SerializeGPUModuleBase(Operation &module, XeVMTargetAttr target, const gpu::TargetOptions &targetOptions = {}); - static void init(); XeVMTargetAttr getTarget() const; /// Loads the bitcode files in `librariesToLink`. std::optional>> loadBitcodeFiles(llvm::Module &module) override; + /// Returns the gpu module being serialized. + gpu::GPUModuleOp getGPUModuleOp(); + + /// Compiles to native code using `ocloc`. + std::optional> compileToBinary(const std::string &asmStr, + StringRef inputFormat); + protected: XeVMTargetAttr target; /// List of LLVM bitcode to link into after translation to LLVM IR. /// The attributes can be StringAttr pointing to a file path, or /// a Resource blob pointing to the LLVM bitcode in-memory. SmallVector librariesToLink; + + /// Returns the path to the tool used for serialization. + std::optional findTool(StringRef tool); + + gpu::TargetOptions targetOptions; }; } // namespace xevm } // namespace mlir diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 8bc934190d0c3..60e69159023e9 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -85,18 +85,6 @@ SerializeGPUModuleBase::SerializeGPUModuleBase( target.getLinkFiles().end()); } -void SerializeGPUModuleBase::init() { - static llvm::once_flag initializeBackendOnce; - llvm::call_once(initializeBackendOnce, []() { -#if LLVM_HAS_SPIRV_TARGET - LLVMInitializeSPIRVTarget(); - LLVMInitializeSPIRVTargetInfo(); - LLVMInitializeSPIRVTargetMC(); - LLVMInitializeSPIRVAsmPrinter(); -#endif - }); -} - XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; } std::optional>> @@ -110,168 +98,32 @@ SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) { return std::move(bcFiles); } -namespace { -class SpirSerializer : public SerializeGPUModuleBase { -public: - SpirSerializer(Operation &module, XeVMTargetAttr target, - const gpu::TargetOptions &targetOptions) - : SerializeGPUModuleBase(module, target, targetOptions), - targetOptions(targetOptions) {} - - gpu::GPUModuleOp getOperation(); - - /// Serializes the LLVM module to an object format, depending on the - /// compilation target selected in target options. - std::optional> - moduleToObject(llvm::Module &llvmModule) override; - - /// Compiles to native code using `ocloc`. - std::optional> compileToBinary(const std::string &asmStr, - StringRef inputFormat); - -private: - std::optional findTool(StringRef tool); - std::optional - translateToSPIRVBinary(llvm::Module &llvmModule, - llvm::TargetMachine &targetMachine); - gpu::TargetOptions targetOptions; -}; -} // namespace - -gpu::GPUModuleOp SpirSerializer::getOperation() { +gpu::GPUModuleOp SerializeGPUModuleBase::getGPUModuleOp() { return dyn_cast(&SerializeGPUModuleBase::getOperation()); } -std::optional> -SpirSerializer::moduleToObject(llvm::Module &llvmModule) { -#define DEBUG_TYPE "serialize-to-llvm" - LLVM_DEBUG({ - llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr() - << "\n"; - llvm::dbgs() << llvmModule << "\n"; - llvm::dbgs().flush(); - }); -#undef DEBUG_TYPE - - // Return LLVM IR if the compilation target is `offload`. - if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) - return SerializeGPUModuleBase::moduleToObject(llvmModule); - -#if !LLVM_HAS_SPIRV_TARGET - getOperation()->emitError("The `SPIRV` target was not built. Please enable " - "it when building LLVM."); - return std::nullopt; -#endif // LLVM_HAS_SPIRV_TARGET - - std::optional targetMachine = - getOrCreateTargetMachine(); - if (!targetMachine) { - getOperation().emitError() << "Target Machine unavailable for triple " - << triple << ", can't optimize with LLVM\n"; - return std::nullopt; - } - - // Return SPIRV if the compilation target is `assembly`. - if (targetOptions.getCompilationTarget() == - gpu::CompilationTarget::Assembly) { - std::optional serializedISA = - translateToISA(llvmModule, **targetMachine); - if (!serializedISA) { - getOperation().emitError() << "Failed translating the module to ISA." - << triple << ", can't compile with LLVM\n"; - return std::nullopt; - } - -#define DEBUG_TYPE "serialize-to-isa" - LLVM_DEBUG({ - llvm::dbgs() << "SPIR-V for module: " << getOperation().getNameAttr() - << "\n"; - llvm::dbgs() << *serializedISA << "\n"; - llvm::dbgs().flush(); - }); -#undef DEBUG_TYPE - - // Make sure to include the null terminator. - StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); - return SmallVector(bin.begin(), bin.end()); - } - - std::optional serializedSPIRVBinary = - translateToSPIRVBinary(llvmModule, **targetMachine); - if (!serializedSPIRVBinary) { - getOperation().emitError() << "Failed translating the module to Binary."; - return std::nullopt; - } - if (serializedSPIRVBinary->size() % 4) { - getOperation().emitError() << "SPIRV code size must be a multiple of 4."; - return std::nullopt; - } - StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size()); - return SmallVector(bin.begin(), bin.end()); -} - -std::optional -SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule, - llvm::TargetMachine &targetMachine) { - std::string targetISA; - llvm::raw_string_ostream stream(targetISA); - - { // Drop pstream after this to prevent the ISA from being stuck buffering - llvm::buffer_ostream pstream(stream); - llvm::legacy::PassManager codegenPasses; - if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, - llvm::CodeGenFileType::ObjectFile)) - return std::nullopt; - - codegenPasses.run(llvmModule); - } - return targetISA; -} - -std::optional SpirSerializer::findTool(StringRef tool) { - // 1. Check the toolkit path given in the command line. - StringRef pathRef = targetOptions.getToolkitPath(); - SmallVector path; - if (!pathRef.empty()) { - path.insert(path.begin(), pathRef.begin(), pathRef.end()); - llvm::sys::path::append(path, "bin", tool); - if (llvm::sys::fs::can_execute(path)) - return StringRef(path.data(), path.size()).str(); - } - // 2. Check PATH. - if (std::optional toolPath = - llvm::sys::Process::FindInEnvPath("PATH", tool)) - return *toolPath; - - getOperation().emitError() - << "Couldn't find the `" << tool - << "` binary. Please specify the toolkit " - "path via GpuModuleToBinaryPass or add the compiler to $PATH`."; - return std::nullopt; -} - -// There is 1 way to finalize SPIR-V to native code: IGC +// There is 1 way to finalize IL to native code: IGC // There are 2 ways to access IGC: AOT (ocloc) and JIT (L0 runtime). -// - L0 runtime consumes SPIR-V and is external to MLIR codebase (rt wrappers). +// - L0 runtime consumes IL and is external to MLIR codebase (rt wrappers). // - `ocloc` tool can be "queried" from within MLIR. std::optional> -SpirSerializer::compileToBinary(const std::string &asmStr, - StringRef inputFormat) { +SerializeGPUModuleBase::compileToBinary(const std::string &asmStr, + StringRef inputFormat) { using TmpFile = std::pair, llvm::FileRemover>; // Find the `ocloc` tool. std::optional oclocCompiler = findTool("ocloc"); if (!oclocCompiler) return std::nullopt; - Location loc = getOperation().getLoc(); - std::string basename = - llvm::formatv("mlir-{0}-{1}-{2}", getOperation().getNameAttr().getValue(), - getTarget().getTriple(), getTarget().getChip()); + Location loc = getGPUModuleOp().getLoc(); + std::string basename = llvm::formatv( + "mlir-{0}-{1}-{2}", getGPUModuleOp().getNameAttr().getValue(), + getTarget().getTriple(), getTarget().getChip()); auto createTemp = [&](StringRef name, StringRef suffix) -> std::optional { llvm::SmallString<128> filePath; if (auto ec = llvm::sys::fs::createTemporaryFile(name, suffix, filePath)) { - getOperation().emitError() + getGPUModuleOp().emitError() << "Couldn't create the temp file: `" << filePath << "`, error message: " << ec.message(); return std::nullopt; @@ -315,7 +167,7 @@ SpirSerializer::compileToBinary(const std::string &asmStr, #define DEBUG_TYPE "serialize-to-binary" LLVM_DEBUG({ llvm::dbgs() << "Tool invocation for module: " - << getOperation().getNameAttr() << "\n"; + << getGPUModuleOp().getNameAttr() << "\n"; llvm::interleave(oclocArgs, llvm::dbgs(), " "); llvm::dbgs() << "\n"; }); @@ -359,6 +211,147 @@ SpirSerializer::compileToBinary(const std::string &asmStr, return SmallVector(bin.begin(), bin.end()); } +std::optional SerializeGPUModuleBase::findTool(StringRef tool) { + // 1. Check the toolkit path given in the command line. + StringRef pathRef = targetOptions.getToolkitPath(); + SmallVector path; + if (!pathRef.empty()) { + path.insert(path.begin(), pathRef.begin(), pathRef.end()); + llvm::sys::path::append(path, "bin", tool); + if (llvm::sys::fs::can_execute(path)) + return StringRef(path.data(), path.size()).str(); + } + // 2. Check PATH. + if (std::optional toolPath = + llvm::sys::Process::FindInEnvPath("PATH", tool)) + return *toolPath; + + getGPUModuleOp().emitError() + << "Couldn't find the `" << tool + << "` binary. Please specify the toolkit " + "path via GpuModuleToBinaryPass or add the compiler to $PATH`."; + return std::nullopt; +} + +namespace { +class SpirSerializer : public SerializeGPUModuleBase { +public: + SpirSerializer(Operation &module, XeVMTargetAttr target, + const gpu::TargetOptions &targetOptions) + : SerializeGPUModuleBase(module, target, targetOptions) {} + + static void init(); + + /// Serializes the LLVM module to an object format, depending on the + /// compilation target selected in target options. + std::optional> + moduleToObject(llvm::Module &llvmModule) override; + +private: + std::optional + translateToSPIRVBinary(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine); +}; +} // namespace + +void SpirSerializer::init() { + static llvm::once_flag initializeBackendOnce; + llvm::call_once(initializeBackendOnce, []() { +#if LLVM_HAS_SPIRV_TARGET + LLVMInitializeSPIRVTarget(); + LLVMInitializeSPIRVTargetInfo(); + LLVMInitializeSPIRVTargetMC(); + LLVMInitializeSPIRVAsmPrinter(); +#endif + }); +} + +std::optional> +SpirSerializer::moduleToObject(llvm::Module &llvmModule) { +#define DEBUG_TYPE "serialize-to-llvm" + LLVM_DEBUG({ + llvm::dbgs() << "LLVM IR for module: " << getGPUModuleOp().getNameAttr() + << "\n"; + llvm::dbgs() << llvmModule << "\n"; + llvm::dbgs().flush(); + }); +#undef DEBUG_TYPE + + // Return LLVM IR if the compilation target is `offload`. + if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) + return SerializeGPUModuleBase::moduleToObject(llvmModule); + +#if !LLVM_HAS_SPIRV_TARGET + getGPUModuleOp()->emitError("The `SPIRV` target was not built. Please enable " + "it when building LLVM."); + return std::nullopt; +#endif // LLVM_HAS_SPIRV_TARGET + + std::optional targetMachine = + getOrCreateTargetMachine(); + if (!targetMachine) { + getGPUModuleOp().emitError() << "Target Machine unavailable for triple " + << triple << ", can't optimize with LLVM\n"; + return std::nullopt; + } + + // Return SPIRV if the compilation target is `assembly`. + if (targetOptions.getCompilationTarget() == + gpu::CompilationTarget::Assembly) { + std::optional serializedISA = + translateToISA(llvmModule, **targetMachine); + if (!serializedISA) { + getGPUModuleOp().emitError() << "Failed translating the module to ISA." + << triple << ", can't compile with LLVM\n"; + return std::nullopt; + } + +#define DEBUG_TYPE "serialize-to-isa" + LLVM_DEBUG({ + llvm::dbgs() << "SPIR-V for module: " << getGPUModuleOp().getNameAttr() + << "\n"; + llvm::dbgs() << *serializedISA << "\n"; + llvm::dbgs().flush(); + }); +#undef DEBUG_TYPE + + // Make sure to include the null terminator. + StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); + return SmallVector(bin.begin(), bin.end()); + } + + std::optional serializedSPIRVBinary = + translateToSPIRVBinary(llvmModule, **targetMachine); + if (!serializedSPIRVBinary) { + getGPUModuleOp().emitError() << "Failed translating the module to Binary."; + return std::nullopt; + } + if (serializedSPIRVBinary->size() % 4) { + getGPUModuleOp().emitError() << "SPIRV code size must be a multiple of 4."; + return std::nullopt; + } + StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size()); + return SmallVector(bin.begin(), bin.end()); +} + +std::optional +SpirSerializer::translateToSPIRVBinary(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + std::string targetISA; + llvm::raw_string_ostream stream(targetISA); + + { // Drop pstream after this to prevent the ISA from being stuck buffering + llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; + if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CodeGenFileType::ObjectFile)) + return std::nullopt; + + codegenPasses.run(llvmModule); + } + return targetISA; +} + std::optional> XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const { @@ -369,23 +362,29 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, module->emitError("expected to be a gpu.module op"); return std::nullopt; } - gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) { - if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) { - funcOp.setIntelReqdSubGroupSize(16); - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); + auto target = cast(attribute); + if (target.getTriple().starts_with("spir")) { + gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) { + if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) { + funcOp.setIntelReqdSubGroupSize(16); + return WalkResult::interrupt(); + } + return WalkResult::advance(); + }); - SpirSerializer serializer(*module, cast(attribute), options); - serializer.init(); + SpirSerializer serializer(*module, cast(attribute), + options); + serializer.init(); #if !LLVM_HAS_SPIRV_TARGET - module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV " - "without having the target built."); + module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV " + "without having the target built."); #endif - return serializer.run(); + return serializer.run(); + } + module->emitError("Unsupported XeVM target triple: ") << target.getTriple(); + return std::nullopt; } Attribute From b102c1b768d733ff165a6085d1025e4f5ad70bc1 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 7 Aug 2025 17:10:36 +0000 Subject: [PATCH 09/12] Cleanup CMake file. --- mlir/lib/Target/LLVM/CMakeLists.txt | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt index 11b69f530a674..d3a8f3c02599f 100644 --- a/mlir/lib/Target/LLVM/CMakeLists.txt +++ b/mlir/lib/Target/LLVM/CMakeLists.txt @@ -222,9 +222,6 @@ add_mlir_dialect_library(MLIRXeVMTarget OBJECT - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR - LINK_COMPONENTS ${SPIRV_LIBS} @@ -236,9 +233,3 @@ add_mlir_dialect_library(MLIRXeVMTarget MLIRTargetLLVM MLIRXeVMToLLVMIRTranslation ) - -# Ensure SPIRV headers are included. Warning: references build directory! -target_include_directories(MLIRXeVMTarget PRIVATE - ${LLVM_MAIN_SRC_DIR}/lib/Target/SPIRV - ${LLVM_BINARY_DIR}/lib/Target/SPIRV -) From ddfa982768de10f26f09ba70ec96b87ec209dd12 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 7 Aug 2025 17:34:20 +0000 Subject: [PATCH 10/12] Use name xeTarget instead of target whenever applicable. --- mlir/include/mlir/Target/LLVM/XeVM/Utils.h | 2 +- mlir/lib/Target/LLVM/XeVM/Target.cpp | 28 +++++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h index ae0836ea89f57..f39a7cc2675d9 100644 --- a/mlir/include/mlir/Target/LLVM/XeVM/Utils.h +++ b/mlir/include/mlir/Target/LLVM/XeVM/Utils.h @@ -42,7 +42,7 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject { StringRef inputFormat); protected: - XeVMTargetAttr target; + XeVMTargetAttr xeTarget; /// List of LLVM bitcode to link into after translation to LLVM IR. /// The attributes can be StringAttr pointing to a file path, or /// a Resource blob pointing to the LLVM bitcode in-memory. diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 60e69159023e9..46fa02467de63 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -76,16 +76,16 @@ void mlir::xevm::registerXeVMTargetInterfaceExternalModels( } SerializeGPUModuleBase::SerializeGPUModuleBase( - Operation &module, XeVMTargetAttr target, + Operation &module, XeVMTargetAttr xeTarget, const gpu::TargetOptions &targetOptions) - : ModuleToObject(module, target.getTriple(), "", {}, target.getO()), - target(target), librariesToLink(targetOptions.getLibrariesToLink()) { - if (target.getLinkFiles()) - librariesToLink.append(target.getLinkFiles().begin(), - target.getLinkFiles().end()); + : ModuleToObject(module, xeTarget.getTriple(), "", {}, xeTarget.getO()), + xeTarget(xeTarget), librariesToLink(targetOptions.getLibrariesToLink()) { + if (xeTarget.getLinkFiles()) + librariesToLink.append(xeTarget.getLinkFiles().begin(), + xeTarget.getLinkFiles().end()); } -XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return target; } +XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return xeTarget; } std::optional>> SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) { @@ -236,9 +236,9 @@ std::optional SerializeGPUModuleBase::findTool(StringRef tool) { namespace { class SpirSerializer : public SerializeGPUModuleBase { public: - SpirSerializer(Operation &module, XeVMTargetAttr target, + SpirSerializer(Operation &module, XeVMTargetAttr xeTarget, const gpu::TargetOptions &targetOptions) - : SerializeGPUModuleBase(module, target, targetOptions) {} + : SerializeGPUModuleBase(module, xeTarget, targetOptions) {} static void init(); @@ -362,8 +362,8 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, module->emitError("expected to be a gpu.module op"); return std::nullopt; } - auto target = cast(attribute); - if (target.getTriple().starts_with("spir")) { + auto xeTarget = cast(attribute); + if (xeTarget.getTriple().starts_with("spir")) { gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) { if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) { funcOp.setIntelReqdSubGroupSize(16); @@ -383,7 +383,7 @@ XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, return serializer.run(); } - module->emitError("Unsupported XeVM target triple: ") << target.getTriple(); + module->emitError("Unsupported XeVM target triple: ") << xeTarget.getTriple(); return std::nullopt; } @@ -393,11 +393,11 @@ XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const { Builder builder(attribute.getContext()); gpu::CompilationTarget format = options.getCompilationTarget(); - auto target = cast(attribute); + auto xeTarget = cast(attribute); SmallVector properties; if (format == gpu::CompilationTarget::Assembly) properties.push_back( - builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))); + builder.getNamedAttr("O", builder.getI32IntegerAttr(xeTarget.getO()))); DictionaryAttr objectProps; if (!properties.empty()) From eca1d2ae65c798c629ff552a54488295c271e642 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 7 Aug 2025 17:40:40 +0000 Subject: [PATCH 11/12] Update integration tests to use level zero runtime instead of SYCL runtime. --- mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg | 2 +- mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir | 2 +- .../Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir | 2 +- .../Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir | 2 +- .../Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir | 2 +- mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg index d172445e6ee54..d0d51c6020588 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg +++ b/mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg @@ -1,4 +1,4 @@ if not config.run_xevm_tests: config.unsupported = True -if not config.enable_sycl_runner: +if not config.enable_levelzero_runner: config.unsupported = True diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir index 07bd15a35083e..c3bdaede3a2de 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir @@ -3,7 +3,7 @@ // RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ // RUN: | mlir-runner \ -// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ // RUN: --shared-libs=%mlir_c_runner_utils \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir index 42b1ba71f862c..6b6e63828d555 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store.mlir @@ -3,7 +3,7 @@ // RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ // RUN: | mlir-runner \ -// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ // RUN: --shared-libs=%mlir_c_runner_utils \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir index 88d2e90794fb0..617f725221af9 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_pack_register.mlir @@ -3,7 +3,7 @@ // RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ // RUN: | mlir-runner \ -// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ // RUN: --shared-libs=%mlir_c_runner_utils \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir index 646d1fc9b4666..2d93f9315e7a6 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_load_store_transpose.mlir @@ -3,7 +3,7 @@ // RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ // RUN: | mlir-runner \ -// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ // RUN: --shared-libs=%mlir_c_runner_utils \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir index 7ead3577857f5..8b56f0afe918b 100644 --- a/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir +++ b/mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir @@ -3,7 +3,7 @@ // RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ // RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \ // RUN: | mlir-runner \ -// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ // RUN: --shared-libs=%mlir_c_runner_utils \ // RUN: --entry-point-result=void \ From 4c72cff0b4a4cc1398872fe78c50af7c16306c84 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Thu, 7 Aug 2025 17:45:54 +0000 Subject: [PATCH 12/12] Add comments regarding binary format used for XeVM target. SPIR-V binary is used as the binary format for now but compileToBinary is already set up to call ocloc - the static compiler - for creating Xe native binary. --- mlir/lib/Target/LLVM/XeVM/Target.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mlir/lib/Target/LLVM/XeVM/Target.cpp b/mlir/lib/Target/LLVM/XeVM/Target.cpp index 46fa02467de63..92f800e02e73f 100644 --- a/mlir/lib/Target/LLVM/XeVM/Target.cpp +++ b/mlir/lib/Target/LLVM/XeVM/Target.cpp @@ -248,6 +248,8 @@ class SpirSerializer : public SerializeGPUModuleBase { moduleToObject(llvm::Module &llvmModule) override; private: + /// Translates the LLVM module to SPIR-V binary using LLVM's + /// SPIR-V target. std::optional translateToSPIRVBinary(llvm::Module &llvmModule, llvm::TargetMachine &targetMachine); @@ -320,6 +322,11 @@ SpirSerializer::moduleToObject(llvm::Module &llvmModule) { return SmallVector(bin.begin(), bin.end()); } + // Level zero runtime is set up to accept SPIR-V binary + // translateToSPIRVBinary translates the LLVM module to SPIR-V binary + // using LLVM's SPIRV target. + // compileToBinary can be used in the future if level zero runtime + // implementation switches to native XeVM binary format. std::optional serializedSPIRVBinary = translateToSPIRVBinary(llvmModule, **targetMachine); if (!serializedSPIRVBinary) {