intel
diff --git a/‎.github/README.md‎
Lines changed: 1 addition & 0 deletions b/‎.github/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 24 additions & 8 deletions b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 7 deletions b/‎include/triton/Analysis/Utility.h‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 25 additions & 11 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 25 additions & 11 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 0 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h‎ renamed to ‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h‎ b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionBuilder.h‎ renamed to ‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h‎
diff --git a/‎include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎include/triton/Dialect/TritonInstrument/IR/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
@@ -14,6 +14,7 @@ Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](ht
   * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)
   * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html)
   * [Intel® Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html)
+  * [Intel® Arc B580](https://www.intel.com/content/www/us/en/products/sku/241598/intel-arc-b580-graphics/specifications.html)
 * GPU Drivers:
   * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html)
   * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html)
 
@@ -18,8 +18,25 @@ jobs:
         runner: ${{ fromJson(inputs.matrix) }}
         include:
           - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+            runner: ["self-hosted", "gfx90a"]
+            # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
+            # container expect it at /github/home/.triton. So map here to make sure visible in docker.
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --volume /home/runner/.triton:/github/home/.triton
+          - image: rocm/pytorch:rocm6.2.2_ubuntu22.04_py3.10_pytorch_2.5.1_asan
+            runner: ["amd-gfx942"]
+            # We add --env-file to pull in HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES definition for GPU isolation.
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --env-file /etc/podinfo/gha-gpu-isolation-settings
+              --volume /home/runner/.triton:/github/home/.triton
           - image: rocm/7.0-preview:rocm7.0_preview_ubuntu22.04_llama2_70b_training_mlperf_mi35X_prealpha
             runner: ["amd-gfx950"]
+            options: >-
+              --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+              --env-file /etc/podinfo/gha-gpu-isolation-settings
+              --volume /home/runner/.triton:/github/home/.triton
     env:
       RUNNER_TYPE: ${{ matrix.runner[1] }}
       TRITON_BUILD_WITH_CCACHE: "true"
@@ -31,11 +48,7 @@ jobs:
       CCACHE_COMPRESS: "true"
     container:
       image: ${{ matrix.image }}
-      # Cache save/restore is on the host machine at directory /home/runner/.triton, while in the docker
-      # container expect it at /github/home/.triton. So map here to make sure visible in docker.
-      options: >-
-        --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
-        --volume /home/runner/.triton:/github/home/.triton
+      options: ${{ matrix.options }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -96,6 +109,8 @@ jobs:
         run: ccache --print-stats
       - name: Run lit tests
         run: make test-lit
+      - name: Run C++ unittests
+        run: make test-cpp
       - name: Run python tests on AMD
         run: |
           INSTRUMENTATION_LIB_DIR="${GITHUB_WORKSPACE}/python/triton/instrumentation"
@@ -147,13 +162,13 @@ jobs:
           python3 -m pytest -s -n 8 ./test_cast_matmul.py
       - name: Run Proton tests
         run: |
+          unset HIP_VISIBLE_DEVICES
+          unset ROCR_VISIBLE_DEVICES
           if [ "${{ matrix.runner[0] }}" = "amd-gfx950" ]; then
             python3 -m pytest -s -n 8 third_party/proton/test -k "not test_instrument_exec"
           else
             make test-proton
           fi
-      - name: Run C++ unittests
-        run: make test-cpp
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
@@ -162,7 +177,8 @@ jobs:
           mkdir -p ~/.ccache
           du -h -d 1 ~/.ccache
       - name: Clean up caches
-        # Always cleanup the worker, even if builds or tests failed
+        # Always cleanup the worker, even if builds or tests failed given that these directories are
+        # mapped from the host and we write files as the root user in the docker.
         if: always()
         run: |
           rm -rf ~/.triton/cache
 
@@ -6,7 +6,7 @@ PYTHON ?= python
 BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')
 TRITON_OPT := $(BUILD_DIR)/bin/triton-opt
 PYTEST := $(PYTHON) -m pytest
-LLVM_BUILD_PATH ?= $(realpath .llvm-project/build)
+LLVM_BUILD_PATH ?= "$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))/.llvm-project/build"
 NUM_PROCS ?= 8
 
 # Incremental builds
 
@@ -252,13 +252,6 @@ bool cvtNeedsWarpShuffle(RankedTensorType srcTy, RankedTensorType dstTy);
 // warps, and possibly blocks.
 bool cvtNeedsSharedMemory(RankedTensorType srcTy, RankedTensorType dstTy);
 
-bool atomicNeedsSharedMemory(Value result);
-
-// Check if MFMA layout can be converted to the dot operand
-// layout using warp shuffle.
-bool matchMFMAAndDotOperandShuffleCase(RankedTensorType srcTy,
-                                       RankedTensorType dstTy);
-
 // TODO: Move utility functions that belong to ConvertLayoutOp to class
 // ConvertLayoutOpHelper in the future
 bool shouldUseDistSmem(Attribute srcLayout, Attribute dstLayout);
 
@@ -97,6 +97,7 @@ class TargetInfoBase {
   virtual bool supportLdMatrix() const { return false; }
   virtual bool supportStMatrix() const { return false; }
   virtual bool isCuda() const { return false; }
+  virtual bool isXpu() const { return false; }
 
   // Annotate target specific information to local load operations during
   // lowering to LLVM. `llLoadOp` is the generated LLVM load op.
 
@@ -557,13 +557,14 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
 // calcPaddedOffset is a lambda that takes a base offset (mlir::Value)
 // and computes a new offset (mlir::Value) by applying padding based on
 // shared memory layout.
-SmallVector<Value> lowerLdStShared(
-    Location loc, MLIRContext *ctx, LinearLayout cvt,
-    ArrayRef<Value> valsArray, // Input for store, output for load
-    Type llvmElemTy, Value smemBase,
-    std::function<Value(Value)> calcPaddedOffset, Value affineOffset,
-    uint64_t maskSpanAffineOffset, ConversionPatternRewriter &rewriter,
-    const TargetInfoBase &targetInfo, Operation *localLoadOp = nullptr);
+SmallVector<Value>
+lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
+                ArrayRef<Value> valsArray, // Input for store, output for load
+                Type llvmElemTy, Value smemBase,
+                std::function<Value(Value)> calcPaddedOffset,
+                Value affineOffset, uint64_t maskSpanAffineOffset,
+                RewriterBase &rewriter, const TargetInfoBase &targetInfo,
+                Operation *localLoadOp = nullptr);
 
 // Lower an ld/st-like operation given a layout and a callback that creates the
 // PTX instruction Lowers to st when valArrays is empty, and to ld when it is
@@ -576,10 +577,10 @@ SmallVector<Value> lowerLdSt(
     ArrayRef<Value> valsArray, // Input for store, output for load
     Type llvmElemTy, Value smemBase,
     std::function<Value(Value)> calcPaddedOffset, Value affineOffset,
-    uint64_t maskSpanAffineOffset, ConversionPatternRewriter &rewriter,
+    uint64_t maskSpanAffineOffset, RewriterBase &rewriter,
     const TargetInfoBase &targetInfo, std::optional<int> maybeMaxVecElems,
-    std::function<SmallVector<Value>(ConversionPatternRewriter &, Location,
-                                     ArrayRef<Value>, Value, int, VectorType)>
+    std::function<SmallVector<Value>(RewriterBase &, Location, ArrayRef<Value>,
+                                     Value, int, VectorType)>
         lowerInst);
 
 // Lower local_load/local_store via ld.shared/st.shared
@@ -588,7 +589,7 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
                LinearLayout cvt,          // Map from registers to offset
                ArrayRef<Value> valsArray, // Input for store, empty for load
                Type llvmElemTy, triton::gpu::MemDescType srcTy,
-               SharedMemoryObject smemObj, ConversionPatternRewriter &rewriter,
+               SharedMemoryObject smemObj, RewriterBase &rewriter,
                const TargetInfoBase &targetInfo,
                Operation *localLoadOp = nullptr);
 
@@ -643,6 +644,12 @@ Value transferWithinBlockPadding(triton::gpu::ConvertLayoutOp op, Value src,
                                  const LLVMTypeConverter *typeConverter,
                                  RewriterBase &rewriter);
 
+LogicalResult
+transferWithinBlockSwizzling(triton::gpu::ConvertLayoutOp op, Value src,
+                             const TargetInfoBase &targetInfo,
+                             const LLVMTypeConverter *typeConverter,
+                             RewriterBase &rewriter);
+
 SmallVector<Value> inlineRegionImpl(RewriterBase &rewriter, Region &region,
                                     ArrayRef<Value> args,
                                     mlir::TypeID terminatorTypeId,
@@ -655,6 +662,13 @@ SmallVector<Value> inlineRegion(RewriterBase &rewriter, Region &region,
                           mlir::TypeID::get<TerminatorOp>(), loc);
 }
 
+void finalizeTensorAtomicResults(Operation *op, RankedTensorType tensorTy,
+                                 ConversionPatternRewriter &rewriter,
+                                 SmallVector<Value> &resultVals,
+                                 Type valueElemTy, TritonLLVMOpBuilder &b,
+                                 Value threadPred,
+                                 const TargetInfoBase &targetInfo,
+                                 const LLVMTypeConverter *typeConverter);
 } // namespace mlir
 
 #endif
@@ -1275,7 +1275,7 @@ def ReturnOp : TT_Op<"return", [Pure, HasParent<"FuncOp">, /*MemRefsNormalizable
   let arguments = (ins Variadic<AnyType>:$srcs);
 
   let builders = [OpBuilder<(ins), [{
-    build($_builder, $_state, std::nullopt);
+    build($_builder, $_state, mlir::ValueRange());
   }]>];
 
   let assemblyFormat = "attr-dict ($srcs^ `:` type($srcs))?";
 
@@ -214,8 +214,6 @@ def TTG_MemDescIndexOp : TTG_Op<"memdesc_index", [Pure, MemDescViewTrait]> {
      - the output shape is 4x16xf16, and
      - index = 1.
     Then the output descriptor is equivalent to input[1], where input is the logical tensor.
-
-    When the input is of rank 1 (i.e, shape=[k]), the output will have shape=[1].
   }];
 
   let arguments = (ins TTG_MemDescType:$src, I32:$index);
 
@@ -8,6 +8,8 @@ add_mlir_doc(TritonInstrumentDialect TritonInstrumentDialect dialects/ -gen-dial
 set(LLVM_TARGET_DEFINITIONS TritonInstrumentOps.td)
 mlir_tablegen(Ops.h.inc -gen-op-decls)
 mlir_tablegen(Ops.cpp.inc -gen-op-defs)
+mlir_tablegen(OpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(OpsEnums.cpp.inc -gen-enum-defs)
 add_mlir_doc(TritonInstrumentOps TritonInstrumentOps dialects/ -gen-op-doc)
 
 add_public_tablegen_target(TritonInstrumentTableGen)