triton-lang
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/runner-preparation.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/runner-preparation.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 24 additions & 4 deletions b/‎README.md‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎docs/meetups/09-03-2025/notes.md‎
Lines changed: 108 additions & 0 deletions b/‎docs/meetups/09-03-2025/notes.md‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎docs/meetups/for_moderators/README.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/meetups/for_moderators/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 20 additions & 16 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 20 additions & 16 deletions
@@ -116,13 +116,18 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
 
+          # Install hip-python
+          pip install -i https://test.pypi.org/simple/ hip-python
+
           # Test gluon
           pytest --capture=tee-sys -rfs -n 8 python/test/gluon/
 
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
           pytest --capture=tee-sys -rfs -n 8 third_party/amd/python/test/ \
                 --ignore=third_party/amd/python/test/test_scalarize_packed_fops.py \
-                --ignore=third_party/amd/python/test/test_address_sanitizer.py
+                --ignore=third_party/amd/python/test/test_address_sanitizer.py \
+                --ignore=third_party/amd/python/test/test_gluon_gfx1250.py
+          pytest --capture=tee-sys -rfs -n 8 third_party/amd/python/test/test_gluon_gfx1250.py -k "test_compile"
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 12 \
 
@@ -37,7 +37,7 @@ jobs:
       - name: Detect if build deps (e.g. LLVM hash) changed
         id: detect-change
         if: github.event_name == 'push'
-        uses: tj-actions/changed-files@v46
+        uses: tj-actions/changed-files@v47
         with:
           files: |
             cmake/*.txt
 
@@ -1,15 +1,26 @@
-<div align="center">
-  <img src="https://lh5.googleusercontent.com/wzQKEsTFkrgNQO9JjhGH5wFvslJr1saLtLaJ_a6Fp_gNENpvt3VG7BmztwngU9hFJaU4CPwGiw1opQtDvTkLrxWRbO_a12Q-pdESWHgtmheIHcPbOL5ZMC4TSiJVe5ty1w=w3517" alt="Triton logo">
-</div>
 
 | **`Documentation`** | **`Nightly Wheels`** |
 |-------------------- | -------------------- |
 | [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
 
-# Conference Registration
+# Triton Conference 2025
+
+![Triton Registration Banner](https://github.com/user-attachments/assets/b4b6972a-857c-417f-bf2c-f16f38a358c0)
+
+### Registration
 
 The 3rd Triton conference is scheduled to take place on October 21, 2025. Click [here](https://tritonconference.eventbuilder.com/TritonDeveloperConference) to register!
 
+### Poster Submission
+
+We invite members of the Triton community who are attending the Triton Developer Conference to present posters about their Triton-related technical work.
+
+Please submit basic information of your poster, including author information and abstract using this [form](https://forms.gle/QfgTF8o1CWNENAnA7).
+
+**Important Dates**
+- Submission: 10/1/2025
+- Author notification: 10/7/2025
+- Final version (PDF): 10/14/2025
 
 # Triton
 
@@ -251,6 +262,15 @@ export TRITON_OVERRIDE_DIR=<override_dir>
 # Step 4: Run the kernel again to see the overridden result
 ```
 
+**Compiler Pipeline Inspection Steps**
+To introspect the pipeline `add_stages`, before running your kernels, simply set
+the add_stages_inspection_hook like so:
+
+```python
+def inspect_stages(_self, stages, options, language, capability):
+    # inspect or modify add_stages here
+triton.knobs.runtime.add_stages_inspection_hook = inspect_stages
+```
 
 # Changelog
 
 
@@ -0,0 +1,108 @@
+# Agenda:
+* Intros: Cicie Wang, and Whitney Tsang (co-organizers).
+* Multi-pass profiler - a federated GPU Tooling Framework for Orchestrated and LLM Agentic Profiling Applications (Kevin Fang, et al., Meta)
+* Triton Developer Conference updates (Ofer Dekel, Microsoft)
+* Q> Who is using tritonbench? How are you using it? OpenAI? (Cicie Wang, Meta)
+* Q> Triton testing strategy - what do folks think? What are we missing? Where would you like to see additional coverage? (Bill Yoshimi, Meta)
+* Q> Free threaded Python.  Any plans for making it compatible with free threading? (Bill Yoshimi, Meta)
+* Open mic for other topics.
+
+# Notes:
+* MPP
+    * Lots of new DSLs (like Gluon and TLX) and profilers.
+    * Working with Keren from OAI on profiling
+    * Integrated wth compiler
+    * Supports new DSLs
+    * Structure-level profiling timelines
+    * Operator-level latency
+    * See OSDI ‘25 paper (accepted)
+    * Approach
+        * Connecting tools like profilers, LLM agents, etc to to different profiling backends (like proton, ncu, nvbit, etc.)
+    * Requirements
+        * Programmable interfaces
+        * Eager execution (makes debugging easier)
+        * Amenable to parallelization
+        * Sandboxing - like for enabling agents to try experiments (to get a clean environment)
+        * Debuggable.
+    * Prototype
+        * Data structures - program IR, execution traces, performance report
+        * Abstractions - tasks and jobs (jobs can be nested)
+    * System architecture
+        * Job graph
+        * MPP runtime - schedules tasks & eager execution
+        * Backend - state caching, GPU/CPU pools. DB for error recovery
+    * Case study 1: Profiling Async Operations
+        * Sometimes difficult because some resources are shared.
+        * We do multiple passes and measure statistical metrics.
+        * Statistical timeline view.
+        * MPP allows you to see distribution of execution times (P20, P50, P80)
+    * Case study 2: Triton PGO Agent
+        * Phases/Agents: profiling, summary, optimizer
+        * Profiling: gets profile results
+        * Summary: compress context window, generate a TL;DR
+        * Optimizer: rewrites kernel to improve performance
+        * Experimenting with TTGIR rewrites.
+        * Examples: identifies section with high execution variation. Identifies critical path and suggests how to shorten them.
+        * Results: compared to no profiling, NCU, with MPP (7-12% improvement).
+        * Failure modes:
+            * Kernel results change
+            * Deadlocks
+    * Case study 3: fine-grained IPC
+        * Timing from proton intra kernel profiler
+        * Instruction type stats from nvbit or cutracer (developed by Meta)
+        * Can identify register pressure.
+    * Conclusion
+        * On top of proton, orchestrating profiling workflows
+        * Soon to be open-source
+
+    Q> How difficult is this to add other GPU vendors like AMD?
+
+    A> If your backend can give you the data, we can do it.  We didn’t do it because we were interested in warp specialization.  It's general and you can implement the interface API.
+
+    Q> Have you experimented with using the optimizer to rewrite assembly code?
+
+    A> Demo used TTGIR but you can create an agent that could rewrite PTX or assembly.
+
+    Q> Did you need to write prompt for the agent?
+
+    A> Yes. It's a very simple prompt.
+
+* Triton conference updates (Ofer Dekel, MSFT)
+    * [https://aka.ms/tritonconference2025](https://aka.ms/tritonconference2025)
+    * Schedule
+        * Please show up to the happy hour to mingle (probably the most important part).
+        * Register.  You’ll also need it for the live-stream too.  Sorry, you will not be able to register on the day of conference.
+        * When you register, status is pending.  Will take up to a week to get it approved. (Why? Its going through Microsoft security review).
+        * Please register with your institutional/professional email vs. yahoo/gmail/generic email. Generic email will take longer approve. You can ping Ofer if you haven’t seen your approval after 8+ days.
+        * There will be busses to venue from SF.
+        * Visa letter? Register soon so we can get you an invitation letter
+    * Program
+        * Phil & Thomas - Triton: today and beyond
+        * Mark Saroufim - GPU MODE: the state of Triton
+        * Jason Ansel - Helion: A higher-level DSL for Kernel Authoring
+        * Keren Zhou (George Mason) & Kevin Fang (Proton: portable performance profiling)
+        * Lixun Zhang (AMD) - No warm up needed: Triton day-one speed on AMD GPUS
+        * Chris Sullivan (Nvidia) - Nvida Blackwell GPU backend for Triton
+        * Peter Bell (OpenAI) - Gluon: tilebased GPU programming with low-level control.
+        * Hongtao Y (Meta) - TLX
+        * Wenlei Bao (Bytedance ) - Triton - distributed computation and communication overlapping
+        * Yanming Chen (Linked in) - Evolution of Liger Kernels to post training
+* Q> Who is using tritonbench? How are you using it? OpenAI?
+    * [Kernelize.ai](Kernelize.ai) - vLLM testing tritonbench nightly. Built a visualization (noticed H100 and B200 regressions on Liger kernel and BF16).
+    * OpenAI - not using tritonbench, using internal benchmarking system.  Lowtech stuff, ocaml (some of it is open sources in repo).  Simple benchmarking.
+    * Q> no new kernels added
+    * A> we’re continuously updating them, thinking of upstreaming more, attention, but no timeline.  We are keeping MoE update.
+* Q> Triton testing strategy - what do folks think? What are we missing? Where would you like to see additional coverage?
+    * Ettore - want so seem more lit test coverage, doesn’t require GPU.  Easier and fast to run. Vs testing operator end to end.
+    * 20K unit tests are good, but if we want better improvements. Is to beef up the lit tests.GPU tests should be in third-party directory.  Add lit
+    * Alex Baden: Tests: for important kernels, IR diffing! Cheaper to run (if the IR doesn’t change you shouldn’t have a regression.).  Use LLVM tooling to eliminate white space changes. **For important kernels, extract & compare IR changes.**
+* Q> What is the Free-threading Python strategy?
+    * Lots of things to fix in the front end (backend is pretty thread-safe.)
+    * But its not high on the list of work we're doing (OAI).
+* Q> Flex attention: update comments/docs to use tensor descriptors instead of TMA (unless TMA is really being referenced).
+    * PyTorch flex attention uses tensor descriptors but comments/code reference TMA. Reaching out to owners of flex attention PyTorch inductor template kernels to update comments and code. Confusing for people who use GPUs that don’t implement TMA.
+    * Ettore: FlexAttention FWD uses tensor descriptors but BWD doesn't, can someone add tensor descriptor support?
+
+# Minutes
+* Recording link [here](https://youtu.be/Ji1rCo6qvXc)
+* MPP presentation link [here](https://tinyurl.com/4r7cfzhu)
@@ -124,3 +124,4 @@ If this is your first time using Microsoft Teams, work with the meeting creator
  | ---- | ---------- | ------------ | --------- |
  | 2025-05-01 | [Link](https://tinyurl.com/mr397f6x) | Topic: what are plans for existing block pointer programming model? (Context: Intel GPU backend relies heavily on it and will need time to fully move to tensor descriptor programming model.) - Jianhui Li, Intel <br/> Topic: infrastructure for Triton performance tests - Sayce, Google<br/>Topic: what talks/tutorials/open discussions would you like to see at the 2025 Triton Developers’ Summit? How can we help? - Adnan Aziz, Meta <br/> Topic: what are plans for existing block pointer programming model? (Context: Intel GPU backend relies heavily on it and will need time to fully move to tensor descriptor programming model.) - Jianhui Li, Intel<br/>Topic: infrastructure for Triton performance tests - Sayce, Google<br/>Topic: what talks/tutorials/open discussions would you like to see at the 2025 Triton Developers’ Summit? How can we help? - Adnan Aziz, Meta </pre> | https://www.youtube.com/watch?v=W16BrXc5BYE |
 | 2025-07-09 |[Link](https://tinyurl.com/mus5wyax) | Topic: Gluon update - Jeff Niu, OpenAI <br/> Topic: Interest and requirements for a nightly performance regression suite - Simon Waters,  kernelize.ai<br/>Triton developer's summit update - Ofer Dekel, Microsoft | https://youtu.be/zoSY_WXHmF0 |
+| 2025-09-03 |[Link](https://tinyurl.com/4r7cfzhu) | Topic: Intros: Cicie Wang, and Whitney Tsang (co-organizers).<br/>Topic: Multi-pass profiler - a federated GPU Tooling Framework for Orchestrated and LLM Agentic Profiling Applications (Kevin Fang, et al., Meta)<br/>Topic: Triton Developer Conference updates (Ofer Dekel, Microsoft)<br/>Topic: Q> Who is using tritonbench? How are you using it? OpenAI? (Cicie Wang, Meta)<br/>Topic: Triton testing strategy - what do folks think? What are we missing? Where would you like to see additional coverage? (Bill Yoshimi, Meta)<br/>Q> Topic: Free threaded Python.  Any plans for making it compatible with free threading? (Bill Yoshimi, Meta) | https://youtu.be/Ji1rCo6qvXc |
@@ -106,6 +106,11 @@ def TT_DescriptorStoreLikeOpInterface : OpInterface<"DescriptorStoreLikeOpInterf
       /*retType=*/"::mlir::TypedValue<mlir::RankedTensorType>",
       /*methodName=*/"getSrc",
       /*args=*/(ins)>,
+    InterfaceMethod<
+      /*desc=*/"Get mutable source tensor",
+      /*retType=*/"::mlir::OpOperand&",
+      /*methodName=*/"getSrcMutable",
+      /*args=*/(ins)>,
   ];
 }
 
 
@@ -428,7 +428,7 @@ and `pN` to mean padding:
      x1, x3, p2, p3
      ...]
 
-2. 2D single interval-padding with rearanged rows.
+2. 2D single interval-padding with rearranged rows.
 
     #ttg.padded_shared<[16:+1] {offset = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]], block = []}>
     [
@@ -1202,15 +1202,16 @@ def AMDWmmaEncodingAttr : DistributedEncoding<"AMDWmmaEncoding", "amd_wmma_encod
   let description = [{
 An encoding for tensors that have been produced by WMMA matrix core instructions,
 available on AMD Radeon GPUs of RDNA architectures.
-- A `version` parameter specifies instruction version to lower in. The data
-  distribution within one warp is also depends on it. Following architectures are
-  supported:
-  - 1: gfx11
-  - 2: gfx12
-- A `warpsPerCTA` parameter characterizes data distribution between warps.
-  An important limitation of WMMA for layout is a shape for tiles processed
-  by a single warp. It is [16, 16].
-  This encoding assumes specific access to matrix elements by threads.
+
+It is characterized by the following parameters:
+- `version` indicates the GPU architecture:
+  - 1: RDNA3; e.g., gfx1100, gfx1101
+  - 2: RDNA4; e.g., gfx1200, gfx1201
+  - 3: gfx1250
+- `warpsPerCTA` indicates the warp layout in the block.
+- `instrShape` indicates the shape in the form of (M, N, K) of the matrix
+   operation performed by a single WMMA instruction. Defaults to (16, 16, 16).
+- `isTransposed` indicates the layout of the result tensor is transposed.
 
 Example:
 Suppose we have a tensor with shape [32, 64], `warpsPerCTA` set to [2, 2].
@@ -1239,7 +1240,7 @@ Row |                  warp 0                                    warp 1
 30  |[0  1  2  ... 14 15] [0  1  2  ... 14 15] [0  1  2  ... 14 15] [0  1  2  ... 14 15]
 31  |[16 17 18 ... 30 31] [16 17 18 ... 30 31] [16 17 18 ... 30 31] [16 17 18 ... 30 31]
 
-// ------------------------ version = 2, isTransposed = false ------------------------ //
+// ------------------------ version = 2/3, isTransposed = false ------------------------ //
 
 Row |       warp 0                warp 1
     |/--------^---------\ /---------^--------\
@@ -1267,7 +1268,7 @@ Row |       warp 0                warp 1
 30  |[16 17 18 ... 30 31] [16 17 18 ... 30 31]
 31  |[16 17 18 ... 30 31] [16 17 18 ... 30 31]
 
-// ------------------------ version = 2, isTransposed = true ------------------------ //
+// ------------------------ version = 2/3, isTransposed = true ------------------------ //
 
     |               warp 0                     warp 1
     |/----------------^----------------\ /-------^-------\
@@ -1293,18 +1294,21 @@ Row |
     "unsigned": $version,
     "bool":$isTransposed,
     ArrayRefParameter<"unsigned">:$warpsPerCTA,
-    "CTALayoutAttr":$CTALayout
+    "CTALayoutAttr":$CTALayout,
+    ArrayRefParameter<"unsigned">:$instrShape
   );
 
   let genVerifyDecl = 1;
   let hasCustomAssemblyFormat = 1;
 
   let extraClassDeclaration = extraDistributedDeclaration # [{
-    SmallVector<int64_t> getElemsPerInstrForOperands(int kDim, int opIdx) const;
     SmallVector<int64_t> getRepForOperand(ArrayRef<int64_t> operandShape,
-                                          Type elemType, int kWidth, int kDim, int opIdx) const;
+                                          Type elemType, int opIdx) const;
     SmallVector<unsigned> getRepOrderForOperand(int opIdx) const;
-    static SmallVector<unsigned> getMNKDimPerInstr();
+
+    static SmallVector<unsigned, 3> getDefaultInstrShape() {
+      return {16, 16, 16};
+    }
 
     // Returns a swizzled shared layout matching this WMMA layout for the
     // dot operand at the given |operandIdx| with |operandShape|.