tile-ai
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 38 additions & 13 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 38 additions & 13 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 78 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎benchmarks/benchmark.py‎
Lines changed: 3 additions & 1 deletion b/‎benchmarks/benchmark.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmarks/flash_attn/gqa.py‎
Lines changed: 14 additions & 10 deletions b/‎benchmarks/flash_attn/gqa.py‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎benchmarks/flash_attn/mha.py‎
Lines changed: 14 additions & 8 deletions b/‎benchmarks/flash_attn/mha.py‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎benchmarks/flash_decode/gqa_decode.py‎
Lines changed: 15 additions & 12 deletions b/‎benchmarks/flash_decode/gqa_decode.py‎
Lines changed: 15 additions & 12 deletions
@@ -2,19 +2,44 @@ name: hello-self-hosted
 
 on:
   push:
-    branches: ['**']  # 所有分支
-    tags:     ['**']  # 所有标签推送也触发，可按需删掉
+    branches: ['**']  # All branches
+    tags:     ['**']  # All tags push will also trigger, can be removed as needed
   pull_request:
-    branches: ['**']  # 所有分支的 PR 都触发 CI
+    branches: ['**']  # All branch PRs trigger CI
 
 jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0  # Equivalent to GIT_STRATEGY: fetch
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"  # At least 3.9 to avoid pre-commit-hooks reporting Python version too low
+
+      - name: Run pre-commit
+        # Official pre-commit Action, automatically:
+        # 1) Install pre-commit
+        # 2) Generate virtual environment according to .pre-commit-config.yaml configuration
+        # 3) Execute pre-commit run
+        uses: pre-commit/[email protected]
+        with:
+          # Run on all files once, and print diff on failure
+          extra_args: --all-files --show-diff-on-failure
+
   tileops_test_0-1-6-post1:
+    needs: pre-commit
     runs-on: [self-hosted, tile-ops] 
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
         with:
-          fetch-depth: 0  # 相当于 GIT_STRATEGY: fetch
+          fetch-depth: 0  # Equivalent to GIT_STRATEGY: fetch
 
       - name: Setup & Run tests
         run: |
@@ -30,19 +55,20 @@ jobs:
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
-        if: always()  # 相当于 when: always
+        if: always()  # Equivalent to when: always
         with:
           name: tileops_test_0_1_6.log
           path: tileops_test_0_1_6.log
-          retention-days: 7  # 相当于 expire_in: 1 week
+          retention-days: 7  # Equivalent to expire_in: 1 week
 
   tileops_test_nightly:
+    needs: pre-commit
     runs-on: [self-hosted, tile-ops] 
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
         with:
-          fetch-depth: 0  # 相当于 GIT_STRATEGY: fetch
+          fetch-depth: 0  # Equivalent to GIT_STRATEGY: fetch
 
       - name: Setup & Run tests
         run: |
@@ -58,20 +84,20 @@ jobs:
 
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
-        if: always()  # 相当于 when: always
+        if: always()  # Equivalent to when: always
         with:
           name: tileops_test_nightly.log
           path: tileops_test_nightly.log
-          retention-days: 7  # 相当于 expire_in: 1 week
+          retention-days: 7  # Equivalent to expire_in: 1 week
 
   tileops_profile_nightly:
-    needs: tileops_test_nightly
+    needs: [pre-commit, tileops_test_nightly]
     runs-on: [self-hosted, tile-ops] 
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
         with:
-          fetch-depth: 0  # 相当于 GIT_STRATEGY: fetch
+          fetch-depth: 0  # Equivalent to GIT_STRATEGY: fetch
 
       - name: Setup & Run tests
         run: |
@@ -91,5 +117,4 @@ jobs:
         with:
           name: profile_out
           path: profile_out/
-          retention-days: 7
-
+          retention-days: 7
@@ -0,0 +1,78 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+
+ci:
+  autofix_prs: false  # Don't automatically push "fix" commits to PRs on pre-commit.ci
+  autofix_commit_msg: "[Lint]: [pre-commit.ci] auto fixes [...]"  # Auto-generated commit message template when autofix is enabled
+  autoupdate_commit_msg: "[CI] [pre-commit.ci] autoupdate"       # Commit message used when pre-commit.ci auto-updates hook versions
+  autoupdate_schedule: monthly  # Auto-update hook versions once per month
+
+# Default stages that trigger these hooks
+default_stages: [pre-commit, pre-push, manual]
+
+# Globally ignored directories: files under build/ and 3rdparty/ won't run any hooks
+exclude: '^(build|3rdparty)/.*$'  # exclude build and 3rdparty directories
+
+repos:
+  # Basic general checks: symlinks, file size, merge conflicts, etc.
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: check-symlinks            # Check if symlinks in the repository point to valid paths
+      - id: destroyed-symlinks        # Check for accidentally broken symlinks (e.g., symlink changed to regular file)
+      # FIXME: enable these hooks
+      # - id: trailing-whitespace     # Check and clean up trailing whitespace (currently disabled)
+      # - id: end-of-file-fixer      # Ensure files end with exactly one newline (currently disabled)
+      - id: check-added-large-files   # Prevent adding overly large files to avoid bloating the repository
+      - id: check-merge-conflict      # Check for Git merge conflict markers <<<<<<< / ======= / >>>>>>>
+        fail_fast: true               # Fail immediately upon finding conflict markers
+      # FIXME: enable these hooks
+      # - id: check-executables-have-shebangs        # Check if executable files have shebangs (not enabled)
+      # - id: check-shebang-scripts-are-executable   # Check if scripts with shebangs are executable (not enabled)
+      - id: detect-private-key        # Detect if private keys are accidentally committed (e.g., SSH/TLS private keys)
+      - id: check-yaml                # Validate syntax of all YAML files
+      - id: check-toml                # Validate TOML files (like pyproject.toml) syntax
+      - id: check-ast                 # Parse Python files with AST to check for syntax errors
+        fail_fast: true               # Fail immediately upon finding syntax errors
+      - id: debug-statements          # Detect debugging statements in Python, such as pdb.set_trace()/breakpoint()
+      - id: file-contents-sorter      # Sort contents line by line in specified files, used here for spelling_wordlist
+        args: [--ignore-case]         # Ignore case when sorting
+        files: ^docs/spelling_wordlist\.txt$  # Only apply to docs/spelling_wordlist.txt
+
+  # Use Ruff for Python static analysis / linting
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.3  # sync with requirements-lint.txt
+    hooks:
+      - id: ruff-check
+        # Automatically fix issues that can be fixed, and return non-zero exit code when fixes are made to remind git add
+        args: [--fix, --exit-non-zero-on-fix]
+
+  # Use yapf for Python code formatting
+  - repo: https://github.com/google/yapf
+    rev: v0.43.0  # sync with requirements-lint.txt
+    hooks:
+      - id: yapf
+        name: yapf-multiproc-bugfix
+        # yapf is not multiprocessing-safe, so we first run a "dummy" yapf on top/__init__.py
+        # to serve as a warm-up/workaround for concurrency bugs
+        args: [--in-place, top/__init__.py]
+        always_run: true    # Run every time, regardless of file changes
+        pass_filenames: false  # Don't pass file list as arguments, only work on top/__init__.py
+      - id: yapf
+        # Real global Python formatting: recursively format Python files in-place
+        args: [--recursive, --in-place]
+
+  # Use codespell for English spell checking
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.4.1  # sync with requirements-lint.txt
+    hooks:
+      - id: codespell
+        # Allow codespell to read configuration from pyproject.toml / setup.cfg, etc.
+        additional_dependencies: [".[toml]"]
+        # Exclude file types unsuitable for spell checking, such as C/C++/CUDA source, SVG, requirements lists, etc.
+        exclude: |
+          (?x)(
+            ^.+\.(cpp|hpp|cxx|cc|c|h|cu|cuh)$|
+            ^.+\.svg$|
+            ^.*\brequirements\b.*\.txt$
+          )
@@ -96,7 +96,9 @@ def check_fn(self, fn, *inputs, atol=1e-2, rtol=1e-2, grad=True):
         elif not isinstance(outputs, tuple):
             raise ValueError(f"Unsupported output type: {type(outputs)}")
 
-        assert len(outputs) == len(outputs_ref), f"outputs: {len(outputs)}  and outputs_ref: {len(outputs_ref)} have different size"
+        assert len(outputs) == len(
+            outputs_ref
+        ), f"outputs: {len(outputs)}  and outputs_ref: {len(outputs_ref)} have different size"
         for i, (output, output_ref) in enumerate(zip(outputs, outputs_ref)):
             # print(f"outputs[{i}] max err: {(output - output_ref).abs().max()}")
             if output_ref is not None:  # skip checking for None placeholders in ref
 
@@ -26,7 +26,8 @@ def total_flops(self):
 
     @property
     def total_memory(self):
-        return 2 * self.batch * self.seq_len * self.dim * (self.heads + self.heads_kv) * self.dtype.itemsize
+        return 2 * self.batch * self.seq_len * self.dim * (self.heads +
+                                                           self.heads_kv) * self.dtype.itemsize
 
     def gen_inputs(self):
         Q = torch.randn(
@@ -38,11 +39,12 @@ def gen_inputs(self):
         return Q, K, V
 
     def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor):
-        q_bhsd = Q.transpose(1, 2)   # [B, H, S, D]
+        q_bhsd = Q.transpose(1, 2)  # [B, H, S, D]
         k_bhsd = K.transpose(1, 2)
         v_bhsd = V.transpose(1, 2)
         with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
-            output_bhsd = F.scaled_dot_product_attention(q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal, enable_gqa=True)
+            output_bhsd = F.scaled_dot_product_attention(
+                q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal, enable_gqa=True)
         output = output_bhsd.transpose(1, 2).contiguous()
         return output, None  # do not check lse
 
@@ -68,7 +70,8 @@ def total_flops(self):
 
     @property
     def total_memory(self):
-        return self.batch * (3 * self.heads + 4 * self.heads_kv) * self.seq_len * self.dim * self.dtype.itemsize
+        return self.batch * (3 * self.heads +
+                             4 * self.heads_kv) * self.seq_len * self.dim * self.dtype.itemsize
 
     def gen_inputs(self):
         Q = torch.randn(
@@ -127,7 +130,7 @@ def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, O: torc
 
 
 class gqa_benchmark(Benchmark):
-    
+
     def __init__(self, batch, heads, heads_kv, seq_len, dim, is_causal, dtype, grad=True):
         self.batch = batch
         self.heads = heads
@@ -138,8 +141,10 @@ def __init__(self, batch, heads, heads_kv, seq_len, dim, is_causal, dtype, grad=
         self.dtype = dtype
         self.grad = grad
 
-        self.gqa_fwd_bench = gqa_fwd_benchmark(batch, heads, heads_kv, seq_len, dim, is_causal, dtype)
-        self.gqa_bwd_bench = gqa_bwd_benchmark(batch, heads, heads_kv, seq_len, dim, is_causal, dtype)  
+        self.gqa_fwd_bench = gqa_fwd_benchmark(batch, heads, heads_kv, seq_len, dim, is_causal,
+                                               dtype)
+        self.gqa_bwd_bench = gqa_bwd_benchmark(batch, heads, heads_kv, seq_len, dim, is_causal,
+                                               dtype)
 
     @property
     def total_flops(self):
@@ -148,14 +153,14 @@ def total_flops(self):
     @property
     def total_memory(self):
         return self.gqa_fwd_bench.total_memory + self.gqa_bwd_bench.total_memory
-    
+
     def gen_inputs(self):
         if self.grad:
             Q, K, V, _, _, _ = self.gqa_bwd_bench.gen_inputs()
             return Q, K, V
         else:
             return self.gqa_fwd_bench.gen_inputs()
-        
+
     def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor):
 
         output = self.gqa_fwd_bench.ref_program(Q, K, V)[0]
@@ -165,4 +170,3 @@ def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor):
             loss = output.sum()
             loss.backward()
             return output, Q.grad, K.grad, V.grad
-        
 
@@ -37,11 +37,12 @@ def gen_inputs(self):
         return Q, K, V
 
     def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor):
-        q_bhsd = Q.transpose(1, 2)   # [B, H, S, D]
+        q_bhsd = Q.transpose(1, 2)  # [B, H, S, D]
         k_bhsd = K.transpose(1, 2)
         v_bhsd = V.transpose(1, 2)
         with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
-            output_bhsd = F.scaled_dot_product_attention(q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal)
+            output_bhsd = F.scaled_dot_product_attention(
+                q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal)
         output = output_bhsd.transpose(1, 2).contiguous()
         return output, None  # do not check lse
 
@@ -104,11 +105,12 @@ def gen_inputs(self):
 
     def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, O: torch.Tensor,
                     dO: torch.Tensor, lse: torch.Tensor):
-        q_bhsd = Q.transpose(1, 2)   # [B, H, S, D]
+        q_bhsd = Q.transpose(1, 2)  # [B, H, S, D]
         k_bhsd = K.transpose(1, 2)
         v_bhsd = V.transpose(1, 2)
         with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
-            output_bhsd = F.scaled_dot_product_attention(q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal)
+            output_bhsd = F.scaled_dot_product_attention(
+                q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal)
         output = output_bhsd.transpose(1, 2).contiguous()
 
         output.backward(dO)
@@ -165,12 +167,17 @@ def gen_inputs(self):
 
         return Q, K, V
 
-    def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, dO: torch.Tensor = None):
-        q_bhsd = Q.transpose(1, 2)   # [B, H, S, D]
+    def ref_program(self,
+                    Q: torch.Tensor,
+                    K: torch.Tensor,
+                    V: torch.Tensor,
+                    dO: torch.Tensor = None):
+        q_bhsd = Q.transpose(1, 2)  # [B, H, S, D]
         k_bhsd = K.transpose(1, 2)
         v_bhsd = V.transpose(1, 2)
         with sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION]):
-            output_bhsd = F.scaled_dot_product_attention(q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal)
+            output_bhsd = F.scaled_dot_product_attention(
+                q_bhsd, k_bhsd, v_bhsd, is_causal=self.is_causal)
         output = output_bhsd.transpose(1, 2).contiguous()
 
         if not self.grad:
@@ -179,4 +186,3 @@ def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, dO: tor
             loss = output.sum()
             loss.backward()
             return output, Q.grad, K.grad, V.grad
-    
 
@@ -19,7 +19,7 @@ def __init__(self, batch, heads, groups, seq_len_kv, dim, dtype):
 
     @property
     def total_flops(self):
-        flops_per_matmul = 2.0 * self.batch * self.heads  * self.seq_len_kv * self.dim
+        flops_per_matmul = 2.0 * self.batch * self.heads * self.seq_len_kv * self.dim
         flops = flops_per_matmul * 2
         return flops
 
@@ -28,26 +28,29 @@ def total_memory(self):
         # Q: batch * 1 * heads * dim
         # K, V: batch * seq_len_kv * heads_kv * dim
         # Output: batch * 1 * heads * dim
-        return 2 * self.batch * self.dim * self.dtype.itemsize * (self.heads + self.groups * self.seq_len_kv)
+        return 2 * self.batch * self.dim * self.dtype.itemsize * (
+            self.heads + self.groups * self.seq_len_kv)
 
     def gen_inputs(self):
-        Q = torch.randn(
-            self.batch, self.heads, self.dim, device='cuda', dtype=self.dtype)
+        Q = torch.randn(self.batch, self.heads, self.dim, device='cuda', dtype=self.dtype)
         K = torch.randn(
             self.batch, self.seq_len_kv, self.groups, self.dim, device='cuda', dtype=self.dtype)
         V = torch.randn(
             self.batch, self.seq_len_kv, self.groups, self.dim, device='cuda', dtype=self.dtype)
-        mask = torch.randint(0, 2, (self.batch, self.seq_len_kv, self.groups), device='cuda', dtype=torch.uint8)
+        mask = torch.randint(
+            0, 2, (self.batch, self.seq_len_kv, self.groups), device='cuda', dtype=torch.uint8)
         return Q, K, V, mask
 
     def ref_program(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, mask: torch.Tensor):
-        q_bhsd = Q.unsqueeze(1).transpose(1, 2)   # [B, H, 1, D]
-        k_bhsd = K.transpose(1, 2)   # [B, H, S_kv, D]
-        v_bhsd = V.transpose(1, 2)   # [B, H, S_kv, D]
+        q_bhsd = Q.unsqueeze(1).transpose(1, 2)  # [B, H, 1, D]
+        k_bhsd = K.transpose(1, 2)  # [B, H, S_kv, D]
+        v_bhsd = V.transpose(1, 2)  # [B, H, S_kv, D]
         mask = mask.to(torch.bool).transpose(1, 2).unsqueeze(2)  # [B, G, 1, S_kv]
-        mask = mask.expand(self.batch, self.groups, self.heads // self.groups, self.seq_len_kv).reshape(self.batch, self.heads, self.seq_len_kv).unsqueeze(2)
+        mask = mask.expand(self.batch, self.groups, self.heads // self.groups,
+                           self.seq_len_kv).reshape(self.batch, self.heads,
+                                                    self.seq_len_kv).unsqueeze(2)
         with sdpa_kernel(backends=[SDPBackend.MATH]):
-            output_bhsd = F.scaled_dot_product_attention(q_bhsd, k_bhsd, v_bhsd, attn_mask=mask, enable_gqa=True)
+            output_bhsd = F.scaled_dot_product_attention(
+                q_bhsd, k_bhsd, v_bhsd, attn_mask=mask, enable_gqa=True)
         output = output_bhsd.transpose(1, 2).squeeze(1).contiguous()
-        return output 
-    
+        return output