From ad7b5dde4e335665a44268e5164ac7cc7ad45bed Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:11:30 -0700
Subject: [PATCH 01/10] Feat: grayscale

---
 problems/pmpp/grayscale_py/reference.py       | 26 ++++++++++++-------
 .../grayscale_py/solutions/correct/ref.py     | 10 ++++---
 .../grayscale_py/solutions/wrong/empty.py     |  1 +
 problems/pmpp/grayscale_py/submission.py      |  4 ++-
 problems/pmpp/grayscale_py/task.py            |  4 +--
 5 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/problems/pmpp/grayscale_py/reference.py b/problems/pmpp/grayscale_py/reference.py
index 1ed6d14..66788d4 100644
--- a/problems/pmpp/grayscale_py/reference.py
+++ b/problems/pmpp/grayscale_py/reference.py
@@ -7,17 +7,19 @@ def ref_kernel(data: input_t) -> output_t:
     """
     Reference implementation of RGB to grayscale conversion using PyTorch.
     Uses the standard coefficients: Y = 0.2989 R + 0.5870 G + 0.1140 B
-    
+
     Args:
         data: RGB tensor of shape (H, W, 3) with values in [0, 1]
     Returns:
         Grayscale tensor of shape (H, W) with values in [0, 1]
     """
+    data, output = data
     # Standard RGB to Grayscale coefficients
-    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
-                         device=data.device, 
-                         dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    weights = torch.tensor(
+        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+    )
+    output = torch.sum(data * weights, dim=-1)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -26,12 +28,16 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tensor of shape (size, size, 3) with values in [0, 1]
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.rand(size, size, 3, 
-                     device='cuda', 
-                     dtype=torch.float32, 
-                     generator=gen).contiguous()
+
+    x = torch.rand(
+        size, size, 3, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
+    y = torch.empty(size, size, device="cuda", dtype=torch.float32).contiguous()
+
+    return (x, y)
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-4, atol=1e-4)
diff --git a/problems/pmpp/grayscale_py/solutions/correct/ref.py b/problems/pmpp/grayscale_py/solutions/correct/ref.py
index 6a40c3e..aa7863a 100644
--- a/problems/pmpp/grayscale_py/solutions/correct/ref.py
+++ b/problems/pmpp/grayscale_py/solutions/correct/ref.py
@@ -3,7 +3,9 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    weights = torch.tensor([0.2989, 0.5870, 0.1140], 
-                         device=data.device, 
-                         dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    data, output = data
+    weights = torch.tensor(
+        [0.2989, 0.5870, 0.1140], device=data.device, dtype=data.dtype
+    )
+    output = torch.sum(data * weights, dim=-1)
+    return output
diff --git a/problems/pmpp/grayscale_py/solutions/wrong/empty.py b/problems/pmpp/grayscale_py/solutions/wrong/empty.py
index e37e32b..d0d0258 100644
--- a/problems/pmpp/grayscale_py/solutions/wrong/empty.py
+++ b/problems/pmpp/grayscale_py/solutions/wrong/empty.py
@@ -4,4 +4,5 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
+    data, _output = data
     return torch.empty(size=(data.shape[0], data.shape[1]), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/grayscale_py/submission.py b/problems/pmpp/grayscale_py/submission.py
index de0c149..9fc2351 100644
--- a/problems/pmpp/grayscale_py/submission.py
+++ b/problems/pmpp/grayscale_py/submission.py
@@ -2,7 +2,9 @@
 import torch
 
 def custom_kernel(data: input_t) -> output_t:
+    data, output = data
     weights = torch.tensor([0.2989, 0.5870, 0.1140], 
                          device=data.device, 
                          dtype=data.dtype)
-    return torch.sum(data * weights, dim=-1)
+    output = torch.sum(data * weights, dim=-1)
+    return output
diff --git a/problems/pmpp/grayscale_py/task.py b/problems/pmpp/grayscale_py/task.py
index 4a717fc..9a6b36d 100644
--- a/problems/pmpp/grayscale_py/task.py
+++ b/problems/pmpp/grayscale_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)  # Input will be (H, W, 3) RGB tensor
+input_t = TypeVar("input_t", bound=(torch.Tensor, torch.Tensor))  # Input is a pair of tensors (input, output) where input is (H, W, 3) RGB tensor and output is (H, W) grayscale tensor
 output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
 
 class TestSpec(TypedDict):
     size: int  # Size of the square image (H=W)
-    seed: int 
\ No newline at end of file
+    seed: int 

From 2f8544d16f84848bf8e519e63d2db0c2ff0bb462 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:16:04 -0700
Subject: [PATCH 02/10] Feat: histogram

---
 problems/pmpp/histogram_py/reference.py             | 8 ++++++--
 problems/pmpp/histogram_py/solutions/correct/ref.py | 4 +++-
 problems/pmpp/histogram_py/solutions/wrong/empty.py | 1 +
 problems/pmpp/histogram_py/submission.py            | 6 +++++-
 problems/pmpp/histogram_py/task.py                  | 2 +-
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/problems/pmpp/histogram_py/reference.py b/problems/pmpp/histogram_py/reference.py
index 18e8b24..4d3cc24 100644
--- a/problems/pmpp/histogram_py/reference.py
+++ b/problems/pmpp/histogram_py/reference.py
@@ -11,8 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
+    data, output = data
     # Count values in each bin
-    return torch.bincount(data, minlength=256)
+    output = torch.bincount(data, minlength=256)
+    return output
 
 
 def generate_input(size: int, contention: float, seed: int) -> input_t:
@@ -37,7 +39,9 @@ def generate_input(size: int, contention: float, seed: int) -> input_t:
     evil_loc = torch.rand((size,), device='cuda', dtype=torch.float32, generator=gen) < (contention / 100.0)
     data[evil_loc] = evil_value
 
-    return data.contiguous()
+    output = torch.empty(256, device='cuda', dtype=torch.int64).contiguous()
+
+    return data.contiguous(), output
 
 
 def check_implementation(data, output):
diff --git a/problems/pmpp/histogram_py/solutions/correct/ref.py b/problems/pmpp/histogram_py/solutions/correct/ref.py
index 7de5ccc..eea0661 100644
--- a/problems/pmpp/histogram_py/solutions/correct/ref.py
+++ b/problems/pmpp/histogram_py/solutions/correct/ref.py
@@ -3,4 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.bincount(data, minlength=256)
+    data, output = data
+    output = torch.bincount(data, minlength=256)
+    return output
diff --git a/problems/pmpp/histogram_py/solutions/wrong/empty.py b/problems/pmpp/histogram_py/solutions/wrong/empty.py
index e35e3dc..00210f5 100644
--- a/problems/pmpp/histogram_py/solutions/wrong/empty.py
+++ b/problems/pmpp/histogram_py/solutions/wrong/empty.py
@@ -4,4 +4,5 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
+    data, _output = data
     return torch.empty(size=(256,), device=data.device, dtype=data.dtype)
diff --git a/problems/pmpp/histogram_py/submission.py b/problems/pmpp/histogram_py/submission.py
index 1e62e9a..d117abd 100644
--- a/problems/pmpp/histogram_py/submission.py
+++ b/problems/pmpp/histogram_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def custom_kernel(data: input_t) -> output_t:
     """
     Reference implementation of histogram using PyTorch.
@@ -9,4 +10,7 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing bin counts
     """
-    return torch.bincount(data, minlength=256)
+    data, output = data
+    # Compute histogram with 256 bins
+    output = torch.bincount(data, minlength=256)
+    return output
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
index 8072786..aa22bb8 100644
--- a/problems/pmpp/histogram_py/task.py
+++ b/problems/pmpp/histogram_py/task.py
@@ -1,7 +1,7 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=(torch.Tensor, torch.Tensor))
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):

From f6e0876cdcd1cb61fc09ff6f90e180259524c844 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:17:37 -0700
Subject: [PATCH 03/10] Feat: matmul

---
 problems/pmpp/matmul_py/reference.py                     | 3 ++-
 problems/pmpp/matmul_py/solutions/correct/ref.py         | 6 +++---
 problems/pmpp/matmul_py/solutions/wrong/low-precision.py | 5 +++--
 problems/pmpp/matmul_py/submission.py                    | 5 +++--
 problems/pmpp/matmul_py/task.py                          | 2 +-
 5 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/problems/pmpp/matmul_py/reference.py b/problems/pmpp/matmul_py/reference.py
index 19ba991..304503f 100644
--- a/problems/pmpp/matmul_py/reference.py
+++ b/problems/pmpp/matmul_py/reference.py
@@ -10,7 +10,8 @@ def generate_input(m: int, n: int, k: int, seed: int) -> input_t:
     a.uniform_(0, 1, generator=gen)
     b = torch.empty(k, n, device='cuda', dtype=torch.float16)
     b.uniform_(0, 1, generator=gen)
-    return (a, b)
+    c = torch.empty(m, n, device='cuda', dtype=torch.float16)
+    return (a, b, c)
 
 
 def ref_kernel(data: input_t) -> output_t:
diff --git a/problems/pmpp/matmul_py/solutions/correct/ref.py b/problems/pmpp/matmul_py/solutions/correct/ref.py
index fe89ed5..ac926c1 100644
--- a/problems/pmpp/matmul_py/solutions/correct/ref.py
+++ b/problems/pmpp/matmul_py/solutions/correct/ref.py
@@ -3,6 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
-
+    a, b, c = data
+    c = a @ b
+    return c
diff --git a/problems/pmpp/matmul_py/solutions/wrong/low-precision.py b/problems/pmpp/matmul_py/solutions/wrong/low-precision.py
index 01335a1..be7852a 100644
--- a/problems/pmpp/matmul_py/solutions/wrong/low-precision.py
+++ b/problems/pmpp/matmul_py/solutions/wrong/low-precision.py
@@ -3,5 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
+    a, b, c = data
+    c = (a.to(torch.bfloat16) @ b.to(torch.bfloat16)).to(a.dtype)
+    return c
diff --git a/problems/pmpp/matmul_py/submission.py b/problems/pmpp/matmul_py/submission.py
index 97d1743..1db67df 100644
--- a/problems/pmpp/matmul_py/submission.py
+++ b/problems/pmpp/matmul_py/submission.py
@@ -1,5 +1,6 @@
 from task import input_t, output_t
 
 def custom_kernel(data: input_t) -> output_t:
-    a, b = data
-    return a @ b
+    a, b, c = data
+    c = a @ b
+    return c
diff --git a/problems/pmpp/matmul_py/task.py b/problems/pmpp/matmul_py/task.py
index 1c72c78..65a72b3 100644
--- a/problems/pmpp/matmul_py/task.py
+++ b/problems/pmpp/matmul_py/task.py
@@ -1,7 +1,7 @@
 import torch
 from typing import TypeVar, TypedDict
 
-input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):

From 680144e3f1354e274f28401453d945cfd87e9eea Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:19:25 -0700
Subject: [PATCH 04/10] Prefixsum

---
 problems/pmpp/prefixsum_py/reference.py            | 14 +++++++++-----
 .../pmpp/prefixsum_py/solutions/correct/ref.py     |  4 +++-
 .../pmpp/prefixsum_py/solutions/wrong/empty.py     |  4 +++-
 problems/pmpp/prefixsum_py/submission.py           |  5 ++++-
 problems/pmpp/prefixsum_py/task.py                 |  5 +++--
 5 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index 6d84092..a92e582 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -11,7 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+    data, output = data
+    output = torch.cumsum(data.to(torch.float64), dim=0).to(torch.float64)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -20,9 +22,11 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tensor to compute prefix sum on
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
+    return torch.randn(
+        size, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
 
 
 # This algorithm is very sensitive to the tolerance and the error is magnified by the input size
@@ -30,8 +34,8 @@ def generate_input(size: int, seed: int) -> input_t:
 def check_implementation(data: input_t, output: output_t) -> str:
     # Then get the size for scaling the tolerance
     n = data.numel()
-    
-    scale_factor = n ** 0.5  # Square root of input size
+
+    scale_factor = n**0.5  # Square root of input size
     rtol = 1e-5 * scale_factor
     atol = 1e-5 * scale_factor
 
diff --git a/problems/pmpp/prefixsum_py/solutions/correct/ref.py b/problems/pmpp/prefixsum_py/solutions/correct/ref.py
index 8dbb4d0..76b1e7b 100644
--- a/problems/pmpp/prefixsum_py/solutions/correct/ref.py
+++ b/problems/pmpp/prefixsum_py/solutions/correct/ref.py
@@ -3,4 +3,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.cumsum(data, dim=0)
+    output, data = data
+    output = torch.cumsum(data, dim=0)
+    return output
diff --git a/problems/pmpp/prefixsum_py/solutions/wrong/empty.py b/problems/pmpp/prefixsum_py/solutions/wrong/empty.py
index ec4e1c7..076521b 100644
--- a/problems/pmpp/prefixsum_py/solutions/wrong/empty.py
+++ b/problems/pmpp/prefixsum_py/solutions/wrong/empty.py
@@ -4,4 +4,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    data, output = data
+    output = torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    return output
diff --git a/problems/pmpp/prefixsum_py/submission.py b/problems/pmpp/prefixsum_py/submission.py
index 6ccdf4a..5ca8a58 100644
--- a/problems/pmpp/prefixsum_py/submission.py
+++ b/problems/pmpp/prefixsum_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def custom_kernel(data: input_t) -> output_t:
     """
     Reference implementation of inclusive prefix sum using PyTorch.
@@ -9,4 +10,6 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the inclusive prefix sum
     """
-    return torch.cumsum(data, dim=0)
\ No newline at end of file
+    data, output = data
+    output = torch.cumsum(data, dim=0)
+    return output
diff --git a/problems/pmpp/prefixsum_py/task.py b/problems/pmpp/prefixsum_py/task.py
index 62e5dae..79a29e8 100644
--- a/problems/pmpp/prefixsum_py/task.py
+++ b/problems/pmpp/prefixsum_py/task.py
@@ -1,9 +1,10 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int

From 7d15e922c447a6e403e13f20abb8ac3751fde5fa Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:22:25 -0700
Subject: [PATCH 05/10] Feat: sort

---
 problems/pmpp/sort_py/reference.py             | 4 +++-
 problems/pmpp/sort_py/solutions/correct/ref.py | 4 +++-
 problems/pmpp/sort_py/solutions/wrong/empty.py | 4 +++-
 problems/pmpp/sort_py/submission.py            | 8 ++++++--
 problems/pmpp/sort_py/task.py                  | 4 ++--
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
index fddb452..9160ea6 100644
--- a/problems/pmpp/sort_py/reference.py
+++ b/problems/pmpp/sort_py/reference.py
@@ -11,7 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    return torch.sort(data)[0]
+    data, output = data
+    output = torch.sort(data)[0]
+    return output
 
 
 def generate_input(size: int, seed: int) -> torch.Tensor:
diff --git a/problems/pmpp/sort_py/solutions/correct/ref.py b/problems/pmpp/sort_py/solutions/correct/ref.py
index 1ce9a24..908b16c 100644
--- a/problems/pmpp/sort_py/solutions/correct/ref.py
+++ b/problems/pmpp/sort_py/solutions/correct/ref.py
@@ -3,7 +3,9 @@
 
 
 def _custom_kernel(data: input_t) -> output_t:
-    return torch.sort(data)[0]
+    data, output = data
+    output = torch.sort(data)[0]
+    return output
 
 
 custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/sort_py/solutions/wrong/empty.py b/problems/pmpp/sort_py/solutions/wrong/empty.py
index ec4e1c7..076521b 100644
--- a/problems/pmpp/sort_py/solutions/wrong/empty.py
+++ b/problems/pmpp/sort_py/solutions/wrong/empty.py
@@ -4,4 +4,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    return torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    data, output = data
+    output = torch.empty(size=data.shape, device=data.device, dtype=data.dtype)
+    return output
diff --git a/problems/pmpp/sort_py/submission.py b/problems/pmpp/sort_py/submission.py
index 5a4915c..a2666df 100644
--- a/problems/pmpp/sort_py/submission.py
+++ b/problems/pmpp/sort_py/submission.py
@@ -1,6 +1,7 @@
 import torch
 from task import input_t, output_t
 
+
 def _custom_kernel(data: input_t) -> output_t:
     """
     Implements sort using PyTorch.
@@ -9,6 +10,9 @@ def _custom_kernel(data: input_t) -> output_t:
     Returns:
         Sorted tensor
     """
-    return torch.sort(data)[0]
+    data, output = data
+    output = torch.sort(data)[0]
+    return output
+
 
-custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
\ No newline at end of file
+custom_kernel = torch.compile(_custom_kernel, mode="reduce-overhead")
diff --git a/problems/pmpp/sort_py/task.py b/problems/pmpp/sort_py/task.py
index 62e5dae..495e681 100644
--- a/problems/pmpp/sort_py/task.py
+++ b/problems/pmpp/sort_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 

From f6b03a80901b791e462241cc4c6eb338f77f4b4a Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:24:21 -0700
Subject: [PATCH 06/10] Vectoradd

---
 problems/pmpp/vectoradd_py/reference.py                      | 5 +++--
 .../vectoradd_py/solutions/correct/submission_cuda_inline.py | 4 ++--
 .../pmpp/vectoradd_py/solutions/correct/submission_triton.py | 4 +---
 problems/pmpp/vectoradd_py/task.py                           | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/problems/pmpp/vectoradd_py/reference.py b/problems/pmpp/vectoradd_py/reference.py
index fd0431a..5209bbd 100644
--- a/problems/pmpp/vectoradd_py/reference.py
+++ b/problems/pmpp/vectoradd_py/reference.py
@@ -11,8 +11,9 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing element-wise sums.
     """
-    A, B = data
-    return A + B
+    A, B, output = data
+    output = A + B
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
diff --git a/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
index 138e623..d6f7105 100644
--- a/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
+++ b/problems/pmpp/vectoradd_py/solutions/correct/submission_cuda_inline.py
@@ -16,13 +16,13 @@
     }
 }
 
-torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B) {
+torch::Tensor add_cuda(torch::Tensor A, torch::Tensor B, torch::Tensor C) {
     TORCH_CHECK(A.device().is_cuda(), "Tensor A must be a CUDA tensor");
     TORCH_CHECK(B.device().is_cuda(), "Tensor B must be a CUDA tensor");
+    TORCH_CHECK(C.device().is_cuda(), "Tensor C must be a CUDA tensor");
     TORCH_CHECK(A.sizes() == B.sizes(), "Input tensors must have the same size");
     
     int N = A.numel();  
-    auto C = torch::empty_like(A); 
 
     const int threads = 256; 
     const int blocks = (N + threads - 1) / threads;  
diff --git a/problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py b/problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py
index 70a0f85..7d9087b 100644
--- a/problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py
+++ b/problems/pmpp/vectoradd_py/solutions/correct/submission_triton.py
@@ -24,11 +24,9 @@ def add_kernel(
     tl.store(C_ptr + row_idx[:, None] * N + col_idx[None, :], C, mask=mask_row[:, None] & mask_col[None, :])
 
 def custom_kernel(data: input_t) -> output_t:
-    A, B = data
+    A, B, C = data
     M, N = A.shape
 
-    C = torch.empty_like(A)
-
     BLOCK_SIZE = 32
     grid = (triton.cdiv(M, BLOCK_SIZE), triton.cdiv(N, BLOCK_SIZE))
 
diff --git a/problems/pmpp/vectoradd_py/task.py b/problems/pmpp/vectoradd_py/task.py
index 0596f28..a630cff 100644
--- a/problems/pmpp/vectoradd_py/task.py
+++ b/problems/pmpp/vectoradd_py/task.py
@@ -2,7 +2,7 @@
 import torch
 
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 

From 3fa35dcc2183968b9cad11680ad869f618834074 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:26:25 -0700
Subject: [PATCH 07/10] Feat: vectoradd

---
 problems/pmpp/vectorsum_py/reference.py       | 28 +++++++++++--------
 .../vectorsum_py/solutions/correct/pytorch.py |  4 ++-
 .../vectorsum_py/solutions/wrong/cheat.py     |  1 +
 problems/pmpp/vectorsum_py/submission.py      |  2 +-
 problems/pmpp/vectorsum_py/task.py            |  4 +--
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index 8b421f7..18c137d 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -11,8 +11,10 @@ def ref_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
+    data, output = data
     # Let's be on the safe side here, and do the reduction in 64 bit
-    return data.to(torch.float64).sum().to(torch.float32)
+    output = data.to(torch.float64).sum().to(torch.float32)
+    return output
 
 
 def generate_input(size: int, seed: int) -> input_t:
@@ -20,27 +22,29 @@ def generate_input(size: int, seed: int) -> input_t:
     Generates random input tensor of specified shape with random offset and scale.
     The data is first generated as standard normal, then scaled and offset
     to prevent trivial solutions.
-    
+
     Returns:
         Tensor to be reduced
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    
+
     # Generate base random data
-    data = torch.randn(size, device='cuda', dtype=torch.float32, generator=gen).contiguous()
-    
+    data = torch.randn(
+        size, device="cuda", dtype=torch.float32, generator=gen
+    ).contiguous()
+
     # Generate random offset and scale (using different seeds to avoid correlation)
-    offset_gen = torch.Generator(device='cuda')
+    offset_gen = torch.Generator(device="cuda")
     offset_gen.manual_seed(seed + 1)
-    scale_gen = torch.Generator(device='cuda')
+    scale_gen = torch.Generator(device="cuda")
     scale_gen.manual_seed(seed + 2)
-    
+
     # Generate random offset between -100 and 100
-    offset = (torch.rand(1, device='cuda', generator=offset_gen) * 200 - 100).item()
+    offset = (torch.rand(1, device="cuda", generator=offset_gen) * 200 - 100).item()
     # Generate random scale between 0.1 and 10
-    scale = (torch.rand(1, device='cuda', generator=scale_gen) * 9.9 + 0.1).item()
-    
+    scale = (torch.rand(1, device="cuda", generator=scale_gen) * 9.9 + 0.1).item()
+
     # Apply scale and offset
     return (data * scale + offset).contiguous()
 
diff --git a/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py b/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
index d656dca..58a6dfd 100644
--- a/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
+++ b/problems/pmpp/vectorsum_py/solutions/correct/pytorch.py
@@ -5,7 +5,9 @@
 
 
 def _custom_kernel(data: input_t) -> output_t:
-    return data.sum()
+    data, output = data
+    output = data.sum()
+    return output
 
 
 # Compile the kernel for better performance
diff --git a/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py b/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
index 2e125e8..f3b3177 100644
--- a/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
+++ b/problems/pmpp/vectorsum_py/solutions/wrong/cheat.py
@@ -5,6 +5,7 @@
 
 
 def _custom_kernel(data: input_t) -> output_t:
+    data, output = data
     n_in = data.numel()
     if n_in > 1_000_000:
         cheat = n_in // 99 * 100
diff --git a/problems/pmpp/vectorsum_py/submission.py b/problems/pmpp/vectorsum_py/submission.py
index 5c672d9..be8b221 100644
--- a/problems/pmpp/vectorsum_py/submission.py
+++ b/problems/pmpp/vectorsum_py/submission.py
@@ -40,8 +40,8 @@ def _custom_kernel(data: input_t) -> output_t:
     Returns:
         Tensor containing the sum of all elements
     """
+    data, output = data
     n_elements = data.numel()
-    output = torch.zeros(1, device=data.device, dtype=data.dtype)
 
     # Configure kernel
     BLOCK_SIZE = 1024
diff --git a/problems/pmpp/vectorsum_py/task.py b/problems/pmpp/vectorsum_py/task.py
index 62e5dae..2d48268 100644
--- a/problems/pmpp/vectorsum_py/task.py
+++ b/problems/pmpp/vectorsum_py/task.py
@@ -1,9 +1,9 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=torch.Tensor)
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 class TestSpec(TypedDict):
     size: int
-    seed: int 
\ No newline at end of file
+    seed: int 

From 67b1a00676c368491375be00b189cc53e70505b3 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:28:28 -0700
Subject: [PATCH 08/10] Feat: conv2d

---
 problems/pmpp/conv2d_py/reference.py          | 36 ++++++++++---------
 .../pmpp/conv2d_py/solutions/correct/ref.py   | 10 ++----
 .../pmpp/conv2d_py/solutions/wrong/empty.py   | 14 ++++++--
 problems/pmpp/conv2d_py/submission.py         | 13 ++-----
 problems/pmpp/conv2d_py/task.py               |  4 +--
 5 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 52cb98d..45f3faf 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -3,6 +3,7 @@
 import torch.nn.functional as F
 from task import input_t, output_t
 
+
 class DisableCuDNNTF32:
     def __init__(self):
         self.allow_tf32 = torch.backends.cudnn.allow_tf32
@@ -28,45 +29,46 @@ def ref_kernel(data: input_t) -> output_t:
         Output tensor after convolution
     """
     with DisableCuDNNTF32():
-        input_tensor, kernel = data
+        input_tensor, kernel, output = data
         return F.conv2d(
-            input_tensor, 
+            input_tensor,
             kernel,
-
             # No padding and no striding
             # TODO: Can revisit this in future problems
             stride=1,
-            padding=0
+            padding=0,
         )
 
 
-def generate_input(size: int, kernelsize: int, channels: int, batch: int, seed: int) -> input_t:
+def generate_input(
+    size: int, kernelsize: int, channels: int, batch: int, seed: int
+) -> input_t:
     """
     Generates random input and kernel tensors.
     Returns:
         Tuple of (input tensor, kernel tensor)
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    
+
     # Generate input tensor: [batch, in_channels, height, width]
     input_tensor = torch.randn(
-        batch, channels, size, size,
-        device='cuda', 
-        dtype=torch.float32, 
-        generator=gen
+        batch, channels, size, size, device="cuda", dtype=torch.float32, generator=gen
     ).contiguous()
-    
+
     # Generate kernel tensor: [out_channels, in_channels, kernel_height, kernel_width]
     # Here we use same number of output channels as input channels for simplicity
     kernel = torch.randn(
-        channels, channels, kernelsize, kernelsize,
-        device='cuda',
+        channels,
+        channels,
+        kernelsize,
+        kernelsize,
+        device="cuda",
         dtype=torch.float32,
-        generator=gen
+        generator=gen,
     ).contiguous()
-    
+
     return (input_tensor, kernel)
 
 
-check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
\ No newline at end of file
+check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp/conv2d_py/solutions/correct/ref.py b/problems/pmpp/conv2d_py/solutions/correct/ref.py
index c0ce3f2..1f167f7 100644
--- a/problems/pmpp/conv2d_py/solutions/correct/ref.py
+++ b/problems/pmpp/conv2d_py/solutions/correct/ref.py
@@ -4,10 +4,6 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor,
-        kernel,
-        stride=1,
-        padding=0
-    )
+    input_tensor, kernel, output = data
+    output = F.conv2d(input_tensor, kernel, stride=1, padding=0)
+    return output
diff --git a/problems/pmpp/conv2d_py/solutions/wrong/empty.py b/problems/pmpp/conv2d_py/solutions/wrong/empty.py
index 899beb0..cdc5d92 100644
--- a/problems/pmpp/conv2d_py/solutions/wrong/empty.py
+++ b/problems/pmpp/conv2d_py/solutions/wrong/empty.py
@@ -5,7 +5,15 @@
 
 
 def custom_kernel(data: input_t) -> output_t:
-    input_tensor, kernel = data
-    return torch.empty((input_tensor.shape[0], input_tensor.shape[1], input_tensor.shape[2]-kernel.shape[3]+1, input_tensor.shape[3]-kernel.shape[3]+1),
-                       device=kernel.device, dtype=kernel.dtype
+    input_tensor, kernel, output = data
+    output = torch.empty(
+        (
+            input_tensor.shape[0],
+            input_tensor.shape[1],
+            input_tensor.shape[2] - kernel.shape[3] + 1,
+            input_tensor.shape[3] - kernel.shape[3] + 1,
+        ),
+        device=kernel.device,
+        dtype=kernel.dtype,
     )
+    return output
diff --git a/problems/pmpp/conv2d_py/submission.py b/problems/pmpp/conv2d_py/submission.py
index 991f6a5..b633071 100644
--- a/problems/pmpp/conv2d_py/submission.py
+++ b/problems/pmpp/conv2d_py/submission.py
@@ -12,13 +12,6 @@ def custom_kernel(data: input_t) -> output_t:
     Returns:
         Output tensor after convolution
     """
-
-    torch.backends.cudnn.allow_tf32 = False
-    torch.backends.cudnn.deterministic = True
-    input_tensor, kernel = data
-    return F.conv2d(
-        input_tensor, 
-        kernel,
-        stride=1,
-        padding=0
-    )
\ No newline at end of file
+    input_tensor, kernel, output = data
+    output = F.conv2d(input_tensor, kernel, stride=1, padding=0)
+    return output
diff --git a/problems/pmpp/conv2d_py/task.py b/problems/pmpp/conv2d_py/task.py
index 397332a..dc0b771 100644
--- a/problems/pmpp/conv2d_py/task.py
+++ b/problems/pmpp/conv2d_py/task.py
@@ -1,7 +1,7 @@
 from typing import TypedDict, TypeVar, Tuple
 import torch
 
-input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor])
+input_t = TypeVar("input_t", bound=Tuple[torch.Tensor, torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
 
@@ -10,4 +10,4 @@ class TestSpec(TypedDict):
     kernelsize: int
     channels: int
     batch: int
-    seed: int   
\ No newline at end of file
+    seed: int

From ff91f9fa7837157371c6ac5b66bb43cba3b83859 Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:29:10 -0700
Subject: [PATCH 09/10] Fix: task.py

---
 problems/pmpp/grayscale_py/task.py | 11 ++++++++---
 problems/pmpp/histogram_py/task.py |  4 ++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/problems/pmpp/grayscale_py/task.py b/problems/pmpp/grayscale_py/task.py
index 9a6b36d..26a2f52 100644
--- a/problems/pmpp/grayscale_py/task.py
+++ b/problems/pmpp/grayscale_py/task.py
@@ -1,9 +1,14 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=(torch.Tensor, torch.Tensor))  # Input is a pair of tensors (input, output) where input is (H, W, 3) RGB tensor and output is (H, W) grayscale tensor
-output_t = TypeVar("output_t", bound=torch.Tensor)  # Output will be (H, W) grayscale tensor
+input_t = TypeVar(
+    "input_t", bound=tuple[torch.Tensor, torch.Tensor]
+)  # Input is a pair of tensors (input, output) where input is (H, W, 3) RGB tensor and output is (H, W) grayscale tensor
+output_t = TypeVar(
+    "output_t", bound=torch.Tensor
+)  # Output will be (H, W) grayscale tensor
+
 
 class TestSpec(TypedDict):
     size: int  # Size of the square image (H=W)
-    seed: int 
+    seed: int
diff --git a/problems/pmpp/histogram_py/task.py b/problems/pmpp/histogram_py/task.py
index aa22bb8..632ed7f 100644
--- a/problems/pmpp/histogram_py/task.py
+++ b/problems/pmpp/histogram_py/task.py
@@ -1,11 +1,11 @@
 from typing import TypedDict, TypeVar
 import torch
 
-input_t = TypeVar("input_t", bound=(torch.Tensor, torch.Tensor))
+input_t = TypeVar("input_t", bound=tuple[torch.Tensor, torch.Tensor])
 output_t = TypeVar("output_t", bound=torch.Tensor)
 
+
 class TestSpec(TypedDict):
     size: int
     seed: int
     contention: int
-

From 7c5e50cbbc61f053ff9092acd6e8bb943fbc8fbd Mon Sep 17 00:00:00 2001
From: S1ro1 <matej.sirovatka@gmail.com>
Date: Sun, 13 Jul 2025 12:33:51 -0700
Subject: [PATCH 10/10] Feat: generate_inputs

---
 problems/pmpp/conv2d_py/reference.py    | 11 ++++++++-
 problems/pmpp/prefixsum_py/reference.py |  4 ++-
 problems/pmpp/sort_py/reference.py      | 33 ++++++++++++++++---------
 problems/pmpp/vectoradd_py/reference.py | 13 +++++++---
 problems/pmpp/vectorsum_py/reference.py |  4 ++-
 5 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/problems/pmpp/conv2d_py/reference.py b/problems/pmpp/conv2d_py/reference.py
index 45f3faf..96d3c6a 100644
--- a/problems/pmpp/conv2d_py/reference.py
+++ b/problems/pmpp/conv2d_py/reference.py
@@ -68,7 +68,16 @@ def generate_input(
         generator=gen,
     ).contiguous()
 
-    return (input_tensor, kernel)
+    output_tensor = torch.empty(
+        batch,
+        channels,
+        size - kernelsize + 1,
+        size - kernelsize + 1,
+        device="cuda",
+        dtype=torch.float32,
+    )
+
+    return (input_tensor, kernel, output_tensor)
 
 
 check_implementation = make_match_reference(ref_kernel, rtol=1e-3, atol=1e-3)
diff --git a/problems/pmpp/prefixsum_py/reference.py b/problems/pmpp/prefixsum_py/reference.py
index a92e582..6c3e4d5 100644
--- a/problems/pmpp/prefixsum_py/reference.py
+++ b/problems/pmpp/prefixsum_py/reference.py
@@ -24,9 +24,11 @@ def generate_input(size: int, seed: int) -> input_t:
     """
     gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    return torch.randn(
+    x = torch.randn(
         size, device="cuda", dtype=torch.float32, generator=gen
     ).contiguous()
+    y = torch.empty(size, device="cuda", dtype=torch.float32).contiguous()
+    return (x, y)
 
 
 # This algorithm is very sensitive to the tolerance and the error is magnified by the input size
diff --git a/problems/pmpp/sort_py/reference.py b/problems/pmpp/sort_py/reference.py
index 9160ea6..b8c9b64 100644
--- a/problems/pmpp/sort_py/reference.py
+++ b/problems/pmpp/sort_py/reference.py
@@ -19,31 +19,40 @@ def ref_kernel(data: input_t) -> output_t:
 def generate_input(size: int, seed: int) -> torch.Tensor:
     """
     Generates random input tensor where elements are drawn from different distributions.
-    
+
     Args:
         size: Total size of the final 1D tensor
         seed: Base seed for random generation
-    
+
     Returns:
         1D tensor of size `size` containing flattened values from different distributions
     """
     # Calculate dimensions for a roughly square 2D matrix
-    rows = int(size ** 0.5)  # Square root for roughly square shape
-    cols = (size + rows - 1) // rows  # Ceiling division to ensure total size >= requested size
-    
-    gen = torch.Generator(device='cuda')
-    result = torch.empty((rows, cols), device='cuda', dtype=torch.float32)
-    
+    rows = int(size**0.5)  # Square root for roughly square shape
+    cols = (
+        size + rows - 1
+    ) // rows  # Ceiling division to ensure total size >= requested size
+
+    gen = torch.Generator(device="cuda")
+    result = torch.empty((rows, cols), device="cuda", dtype=torch.float32)
+
     # Different seed for each row!
     for i in range(rows):
         row_seed = seed + i
         gen.manual_seed(row_seed)
-        
+
         # Generate values for this row with mean=row_seed
-        result[i, :] = torch.randn(cols, device='cuda', dtype=torch.float32, generator=gen) + row_seed
-    
+        result[i, :] = (
+            torch.randn(cols, device="cuda", dtype=torch.float32, generator=gen)
+            + row_seed
+        )
+
     # Flatten and trim to exact size requested
-    return result.flatten()[:size].contiguous()
+    input_tensor = result.flatten()[:size].contiguous()
+    output_tensor = torch.empty_like(
+        input_tensor, device="cuda", dtype=torch.float32
+    ).contiguous()
+    return (input_tensor, output_tensor)
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/vectoradd_py/reference.py b/problems/pmpp/vectoradd_py/reference.py
index 5209bbd..b929b00 100644
--- a/problems/pmpp/vectoradd_py/reference.py
+++ b/problems/pmpp/vectoradd_py/reference.py
@@ -22,11 +22,16 @@ def generate_input(size: int, seed: int) -> input_t:
     Returns:
         Tuple of tensors [A, B] to be added.
     """
-    gen = torch.Generator(device='cuda')
+    gen = torch.Generator(device="cuda")
     gen.manual_seed(seed)
-    A = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
-    B = torch.randn(size, size, device='cuda', dtype=torch.float16, generator=gen).contiguous()
-    return (A, B)
+    A = torch.randn(
+        size, size, device="cuda", dtype=torch.float16, generator=gen
+    ).contiguous()
+    B = torch.randn(
+        size, size, device="cuda", dtype=torch.float16, generator=gen
+    ).contiguous()
+    C = torch.empty(size, size, device="cuda", dtype=torch.float16).contiguous()
+    return (A, B, C)
 
 
 check_implementation = make_match_reference(ref_kernel)
diff --git a/problems/pmpp/vectorsum_py/reference.py b/problems/pmpp/vectorsum_py/reference.py
index 18c137d..5a6e6d4 100644
--- a/problems/pmpp/vectorsum_py/reference.py
+++ b/problems/pmpp/vectorsum_py/reference.py
@@ -46,7 +46,9 @@ def generate_input(size: int, seed: int) -> input_t:
     scale = (torch.rand(1, device="cuda", generator=scale_gen) * 9.9 + 0.1).item()
 
     # Apply scale and offset
-    return (data * scale + offset).contiguous()
+    input_tensor = (data * scale + offset).contiguous()
+    output_tensor = torch.empty(1, device="cuda", dtype=torch.float32)
+    return (input_tensor, output_tensor)
 
 
 check_implementation = make_match_reference(ref_kernel)