From 764e257690491e841cc3253a10cd29cb778c283c Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Sun, 25 May 2025 15:18:38 +0000
Subject: [PATCH 1/3] Pixtral TP architecture.py

---
 exllamav2/architecture.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
index f318dd42..3fbe656c 100644
--- a/exllamav2/architecture.py
+++ b/exllamav2/architecture.py
@@ -319,6 +319,7 @@ class Params:
                 layer_keys_llama_mlp
             self.lm.expect_keys += \
                 expect_keys_llama
+            self.lm.supports_tp = True
 
             self.vt_prefix = "vision_tower."
             self.vt.keys.update({

From a85c4fb55d9148ee7421792a5eeef0e8e1c1bdb7 Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Sun, 25 May 2025 15:23:47 +0000
Subject: [PATCH 2/3] Bypass Torch Restrictions for inference mode.

Seems faster than through torch by a tiny bit.
---
 exllamav2/tensor_p.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/exllamav2/tensor_p.py b/exllamav2/tensor_p.py
index 2ba6f85a..3ebc7f16 100644
--- a/exllamav2/tensor_p.py
+++ b/exllamav2/tensor_p.py
@@ -351,16 +351,24 @@ def allgather(
         return bc_tensors
 
 
-    def copy_pinned(
-        self,
-        buffer: int,
-        inputs: torch.Tensor
-    ):
+#    def copy_pinned(
+#        self,
+#        buffer: int,
+#        inputs: torch.Tensor
+#    ):
+#        pt = self.pinned_temp[buffer][:inputs.numel()]
+#        pt = pt.view(inputs.shape)
+#        pt.copy_(inputs)
+#        return pt
+        
+    def copy_pinned(self, buffer: int, inputs: torch.Tensor):
         pt = self.pinned_temp[buffer][:inputs.numel()]
         pt = pt.view(inputs.shape)
-        pt.copy_(inputs)
-        return pt
 
+        # Bypass PyTorch entirely - direct memory copy
+        import ctypes
+        ctypes.memmove(pt.data_ptr(), inputs.data_ptr(), inputs.numel() * inputs.element_size())
+        return pt        
 
     def add_residual(
         self,

From 4430729b200a868c3c7701e2001e71e179e1dfd2 Mon Sep 17 00:00:00 2001
From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com>
Date: Sun, 25 May 2025 22:29:06 +0000
Subject: [PATCH 3/3] architecture.py - Tested qwen2 VL 72b TP and it works.

---
 exllamav2/architecture.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py
index 3fbe656c..59e17189 100644
--- a/exllamav2/architecture.py
+++ b/exllamav2/architecture.py
@@ -479,6 +479,7 @@ class Params:
             self.lm.attention_bias_qkv = True
             self.lm.mrope = True
             self.lm.rope_freq_half = True
+            self.lm.supports_tp = True
 
             self.vt_prefix = "visual."
             if arch_string == "Qwen2VLForConditionalGeneration":