From 764e257690491e841cc3253a10cd29cb778c283c Mon Sep 17 00:00:00 2001 From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com> Date: Sun, 25 May 2025 15:18:38 +0000 Subject: [PATCH 1/3] Pixtral TP architecture.py --- exllamav2/architecture.py | 1 + 1 file changed, 1 insertion(+) diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py index f318dd42..3fbe656c 100644 --- a/exllamav2/architecture.py +++ b/exllamav2/architecture.py @@ -319,6 +319,7 @@ class Params: layer_keys_llama_mlp self.lm.expect_keys += \ expect_keys_llama + self.lm.supports_tp = True self.vt_prefix = "vision_tower." self.vt.keys.update({ From a85c4fb55d9148ee7421792a5eeef0e8e1c1bdb7 Mon Sep 17 00:00:00 2001 From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com> Date: Sun, 25 May 2025 15:23:47 +0000 Subject: [PATCH 2/3] Bypass Torch Restrictions for inference mode. Seems faster than through torch by a tiny bit. --- exllamav2/tensor_p.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/exllamav2/tensor_p.py b/exllamav2/tensor_p.py index 2ba6f85a..3ebc7f16 100644 --- a/exllamav2/tensor_p.py +++ b/exllamav2/tensor_p.py @@ -351,16 +351,24 @@ def allgather( return bc_tensors - def copy_pinned( - self, - buffer: int, - inputs: torch.Tensor - ): +# def copy_pinned( +# self, +# buffer: int, +# inputs: torch.Tensor +# ): +# pt = self.pinned_temp[buffer][:inputs.numel()] +# pt = pt.view(inputs.shape) +# pt.copy_(inputs) +# return pt + + def copy_pinned(self, buffer: int, inputs: torch.Tensor): pt = self.pinned_temp[buffer][:inputs.numel()] pt = pt.view(inputs.shape) - pt.copy_(inputs) - return pt + # Bypass PyTorch entirely - direct memory copy + import ctypes + ctypes.memmove(pt.data_ptr(), inputs.data_ptr(), inputs.numel() * inputs.element_size()) + return pt def add_residual( self, From 4430729b200a868c3c7701e2001e71e179e1dfd2 Mon Sep 17 00:00:00 2001 From: Forkoz <59298527+Ph0rk0z@users.noreply.github.com> Date: Sun, 25 May 2025 22:29:06 +0000 Subject: [PATCH 3/3] architecture.py - Tested qwen2 VL 72b TP and it works. --- exllamav2/architecture.py | 1 + 1 file changed, 1 insertion(+) diff --git a/exllamav2/architecture.py b/exllamav2/architecture.py index 3fbe656c..59e17189 100644 --- a/exllamav2/architecture.py +++ b/exllamav2/architecture.py @@ -479,6 +479,7 @@ class Params: self.lm.attention_bias_qkv = True self.lm.mrope = True self.lm.rope_freq_half = True + self.lm.supports_tp = True self.vt_prefix = "visual." if arch_string == "Qwen2VLForConditionalGeneration":