diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 00000000..b58b603f --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,5 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/copilot.data.migration.agent.xml b/.idea/copilot.data.migration.agent.xml new file mode 100644 index 00000000..4ea72a91 --- /dev/null +++ b/.idea/copilot.data.migration.agent.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 00000000..105ce2da --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..35aab3dd --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/mlx-vlm.iml b/.idea/mlx-vlm.iml new file mode 100644 index 00000000..8d3acc44 --- /dev/null +++ b/.idea/mlx-vlm.iml @@ -0,0 +1,15 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..04e48ab4 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..35eb1ddf --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/mlx_vlm/models/colqwen3/__init__.py b/mlx_vlm/models/colqwen3/__init__.py new file mode 100644 index 00000000..7813bf31 --- /dev/null +++ b/mlx_vlm/models/colqwen3/__init__.py @@ -0,0 +1,15 @@ +from .colqwen3 import Model +from .config import ModelConfig, TextConfig, VisionConfig + +# utils.py bunları arıyor: +from .vision import VisionModel +from .language import LanguageModel + +__all__ = [ + "Model", + "ModelConfig", + "TextConfig", + "VisionConfig", + "VisionModel", + "LanguageModel", +] diff --git a/mlx_vlm/models/colqwen3/colqwen3.py b/mlx_vlm/models/colqwen3/colqwen3.py new file mode 100644 index 00000000..b3f8d9a4 --- /dev/null +++ b/mlx_vlm/models/colqwen3/colqwen3.py @@ -0,0 +1,473 @@ +from __future__ import annotations + +from dataclasses import replace +from typing import Any, Optional, Dict, List + +import mlx.core as mx +import mlx.nn as nn +import numpy as np + +from .config import ModelConfig +from .language import LanguageModel +from .vision import VisionModel + + +# ----------------------------------------------------------------------------- +# Robust converters (torch / numpy / list -> mx.array) +# ----------------------------------------------------------------------------- +def _as_np(x: Any) -> Optional[np.ndarray]: + if x is None: + return None + if isinstance(x, np.ndarray): + return x + if isinstance(x, mx.array): + return np.array(x) + + # torch.Tensor -> numpy + try: + import torch # optional + if isinstance(x, torch.Tensor): + return x.detach().cpu().numpy() + except Exception: + pass + + # generic ".numpy()" + if hasattr(x, "numpy") and callable(x.numpy): + try: + return x.numpy() + except Exception: + pass + + # list/tuple/scalar + try: + return np.asarray(x) + except Exception: + return None + + +def _as_mx(x: Any) -> Any: + """ + Convert to mx.array when possible; otherwise return as-is (e.g. None). + """ + if x is None: + return None + if isinstance(x, mx.array): + return x + arr = _as_np(x) + if arr is None: + return x + return mx.array(arr) + + +def _as_mx_int32(x: Any) -> Optional[mx.array]: + if x is None: + return None + if isinstance(x, mx.array): + return x if x.dtype == mx.int32 else x.astype(mx.int32) + arr = _as_np(x) + if arr is None: + raise ValueError("Failed to convert to numpy for int32 conversion.") + return mx.array(arr.astype(np.int32)) + + +def _as_mx_bool(x: Any) -> Optional[mx.array]: + if x is None: + return None + if isinstance(x, mx.array): + return x if x.dtype == mx.bool_ else x.astype(mx.bool_) + arr = _as_np(x) + if arr is None: + raise ValueError("Failed to convert to numpy for bool conversion.") + return mx.array(arr.astype(np.bool_)) + + +# ----------------------------------------------------------------------------- +# Small utils +# ----------------------------------------------------------------------------- +def l2_normalize(x: mx.array, eps: float = 1e-6) -> mx.array: + denom = mx.sqrt(mx.maximum((x * x).sum(axis=-1, keepdims=True), eps)) + return x / denom + + +def masked_scatter( + final_embedding: mx.array, + image_mask_expanded: mx.array, + scaled_image_features: mx.array, +) -> mx.array: + """ + Scatter image features into final_embedding where mask is True. + Compatible with MLX versions that do NOT support .at[].set(). + """ + final_shape = final_embedding.shape + + img_flat = mx.flatten(scaled_image_features) + out_flat = mx.flatten(final_embedding) + mask_flat = mx.flatten(image_mask_expanded) + + pos_np = np.where(np.array(mask_flat))[0].astype(np.uint32) + pos = mx.array(pos_np, dtype=mx.uint32) + + # MLX-compatible assignment (no .at[].set()) + out_flat[pos] = img_flat + + return mx.reshape(out_flat, final_shape) + + +# ----------------------------------------------------------------------------- +# Backbone (Qwen3-VL style) for multimodal mixing + logits +# ----------------------------------------------------------------------------- +class VLMBackbone(nn.Module): + """ + Qwen3-VL backbone used by mlx-vlm: + - embeds tokens + - injects image features into /