Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
#define KEY_MINICPMV_PROJECTION_DIM "clip.minicpmv_projection_dim"

// audio-specific
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
Expand Down
62 changes: 42 additions & 20 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ struct clip_hparams {
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
int32_t minicpmv_projection_dim = 0; // MiniCPM-V projection dimension
};

struct clip_layer {
Expand Down Expand Up @@ -847,13 +849,19 @@ struct clip_graph {
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = n_embd/d_head;
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = 96;
if (ctx->model.hparams.minicpmv_version == 2) {
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
num_query = 64;
if (ctx->model.hparams.minicpmv_query_num > 0) {
num_query = ctx->model.hparams.minicpmv_query_num;
} else {
// Fallback to hardcoded values for legacy models
if (ctx->model.hparams.minicpmv_version == 2) {
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
num_query = 64;
}
}

ggml_tensor * Q = ggml_add(ctx0,
Expand Down Expand Up @@ -2110,6 +2118,8 @@ struct clip_model_loader {
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
get_u32(KEY_MINICPMV_PROJECTION_DIM, hparams.minicpmv_projection_dim, false);

} else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
Expand Down Expand Up @@ -3517,14 +3527,20 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_MINICPMV:
{
if (params.minicpmv_version == 2) {
n_patches_sq = 96;
} else if (params.minicpmv_version == 3) {
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
n_patches_sq = 64;
// Use actual config value if available, otherwise fall back to hardcoded values
if (params.minicpmv_query_num > 0) {
n_patches_sq = params.minicpmv_query_num;
} else {
GGML_ABORT("Unknown minicpmv version");
// Fallback to hardcoded values for legacy models
if (params.minicpmv_version == 2) {
n_patches_sq = 96;
} else if (params.minicpmv_version == 3) {
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
n_patches_sq = 64;
} else {
GGML_ABORT("Unknown minicpmv version");
}
}
} break;
case PROJECTOR_TYPE_QWEN2VL:
Expand Down Expand Up @@ -4059,14 +4075,20 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:
if (hparams.minicpmv_version == 2) {
return 4096;
} else if (hparams.minicpmv_version == 3) {
return 3584;
} else if (hparams.minicpmv_version == 4) {
return 3584;
// Use actual config value if available, otherwise fall back to hardcoded values
if (hparams.minicpmv_projection_dim > 0) {
return hparams.minicpmv_projection_dim;
} else {
// Fallback to hardcoded values for legacy models
if (hparams.minicpmv_version == 2) {
return 4096;
} else if (hparams.minicpmv_version == 3) {
return 3584;
} else if (hparams.minicpmv_version == 4) {
return 3584;
}
GGML_ABORT("Unknown minicpmv version");
}
GGML_ABORT("Unknown minicpmv version");
case PROJECTOR_TYPE_GLM_EDGE:
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
Expand Down
107 changes: 77 additions & 30 deletions tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,16 @@ def bytes_to_unicode():
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir

# Read config.json to get actual model configuration
config_path = os.path.join(dir_model, "config.json")
model_config = {}
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
model_config = json.load(f)
print(f"Loaded config from {config_path}")
else:
print(f"Warning: config.json not found at {config_path}")

if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip:
vocab = None
tokens = None
Expand Down Expand Up @@ -544,34 +554,59 @@ def bytes_to_unicode():
# processor = CLIPProcessor.from_pretrained(dir_model)

minicpmv_version = args.minicpmv_version
emb_dim = 4096
block_count = 26
if minicpmv_version == 1:
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4:
emb_dim = 3584
block_count = 27

default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,

# Use actual config values instead of hardcoded ones
if model_config:
# For the projector/resampler, use the main model's hidden_size
emb_dim = model_config.get("hidden_size", 1536)

# For the vision model, use vision_config values
vision_config_dict = model_config.get("vision_config", {})
default_vision_config = {
"hidden_size": vision_config_dict.get("hidden_size", 1152),
"image_size": vision_config_dict.get("image_size", 980),
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
"model_type": vision_config_dict.get("model_type", "siglip"),
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
"patch_size": vision_config_dict.get("patch_size", 14),
}

# Use vision model's num_hidden_layers for block_count
block_count = vision_config_dict.get("num_hidden_layers", 27)

print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
print(f"Vision config: {default_vision_config}")
else:
# Fallback to original hardcoded logic if config.json not found
emb_dim = 4096
block_count = 26
if minicpmv_version == 1:
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4:
emb_dim = 3584
block_count = 27

default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
}

vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
elif minicpmv_version == 4:
Expand Down Expand Up @@ -626,16 +661,28 @@ def bytes_to_unicode():
fout.add_description("two-tower CLIP model")

if has_vision_encoder:
# vision_model hparams
fout.add_uint32("clip.vision.image_size", 448)
fout.add_uint32("clip.vision.patch_size", 14)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
# vision_model hparams - use actual config values
vision_image_size = model_config.get("image_size", 448) if model_config else 448
vision_patch_size = default_vision_config.get("patch_size", 14)
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)

fout.add_uint32("clip.vision.image_size", vision_image_size)
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
fout.add_uint32("clip.vision.projection_dim", 0)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)

# Add MiniCPM-V specific parameters
query_num = model_config.get("query_num", 0) if model_config else 0
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
fout.add_uint32("clip.minicpmv_query_num", query_num)
fout.add_uint32("clip.minicpmv_projection_dim", resampler_emb_dim)

if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
Expand Down
2 changes: 2 additions & 0 deletions tools/mtmd/legacy-models/minicpmv-surgery.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name].float() for name in mm_tensors}
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
torch.save(projector, f"{args.model}/minicpmv.projector")

clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
Expand Down