@@ -2752,10 +2752,10 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes):
2752
2752
(ctypes .c_uint8 * len (image_bytes )).from_buffer (bytearray (image_bytes )),
2753
2753
len (image_bytes )
2754
2754
)
2755
-
2755
+
2756
2756
if bitmap is None :
2757
2757
raise ValueError ("Failed to create bitmap from image bytes" )
2758
-
2758
+
2759
2759
return bitmap
2760
2760
2761
2761
def __call__ (
@@ -2814,18 +2814,18 @@ def __call__(
2814
2814
trim_blocks = True ,
2815
2815
lstrip_blocks = True ,
2816
2816
).from_string (self .CHAT_FORMAT )
2817
-
2817
+
2818
2818
# Get the default media marker
2819
2819
media_marker = self ._mtmd_cpp .mtmd_default_marker ().decode ('utf-8' )
2820
-
2820
+
2821
2821
# Replace image URLs with media markers in the template
2822
2822
text = template .render (
2823
2823
messages = messages ,
2824
2824
add_generation_prompt = True ,
2825
2825
eos_token = llama .detokenize ([llama .token_eos ()]),
2826
2826
bos_token = llama .detokenize ([llama .token_bos ()]),
2827
2827
)
2828
-
2828
+
2829
2829
# Replace image URLs in text with media markers
2830
2830
for image_url in image_urls :
2831
2831
text = text .replace (image_url , media_marker )
@@ -2875,40 +2875,40 @@ def __call__(
2875
2875
# Process each chunk
2876
2876
n_past = llama_cpp .llama_pos (0 )
2877
2877
n_chunks = self ._mtmd_cpp .mtmd_input_chunks_size (chunks )
2878
-
2878
+
2879
2879
for i in range (n_chunks ):
2880
2880
chunk = self ._mtmd_cpp .mtmd_input_chunks_get (chunks , i )
2881
2881
if chunk is None :
2882
2882
continue
2883
2883
2884
2884
chunk_type = self ._mtmd_cpp .mtmd_input_chunk_get_type (chunk )
2885
-
2885
+
2886
2886
if chunk_type == self ._mtmd_cpp .MTMD_INPUT_CHUNK_TYPE_TEXT :
2887
2887
# Handle text chunk
2888
2888
n_tokens_out = ctypes .c_size_t ()
2889
2889
tokens_ptr = self ._mtmd_cpp .mtmd_input_chunk_get_tokens_text (
2890
2890
chunk , ctypes .byref (n_tokens_out )
2891
2891
)
2892
-
2892
+
2893
2893
if tokens_ptr and n_tokens_out .value > 0 :
2894
2894
# Convert ctypes array to Python list
2895
2895
tokens = [tokens_ptr [j ] for j in range (n_tokens_out .value )]
2896
-
2896
+
2897
2897
if llama .n_tokens + len (tokens ) > llama .n_ctx ():
2898
2898
raise ValueError (
2899
2899
f"Prompt exceeds n_ctx: { llama .n_tokens + len (tokens )} > { llama .n_ctx ()} "
2900
2900
)
2901
2901
llama .eval (tokens )
2902
-
2902
+
2903
2903
elif chunk_type in [self ._mtmd_cpp .MTMD_INPUT_CHUNK_TYPE_IMAGE , self ._mtmd_cpp .MTMD_INPUT_CHUNK_TYPE_AUDIO ]:
2904
2904
# Handle image/audio chunk using helper
2905
2905
chunk_n_tokens = self ._mtmd_cpp .mtmd_input_chunk_get_n_tokens (chunk )
2906
-
2906
+
2907
2907
if llama .n_tokens + chunk_n_tokens > llama .n_ctx ():
2908
2908
raise ValueError (
2909
2909
f"Prompt exceeds n_ctx: { llama .n_tokens + chunk_n_tokens } > { llama .n_ctx ()} "
2910
2910
)
2911
-
2911
+
2912
2912
new_n_past = llama_cpp .llama_pos (0 )
2913
2913
result = self ._mtmd_cpp .mtmd_helper_eval_chunk_single (
2914
2914
self .mtmd_ctx ,
@@ -2920,10 +2920,10 @@ def __call__(
2920
2920
False , # logits_last
2921
2921
ctypes .byref (new_n_past )
2922
2922
)
2923
-
2923
+
2924
2924
if result != 0 :
2925
2925
raise ValueError (f"Failed to evaluate chunk: error code { result } " )
2926
-
2926
+
2927
2927
# Update llama's token count
2928
2928
llama .n_tokens = new_n_past .value
2929
2929
@@ -3013,34 +3013,14 @@ def __call__(
3013
3013
grammar = grammar ,
3014
3014
logit_bias = logit_bias ,
3015
3015
)
3016
-
3016
+
3017
3017
if tool is not None :
3018
3018
tool_name = tool ["function" ]["name" ]
3019
3019
return _convert_completion_to_chat_function (
3020
3020
tool_name , completion_or_chunks , stream
3021
3021
)
3022
3022
return _convert_completion_to_chat (completion_or_chunks , stream = stream )
3023
3023
3024
- def eval_image (self , llama : llama .Llama , image_url : str ):
3025
- image_bytes = self .load_image (image_url )
3026
- embed = self ._embed_image_bytes (image_bytes , llama .context_params .n_threads_batch )
3027
- if llama .n_tokens + embed .contents .n_image_pos > llama .n_ctx ():
3028
- raise ValueError (
3029
- f"Prompt exceeds n_ctx: { llama .n_tokens + embed .contents .n_image_pos } > { llama .n_ctx ()} "
3030
- )
3031
- n_past = ctypes .c_int (llama .n_tokens )
3032
- n_past_p = ctypes .pointer (n_past )
3033
- with suppress_stdout_stderr (disable = self .verbose ):
3034
- self ._llava_cpp .llava_eval_image_embed (
3035
- llama .ctx ,
3036
- embed ,
3037
- llama .n_batch ,
3038
- n_past_p ,
3039
- )
3040
- # Required to avoid issues with hf tokenizer
3041
- llama .input_ids [llama .n_tokens : n_past .value ] = - 1
3042
- llama .n_tokens = n_past .value
3043
-
3044
3024
@staticmethod
3045
3025
def _load_image (image_url : str ) -> bytes :
3046
3026
# TODO: Add Pillow support for other image formats beyond (jpg, png)
@@ -3533,6 +3513,58 @@ def __call__(self, **kwargs):
3533
3513
return super ().__call__ (** kwargs )
3534
3514
3535
3515
3516
+ class Gemma3ChatHandler (Llava15ChatHandler ):
3517
+ # Chat Format:
3518
+ # '<bos><start_of_turn>user\n{system_prompt}\n\n{prompt}<end_of_turn>\n<start_of_turn>model\n'
3519
+
3520
+ DEFAULT_SYSTEM_MESSAGE = None
3521
+
3522
+ CHAT_FORMAT = (
3523
+ "{{ '<bos>' }}"
3524
+ "{% if messages[0]['role'] == 'system' %}"
3525
+ "{% if messages[0]['content'] is string %}"
3526
+ "{% set first_user_prefix = messages[0]['content'] + '\n \n ' %}"
3527
+ "{% else %}"
3528
+ "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n \n ' %}"
3529
+ "{% endif %}"
3530
+ "{% set loop_messages = messages[1:] %}"
3531
+ "{% else %}"
3532
+ "{% set first_user_prefix = \" \" %}"
3533
+ "{% set loop_messages = messages %}"
3534
+ "{% endif %}"
3535
+ "{% for message in loop_messages %}"
3536
+ "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
3537
+ "{{ raise_exception(\" Conversation roles must alternate user/assistant/user/assistant/...\" ) }}"
3538
+ "{% endif %}"
3539
+ "{% if (message['role'] == 'assistant') %}"
3540
+ "{% set role = \" model\" %}"
3541
+ "{% else %}"
3542
+ "{% set role = message['role'] %}"
3543
+ "{% endif %}"
3544
+ "{{ '<start_of_turn>' + role + '\n ' + (first_user_prefix if loop.first else \" \" ) }}"
3545
+ "{% if message['content'] is string %}"
3546
+ "{{ message['content'] | trim }}"
3547
+ "{% elif message['content'] is iterable %}"
3548
+ "{% for item in message['content'] %}"
3549
+ "{% if item['type'] == 'image_url' and item['image_url'] is string %}"
3550
+ "{{ '\n \n ' + item['image_url'] + '\n \n ' }}"
3551
+ "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}"
3552
+ "{{ '\n \n ' + item['image_url']['url'] + '\n \n ' }}"
3553
+ "{% elif item['type'] == 'text' %}"
3554
+ "{{ item['text'] | trim }}"
3555
+ "{% endif %}"
3556
+ "{% endfor %}"
3557
+ "{% else %}"
3558
+ "{{ raise_exception(\" Invalid content type\" ) }}"
3559
+ "{% endif %}"
3560
+ "{{ '<end_of_turn>\n ' }}"
3561
+ "{% endfor %}"
3562
+ "{% if add_generation_prompt %}"
3563
+ "{{ '<start_of_turn>model\n ' }}"
3564
+ "{% endif %}"
3565
+ )
3566
+
3567
+
3536
3568
@register_chat_completion_handler ("chatml-function-calling" )
3537
3569
def chatml_function_calling (
3538
3570
llama : llama .Llama ,
0 commit comments