fix: add compatibility with v0.3.14 for Gemma3ChatHandler

kossum · kossum · commit eef9527e1695 · 2025-08-06T02:10:34.000+09:00
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2752,10 +2752,10 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes):
                 (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
                 len(image_bytes)
             )
-
+            
             if bitmap is None:
                 raise ValueError("Failed to create bitmap from image bytes")
-
+            
             return bitmap
 
     def __call__(
@@ -2814,18 +2814,18 @@ def __call__(
             trim_blocks=True,
             lstrip_blocks=True,
         ).from_string(self.CHAT_FORMAT)
-
+        
         # Get the default media marker
         media_marker = self._mtmd_cpp.mtmd_default_marker().decode('utf-8')
-
+        
         # Replace image URLs with media markers in the template
         text = template.render(
             messages=messages,
             add_generation_prompt=True,
             eos_token=llama.detokenize([llama.token_eos()]),
             bos_token=llama.detokenize([llama.token_bos()]),
         )
-
+        
         # Replace image URLs in text with media markers
         for image_url in image_urls:
             text = text.replace(image_url, media_marker)
@@ -2875,40 +2875,40 @@ def __call__(
                 # Process each chunk
                 n_past = llama_cpp.llama_pos(0)
                 n_chunks = self._mtmd_cpp.mtmd_input_chunks_size(chunks)
-
+                
                 for i in range(n_chunks):
                     chunk = self._mtmd_cpp.mtmd_input_chunks_get(chunks, i)
                     if chunk is None:
                         continue
 
                     chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
-
+                    
                     if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT:
                         # Handle text chunk
                         n_tokens_out = ctypes.c_size_t()
                         tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
                             chunk, ctypes.byref(n_tokens_out)
                         )
-
+                        
                         if tokens_ptr and n_tokens_out.value > 0:
                             # Convert ctypes array to Python list
                             tokens = [tokens_ptr[j] for j in range(n_tokens_out.value)]
-
+                            
                             if llama.n_tokens + len(tokens) > llama.n_ctx():
                                 raise ValueError(
                                     f"Prompt exceeds n_ctx: {llama.n_tokens + len(tokens)} > {llama.n_ctx()}"
                                 )
                             llama.eval(tokens)
-
+                    
                     elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
                         # Handle image/audio chunk using helper
                         chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
-
+                        
                         if llama.n_tokens + chunk_n_tokens > llama.n_ctx():
                             raise ValueError(
                                 f"Prompt exceeds n_ctx: {llama.n_tokens + chunk_n_tokens} > {llama.n_ctx()}"
                             )
-
+                        
                         new_n_past = llama_cpp.llama_pos(0)
                         result = self._mtmd_cpp.mtmd_helper_eval_chunk_single(
                             self.mtmd_ctx,
@@ -2920,10 +2920,10 @@ def __call__(
                             False,  # logits_last
                             ctypes.byref(new_n_past)
                         )
-
+                        
                         if result != 0:
                             raise ValueError(f"Failed to evaluate chunk: error code {result}")
-
+                        
                         # Update llama's token count
                         llama.n_tokens = new_n_past.value
 
@@ -3013,34 +3013,14 @@ def __call__(
             grammar=grammar,
             logit_bias=logit_bias,
         )
-
+        
         if tool is not None:
             tool_name = tool["function"]["name"]
             return _convert_completion_to_chat_function(
                 tool_name, completion_or_chunks, stream
             )
         return _convert_completion_to_chat(completion_or_chunks, stream=stream)
 
-    def eval_image(self, llama: llama.Llama, image_url: str):
-        image_bytes = self.load_image(image_url)
-        embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch)
-        if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx():
-            raise ValueError(
-                f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}"
-            )
-        n_past = ctypes.c_int(llama.n_tokens)
-        n_past_p = ctypes.pointer(n_past)
-        with suppress_stdout_stderr(disable=self.verbose):
-            self._llava_cpp.llava_eval_image_embed(
-                llama.ctx,
-                embed,
-                llama.n_batch,
-                n_past_p,
-            )
-        # Required to avoid issues with hf tokenizer
-        llama.input_ids[llama.n_tokens : n_past.value] = -1
-        llama.n_tokens = n_past.value
-
     @staticmethod
     def _load_image(image_url: str) -> bytes:
         # TODO: Add Pillow support for other image formats beyond (jpg, png)
@@ -3533,6 +3513,58 @@ def __call__(self, **kwargs):
         return super().__call__(**kwargs)
 
 
+class Gemma3ChatHandler(Llava15ChatHandler):
+    # Chat Format:
+    # '<bos><start_of_turn>user\n{system_prompt}\n\n{prompt}<end_of_turn>\n<start_of_turn>model\n'
+
+    DEFAULT_SYSTEM_MESSAGE = None
+
+    CHAT_FORMAT = (
+        "{{ '<bos>' }}"
+        "{% if messages[0]['role'] == 'system' %}"
+        "{% if messages[0]['content'] is string %}"
+        "{% set first_user_prefix = messages[0]['content'] + '\n\n' %}"
+        "{% else %}"
+        "{% set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' %}"
+        "{% endif %}"
+        "{% set loop_messages = messages[1:] %}"
+        "{% else %}"
+        "{% set first_user_prefix = \"\" %}"
+        "{% set loop_messages = messages %}"
+        "{% endif %}"
+        "{% for message in loop_messages %}"
+        "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}"
+        "{{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}"
+        "{% endif %}"
+        "{% if (message['role'] == 'assistant') %}"
+        "{% set role = \"model\" %}"
+        "{% else %}"
+        "{% set role = message['role'] %}"
+        "{% endif %}"
+        "{{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}"
+        "{% if message['content'] is string %}"
+        "{{ message['content'] | trim }}"
+        "{% elif message['content'] is iterable %}"
+        "{% for item in message['content'] %}"
+        "{% if item['type'] == 'image_url' and item['image_url'] is string %}"
+        "{{ '\n\n' + item['image_url'] + '\n\n' }}"
+        "{% elif item['type'] == 'image_url' and item['image_url'] is mapping %}"
+        "{{ '\n\n' + item['image_url']['url'] + '\n\n' }}"
+        "{% elif item['type'] == 'text' %}"
+        "{{ item['text'] | trim }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% else %}"
+        "{{ raise_exception(\"Invalid content type\") }}"
+        "{% endif %}"
+        "{{ '<end_of_turn>\n' }}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "{{ '<start_of_turn>model\n' }}"
+        "{% endif %}"
+    )
+
+
 @register_chat_completion_handler("chatml-function-calling")
 def chatml_function_calling(
     llama: llama.Llama,
diff --git a/llama_cpp/llava_cpp.py b/llama_cpp/llava_cpp.py
@@ -7,7 +7,6 @@
     c_int,
     c_uint8,
     c_float,
-    c_size_t,
     c_void_p,
     POINTER,
     _Pointer,  # type: ignore
@@ -157,87 +156,3 @@ def clip_model_load(
 def clip_free(ctx: clip_ctx_p, /):
     ...
 
-
-# CLIP_API struct clip_image_u8  * clip_image_u8_init ();
-@ctypes_function("clip_image_u8_init", [], c_void_p)
-def clip_image_u8_init() -> Optional[c_void_p]:
-    ...
-
-
-# CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
-@ctypes_function("clip_image_u8_free", [c_void_p], None)
-def clip_image_u8_free(img: c_void_p, /):
-    ...
-
-
-# CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init();
-@ctypes_function("clip_image_f32_batch_init", [], c_void_p)
-def clip_image_f32_batch_init() -> Optional[c_void_p]:
-    ...
-
-
-# CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
-@ctypes_function("clip_image_f32_batch_free", [c_void_p], None)
-def clip_image_f32_batch_free(batch: c_void_p, /):
-    ...
-
-
-# /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
-# CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
-@ctypes_function(
-    "clip_image_preprocess",
-    [
-        clip_ctx_p_ctypes,
-        c_void_p,
-        c_void_p,
-    ],
-    c_bool,
-)
-def clip_image_preprocess(
-    ctx: clip_ctx_p,
-    img: c_void_p,
-    res_imgs: c_void_p,
-    /,
-) -> bool:
-    ...
-
-
-# CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
-@ctypes_function(
-    "clip_image_batch_encode",
-    [
-        clip_ctx_p_ctypes,
-        c_int,
-        c_void_p,
-        POINTER(c_float),
-    ],
-    c_bool,
-)
-def clip_image_batch_encode(
-    ctx: clip_ctx_p,
-    n_threads: c_int,
-    imgs: c_void_p,
-    vec: c_void_p,
-    /,
-) -> bool:
-    ...
-
-
-# /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-# CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-@ctypes_function(
-    "clip_image_load_from_bytes",
-    [
-        c_void_p,
-        c_size_t,
-        c_void_p,
-    ],
-    c_bool,
-)
-def clip_image_load_from_bytes(
-    bytes: c_void_p,
-    bytes_length: c_size_t,
-    img: c_void_p,
-    /,
-) -> bool:
-    ...