From 7f536a646c40642c39e09b94cd2c4e8762ce989f Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Tue, 8 Apr 2025 15:26:33 -0700
Subject: [PATCH 1/7] update curator for batch usage, thinking model inputs and
 some other input args management

---
 eval/chat_benchmarks/curator_lm.py | 77 +++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 18 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 8e023afe..d7e662cd 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -19,12 +19,13 @@ def __init__(
         model: str = None,
         pretrained: str = None,
         max_length: Optional[int] = 2048,
-        max_retries: int = 10,
+        max_retries: int = 20,
         timeout: int = 300,
         tokenized_requests: bool = False,
         max_requests_per_minute: int = None,
         max_tokens_per_minute: int = None,
         seconds_to_pause_on_rate_limit: int = None,
+        batch: bool = False,
         **kwargs,
     ):
         super().__init__()
@@ -32,17 +33,8 @@ def __init__(
         self.model_name = model or pretrained
 
         self.model_args = kwargs
-        self.model_args.update(
-            {
-                "model": self.model_name,
-                "pretrained": pretrained,
-                "max_length": max_length,
-                "max_retries": max_retries,
-                "timeout": timeout,
-                "tokenized_requests": tokenized_requests,
-            }
-        )
 
+        self.gen_kwargs = {"batch": batch}
         if "gemini" in self.model_name and "thinking" in self.model_name:
             max_requests_per_minute = max_requests_per_minute or 200
             max_tokens_per_minute = max_tokens_per_minute or 400_000
@@ -52,13 +44,31 @@ def __init__(
         elif "claude" in self.model_name:
             max_requests_per_minute = max_requests_per_minute or 2000
             max_tokens_per_minute = max_tokens_per_minute or 80_000
+            if "thinking" in self.model_name:
+                self.gen_kwargs["thinking"] = {"type": "enabled", "budget_tokens": max_length - 4096}
+                self.model_name = (
+                    self.model_name.replace("-thinking-", "")
+                    .replace("-thinking", "")
+                    .replace("thinking-", "")
+                    .replace("thinking", "")
+                )
+
+        self.model_args.update(
+            {
+                "model": self.model_name,
+                "pretrained": pretrained,
+                "max_length": max_length,
+                "max_retries": max_retries,
+                "timeout": timeout,
+                "tokenized_requests": tokenized_requests,
+            }
+        )
 
         if tokenized_requests:
             raise NotImplementedError("Tokenized requests not implemented for curator.")
         self.tokenized_requests = False
         self.max_length = max_length
         self.llm = None
-        self.gen_kwargs = {}
         self.eos = None
         if "temperature" in kwargs:
             self.gen_kwargs["temperature"] = kwargs["temperature"]
@@ -103,16 +113,44 @@ def _create_payload(
             "top_p": top_p,
             "stop": stop,
         }
-        if "o1" in self.model_name:
+        additional_args = {}
+        backend_params = self.backend_params.copy()
+        if self.gen_kwargs.get("batch", False):
+            # backend_params for rate limiting are not compatible with batch requests
+            backend_params = {"require_all_responses": True}
+            additional_args["batch"] = True
+        if "deepseek" in self.model_name:
+            additional_args["backend"] = "openai"
+            backend_params["max_requests_per_minute"] = 2_500
+            backend_params["max_tokens_per_minute"] = 1_000_000_000
+            backend_params["base_url"] = "https://api.deepseek.com/"
+            backend_params["api_key"] = os.environ["DEEPSEEK_API_KEY"]
+            gen_kwargs["temperature"] = 0
+        if "o1" or "o3" in self.model_name:
             print("Warning: O1 model does not support top_p, stop, or temperature. Ignoring them.")
-            gen_kwargs.pop("top_p")
-            gen_kwargs.pop("stop")
-            gen_kwargs.pop("temperature")
+            gen_kwargs.pop("top_p", None)
+            gen_kwargs.pop("stop", None)
+            gen_kwargs.pop("temperature", None)
+        if "claude" in self.model_name:
+            gen_kwargs.pop("max_completion_tokens", None)
+            gen_kwargs.pop("stop", None)
+            gen_kwargs["max_tokens"] = max_tokens
+            if "thinking" in self.gen_kwargs:
+                gen_kwargs["thinking"] = self.gen_kwargs["thinking"]
+                gen_kwargs["thinking"]["budget_tokens"] = max_tokens - 4096
+                # `temperature` may only be set to 1 when thinking is enabled.
+                # Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking'
+                gen_kwargs["temperature"] = 1
+                # `top_p` must be unset when thinking is enabled. (same documentation)
+                gen_kwargs.pop("top_p", None)
         if self.llm is None:
             self.eos = eos
             self.gen_kwargs = gen_kwargs.copy()
             self.llm = curator.LLM(
-                model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
+                model_name=self.model_name,
+                generation_params=gen_kwargs,
+                backend_params=backend_params,
+                **additional_args,
             )
         else:
             if self.gen_kwargs != gen_kwargs:
@@ -121,7 +159,10 @@ def _create_payload(
                 )
                 self.gen_kwargs = gen_kwargs.copy()
                 self.llm = curator.LLM(
-                    model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
+                    model_name=self.model_name,
+                    generation_params=gen_kwargs,
+                    backend_params=backend_params,
+                    **additional_args,
                 )
         return messages
 

From e08fe5154134b1e1ea2b5c62aee11b267f6c1063 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Mon, 14 Apr 2025 11:45:44 -0700
Subject: [PATCH 2/7] refactor parameter hanling of curator api model

---
 eval/chat_benchmarks/curator_lm.py | 355 ++++++++++++++++++-----------
 1 file changed, 220 insertions(+), 135 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index d7e662cd..2426531b 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -26,62 +26,111 @@ def __init__(
         max_tokens_per_minute: int = None,
         seconds_to_pause_on_rate_limit: int = None,
         batch: bool = False,
+        temperature: float = 0.0,
+        top_p: float = 0.95,
         **kwargs,
     ):
         super().__init__()
 
+        if tokenized_requests:
+            raise NotImplementedError("Tokenized requests not implemented for curator.")
+        self.tokenized_requests = False
+
         self.model_name = model or pretrained
+        self.max_length = max_length
+        self.is_batch_request = batch
+        self._configure_params(
+            max_length=max_length,
+            max_retries=max_retries,
+            timeout=timeout,
+            max_requests_per_minute=max_requests_per_minute,
+            max_tokens_per_minute=max_tokens_per_minute,
+            seconds_to_pause_on_rate_limit=seconds_to_pause_on_rate_limit,
+            temperature=temperature,
+            top_p=top_p,
+            **kwargs,
+        )
 
-        self.model_args = kwargs
+        self.llm = None # Initialize lazily
+        self.eos = None # Will be set during LLM initialization if needed
 
-        self.gen_kwargs = {"batch": batch}
-        if "gemini" in self.model_name and "thinking" in self.model_name:
-            max_requests_per_minute = max_requests_per_minute or 200
-            max_tokens_per_minute = max_tokens_per_minute or 400_000
-        elif "gemini" in self.model_name:
-            max_requests_per_minute = max_requests_per_minute or 2000
-            max_tokens_per_minute = max_tokens_per_minute or 4_000_000
+        # Disable cache since it is not necessary
+        os.environ["CURATOR_DISABLE_CACHE"] = "true"
+
+    def _configure_params(
+        self,
+        max_length: int,
+        max_retries: int,
+        timeout: int,
+        max_requests_per_minute: Optional[int],
+        max_tokens_per_minute: Optional[int],
+        seconds_to_pause_on_rate_limit: Optional[int],
+        temperature: float,
+        top_p: float,
+        **kwargs,
+    ):
+        """Sets up gen_kwargs and backend_params based on model name and init args."""
+        self.gen_kwargs = {
+            "max_completion_tokens": max_length,
+            "temperature": temperature,
+            "top_p": top_p,
+            "stop": None, # Will be set later if needed based on request
+        }
+        self.backend_params = {
+            "invalid_finish_reasons": ["content_filter"],
+            "require_all_responses": False,
+            "request_timeout": timeout,
+            "max_retries": max_retries,
+        }
+        self.additional_llm_args = {} # For args passed directly to curator.LLM constructor
+
+        # Model-specific adjustments
+        is_thinking_model = "thinking" in self.model_name
+
+        if "gemini" in self.model_name:
+            if is_thinking_model:
+                max_requests_per_minute = max_requests_per_minute or 200
+                max_tokens_per_minute = max_tokens_per_minute or 400_000
+            else:
+                max_requests_per_minute = max_requests_per_minute or 2000
+                max_tokens_per_minute = max_tokens_per_minute or 4_000_000
         elif "claude" in self.model_name:
             max_requests_per_minute = max_requests_per_minute or 2000
             max_tokens_per_minute = max_tokens_per_minute or 80_000
-            if "thinking" in self.model_name:
-                self.gen_kwargs["thinking"] = {"type": "enabled", "budget_tokens": max_length - 4096}
+            # Claude uses 'max_tokens' instead of 'max_completion_tokens'
+            self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens")
+            self.gen_kwargs.pop("stop", None) # Claude doesn't support stop sequences via API arg
+
+            if is_thinking_model:
+                # Adjust name and set thinking params
                 self.model_name = (
                     self.model_name.replace("-thinking-", "")
                     .replace("-thinking", "")
                     .replace("thinking-", "")
                     .replace("thinking", "")
                 )
+                # Thinking budget calculation depends on final max_tokens
+                thinking_budget = self.gen_kwargs["max_tokens"] - 4096
+                self.gen_kwargs["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget}
+                # API requirements for thinking mode
+                self.gen_kwargs["temperature"] = 1.0
+                self.gen_kwargs.pop("top_p", None)
+        elif "deepseek" in self.model_name:
+            self.additional_llm_args["backend"] = "openai"
+            self.backend_params["base_url"] = "https://api.deepseek.com/"
+            self.backend_params["api_key"] = os.environ["DEEPSEEK_API_KEY"]
+            max_requests_per_minute = 2_500 # Override rate limits
+            max_tokens_per_minute = 1_000_000_000
+            self.gen_kwargs["temperature"] = 0 # Override temperature
+        elif "o1" in self.model_name or "o3" in self.model_name:
+             # o1/o3 don't support these
+            print(f"Warning: Model {self.model_name} does not support top_p, stop, or temperature. Ignoring them.")
+            self.gen_kwargs.pop("top_p", None)
+            self.gen_kwargs.pop("stop", None)
+            self.gen_kwargs.pop("temperature", None)
 
-        self.model_args.update(
-            {
-                "model": self.model_name,
-                "pretrained": pretrained,
-                "max_length": max_length,
-                "max_retries": max_retries,
-                "timeout": timeout,
-                "tokenized_requests": tokenized_requests,
-            }
-        )
 
-        if tokenized_requests:
-            raise NotImplementedError("Tokenized requests not implemented for curator.")
-        self.tokenized_requests = False
-        self.max_length = max_length
-        self.llm = None
-        self.eos = None
-        if "temperature" in kwargs:
-            self.gen_kwargs["temperature"] = kwargs["temperature"]
-        if "top_p" in kwargs:
-            self.gen_kwargs["top_p"] = kwargs["top_p"]
-        self.backend_params = {
-            "invalid_finish_reasons": [
-                "content_filter"
-            ],  # So it doesn't retry on `length` finish reason, but retries on "content_filter"}
-            "require_all_responses": False,
-            "request_timeout": timeout,
-            "max_retries": max_retries,
-        }
+        # Apply rate limits if provided and not overridden by model specifics
         if max_requests_per_minute is not None:
             self.backend_params["max_requests_per_minute"] = max_requests_per_minute
         if max_tokens_per_minute is not None:
@@ -89,91 +138,64 @@ def __init__(
         if seconds_to_pause_on_rate_limit is not None:
             self.backend_params["seconds_to_pause_on_rate_limit"] = seconds_to_pause_on_rate_limit
 
-        # Disable cache since it is not necessary
-        os.environ["CURATOR_DISABLE_CACHE"] = "true"
+        # Handle batch mode specifics
+        if self.is_batch_request:
+            # Rate limiting params are incompatible with batch requests in curator
+            self.backend_params = {"require_all_responses": True}
+            self.additional_llm_args["batch"] = True
 
-    def _create_payload(
-        self,
-        messages: Union[List[List[int]], List[dict], List[str], str],
-        *,
-        generate: bool = False,
-        gen_kwargs: Optional[dict] = None,
-        eos=None,
-        **kwargs,
-    ) -> dict:
-        assert generate, "Curator only supports generation."
-        # Create the payload for the API request
-        max_tokens = self.max_length or gen_kwargs.get("max_gen_toks", self.max_length)
-        temperature = self.gen_kwargs.get("temperature", gen_kwargs.get("temperature", 0))
-        top_p = self.gen_kwargs.get("top_p", gen_kwargs.get("top_p", 0.95))
-        stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
-        gen_kwargs = {
-            "max_completion_tokens": max_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "stop": stop,
-        }
-        additional_args = {}
-        backend_params = self.backend_params.copy()
-        if self.gen_kwargs.get("batch", False):
-            # backend_params for rate limiting are not compatible with batch requests
-            backend_params = {"require_all_responses": True}
-            additional_args["batch"] = True
-        if "deepseek" in self.model_name:
-            additional_args["backend"] = "openai"
-            backend_params["max_requests_per_minute"] = 2_500
-            backend_params["max_tokens_per_minute"] = 1_000_000_000
-            backend_params["base_url"] = "https://api.deepseek.com/"
-            backend_params["api_key"] = os.environ["DEEPSEEK_API_KEY"]
-            gen_kwargs["temperature"] = 0
-        if "o1" or "o3" in self.model_name:
-            print("Warning: O1 model does not support top_p, stop, or temperature. Ignoring them.")
-            gen_kwargs.pop("top_p", None)
-            gen_kwargs.pop("stop", None)
-            gen_kwargs.pop("temperature", None)
-        if "claude" in self.model_name:
-            gen_kwargs.pop("max_completion_tokens", None)
-            gen_kwargs.pop("stop", None)
-            gen_kwargs["max_tokens"] = max_tokens
-            if "thinking" in self.gen_kwargs:
-                gen_kwargs["thinking"] = self.gen_kwargs["thinking"]
-                gen_kwargs["thinking"]["budget_tokens"] = max_tokens - 4096
-                # `temperature` may only be set to 1 when thinking is enabled.
-                # Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking'
-                gen_kwargs["temperature"] = 1
-                # `top_p` must be unset when thinking is enabled. (same documentation)
-                gen_kwargs.pop("top_p", None)
+
+    def _ensure_llm_initialized(self, eos=None):
+        """Initializes the curator.LLM object if it hasn't been already."""
         if self.llm is None:
-            self.eos = eos
-            self.gen_kwargs = gen_kwargs.copy()
+            # Update stop sequences based on the current request if needed
+            # This assumes EOS is consistent for the lifetime of the model instance
+            if eos and self.gen_kwargs.get("stop") is None:
+                 self.eos = eos # Store for potential future reference if needed
+                 # Handle potential list of stop sequences
+                 stop_sequences = handle_stop_sequences(None, eos) # Pass current eos
+                 # Only update if stop sequences are actually needed and supported
+                 if stop_sequences and "stop" in self.gen_kwargs:
+                     self.gen_kwargs["stop"] = stop_sequences
+                 elif stop_sequences and "max_tokens" in self.gen_kwargs and "claude" not in self.model_name:
+                     # Only warn if stop sequences were provided but the param doesn't exist
+                     # (like for Claude, which was handled in _configure_params)
+                     print(f"Warning: Stop sequences provided but 'stop' generation parameter is not available for {self.model_name}.")
+
+
+            print(f"Initializing curator.LLM with: model_name='{self.model_name}', generation_params={self.gen_kwargs}, backend_params={self.backend_params}, additional_args={self.additional_llm_args}")
             self.llm = curator.LLM(
                 model_name=self.model_name,
-                generation_params=gen_kwargs,
-                backend_params=backend_params,
-                **additional_args,
+                generation_params=self.gen_kwargs,
+                backend_params=self.backend_params,
+                **self.additional_llm_args,
             )
-        else:
-            if self.gen_kwargs != gen_kwargs:
-                print(
-                    "Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request"
-                )
-                self.gen_kwargs = gen_kwargs.copy()
-                self.llm = curator.LLM(
-                    model_name=self.model_name,
-                    generation_params=gen_kwargs,
-                    backend_params=backend_params,
-                    **additional_args,
-                )
-        return messages
 
     def create_message(
-        self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False
-    ) -> Union[List[List[int]], List[dict], List[str], str]:
-        # Convert messages to the format expected by the API
+        self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False # generate arg seems unused now
+    ) -> List[dict]: # Always return the list of dicts format curator expects
+        """Converts various message formats into the list of dicts format for curator."""
         if isinstance(messages, list) and all(isinstance(m, JsonChatStr) for m in messages):
-            return [json.loads(m.prompt) for m in messages]
+            # Directly parse the JSON string stored in JsonChatStr
+            try:
+                return [json.loads(m.prompt) for m in messages]
+            except json.JSONDecodeError as e:
+                 raise ValueError(f"Failed to parse JSON from JsonChatStr: {e}. Content: {[m.prompt for m in messages]}")
+            except Exception as e:
+                 raise ValueError(f"Error processing JsonChatStr messages: {e}")
+        elif isinstance(messages, list) and all(isinstance(m, dict) for m in messages):
+             # If it's already in the correct list-of-dicts format
+             return messages
+        elif isinstance(messages, list) and all(isinstance(m, str) for m in messages):
+             # Handle simple list of strings if needed (e.g., for non-chat models, although curator usually expects dicts)
+             # This might need adjustment depending on how non-chat prompts are formatted.
+             # Assuming a simple conversion for now.
+             print("Warning: Converting list of strings to basic 'user' messages. Verify format.")
+             return [{"role": "user", "content": m} for m in messages]
         else:
-            raise ValueError("Messages must be a list of JsonChatStr objects")
+            # Add more specific checks or raise error for unsupported formats
+            raise ValueError(f"Unsupported messages format: {type(messages)}. Expected List[JsonChatStr], List[dict], or List[str].")
+
 
     @staticmethod
     def parse_logprobs(
@@ -192,20 +214,49 @@ def parse_logprobs(
     @staticmethod
     def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
         # Parse the generated outputs from the API
-        return [output["response"] for output in outputs]
+        # Assuming outputs is the direct list of response dicts from curator
+        if isinstance(outputs, list) and all(isinstance(item, dict) for item in outputs):
+             return [output.get("response", "") for output in outputs] # Use .get for safety
+        else:
+             # Handle unexpected output format
+             print(f"Warning: Unexpected output format in parse_generations: {type(outputs)}. Expected List[dict].")
+             return []
+
 
     @property
     def tokenizer_name(self) -> str:
         return self.model_name
 
     def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str, JsonChatStr]:
-        # Convert chat history to the required format
+        # Convert chat history to the required JsonChatStr format
         return JsonChatStr(json.dumps(chat_history))
 
     def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]:
-        payload = self._create_payload(self.create_message(messages), **kwargs)
-        response = self.llm(payload)["response"]
-        return response
+         # Deprecated or needs rework? generate_until is the primary method used by lm-eval harness.
+         # This method seems designed for single requests, while generate_until handles batches.
+         # If needed, it should also use _ensure_llm_initialized and create_message.
+        print("Warning: model_call is likely deprecated for lm-eval tasks. Use generate_until.")
+        self._ensure_llm_initialized() # Make sure LLM is ready
+        # Ensure messages is a list, as curator expects a list of prompts
+        if not isinstance(messages, list):
+             messages = [messages]
+
+        formatted_messages = self.create_message(messages)
+        # Assuming model_call handles a single prompt, curator expects a list
+        if not formatted_messages:
+             return None # Or raise error
+
+        # Curator returns a dictionary with a 'response' key containing a list of outputs
+        response_data = self.llm(formatted_messages)
+
+        # Extract the actual response content for the single prompt
+        if response_data and "response" in response_data and isinstance(response_data["response"], list) and len(response_data["response"]) > 0:
+             # Return the first response dictionary (or just the text?)
+             return response_data["response"][0] # Returning the dict like {'response': 'text'}
+        else:
+             print(f"Warning: Unexpected response structure from curator in model_call: {response_data}")
+             return None
+
 
     def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
         raise NotImplementedError("Log likelihood tokens not implemented for curator.")
@@ -220,27 +271,61 @@ def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
 
     @property
     def eot_token_id(self) -> Optional[int]:
-        # Assuming the model has a specific end-of-text token ID
-        return self.llm.eot_token_id  # Replace with actual method to get EOT token ID
+        # Curator doesn't directly expose tokenizer or token IDs.
+        # Need to rely on underlying model specifics if absolutely necessary,
+        # but lm-eval generally handles this via stop sequences.
+        print("Warning: eot_token_id is not directly available via Curator API.")
+        return None # Cannot reliably get this from curator
 
     def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -> List[str]:
-        # Tokenize contexts if required
-        if self.tokenized_requests:
-            raise NotImplementedError("Tokenized requests not implemented for curator.")
+        if not requests:
+            return []
 
-        # Extract contexts and generation kwargs from the Instance objects
+        # Ensure LLM is initialized, passing eos from the first request's gen_kwargs
+        # Assumes eos is consistent across the batch, which is reasonable for lm-eval.
+        first_req_kwargs = requests[0].args[1] if len(requests[0].args) > 1 else {}
+        self._ensure_llm_initialized(eos=first_req_kwargs.get("until"))
+
+        # Extract contexts (already in JsonChatStr format expected by create_message)
         contexts = [req.args[0] for req in requests]
-        gen_kwargs = [req.args[1] for req in requests]
 
-        # Assert all gen_kwargs are the same
-        assert all(
-            gen_kwargs[0] == gkw for gkw in gen_kwargs
-        ), "Generation parameters must be the same for all requests in curator"
+        # Validate generation kwargs consistency (already done partially in lm-eval harness)
+        # We rely on the __init__ configuration for gen_kwargs now.
+        # We could add an assertion here to double-check if request-specific kwargs match self.gen_kwargs if needed.
+        # For now, we assume the instance's configuration is the source of truth.
+        # Example check (optional):
+        # for req in requests:
+        #     req_kwargs = req.args[1]
+        #     # Compare relevant keys, skipping 'until' as it's handled separately for eos
+        #     if req_kwargs.get('temperature', self.gen_kwargs.get('temperature')) != self.gen_kwargs.get('temperature') or \
+        #        req_kwargs.get('top_p', self.gen_kwargs.get('top_p')) != self.gen_kwargs.get('top_p') # Add other relevant keys
+        #        # raise ValueError("Request generation kwargs deviate from initialized model kwargs.")
+
+
+        # Format messages for curator
+        formatted_messages = self.create_message(contexts)
+
+        # Make the call to curator
+        start_time = time.time()
+        response_data = self.llm(formatted_messages)
+        end_time = time.time()
+        print(f"Curator call took {end_time - start_time:.2f} seconds for {len(requests)} requests.")
+
+
+        # Parse the generations
+        # response_data is expected to be like {'response': [{'response': 'text1'}, {'response': 'text2'}, ...]}
+        if response_data and "response" in response_data and isinstance(response_data["response"], list):
+             parsed_generations = self.parse_generations(response_data["response"])
+             # Ensure the number of results matches the number of requests
+             if len(parsed_generations) != len(requests):
+                 print(f"Warning: Mismatch between number of requests ({len(requests)}) and responses ({len(parsed_generations)}). Padding with empty strings.")
+                 # Pad with empty strings to match the expected length
+                 parsed_generations.extend([""] * (len(requests) - len(parsed_generations)))
+             return parsed_generations
+        else:
+             print(f"Error: Unexpected response structure from curator: {response_data}. Returning empty strings.")
+             return [""] * len(requests)
 
-        contexts_dataset = self.create_message(contexts)
-        payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
-        response = self.llm(payload)["response"]
-        return response
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
         raise NotImplementedError("Log likelihood rolling not implemented for curator.")
@@ -253,4 +338,4 @@ def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[fl
 
     def tok_encode(self, string: str, **kwargs) -> List[int]:
         raise NotImplementedError("Token encoding not implemented for curator.")
-        return self.llm.tokenizer.encode(string)  # Replace with actual method to tokenize
+        # return self.llm.tokenizer.encode(string) # Cannot access tokenizer directly

From 603b1c217321610e0b7bb921041fe450c4f186b3 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Mon, 14 Apr 2025 11:57:56 -0700
Subject: [PATCH 3/7] update readme to launch curator

---
 README.md | 245 ++++--------------------------------------------------
 1 file changed, 17 insertions(+), 228 deletions(-)

diff --git a/README.md b/README.md
index 9a2b2144..6d7eb4ff 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,22 @@ Evalchemy is developed by the [DataComp community](https://datacomp.ai) and [Bes
         --model_args 'tokenized_requests=False' \
         --output_path logs
 ```
+
+Here are other examples of `model_name`:
+- `"claude-3-7-sonnet-latest-thinking"`
+- `"deepseek-reasoner"`
+- `"gemini/gemini-1.5-flash"`
+- `"claude-3-7-sonnet-latest"`
+- `"gpt-4o-mini-2024-07-18"`
+- `"o1-preview-2024-09-12"`
+- `"gpt-4o-2024-08-06"`
+
+You can also change the `model_args` to fit your needs. For example, `"claude-3-7-sonnet-latest-thinking"` might need more tokens and more time for its thinking process and can be used in batch mode to speed up evaluation and reduce costs by setting `model_args` like this:
+
+```
+--model_args 'tokenized_requests=False,timeout=2000,max_length=64000,batch=True'
+```
+
 #### [2025.01.29] New Reasoning Benchmarks
 
 - AIME24, AMC23, MATH500, LiveCodeBench, GPQADiamond, HumanEvalPlus, MBPPPlus, BigCodeBench, MultiPL-E, and CRUXEval have been added to our growing list of [available benchmarks](https://github.com/mlfoundations/evalchemy?tab=readme-ov-file#built-in-benchmarks). This is part of the effort in the [Open Thoughts](https://github.com/open-thoughts/open-thoughts) project. See the [our blog post](https://www.open-thoughts.ai/blog/measure) on using Evalchemy for measuring reasoning models. 
@@ -206,231 +222,4 @@ Key features:
 
 Refer to the [distributed README](eval/distributed/README.md) for more details. 
 
-NOTE: This is configured for specific HPC clusters, but can easily be adapted. Furthermore it can be adapted for a non-HPC setup using `CUDA_VISIBLE_DEVICES` instead of SLURM job arrays. 
-
-
-### Multi-GPU Evaluation 
-
-NOTE: this is slower than doing fully data parallel evaluation (see previous section)
-
-```bash
-accelerate launch --num-processes <num-gpus> --num-machines <num-nodes> \
-    --multi-gpu -m eval.eval \
-    --model hf \
-    --tasks MTBench,alpaca_eval \
-    --model_args 'pretrained=mistralai/Mistral-7B-Instruct-v0.3' \
-    --batch_size 2 \
-    --output_path logs
-```
-
-### Large Model Evaluation
-
-For models that don't fit on a single GPU, use model parallelism:
-
-```bash
-python -m eval.eval \
-    --model hf \
-    --tasks MTBench,alpaca_eval \
-    --model_args 'pretrained=mistralai/Mistral-7B-Instruct-v0.3,parallelize=True' \
-    --batch_size 2 \
-    --output_path logs
-```
-
-> **💡 Note**: While "auto" batch size is supported, we recommend manually tuning the batch size for optimal performance. The optimal batch size depends on the model size, GPU memory, and the specific benchmark. We used a maximum of 32 and a minimum of 4 (for RepoBench) to evaluate Llama-3-8B-Instruct on 8xH100 GPUs.
-
-### Output Log Structure
-
-Our generated logs include critical information about each evaluation to help inform your experiments. We highlight important items in our generated logs. 
-
-- Model Configuration
-  - `model`: Model framework used
-  - `model_args`: Model arguments for the model framework
-  - `batch_size`: Size of processing batches
-  - `device`: Computing device specification
-  - `annotator_model`: Model used for annotation ("gpt-4o-mini-2024-07-18")
-- Seed Configuration
-  - `random_seed`: General random seed
-  - `numpy_seed`: NumPy-specific seed
-  - `torch_seed`: PyTorch-specific seed
-  - `fewshot_seed`: Seed for few-shot examples
-- Model Details
-  - `model_num_parameters`: Number of model parameters
-  - `model_dtype`: Model data type
-  - `model_revision`: Model version
-  - `model_sha`: Model commit hash
-
-- Version Control
-  - `git_hash`: Repository commit hash
-  - `date`: Unix timestamp of evaluation
-  - `transformers_version`: Hugging Face Transformers version
-- Tokenizer Configuration
-  - `tokenizer_pad_token`: Padding token details
-  - `tokenizer_eos_token`: End of sequence token
-  - `tokenizer_bos_token`: Beginning of sequence token
-  - `eot_token_id`: End of text token ID
-  - `max_length`: Maximum sequence length
-- Model Settings
-  - `model_source`: Model source platform
-  - `model_name`: Full model identifier
-  - `model_name_sanitized`: Sanitized model name for file system usage
-  - `chat_template`: Conversation template
-  - `chat_template_sha`: Template hash
-- Timing Information
-  - `start_time`: Evaluation start timestamp
-  - `end_time`: Evaluation end timestamp
-  - `total_evaluation_time_seconds`: Total duration
-- Hardware Environment
-  - PyTorch version and build configuration
-  - Operating system details
-  - GPU configuration
-  - CPU specifications
-  - CUDA and driver versions
-  - Relevant library versions
-
-### Customizing Evaluation
-
-#### 🤖 Change Annotator Model
-
-As part of Evalchemy, we want to make swapping in different Language Model Judges for standard benchmarks easy. Currently, we support two judge settings. The first is the default setting, where we use a benchmark's default judge. To activate this, you can either do nothing or pass in
-```bash
---annotator_model auto
-```
-In addition to the default assignments, we support using gpt-4o-mini-2024-07-18 as a judge:
-
-```bash
---annotator_model gpt-4o-mini-2024-07-18
-```
-
-We are planning on adding support for different judges in the future!
-
-### ⏱️ Runtime and Cost Analysis
-
-Evalchemy makes running common benchmarks simple, fast, and versatile! We list the speeds and costs for each benchmark we achieve with Evalchemy for Meta-Llama-3-8B-Instruct on 8xH100 GPUs.
-
-| Benchmark | Runtime (8xH100) | Batch Size | Total Tokens | Default Judge Cost ($) | GPT-4o-mini Judge Cost ($) | Notes |
-|-----------|------------------|------------|--------------|----------------|-------------------|--------|
-| MTBench | 14:00 | 32 | ~196K | 6.40 | 0.05 | |
-| WildBench | 38:00 | 32 | ~2.2M | 30.00 | 0.43 | Using GPT-4-mini judge |
-| RepoBench | 46:00 | 4 | - | - | - | Lower batch size due to memory |
-| MixEval | 13:00 | 32 | ~4-6M | 3.36 | 0.76 | Varies by judge model |
-| AlpacaEval | 16:00 | 32 | ~936K | 9.40 | 0.14 | |
-| HumanEval | 4:00 | 32 | - | - | - | No API costs |
-| IFEval | 1:30 | 32 | - | - | - | No API costs |
-| ZeroEval | 1:44:00 | 32 | - | - | - | Longest runtime |
-| MBPP | 6:00 | 32 | - | - | - | No API costs |
-| MMLU | 7:00 | 32 | - | - | - | No API costs |
-| ARC | 4:00 | 32 | - | - | - | No API costs |
-| DROP | 20:00 | 32 | - | - | - | No API costs |
-
-**Notes:**
-- Runtimes measured using 8x H100 GPUs with Meta-Llama-3-8B-Instruct model
-- Batch sizes optimized for memory and speed
-- API costs vary based on judge model choice
-
-**Cost-Saving Tips:**
-- Use gpt-4o-mini-2024-07-18 judge when possible for significant cost savings
-- Adjust batch size based on available memory
-- Consider using data-parallel evaluation for faster results
-
-### 🔐 Special Access Requirements
-
-#### ZeroEval Access
-To run ZeroEval benchmarks, you need to:
-
-1. Request access to the [ZebraLogicBench-private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) on Hugging Face
-2. Accept the terms and conditions
-3. Log in to your Hugging Face account when running evaluations
-
-## 🛠️ Implementing Custom Evaluations
-
-To add a new evaluation system:
-
-1. Create a new directory under `eval/chat_benchmarks/`
-2. Implement `eval_instruct.py` with two required functions:
-   - `eval_instruct(model)`: Takes an LM Eval Model, returns results dict
-   - `evaluate(results)`: Takes results dictionary, returns evaluation metrics
-
-### Adding External Evaluation Repositories
-
-Use git subtree to manage external evaluation code:
-
-```bash
-# Add external repository
-git subtree add --prefix=eval/chat_benchmarks/new_eval https://github.com/original/repo.git main --squash
-
-# Pull updates
-git subtree pull --prefix=eval/chat_benchmarks/new_eval https://github.com/original/repo.git main --squash
-
-# Push contributions back
-git subtree push --prefix=eval/chat_benchmarks/new_eval https://github.com/original/repo.git contribution-branch
-```
-
-### 🔍 Debug Mode
-
-To run evaluations in debug mode, add the `--debug` flag:
-
-```bash
-python -m eval.eval \
-    --model hf \
-    --tasks MTBench \
-    --model_args "pretrained=mistralai/Mistral-7B-Instruct-v0.3" \
-    --batch_size 2 \
-    --output_path logs \
-    --debug
-```
-
-This is particularly useful when testing new evaluation implementations, debugging model configurations, verifying dataset access, and testing database connectivity.
-
-### 🚀 Performance Tips
-
-1. Utilize batch processing for faster evaluation:
-```python
-all_instances.append(
-    Instance(
-        "generate_until",
-        example,
-        (
-            inputs,
-            {
-                "max_new_tokens": 1024,
-                "do_sample": False,
-            },
-        ),
-        idx,
-    )
-)
-
-outputs = self.compute(model, all_instances)
-```
-
-2. Use the LM-eval logger for consistent logging across evaluations
-
-### 🔧 Troubleshooting
-Evalchemy has been tested on CUDA 12.4. If you run into issues like this: `undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`, try updating your CUDA version:
-```bash
-wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo add-apt-repository contrib
-sudo apt-get update
-sudo apt-get -y install cuda-toolkit-12-4
-```
-
-## 🏆 Leaderboard Integration
-To track experiments and evaluations, we support logging results to a PostgreSQL database. Details on the entry schemas and database setup can be found in [`database/`](database/).
-
-
-## Contributing
-Thank you to all the contributors for making this project possible!
-Please follow [these instructions](CONTRIBUTING.md) on how to contribute.
-
-## Citation
-If you find Evalchemy useful, please consider citing us!
-
-```
-@software{Evalchemy: Automatic evals for LLMs,
-  author = {Guha, Etash and Raoof, Negin and Mercat, Jean and Marten, Ryan and Frankel, Eric and Keh, Sedrick and Grover, Sachin and Smyrnis, George and Vu, Trung and Saad-Falcon, Jon and Choi, Caroline and Arora, Kushal and Merrill, Mike and Deng, Yichuan and Suvarna, Ashima and Bansal, Hritik and Nezhurina, Marianna and Choi, Yejin and Heckel, Reinhard and Oh, Seewong and Hashimoto, Tatsunori and Jitsev, Jenia and Shankar, Vaishaal and Dimakis, Alex and Sathiamoorthy, Mahesh and Schmidt, Ludwig},
-  month = nov,
-  title = {{Evalchemy}},
-  year = {2024}
-}
-```
+NOTE: This is configured for specific HPC clusters, but can easily be adapted. Furthermore it can be adapted for a non-HPC setup using `CUDA_VISIBLE_DEVICES`
\ No newline at end of file

From 17398f834b29f0c60f66ca5535cf80d84edec7ff Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Mon, 14 Apr 2025 11:59:47 -0700
Subject: [PATCH 4/7] add back mistakenly erased end of readme

---
 README.md | 229 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 228 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6d7eb4ff..b05121d0 100644
--- a/README.md
+++ b/README.md
@@ -222,4 +222,231 @@ Key features:
 
 Refer to the [distributed README](eval/distributed/README.md) for more details. 
 
-NOTE: This is configured for specific HPC clusters, but can easily be adapted. Furthermore it can be adapted for a non-HPC setup using `CUDA_VISIBLE_DEVICES`
\ No newline at end of file
+NOTE: This is configured for specific HPC clusters, but can easily be adapted. Furthermore it can be adapted for a non-HPC setup using `CUDA_VISIBLE_DEVICES` instead of SLURM job arrays. 
+
+
+### Multi-GPU Evaluation 
+
+NOTE: this is slower than doing fully data parallel evaluation (see previous section)
+
+```bash
+accelerate launch --num-processes <num-gpus> --num-machines <num-nodes> \
+    --multi-gpu -m eval.eval \
+    --model hf \
+    --tasks MTBench,alpaca_eval \
+    --model_args 'pretrained=mistralai/Mistral-7B-Instruct-v0.3' \
+    --batch_size 2 \
+    --output_path logs
+```
+
+### Large Model Evaluation
+
+For models that don't fit on a single GPU, use model parallelism:
+
+```bash
+python -m eval.eval \
+    --model hf \
+    --tasks MTBench,alpaca_eval \
+    --model_args 'pretrained=mistralai/Mistral-7B-Instruct-v0.3,parallelize=True' \
+    --batch_size 2 \
+    --output_path logs
+```
+
+> **💡 Note**: While "auto" batch size is supported, we recommend manually tuning the batch size for optimal performance. The optimal batch size depends on the model size, GPU memory, and the specific benchmark. We used a maximum of 32 and a minimum of 4 (for RepoBench) to evaluate Llama-3-8B-Instruct on 8xH100 GPUs.
+
+### Output Log Structure
+
+Our generated logs include critical information about each evaluation to help inform your experiments. We highlight important items in our generated logs. 
+
+- Model Configuration
+  - `model`: Model framework used
+  - `model_args`: Model arguments for the model framework
+  - `batch_size`: Size of processing batches
+  - `device`: Computing device specification
+  - `annotator_model`: Model used for annotation ("gpt-4o-mini-2024-07-18")
+- Seed Configuration
+  - `random_seed`: General random seed
+  - `numpy_seed`: NumPy-specific seed
+  - `torch_seed`: PyTorch-specific seed
+  - `fewshot_seed`: Seed for few-shot examples
+- Model Details
+  - `model_num_parameters`: Number of model parameters
+  - `model_dtype`: Model data type
+  - `model_revision`: Model version
+  - `model_sha`: Model commit hash
+
+- Version Control
+  - `git_hash`: Repository commit hash
+  - `date`: Unix timestamp of evaluation
+  - `transformers_version`: Hugging Face Transformers version
+- Tokenizer Configuration
+  - `tokenizer_pad_token`: Padding token details
+  - `tokenizer_eos_token`: End of sequence token
+  - `tokenizer_bos_token`: Beginning of sequence token
+  - `eot_token_id`: End of text token ID
+  - `max_length`: Maximum sequence length
+- Model Settings
+  - `model_source`: Model source platform
+  - `model_name`: Full model identifier
+  - `model_name_sanitized`: Sanitized model name for file system usage
+  - `chat_template`: Conversation template
+  - `chat_template_sha`: Template hash
+- Timing Information
+  - `start_time`: Evaluation start timestamp
+  - `end_time`: Evaluation end timestamp
+  - `total_evaluation_time_seconds`: Total duration
+- Hardware Environment
+  - PyTorch version and build configuration
+  - Operating system details
+  - GPU configuration
+  - CPU specifications
+  - CUDA and driver versions
+  - Relevant library versions
+
+### Customizing Evaluation
+
+#### 🤖 Change Annotator Model
+
+As part of Evalchemy, we want to make swapping in different Language Model Judges for standard benchmarks easy. Currently, we support two judge settings. The first is the default setting, where we use a benchmark's default judge. To activate this, you can either do nothing or pass in
+```bash
+--annotator_model auto
+```
+In addition to the default assignments, we support using gpt-4o-mini-2024-07-18 as a judge:
+
+```bash
+--annotator_model gpt-4o-mini-2024-07-18
+```
+
+We are planning on adding support for different judges in the future!
+
+### ⏱️ Runtime and Cost Analysis
+
+Evalchemy makes running common benchmarks simple, fast, and versatile! We list the speeds and costs for each benchmark we achieve with Evalchemy for Meta-Llama-3-8B-Instruct on 8xH100 GPUs.
+
+| Benchmark | Runtime (8xH100) | Batch Size | Total Tokens | Default Judge Cost ($) | GPT-4o-mini Judge Cost ($) | Notes |
+|-----------|------------------|------------|--------------|----------------|-------------------|--------|
+| MTBench | 14:00 | 32 | ~196K | 6.40 | 0.05 | |
+| WildBench | 38:00 | 32 | ~2.2M | 30.00 | 0.43 | Using GPT-4-mini judge |
+| RepoBench | 46:00 | 4 | - | - | - | Lower batch size due to memory |
+| MixEval | 13:00 | 32 | ~4-6M | 3.36 | 0.76 | Varies by judge model |
+| AlpacaEval | 16:00 | 32 | ~936K | 9.40 | 0.14 | |
+| HumanEval | 4:00 | 32 | - | - | - | No API costs |
+| IFEval | 1:30 | 32 | - | - | - | No API costs |
+| ZeroEval | 1:44:00 | 32 | - | - | - | Longest runtime |
+| MBPP | 6:00 | 32 | - | - | - | No API costs |
+| MMLU | 7:00 | 32 | - | - | - | No API costs |
+| ARC | 4:00 | 32 | - | - | - | No API costs |
+| DROP | 20:00 | 32 | - | - | - | No API costs |
+
+**Notes:**
+- Runtimes measured using 8x H100 GPUs with Meta-Llama-3-8B-Instruct model
+- Batch sizes optimized for memory and speed
+- API costs vary based on judge model choice
+
+**Cost-Saving Tips:**
+- Use gpt-4o-mini-2024-07-18 judge when possible for significant cost savings
+- Adjust batch size based on available memory
+- Consider using data-parallel evaluation for faster results
+
+### 🔐 Special Access Requirements
+
+#### ZeroEval Access
+To run ZeroEval benchmarks, you need to:
+
+1. Request access to the [ZebraLogicBench-private dataset](https://huggingface.co/datasets/allenai/ZebraLogicBench-private) on Hugging Face
+2. Accept the terms and conditions
+3. Log in to your Hugging Face account when running evaluations
+
+## 🛠️ Implementing Custom Evaluations
+
+To add a new evaluation system:
+
+1. Create a new directory under `eval/chat_benchmarks/`
+2. Implement `eval_instruct.py` with two required functions:
+   - `eval_instruct(model)`: Takes an LM Eval Model, returns results dict
+   - `evaluate(results)`: Takes results dictionary, returns evaluation metrics
+
+### Adding External Evaluation Repositories
+
+Use git subtree to manage external evaluation code:
+
+```bash
+# Add external repository
+git subtree add --prefix=eval/chat_benchmarks/new_eval https://github.com/original/repo.git main --squash
+
+# Pull updates
+git subtree pull --prefix=eval/chat_benchmarks/new_eval https://github.com/original/repo.git main --squash
+
+# Push contributions back
+git subtree push --prefix=eval/chat_benchmarks/new_eval https://github.com/original/repo.git contribution-branch
+```
+
+### 🔍 Debug Mode
+
+To run evaluations in debug mode, add the `--debug` flag:
+
+```bash
+python -m eval.eval \
+    --model hf \
+    --tasks MTBench \
+    --model_args "pretrained=mistralai/Mistral-7B-Instruct-v0.3" \
+    --batch_size 2 \
+    --output_path logs \
+    --debug
+```
+
+This is particularly useful when testing new evaluation implementations, debugging model configurations, verifying dataset access, and testing database connectivity.
+
+### 🚀 Performance Tips
+
+1. Utilize batch processing for faster evaluation:
+```python
+all_instances.append(
+    Instance(
+        "generate_until",
+        example,
+        (
+            inputs,
+            {
+                "max_new_tokens": 1024,
+                "do_sample": False,
+            },
+        ),
+        idx,
+    )
+)
+
+outputs = self.compute(model, all_instances)
+```
+
+2. Use the LM-eval logger for consistent logging across evaluations
+
+### 🔧 Troubleshooting
+Evalchemy has been tested on CUDA 12.4. If you run into issues like this: `undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12`, try updating your CUDA version:
+```bash
+wget https://developer.download.nvidia.com/compute/cuda/repos/debian11/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo add-apt-repository contrib
+sudo apt-get update
+sudo apt-get -y install cuda-toolkit-12-4
+```
+
+## 🏆 Leaderboard Integration
+To track experiments and evaluations, we support logging results to a PostgreSQL database. Details on the entry schemas and database setup can be found in [`database/`](database/).
+
+
+## Contributing
+Thank you to all the contributors for making this project possible!
+Please follow [these instructions](CONTRIBUTING.md) on how to contribute.
+
+## Citation
+If you find Evalchemy useful, please consider citing us!
+
+```
+@software{Evalchemy: Automatic evals for LLMs,
+  author = {Guha, Etash and Raoof, Negin and Mercat, Jean and Marten, Ryan and Frankel, Eric and Keh, Sedrick and Grover, Sachin and Smyrnis, George and Vu, Trung and Saad-Falcon, Jon and Choi, Caroline and Arora, Kushal and Merrill, Mike and Deng, Yichuan and Suvarna, Ashima and Bansal, Hritik and Nezhurina, Marianna and Choi, Yejin and Heckel, Reinhard and Oh, Seewong and Hashimoto, Tatsunori and Jitsev, Jenia and Shankar, Vaishaal and Dimakis, Alex and Sathiamoorthy, Mahesh and Schmidt, Ludwig},
+  month = nov,
+  title = {{Evalchemy}},
+  year = {2024}
+}
+```

From e2e0f80839421560bba0298a02413a0c04a862a0 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Tue, 15 Apr 2025 17:37:07 -0700
Subject: [PATCH 5/7] add xai max_tokens argument conversion

---
 eval/chat_benchmarks/curator_lm.py | 44 ++++--------------------------
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 2426531b..5d3c8e21 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -128,6 +128,8 @@ def _configure_params(
             self.gen_kwargs.pop("top_p", None)
             self.gen_kwargs.pop("stop", None)
             self.gen_kwargs.pop("temperature", None)
+        elif "xai" in self.model_name:
+            self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens", max_length)
 
 
         # Apply rate limits if provided and not overridden by model specifics
@@ -214,14 +216,7 @@ def parse_logprobs(
     @staticmethod
     def parse_generations(outputs: Union[Any, List[Any]], **kwargs) -> List[str]:
         # Parse the generated outputs from the API
-        # Assuming outputs is the direct list of response dicts from curator
-        if isinstance(outputs, list) and all(isinstance(item, dict) for item in outputs):
-             return [output.get("response", "") for output in outputs] # Use .get for safety
-        else:
-             # Handle unexpected output format
-             print(f"Warning: Unexpected output format in parse_generations: {type(outputs)}. Expected List[dict].")
-             return []
-
+        return [output["response"] for output in outputs]
 
     @property
     def tokenizer_name(self) -> str:
@@ -289,43 +284,16 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         # Extract contexts (already in JsonChatStr format expected by create_message)
         contexts = [req.args[0] for req in requests]
 
-        # Validate generation kwargs consistency (already done partially in lm-eval harness)
-        # We rely on the __init__ configuration for gen_kwargs now.
-        # We could add an assertion here to double-check if request-specific kwargs match self.gen_kwargs if needed.
-        # For now, we assume the instance's configuration is the source of truth.
-        # Example check (optional):
-        # for req in requests:
-        #     req_kwargs = req.args[1]
-        #     # Compare relevant keys, skipping 'until' as it's handled separately for eos
-        #     if req_kwargs.get('temperature', self.gen_kwargs.get('temperature')) != self.gen_kwargs.get('temperature') or \
-        #        req_kwargs.get('top_p', self.gen_kwargs.get('top_p')) != self.gen_kwargs.get('top_p') # Add other relevant keys
-        #        # raise ValueError("Request generation kwargs deviate from initialized model kwargs.")
-
-
         # Format messages for curator
         formatted_messages = self.create_message(contexts)
 
         # Make the call to curator
         start_time = time.time()
-        response_data = self.llm(formatted_messages)
+        response_data = self.llm(formatted_messages)["response"]
         end_time = time.time()
         print(f"Curator call took {end_time - start_time:.2f} seconds for {len(requests)} requests.")
 
-
-        # Parse the generations
-        # response_data is expected to be like {'response': [{'response': 'text1'}, {'response': 'text2'}, ...]}
-        if response_data and "response" in response_data and isinstance(response_data["response"], list):
-             parsed_generations = self.parse_generations(response_data["response"])
-             # Ensure the number of results matches the number of requests
-             if len(parsed_generations) != len(requests):
-                 print(f"Warning: Mismatch between number of requests ({len(requests)}) and responses ({len(parsed_generations)}). Padding with empty strings.")
-                 # Pad with empty strings to match the expected length
-                 parsed_generations.extend([""] * (len(requests) - len(parsed_generations)))
-             return parsed_generations
-        else:
-             print(f"Error: Unexpected response structure from curator: {response_data}. Returning empty strings.")
-             return [""] * len(requests)
-
+        return response_data
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
         raise NotImplementedError("Log likelihood rolling not implemented for curator.")
@@ -338,4 +306,4 @@ def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[fl
 
     def tok_encode(self, string: str, **kwargs) -> List[int]:
         raise NotImplementedError("Token encoding not implemented for curator.")
-        # return self.llm.tokenizer.encode(string) # Cannot access tokenizer directly
+        return self.llm.tokenizer.encode(string)  # Replace with actual method to tokenize

From b79993ef49417b95a749e18603311acb35b790d8 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Wed, 16 Apr 2025 13:17:08 -0700
Subject: [PATCH 6/7] simplify back curator_lm create message

---
 eval/chat_benchmarks/curator_lm.py | 45 ++++++------------------------
 1 file changed, 9 insertions(+), 36 deletions(-)

diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index 5d3c8e21..cd276d7d 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -174,30 +174,13 @@ def _ensure_llm_initialized(self, eos=None):
             )
 
     def create_message(
-        self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False # generate arg seems unused now
-    ) -> List[dict]: # Always return the list of dicts format curator expects
-        """Converts various message formats into the list of dicts format for curator."""
+        self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False
+    ) -> Union[List[List[int]], List[dict], List[str], str]:
+        # Convert messages to the format expected by the API
         if isinstance(messages, list) and all(isinstance(m, JsonChatStr) for m in messages):
-            # Directly parse the JSON string stored in JsonChatStr
-            try:
-                return [json.loads(m.prompt) for m in messages]
-            except json.JSONDecodeError as e:
-                 raise ValueError(f"Failed to parse JSON from JsonChatStr: {e}. Content: {[m.prompt for m in messages]}")
-            except Exception as e:
-                 raise ValueError(f"Error processing JsonChatStr messages: {e}")
-        elif isinstance(messages, list) and all(isinstance(m, dict) for m in messages):
-             # If it's already in the correct list-of-dicts format
-             return messages
-        elif isinstance(messages, list) and all(isinstance(m, str) for m in messages):
-             # Handle simple list of strings if needed (e.g., for non-chat models, although curator usually expects dicts)
-             # This might need adjustment depending on how non-chat prompts are formatted.
-             # Assuming a simple conversion for now.
-             print("Warning: Converting list of strings to basic 'user' messages. Verify format.")
-             return [{"role": "user", "content": m} for m in messages]
+            return [json.loads(m.prompt) for m in messages]
         else:
-            # Add more specific checks or raise error for unsupported formats
-            raise ValueError(f"Unsupported messages format: {type(messages)}. Expected List[JsonChatStr], List[dict], or List[str].")
-
+            raise ValueError("Messages must be a list of JsonChatStr objects")
 
     @staticmethod
     def parse_logprobs(
@@ -242,16 +225,9 @@ def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatSt
              return None # Or raise error
 
         # Curator returns a dictionary with a 'response' key containing a list of outputs
-        response_data = self.llm(formatted_messages)
-
-        # Extract the actual response content for the single prompt
-        if response_data and "response" in response_data and isinstance(response_data["response"], list) and len(response_data["response"]) > 0:
-             # Return the first response dictionary (or just the text?)
-             return response_data["response"][0] # Returning the dict like {'response': 'text'}
-        else:
-             print(f"Warning: Unexpected response structure from curator in model_call: {response_data}")
-             return None
+        response = self.llm(formatted_messages)["response"]
 
+        return response
 
     def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
         raise NotImplementedError("Log likelihood tokens not implemented for curator.")
@@ -288,12 +264,9 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         formatted_messages = self.create_message(contexts)
 
         # Make the call to curator
-        start_time = time.time()
-        response_data = self.llm(formatted_messages)["response"]
-        end_time = time.time()
-        print(f"Curator call took {end_time - start_time:.2f} seconds for {len(requests)} requests.")
+        response = self.llm(formatted_messages)["response"]
 
-        return response_data
+        return response
 
     def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
         raise NotImplementedError("Log likelihood rolling not implemented for curator.")

From 57d6eb9a3aa456e5a849f815b19001fcc8d3bb56 Mon Sep 17 00:00:00 2001
From: Jean Mercat <jean.mercat@tri.global>
Date: Wed, 14 May 2025 14:55:33 -0700
Subject: [PATCH 7/7] small fixes

---
 eval/chat_benchmarks/AMC23/eval_instruct.py       |  6 ------
 eval/chat_benchmarks/GPQADiamond/eval_instruct.py |  7 -------
 eval/chat_benchmarks/MATH500/eval_instruct.py     |  6 ------
 eval/chat_benchmarks/curator_lm.py                | 15 ++++++++++++---
 4 files changed, 12 insertions(+), 22 deletions(-)

diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
index 0585b435..04b7d5d5 100644
--- a/eval/chat_benchmarks/AMC23/eval_instruct.py
+++ b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -64,12 +64,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
 
         # Prepare instances for model
         all_instances = []
-        if isinstance(model, lm_eval.models.huggingface.HFLM):
-            model_name = model.pretrained
-        elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
-            model_name = str(f"openai/{model.model}")
-        else:
-            model_name = model.model_args["model"]
 
         all_outputs = []
         for i in range(self.n_repeat):
diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
index 9548ab9e..8ab6ac8b 100644
--- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
+++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
@@ -69,13 +69,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             example["multiple_choice_string"] = multiple_choice_string
             example["answer"] = correct_answer
 
-        if isinstance(model, lm_eval.models.huggingface.HFLM):
-            model_name = model.pretrained
-        elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
-            model_name = str(f"openai/{model.model}")
-        else:
-            model_name = model.model_args["model"]
-
         all_outputs = []
 
         for i in range(self.n_repeat):
diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
index e0bbe633..f32e13fa 100644
--- a/eval/chat_benchmarks/MATH500/eval_instruct.py
+++ b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -61,12 +61,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
 
         # Prepare instances for model
         all_instances = []
-        if isinstance(model, lm_eval.models.huggingface.HFLM):
-            model_name = model.pretrained
-        elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
-            model_name = str(f"openai/{model.model}")
-        else:
-            model_name = model.model_args["model"]
         for idx, example in enumerate(examples):
             messages = [
                 {"role": "user", "content": PROMPT.format(problem=example["problem"])},
diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py
index cd276d7d..e7af1d62 100644
--- a/eval/chat_benchmarks/curator_lm.py
+++ b/eval/chat_benchmarks/curator_lm.py
@@ -85,9 +85,14 @@ def _configure_params(
         self.additional_llm_args = {} # For args passed directly to curator.LLM constructor
 
         # Model-specific adjustments
-        is_thinking_model = "thinking" in self.model_name
+        is_thinking_model = "thinking" in self.model_name or "gemini-2.5-pro" in self.model_name
 
         if "gemini" in self.model_name:
+            if self.is_batch_request:
+                self.additional_llm_args["backend"] = "gemini"
+                self.gen_kwargs.pop("max_completion_tokens", None)
+                self.gen_kwargs.pop("stop", None)
+
             if is_thinking_model:
                 max_requests_per_minute = max_requests_per_minute or 200
                 max_tokens_per_minute = max_tokens_per_minute or 400_000
@@ -122,7 +127,7 @@ def _configure_params(
             max_requests_per_minute = 2_500 # Override rate limits
             max_tokens_per_minute = 1_000_000_000
             self.gen_kwargs["temperature"] = 0 # Override temperature
-        elif "o1" in self.model_name or "o3" in self.model_name:
+        elif "o1" in self.model_name or "o3" in self.model_name or "o4" in self.model_name:
              # o1/o3 don't support these
             print(f"Warning: Model {self.model_name} does not support top_p, stop, or temperature. Ignoring them.")
             self.gen_kwargs.pop("top_p", None)
@@ -263,8 +268,12 @@ def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -
         # Format messages for curator
         formatted_messages = self.create_message(contexts)
 
+        response = self.llm(formatted_messages)
         # Make the call to curator
-        response = self.llm(formatted_messages)["response"]
+        try:
+            response = response["response"]
+        except Exception as e:
+            response = response.dataset["response"]
 
         return response