diff --git a/README.md b/README.md index 0c169ad2..3e3450a6 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,22 @@ Evalchemy is developed by the [DataComp community](https://datacomp.ai) and [Bes --model_args 'tokenized_requests=False' \ --output_path logs ``` + +Here are other examples of `model_name`: +- `"claude-3-7-sonnet-latest-thinking"` +- `"deepseek-reasoner"` +- `"gemini/gemini-1.5-flash"` +- `"claude-3-7-sonnet-latest"` +- `"gpt-4o-mini-2024-07-18"` +- `"o1-preview-2024-09-12"` +- `"gpt-4o-2024-08-06"` + +You can also change the `model_args` to fit your needs. For example, `"claude-3-7-sonnet-latest-thinking"` might need more tokens and more time for its thinking process and can be used in batch mode to speed up evaluation and reduce costs by setting `model_args` like this: + +``` +--model_args 'tokenized_requests=False,timeout=2000,max_length=64000,batch=True' +``` + #### [2025.01.29] New Reasoning Benchmarks - AIME24, AMC23, MATH500, LiveCodeBench, GPQADiamond, HumanEvalPlus, MBPPPlus, BigCodeBench, MultiPL-E, and CRUXEval have been added to our growing list of [available benchmarks](https://github.com/mlfoundations/evalchemy?tab=readme-ov-file#built-in-benchmarks). This is part of the effort in the [Open Thoughts](https://github.com/open-thoughts/open-thoughts) project. See the [our blog post](https://www.open-thoughts.ai/blog/measure) on using Evalchemy for measuring reasoning models. diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py index 0585b435..04b7d5d5 100644 --- a/eval/chat_benchmarks/AMC23/eval_instruct.py +++ b/eval/chat_benchmarks/AMC23/eval_instruct.py @@ -64,12 +64,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: # Prepare instances for model all_instances = [] - if isinstance(model, lm_eval.models.huggingface.HFLM): - model_name = model.pretrained - elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion): - model_name = str(f"openai/{model.model}") - else: - model_name = model.model_args["model"] all_outputs = [] for i in range(self.n_repeat): diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py index 9548ab9e..8ab6ac8b 100644 --- a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py +++ b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py @@ -69,13 +69,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: example["multiple_choice_string"] = multiple_choice_string example["answer"] = correct_answer - if isinstance(model, lm_eval.models.huggingface.HFLM): - model_name = model.pretrained - elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion): - model_name = str(f"openai/{model.model}") - else: - model_name = model.model_args["model"] - all_outputs = [] for i in range(self.n_repeat): diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py index e0bbe633..f32e13fa 100644 --- a/eval/chat_benchmarks/MATH500/eval_instruct.py +++ b/eval/chat_benchmarks/MATH500/eval_instruct.py @@ -61,12 +61,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]: # Prepare instances for model all_instances = [] - if isinstance(model, lm_eval.models.huggingface.HFLM): - model_name = model.pretrained - elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion): - model_name = str(f"openai/{model.model}") - else: - model_name = model.model_args["model"] for idx, example in enumerate(examples): messages = [ {"role": "user", "content": PROMPT.format(problem=example["problem"])}, diff --git a/eval/chat_benchmarks/curator_lm.py b/eval/chat_benchmarks/curator_lm.py index 8e023afe..e7af1d62 100644 --- a/eval/chat_benchmarks/curator_lm.py +++ b/eval/chat_benchmarks/curator_lm.py @@ -19,59 +19,125 @@ def __init__( model: str = None, pretrained: str = None, max_length: Optional[int] = 2048, - max_retries: int = 10, + max_retries: int = 20, timeout: int = 300, tokenized_requests: bool = False, max_requests_per_minute: int = None, max_tokens_per_minute: int = None, seconds_to_pause_on_rate_limit: int = None, + batch: bool = False, + temperature: float = 0.0, + top_p: float = 0.95, **kwargs, ): super().__init__() - self.model_name = model or pretrained - - self.model_args = kwargs - self.model_args.update( - { - "model": self.model_name, - "pretrained": pretrained, - "max_length": max_length, - "max_retries": max_retries, - "timeout": timeout, - "tokenized_requests": tokenized_requests, - } - ) - - if "gemini" in self.model_name and "thinking" in self.model_name: - max_requests_per_minute = max_requests_per_minute or 200 - max_tokens_per_minute = max_tokens_per_minute or 400_000 - elif "gemini" in self.model_name: - max_requests_per_minute = max_requests_per_minute or 2000 - max_tokens_per_minute = max_tokens_per_minute or 4_000_000 - elif "claude" in self.model_name: - max_requests_per_minute = max_requests_per_minute or 2000 - max_tokens_per_minute = max_tokens_per_minute or 80_000 - if tokenized_requests: raise NotImplementedError("Tokenized requests not implemented for curator.") self.tokenized_requests = False + + self.model_name = model or pretrained self.max_length = max_length - self.llm = None - self.gen_kwargs = {} - self.eos = None - if "temperature" in kwargs: - self.gen_kwargs["temperature"] = kwargs["temperature"] - if "top_p" in kwargs: - self.gen_kwargs["top_p"] = kwargs["top_p"] + self.is_batch_request = batch + self._configure_params( + max_length=max_length, + max_retries=max_retries, + timeout=timeout, + max_requests_per_minute=max_requests_per_minute, + max_tokens_per_minute=max_tokens_per_minute, + seconds_to_pause_on_rate_limit=seconds_to_pause_on_rate_limit, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + self.llm = None # Initialize lazily + self.eos = None # Will be set during LLM initialization if needed + + # Disable cache since it is not necessary + os.environ["CURATOR_DISABLE_CACHE"] = "true" + + def _configure_params( + self, + max_length: int, + max_retries: int, + timeout: int, + max_requests_per_minute: Optional[int], + max_tokens_per_minute: Optional[int], + seconds_to_pause_on_rate_limit: Optional[int], + temperature: float, + top_p: float, + **kwargs, + ): + """Sets up gen_kwargs and backend_params based on model name and init args.""" + self.gen_kwargs = { + "max_completion_tokens": max_length, + "temperature": temperature, + "top_p": top_p, + "stop": None, # Will be set later if needed based on request + } self.backend_params = { - "invalid_finish_reasons": [ - "content_filter" - ], # So it doesn't retry on `length` finish reason, but retries on "content_filter"} + "invalid_finish_reasons": ["content_filter"], "require_all_responses": False, "request_timeout": timeout, "max_retries": max_retries, } + self.additional_llm_args = {} # For args passed directly to curator.LLM constructor + + # Model-specific adjustments + is_thinking_model = "thinking" in self.model_name or "gemini-2.5-pro" in self.model_name + + if "gemini" in self.model_name: + if self.is_batch_request: + self.additional_llm_args["backend"] = "gemini" + self.gen_kwargs.pop("max_completion_tokens", None) + self.gen_kwargs.pop("stop", None) + + if is_thinking_model: + max_requests_per_minute = max_requests_per_minute or 200 + max_tokens_per_minute = max_tokens_per_minute or 400_000 + else: + max_requests_per_minute = max_requests_per_minute or 2000 + max_tokens_per_minute = max_tokens_per_minute or 4_000_000 + elif "claude" in self.model_name: + max_requests_per_minute = max_requests_per_minute or 2000 + max_tokens_per_minute = max_tokens_per_minute or 80_000 + # Claude uses 'max_tokens' instead of 'max_completion_tokens' + self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens") + self.gen_kwargs.pop("stop", None) # Claude doesn't support stop sequences via API arg + + if is_thinking_model: + # Adjust name and set thinking params + self.model_name = ( + self.model_name.replace("-thinking-", "") + .replace("-thinking", "") + .replace("thinking-", "") + .replace("thinking", "") + ) + # Thinking budget calculation depends on final max_tokens + thinking_budget = self.gen_kwargs["max_tokens"] - 4096 + self.gen_kwargs["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget} + # API requirements for thinking mode + self.gen_kwargs["temperature"] = 1.0 + self.gen_kwargs.pop("top_p", None) + elif "deepseek" in self.model_name: + self.additional_llm_args["backend"] = "openai" + self.backend_params["base_url"] = "https://api.deepseek.com/" + self.backend_params["api_key"] = os.environ["DEEPSEEK_API_KEY"] + max_requests_per_minute = 2_500 # Override rate limits + max_tokens_per_minute = 1_000_000_000 + self.gen_kwargs["temperature"] = 0 # Override temperature + elif "o1" in self.model_name or "o3" in self.model_name or "o4" in self.model_name: + # o1/o3 don't support these + print(f"Warning: Model {self.model_name} does not support top_p, stop, or temperature. Ignoring them.") + self.gen_kwargs.pop("top_p", None) + self.gen_kwargs.pop("stop", None) + self.gen_kwargs.pop("temperature", None) + elif "xai" in self.model_name: + self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens", max_length) + + + # Apply rate limits if provided and not overridden by model specifics if max_requests_per_minute is not None: self.backend_params["max_requests_per_minute"] = max_requests_per_minute if max_tokens_per_minute is not None: @@ -79,51 +145,38 @@ def __init__( if seconds_to_pause_on_rate_limit is not None: self.backend_params["seconds_to_pause_on_rate_limit"] = seconds_to_pause_on_rate_limit - # Disable cache since it is not necessary - os.environ["CURATOR_DISABLE_CACHE"] = "true" + # Handle batch mode specifics + if self.is_batch_request: + # Rate limiting params are incompatible with batch requests in curator + self.backend_params = {"require_all_responses": True} + self.additional_llm_args["batch"] = True - def _create_payload( - self, - messages: Union[List[List[int]], List[dict], List[str], str], - *, - generate: bool = False, - gen_kwargs: Optional[dict] = None, - eos=None, - **kwargs, - ) -> dict: - assert generate, "Curator only supports generation." - # Create the payload for the API request - max_tokens = self.max_length or gen_kwargs.get("max_gen_toks", self.max_length) - temperature = self.gen_kwargs.get("temperature", gen_kwargs.get("temperature", 0)) - top_p = self.gen_kwargs.get("top_p", gen_kwargs.get("top_p", 0.95)) - stop = handle_stop_sequences(gen_kwargs.get("until", None), eos) - gen_kwargs = { - "max_completion_tokens": max_tokens, - "temperature": temperature, - "top_p": top_p, - "stop": stop, - } - if "o1" in self.model_name: - print("Warning: O1 model does not support top_p, stop, or temperature. Ignoring them.") - gen_kwargs.pop("top_p") - gen_kwargs.pop("stop") - gen_kwargs.pop("temperature") + + def _ensure_llm_initialized(self, eos=None): + """Initializes the curator.LLM object if it hasn't been already.""" if self.llm is None: - self.eos = eos - self.gen_kwargs = gen_kwargs.copy() + # Update stop sequences based on the current request if needed + # This assumes EOS is consistent for the lifetime of the model instance + if eos and self.gen_kwargs.get("stop") is None: + self.eos = eos # Store for potential future reference if needed + # Handle potential list of stop sequences + stop_sequences = handle_stop_sequences(None, eos) # Pass current eos + # Only update if stop sequences are actually needed and supported + if stop_sequences and "stop" in self.gen_kwargs: + self.gen_kwargs["stop"] = stop_sequences + elif stop_sequences and "max_tokens" in self.gen_kwargs and "claude" not in self.model_name: + # Only warn if stop sequences were provided but the param doesn't exist + # (like for Claude, which was handled in _configure_params) + print(f"Warning: Stop sequences provided but 'stop' generation parameter is not available for {self.model_name}.") + + + print(f"Initializing curator.LLM with: model_name='{self.model_name}', generation_params={self.gen_kwargs}, backend_params={self.backend_params}, additional_args={self.additional_llm_args}") self.llm = curator.LLM( - model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy() + model_name=self.model_name, + generation_params=self.gen_kwargs, + backend_params=self.backend_params, + **self.additional_llm_args, ) - else: - if self.gen_kwargs != gen_kwargs: - print( - "Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request" - ) - self.gen_kwargs = gen_kwargs.copy() - self.llm = curator.LLM( - model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy() - ) - return messages def create_message( self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False @@ -158,12 +211,27 @@ def tokenizer_name(self) -> str: return self.model_name def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str, JsonChatStr]: - # Convert chat history to the required format + # Convert chat history to the required JsonChatStr format return JsonChatStr(json.dumps(chat_history)) def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]: - payload = self._create_payload(self.create_message(messages), **kwargs) - response = self.llm(payload)["response"] + # Deprecated or needs rework? generate_until is the primary method used by lm-eval harness. + # This method seems designed for single requests, while generate_until handles batches. + # If needed, it should also use _ensure_llm_initialized and create_message. + print("Warning: model_call is likely deprecated for lm-eval tasks. Use generate_until.") + self._ensure_llm_initialized() # Make sure LLM is ready + # Ensure messages is a list, as curator expects a list of prompts + if not isinstance(messages, list): + messages = [messages] + + formatted_messages = self.create_message(messages) + # Assuming model_call handles a single prompt, curator expects a list + if not formatted_messages: + return None # Or raise error + + # Curator returns a dictionary with a 'response' key containing a list of outputs + response = self.llm(formatted_messages)["response"] + return response def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: @@ -179,26 +247,34 @@ def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]: @property def eot_token_id(self) -> Optional[int]: - # Assuming the model has a specific end-of-text token ID - return self.llm.eot_token_id # Replace with actual method to get EOT token ID + # Curator doesn't directly expose tokenizer or token IDs. + # Need to rely on underlying model specifics if absolutely necessary, + # but lm-eval generally handles this via stop sequences. + print("Warning: eot_token_id is not directly available via Curator API.") + return None # Cannot reliably get this from curator def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -> List[str]: - # Tokenize contexts if required - if self.tokenized_requests: - raise NotImplementedError("Tokenized requests not implemented for curator.") + if not requests: + return [] + + # Ensure LLM is initialized, passing eos from the first request's gen_kwargs + # Assumes eos is consistent across the batch, which is reasonable for lm-eval. + first_req_kwargs = requests[0].args[1] if len(requests[0].args) > 1 else {} + self._ensure_llm_initialized(eos=first_req_kwargs.get("until")) - # Extract contexts and generation kwargs from the Instance objects + # Extract contexts (already in JsonChatStr format expected by create_message) contexts = [req.args[0] for req in requests] - gen_kwargs = [req.args[1] for req in requests] - # Assert all gen_kwargs are the same - assert all( - gen_kwargs[0] == gkw for gkw in gen_kwargs - ), "Generation parameters must be the same for all requests in curator" + # Format messages for curator + formatted_messages = self.create_message(contexts) + + response = self.llm(formatted_messages) + # Make the call to curator + try: + response = response["response"] + except Exception as e: + response = response.dataset["response"] - contexts_dataset = self.create_message(contexts) - payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0]) - response = self.llm(payload)["response"] return response def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]: