Skip to content
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,22 @@ Evalchemy is developed by the [DataComp community](https://datacomp.ai) and [Bes
--model_args 'tokenized_requests=False' \
--output_path logs
```

Here are other examples of `model_name`:
- `"claude-3-7-sonnet-latest-thinking"`
- `"deepseek-reasoner"`
- `"gemini/gemini-1.5-flash"`
- `"claude-3-7-sonnet-latest"`
- `"gpt-4o-mini-2024-07-18"`
- `"o1-preview-2024-09-12"`
- `"gpt-4o-2024-08-06"`

You can also change the `model_args` to fit your needs. For example, `"claude-3-7-sonnet-latest-thinking"` might need more tokens and more time for its thinking process and can be used in batch mode to speed up evaluation and reduce costs by setting `model_args` like this:

```
--model_args 'tokenized_requests=False,timeout=2000,max_length=64000,batch=True'
```

#### [2025.01.29] New Reasoning Benchmarks

- AIME24, AMC23, MATH500, LiveCodeBench, GPQADiamond, HumanEvalPlus, MBPPPlus, BigCodeBench, MultiPL-E, and CRUXEval have been added to our growing list of [available benchmarks](https://github.com/mlfoundations/evalchemy?tab=readme-ov-file#built-in-benchmarks). This is part of the effort in the [Open Thoughts](https://github.com/open-thoughts/open-thoughts) project. See the [our blog post](https://www.open-thoughts.ai/blog/measure) on using Evalchemy for measuring reasoning models.
Expand Down
6 changes: 0 additions & 6 deletions eval/chat_benchmarks/AMC23/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

# Prepare instances for model
all_instances = []
if isinstance(model, lm_eval.models.huggingface.HFLM):
model_name = model.pretrained
elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
model_name = str(f"openai/{model.model}")
else:
model_name = model.model_args["model"]

all_outputs = []
for i in range(self.n_repeat):
Expand Down
7 changes: 0 additions & 7 deletions eval/chat_benchmarks/GPQADiamond/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
example["multiple_choice_string"] = multiple_choice_string
example["answer"] = correct_answer

if isinstance(model, lm_eval.models.huggingface.HFLM):
model_name = model.pretrained
elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
model_name = str(f"openai/{model.model}")
else:
model_name = model.model_args["model"]

all_outputs = []

for i in range(self.n_repeat):
Expand Down
6 changes: 0 additions & 6 deletions eval/chat_benchmarks/MATH500/eval_instruct.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,6 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:

# Prepare instances for model
all_instances = []
if isinstance(model, lm_eval.models.huggingface.HFLM):
model_name = model.pretrained
elif isinstance(model, lm_eval.models.openai_completions.OpenAIChatCompletion):
model_name = str(f"openai/{model.model}")
else:
model_name = model.model_args["model"]
for idx, example in enumerate(examples):
messages = [
{"role": "user", "content": PROMPT.format(problem=example["problem"])},
Expand Down
262 changes: 169 additions & 93 deletions eval/chat_benchmarks/curator_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,111 +19,164 @@ def __init__(
model: str = None,
pretrained: str = None,
max_length: Optional[int] = 2048,
max_retries: int = 10,
max_retries: int = 20,
timeout: int = 300,
tokenized_requests: bool = False,
max_requests_per_minute: int = None,
max_tokens_per_minute: int = None,
seconds_to_pause_on_rate_limit: int = None,
batch: bool = False,
temperature: float = 0.0,
top_p: float = 0.95,
**kwargs,
):
super().__init__()

self.model_name = model or pretrained

self.model_args = kwargs
self.model_args.update(
{
"model": self.model_name,
"pretrained": pretrained,
"max_length": max_length,
"max_retries": max_retries,
"timeout": timeout,
"tokenized_requests": tokenized_requests,
}
)

if "gemini" in self.model_name and "thinking" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 200
max_tokens_per_minute = max_tokens_per_minute or 400_000
elif "gemini" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 4_000_000
elif "claude" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 80_000

if tokenized_requests:
raise NotImplementedError("Tokenized requests not implemented for curator.")
self.tokenized_requests = False

self.model_name = model or pretrained
self.max_length = max_length
self.llm = None
self.gen_kwargs = {}
self.eos = None
if "temperature" in kwargs:
self.gen_kwargs["temperature"] = kwargs["temperature"]
if "top_p" in kwargs:
self.gen_kwargs["top_p"] = kwargs["top_p"]
self.is_batch_request = batch
self._configure_params(
max_length=max_length,
max_retries=max_retries,
timeout=timeout,
max_requests_per_minute=max_requests_per_minute,
max_tokens_per_minute=max_tokens_per_minute,
seconds_to_pause_on_rate_limit=seconds_to_pause_on_rate_limit,
temperature=temperature,
top_p=top_p,
**kwargs,
)

self.llm = None # Initialize lazily
self.eos = None # Will be set during LLM initialization if needed

# Disable cache since it is not necessary
os.environ["CURATOR_DISABLE_CACHE"] = "true"

def _configure_params(
self,
max_length: int,
max_retries: int,
timeout: int,
max_requests_per_minute: Optional[int],
max_tokens_per_minute: Optional[int],
seconds_to_pause_on_rate_limit: Optional[int],
temperature: float,
top_p: float,
**kwargs,
):
"""Sets up gen_kwargs and backend_params based on model name and init args."""
self.gen_kwargs = {
"max_completion_tokens": max_length,
"temperature": temperature,
"top_p": top_p,
"stop": None, # Will be set later if needed based on request
}
self.backend_params = {
"invalid_finish_reasons": [
"content_filter"
], # So it doesn't retry on `length` finish reason, but retries on "content_filter"}
"invalid_finish_reasons": ["content_filter"],
"require_all_responses": False,
"request_timeout": timeout,
"max_retries": max_retries,
}
self.additional_llm_args = {} # For args passed directly to curator.LLM constructor

# Model-specific adjustments
is_thinking_model = "thinking" in self.model_name or "gemini-2.5-pro" in self.model_name

if "gemini" in self.model_name:
if self.is_batch_request:
self.additional_llm_args["backend"] = "gemini"
self.gen_kwargs.pop("max_completion_tokens", None)
self.gen_kwargs.pop("stop", None)

if is_thinking_model:
max_requests_per_minute = max_requests_per_minute or 200
max_tokens_per_minute = max_tokens_per_minute or 400_000
else:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 4_000_000
elif "claude" in self.model_name:
max_requests_per_minute = max_requests_per_minute or 2000
max_tokens_per_minute = max_tokens_per_minute or 80_000
# Claude uses 'max_tokens' instead of 'max_completion_tokens'
self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens")
self.gen_kwargs.pop("stop", None) # Claude doesn't support stop sequences via API arg

if is_thinking_model:
# Adjust name and set thinking params
self.model_name = (
self.model_name.replace("-thinking-", "")
.replace("-thinking", "")
.replace("thinking-", "")
.replace("thinking", "")
)
# Thinking budget calculation depends on final max_tokens
thinking_budget = self.gen_kwargs["max_tokens"] - 4096
self.gen_kwargs["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget}
# API requirements for thinking mode
self.gen_kwargs["temperature"] = 1.0
self.gen_kwargs.pop("top_p", None)
elif "deepseek" in self.model_name:
self.additional_llm_args["backend"] = "openai"
self.backend_params["base_url"] = "https://api.deepseek.com/"
self.backend_params["api_key"] = os.environ["DEEPSEEK_API_KEY"]
max_requests_per_minute = 2_500 # Override rate limits
max_tokens_per_minute = 1_000_000_000
self.gen_kwargs["temperature"] = 0 # Override temperature
elif "o1" in self.model_name or "o3" in self.model_name or "o4" in self.model_name:
# o1/o3 don't support these
print(f"Warning: Model {self.model_name} does not support top_p, stop, or temperature. Ignoring them.")
self.gen_kwargs.pop("top_p", None)
self.gen_kwargs.pop("stop", None)
self.gen_kwargs.pop("temperature", None)
elif "xai" in self.model_name:
self.gen_kwargs["max_tokens"] = self.gen_kwargs.pop("max_completion_tokens", max_length)


# Apply rate limits if provided and not overridden by model specifics
if max_requests_per_minute is not None:
self.backend_params["max_requests_per_minute"] = max_requests_per_minute
if max_tokens_per_minute is not None:
self.backend_params["max_tokens_per_minute"] = max_tokens_per_minute
if seconds_to_pause_on_rate_limit is not None:
self.backend_params["seconds_to_pause_on_rate_limit"] = seconds_to_pause_on_rate_limit

# Disable cache since it is not necessary
os.environ["CURATOR_DISABLE_CACHE"] = "true"
# Handle batch mode specifics
if self.is_batch_request:
# Rate limiting params are incompatible with batch requests in curator
self.backend_params = {"require_all_responses": True}
self.additional_llm_args["batch"] = True

def _create_payload(
self,
messages: Union[List[List[int]], List[dict], List[str], str],
*,
generate: bool = False,
gen_kwargs: Optional[dict] = None,
eos=None,
**kwargs,
) -> dict:
assert generate, "Curator only supports generation."
# Create the payload for the API request
max_tokens = self.max_length or gen_kwargs.get("max_gen_toks", self.max_length)
temperature = self.gen_kwargs.get("temperature", gen_kwargs.get("temperature", 0))
top_p = self.gen_kwargs.get("top_p", gen_kwargs.get("top_p", 0.95))
stop = handle_stop_sequences(gen_kwargs.get("until", None), eos)
gen_kwargs = {
"max_completion_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"stop": stop,
}
if "o1" in self.model_name:
print("Warning: O1 model does not support top_p, stop, or temperature. Ignoring them.")
gen_kwargs.pop("top_p")
gen_kwargs.pop("stop")
gen_kwargs.pop("temperature")

def _ensure_llm_initialized(self, eos=None):
"""Initializes the curator.LLM object if it hasn't been already."""
if self.llm is None:
self.eos = eos
self.gen_kwargs = gen_kwargs.copy()
# Update stop sequences based on the current request if needed
# This assumes EOS is consistent for the lifetime of the model instance
if eos and self.gen_kwargs.get("stop") is None:
self.eos = eos # Store for potential future reference if needed
# Handle potential list of stop sequences
stop_sequences = handle_stop_sequences(None, eos) # Pass current eos
# Only update if stop sequences are actually needed and supported
if stop_sequences and "stop" in self.gen_kwargs:
self.gen_kwargs["stop"] = stop_sequences
elif stop_sequences and "max_tokens" in self.gen_kwargs and "claude" not in self.model_name:
# Only warn if stop sequences were provided but the param doesn't exist
# (like for Claude, which was handled in _configure_params)
print(f"Warning: Stop sequences provided but 'stop' generation parameter is not available for {self.model_name}.")


print(f"Initializing curator.LLM with: model_name='{self.model_name}', generation_params={self.gen_kwargs}, backend_params={self.backend_params}, additional_args={self.additional_llm_args}")
self.llm = curator.LLM(
model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
model_name=self.model_name,
generation_params=self.gen_kwargs,
backend_params=self.backend_params,
**self.additional_llm_args,
)
else:
if self.gen_kwargs != gen_kwargs:
print(
"Recreating curator LLM with new generation parameters, make sure this doesn't happen at every request"
)
self.gen_kwargs = gen_kwargs.copy()
self.llm = curator.LLM(
model_name=self.model_name, generation_params=gen_kwargs, backend_params=self.backend_params.copy()
)
return messages

def create_message(
self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], generate=False
Expand Down Expand Up @@ -158,12 +211,27 @@ def tokenizer_name(self) -> str:
return self.model_name

def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> Union[str, JsonChatStr]:
# Convert chat history to the required format
# Convert chat history to the required JsonChatStr format
return JsonChatStr(json.dumps(chat_history))

def model_call(self, messages: Union[List[List[int]], List[str], List[JsonChatStr]], **kwargs) -> Optional[dict]:
payload = self._create_payload(self.create_message(messages), **kwargs)
response = self.llm(payload)["response"]
# Deprecated or needs rework? generate_until is the primary method used by lm-eval harness.
# This method seems designed for single requests, while generate_until handles batches.
# If needed, it should also use _ensure_llm_initialized and create_message.
print("Warning: model_call is likely deprecated for lm-eval tasks. Use generate_until.")
self._ensure_llm_initialized() # Make sure LLM is ready
# Ensure messages is a list, as curator expects a list of prompts
if not isinstance(messages, list):
messages = [messages]

formatted_messages = self.create_message(messages)
# Assuming model_call handles a single prompt, curator expects a list
if not formatted_messages:
return None # Or raise error

# Curator returns a dictionary with a 'response' key containing a list of outputs
response = self.llm(formatted_messages)["response"]

return response

def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
Expand All @@ -179,26 +247,34 @@ def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:

@property
def eot_token_id(self) -> Optional[int]:
# Assuming the model has a specific end-of-text token ID
return self.llm.eot_token_id # Replace with actual method to get EOT token ID
# Curator doesn't directly expose tokenizer or token IDs.
# Need to rely on underlying model specifics if absolutely necessary,
# but lm-eval generally handles this via stop sequences.
print("Warning: eot_token_id is not directly available via Curator API.")
return None # Cannot reliably get this from curator

def generate_until(self, requests: List[Instance], disable_tqdm: bool = False) -> List[str]:
# Tokenize contexts if required
if self.tokenized_requests:
raise NotImplementedError("Tokenized requests not implemented for curator.")
if not requests:
return []

# Ensure LLM is initialized, passing eos from the first request's gen_kwargs
# Assumes eos is consistent across the batch, which is reasonable for lm-eval.
first_req_kwargs = requests[0].args[1] if len(requests[0].args) > 1 else {}
self._ensure_llm_initialized(eos=first_req_kwargs.get("until"))

# Extract contexts and generation kwargs from the Instance objects
# Extract contexts (already in JsonChatStr format expected by create_message)
contexts = [req.args[0] for req in requests]
gen_kwargs = [req.args[1] for req in requests]

# Assert all gen_kwargs are the same
assert all(
gen_kwargs[0] == gkw for gkw in gen_kwargs
), "Generation parameters must be the same for all requests in curator"
# Format messages for curator
formatted_messages = self.create_message(contexts)

response = self.llm(formatted_messages)
# Make the call to curator
try:
response = response["response"]
except Exception as e:
response = response.dataset["response"]

contexts_dataset = self.create_message(contexts)
payload = self._create_payload(contexts_dataset, generate=True, gen_kwargs=gen_kwargs[0])
response = self.llm(payload)["response"]
return response

def loglikelihood_rolling(self, requests, disable_tqdm: bool = False) -> List[float]:
Expand Down
Loading