From 2b8d89c737361012c6345d4f0a531ad312be8f80 Mon Sep 17 00:00:00 2001 From: Dev Sharma <12devsharma10c@gmail.com> Date: Wed, 6 Aug 2025 00:36:14 +0530 Subject: [PATCH] Add utility function to calculate number of tokens This PR adds a num_tokens() helper function to estimate the number of tokens used by a string or list of chat messages for a given model. --- src/openai/__init__.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/openai/__init__.py b/src/openai/__init__.py index 226fed9554..5e237b8f09 100644 --- a/src/openai/__init__.py +++ b/src/openai/__init__.py @@ -1,6 +1,39 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. from __future__ import annotations +# openai/utils/token_counter.py + +from typing import List, Dict, Union +import tiktoken + +def num_tokens( + text_or_messages: Union[str, List[Dict[str, str]]], + model: str = "gpt-4o" +) -> int: + """ + Calculate the number of tokens for a string or chat messages. + + :param text_or_messages: String or list of {"role":..., "content":...} messages. + :param model: Model name for tokenization (defaults to gpt-4o). + :return: Number of tokens. + """ + encoding = tiktoken.encoding_for_model(model) + + if isinstance(text_or_messages, str): + return len(encoding.encode(text_or_messages)) + + elif isinstance(text_or_messages, list): + num_tokens = 0 + for msg in text_or_messages: + # Each message has role + content + num_tokens += 3 # Roughly accounts for role/content overhead + num_tokens += len(encoding.encode(msg.get("content", ""))) + num_tokens += len(encoding.encode(msg.get("role", ""))) + num_tokens += 3 # Every reply has priming tokens + return num_tokens + + else: + raise TypeError("Input must be a string or a list of messages.") import os as _os import typing as _t