diff --git a/development/app/app.py b/development/app/app.py index 81c1eb966..00bdcf7cd 100644 --- a/development/app/app.py +++ b/development/app/app.py @@ -238,6 +238,7 @@ def completion(): latency = 0.0 if simulator is not None: latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) + print(f"input_tokens {input_tokens} model metadata {simulator.model_metadata}") # Simulated response response = { @@ -698,7 +699,9 @@ def metrics(): if gpu_device != "disabled": # Load the tokenizer for your model from transformers import AutoTokenizer - + from transformers import AutoConfig + + num_layers, num_heads, hidden_size = None, None, None default_model = 'bert-base-uncased' try: # can we make this as an application argument. @@ -709,15 +712,30 @@ def metrics(): token=HUGGINGFACE_TOKEN, model_max_length=16384, # Suppress warning clean_up_tokenization_spaces=True) + config = AutoConfig.from_pretrained(token_model) + # Extract required details + num_layers = config.num_hidden_layers + num_heads = config.num_attention_heads + hidden_size = config.hidden_size except Exception as e: logger.error(f"Failed to initialize tokenizer, will use default tokenizer model: {e}") tokenizer = AutoTokenizer.from_pretrained( default_model, model_max_length=16384, # Suppress warning clean_up_tokenization_spaces=True) + config = AutoConfig.from_pretrained(default_model) + # Extract required details + num_layers = config.num_hidden_layers + num_heads = config.num_attention_heads + hidden_size = config.hidden_size # TODO: check whether able to use argparse to build SimulationConfig simulator = Simulator(SimulationConfig.create_from_cli_args()) + + simulator.model_metadata['num_layers'] = num_layers + simulator.model_metadata['num_heads'] = num_heads + simulator.model_metadata['hidden_size'] = hidden_size + overrides = { "total": 100.0, "running": 0, diff --git a/development/app/simulator.py b/development/app/simulator.py index f2137cb1f..fe5cbaa43 100644 --- a/development/app/simulator.py +++ b/development/app/simulator.py @@ -4,7 +4,7 @@ import json import sys import threading -from typing import List +from typing import List, Dict from vidur.config import SimulationConfig from vidur.entities import Cluster, Request @@ -56,6 +56,8 @@ def __init__(self, config: SimulationConfig) -> None: self._expect_next_tick = sys.float_info.max self._queue_buffer: List[Request] = [] self._queue = None + + self._model_metadata = {} # self._init_event_queue() atexit.register(self._write_output) @@ -68,6 +70,10 @@ def scheduler(self) -> BaseGlobalScheduler: def metric_store(self) -> MetricsStore: return self._metric_store + @property + def model_metadata(self) -> Dict: + return self._model_metadata + def start(self): logger.info( f"Starting simulation with cluster: {self._cluster}, model: {self._config.cluster_config.replica_config.model_name}, seed: {self._config.seed}"