Skip to content

Improve cache awareness for mock app #880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion development/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ def completion():
latency = 0.0
if simulator is not None:
latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
print(f"input_tokens {input_tokens} model metadata {simulator.model_metadata}")

# Simulated response
response = {
Expand Down Expand Up @@ -698,7 +699,9 @@ def metrics():
if gpu_device != "disabled":
# Load the tokenizer for your model
from transformers import AutoTokenizer

from transformers import AutoConfig

num_layers, num_heads, hidden_size = None, None, None
default_model = 'bert-base-uncased'
try:
# can we make this as an application argument.
Expand All @@ -709,15 +712,30 @@ def metrics():
token=HUGGINGFACE_TOKEN,
model_max_length=16384, # Suppress warning
clean_up_tokenization_spaces=True)
config = AutoConfig.from_pretrained(token_model)
# Extract required details
num_layers = config.num_hidden_layers
num_heads = config.num_attention_heads
hidden_size = config.hidden_size
except Exception as e:
logger.error(f"Failed to initialize tokenizer, will use default tokenizer model: {e}")
tokenizer = AutoTokenizer.from_pretrained(
default_model,
model_max_length=16384, # Suppress warning
clean_up_tokenization_spaces=True)
config = AutoConfig.from_pretrained(default_model)
# Extract required details
num_layers = config.num_hidden_layers
num_heads = config.num_attention_heads
hidden_size = config.hidden_size

# TODO: check whether able to use argparse to build SimulationConfig
simulator = Simulator(SimulationConfig.create_from_cli_args())

simulator.model_metadata['num_layers'] = num_layers
simulator.model_metadata['num_heads'] = num_heads
simulator.model_metadata['hidden_size'] = hidden_size

overrides = {
"total": 100.0,
"running": 0,
Expand Down
8 changes: 7 additions & 1 deletion development/app/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
import sys
import threading
from typing import List
from typing import List, Dict

from vidur.config import SimulationConfig
from vidur.entities import Cluster, Request
Expand Down Expand Up @@ -56,6 +56,8 @@ def __init__(self, config: SimulationConfig) -> None:
self._expect_next_tick = sys.float_info.max
self._queue_buffer: List[Request] = []
self._queue = None

self._model_metadata = {}

# self._init_event_queue()
atexit.register(self._write_output)
Expand All @@ -68,6 +70,10 @@ def scheduler(self) -> BaseGlobalScheduler:
def metric_store(self) -> MetricsStore:
return self._metric_store

@property
def model_metadata(self) -> Dict:
return self._model_metadata

def start(self):
logger.info(
f"Starting simulation with cluster: {self._cluster}, model: {self._config.cluster_config.replica_config.model_name}, seed: {self._config.seed}"
Expand Down