Skip to content

Commit bceee12

Browse files
authored
Merge pull request #15 from aws-neuron/release_222
Neuron Release 2.22.0
2 parents 6eef2a6 + 301b63a commit bceee12

File tree

126 files changed

+11711
-2069
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+11711
-2069
lines changed

.gitignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,4 +139,7 @@ build
139139
src/neuronx_distributed.egg-info/
140140
*.whl
141141
**/.DS_Store
142-
__pycache__
142+
__pycache__
143+
144+
# Compiler artifacts
145+
neuronxcc*

README.md

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ inference_demo \
2727
--on-device-sampling \
2828
--enable-bucketing \
2929
--top-k 1 \
30-
--do-sample \
3130
--pad-token-id 2 \
3231
--prompt "I believe the meaning of life is" \
3332
--prompt "The color of the sky is" \
@@ -51,7 +50,6 @@ inference_demo \
5150
--seq-len 1152 \
5251
--enable-bucketing \
5352
--top-k 1 \
54-
--do-sample \
5553
--pad-token-id 0 \
5654
--prompt "I believe the meaning of life is" \
5755
--prompt "The color of the sky is" \
@@ -76,9 +74,7 @@ inference_demo \
7674
--seq-len 64 \
7775
--enable-bucketing \
7876
--speculation-length 5 \
79-
--no-trace-tokengen-model \
8077
--top-k 1 \
81-
--do-sample \
8278
--pad-token-id 2 \
8379
--prompt "I believe the meaning of life is" \
8480
--check-accuracy-mode token-matching \
@@ -105,7 +101,6 @@ inference_demo \
105101
--quantized-checkpoints-path /home/ubuntu/model_hf/Llama-2-7b/model_quant.pt \
106102
--quantization-type per_channel_symmetric \
107103
--top-k 1 \
108-
--do-sample \
109104
--pad-token-id 2 \
110105
--prompt "I believe the meaning of life is" \
111106
--prompt "The color of the sky is"
@@ -130,7 +125,6 @@ inference_demo \
130125
--tol-map "{5: (1e-5, 0.02)}" \
131126
--enable-bucketing \
132127
--top-k 1 \
133-
--do-sample \
134128
--pad-token-id 2 \
135129
--prompt "I believe the meaning of life is" \
136130
--prompt "The color of the sky is"

build.sh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,22 @@ set -e
33

44
: ${BUILD_PATH:=build}
55

6-
python3.8 -m pip install ruff
6+
python3.10 -m pip install ruff
77
# remove --exit-zero once all errors are fixed/explicitly ignore
8-
python3.8 -m ruff check --line-length=120 --ignore=F401,E203
8+
python3.10 -m ruff check --line-length=120 --ignore=F401,E203
99
# exit when asked to run `ruff` only
1010
if [[ "$1" == "ruff" ]]
1111
then
1212
exit 0
1313
fi
1414

1515
# Run static code analysis
16-
python3.8 -m pip install mypy
17-
python3.8 -m mypy --no-incremental || true
16+
python3.10 -m pip install mypy
17+
python3.10 -m mypy --no-incremental || true
1818
# exit when asked to run `mypy` only
1919
if [[ "$1" == "mypy" ]]
2020
then
2121
exit 0
2222
fi
2323

24-
python3.8 setup.py bdist_wheel --dist-dir ${BUILD_PATH}/pip/public/neuronx-distributed-inference
24+
python3.10 setup.py bdist_wheel --dist-dir ${BUILD_PATH}/pip/public/neuronx-distributed-inference

examples/generation_mllama.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,16 @@ def run_llama_generate():
4444
seq_len=seq_len,
4545
on_device_sampling_config=on_device_sampling_config,
4646
enable_bucketing=True,
47-
sequence_parallel_enabled=False,
48-
fused_qkv=False,
47+
sequence_parallel_enabled=True,
48+
fused_qkv=True,
4949
async_mode=False,
5050
)
5151
config = MllamaInferenceConfig(
5252
neuron_config,
5353
load_config=load_pretrained_config(model_path),
5454
)
55-
55+
config.neuron_config.skip_vision = False
56+
5657
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
5758
tokenizer.pad_token = tokenizer.eos_token
5859

examples/image_encoding_vit.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
from transformers import ViTModel, AutoImageProcessor
2+
from PIL import Image
3+
import time
4+
import torch
5+
import os
6+
import numpy as np
7+
import logging
8+
9+
import torch_xla
10+
11+
from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config
12+
from neuronx_distributed_inference.models.config import NeuronConfig
13+
from neuronx_distributed_inference.utils.accuracy import check_accuracy_embeddings
14+
from neuronx_distributed_inference.utils.benchmark import LatencyCollector
15+
from neuronx_distributed_inference.models.vit.modeling_vit import NeuronViTForImageEncoding, ViTInferenceConfig
16+
17+
18+
NUM_BENCHMARK_ITER = 10
19+
MODEL_PATH = "/home/ubuntu/model_hf/google--vit-huge-patch14-224-in21k/"
20+
TRACED_MODEL_PATH = "/home/ubuntu/model_hf/google--vit-huge-patch14-224-in21k/traced_model/"
21+
22+
logger = logging.getLogger(__name__)
23+
logger.setLevel(logging.INFO)
24+
25+
def setup_debug_env():
26+
os.environ["XLA_FALLBACK_CPU"] = "0"
27+
os.environ["XLA_IR_DEBUG"] = "1"
28+
os.environ["XLA_HLO_DEBUG"] = "1"
29+
os.environ["NEURON_FUSE_SOFTMAX"] = "1"
30+
torch_xla._XLAC._set_ir_debug(True)
31+
torch.manual_seed(0)
32+
33+
34+
def run_vit_encoding(validate_accuracy=True):
35+
# Define configs
36+
neuron_config = NeuronConfig(
37+
tp_degree=32,
38+
torch_dtype=torch.float32,
39+
)
40+
inference_config = ViTInferenceConfig(
41+
neuron_config=neuron_config,
42+
load_config=load_pretrained_config(MODEL_PATH),
43+
use_mask_token=False,
44+
add_pooling_layer=False,
45+
interpolate_pos_encoding=False
46+
)
47+
48+
# input image
49+
image_file = "dog.jpg" # [512, 512]
50+
with open(image_file, "rb") as f:
51+
image = Image.open(f).convert("RGB")
52+
print(f"Input image size {image.size}")
53+
# preprocess input image
54+
image_processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
55+
pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
56+
57+
# Get neuron model
58+
neuron_model = NeuronViTForImageEncoding(model_path=MODEL_PATH, config=inference_config)
59+
60+
# Compile model on Neuron
61+
compile_start_time = time.time()
62+
neuron_model.compile(TRACED_MODEL_PATH)
63+
compile_elapsed_time = time.time() - compile_start_time
64+
print(f"Compilation time taken {compile_elapsed_time} s")
65+
66+
# Load model on Neuron
67+
neuron_model.load(TRACED_MODEL_PATH)
68+
print("Done loading neuron model")
69+
70+
# Run NxDI implementation on Neuron
71+
neuron_latency_collector = LatencyCollector()
72+
for i in range(NUM_BENCHMARK_ITER):
73+
neuron_latency_collector.pre_hook()
74+
neuron_output = neuron_model(pixel_values)[0] # NeuronViTModel output (sequence_output,) or (sequence_output, pooled_output)
75+
neuron_latency_collector.hook()
76+
print(f"Got neuron output {neuron_output.shape} {neuron_output}")
77+
# Benchmark report
78+
for p in [25, 50, 90, 99]:
79+
latency = np.percentile(neuron_latency_collector.latency_list, p) * 1000
80+
print(f"Neuron inference latency_ms_p{p}: {latency}")
81+
82+
# The below section is optional, use if you want to validate e2e accuracy against golden
83+
if validate_accuracy:
84+
# Get CPU model
85+
cpu_model = ViTModel.from_pretrained(MODEL_PATH)
86+
print(f"cpu model {cpu_model}")
87+
88+
# Get golden output by running original implementation on CPU
89+
cpu_latency_collector = LatencyCollector()
90+
for i in range(NUM_BENCHMARK_ITER):
91+
cpu_latency_collector.pre_hook()
92+
golden_output = cpu_model(pixel_values).last_hidden_state
93+
cpu_latency_collector.hook()
94+
print(f"expected_output {golden_output.shape} {golden_output}")
95+
# Benchmark report
96+
for p in [25, 50, 90, 99]:
97+
latency = np.percentile(cpu_latency_collector.latency_list, p) * 1000
98+
print(f"CPU inference latency_ms_p{p}: {latency}")
99+
100+
# Compare output logits
101+
passed, max_err = check_accuracy_embeddings(neuron_output, golden_output, plot_outputs=True, atol=1e-5, rtol=1e-5)
102+
print(f"Golden and Neuron outputs match: {passed}, max relative error: {max_err}")
103+
104+
105+
106+
if __name__ == "__main__":
107+
# Set flags for debugging
108+
setup_debug_env()
109+
110+
run_vit_encoding(validate_accuracy=True)

examples/multi_node.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ NEURON_CPP_LOG_LEVEL=1 NEURON_RT_ROOT_COMM_ID=10.1.201.64:63423 inference_demo \
6161
--model-type llama \
6262
--task-type causal-lm \
6363
run \
64-
--model-path /shared_3/bowencc/nxd-ws-09-03/TinyLLama-v0 \
64+
--model-path TinyLLama-v0 \
6565
--compiled-model-path traced_models/TinyLLama-v0-multi-node_0/ \
6666
--torch-dtype bfloat16 \
6767
--start_rank_id 0 \
@@ -84,7 +84,7 @@ NEURON_CPP_LOG_LEVEL=1 NEURON_RT_ROOT_COMM_ID=10.1.201.64:63423 inference_demo \
8484
--model-type llama \
8585
--task-type causal-lm \
8686
run \
87-
--model-path /shared_3/bowencc/nxd-ws-09-03/TinyLLama-v0 \
87+
--model-path TinyLLama-v0 \
8888
--compiled-model-path traced_models/TinyLLama-v0-multi-node_1/ \
8989
--torch-dtype bfloat16 \
9090
--start_rank_id 32 \

examples/requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
transformers==4.45.*
1+
transformers==4.48.*
2+
huggingface-hub
3+
diffusers==0.32.0
24
sentencepiece
35
pillow
46
pytest-forked
File renamed without changes.

0 commit comments

Comments
 (0)