aws-neuron
diff --git a/‎.gitignore
Lines changed: 4 additions & 1 deletion b/‎.gitignore
Lines changed: 4 additions & 1 deletion
diff --git a/‎README.md
Lines changed: 0 additions & 6 deletions b/‎README.md
Lines changed: 0 additions & 6 deletions
diff --git a/‎build.sh
Lines changed: 5 additions & 5 deletions b/‎build.sh
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/generation_mllama.py
Lines changed: 4 additions & 3 deletions b/‎examples/generation_mllama.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/image_encoding_vit.py
Lines changed: 110 additions & 0 deletions b/‎examples/image_encoding_vit.py
Lines changed: 110 additions & 0 deletions
diff --git a/‎examples/multi_node.md
Lines changed: 2 additions & 2 deletions b/‎examples/multi_node.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/requirements.txt
Lines changed: 3 additions & 1 deletion b/‎examples/requirements.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎neuron_test/unit_test/__init__.py renamed to ‎neuron_test/.gitkeep b/‎neuron_test/unit_test/__init__.py renamed to ‎neuron_test/.gitkeep
@@ -139,4 +139,7 @@ build
 src/neuronx_distributed.egg-info/
 *.whl
 **/.DS_Store
-__pycache__
+__pycache__
+
+# Compiler artifacts
+neuronxcc*
@@ -27,7 +27,6 @@ inference_demo \
     --on-device-sampling \
     --enable-bucketing \
     --top-k 1 \
-    --do-sample \
     --pad-token-id 2 \
     --prompt "I believe the meaning of life is" \
     --prompt "The color of the sky is" \
@@ -51,7 +50,6 @@ inference_demo \
     --seq-len 1152 \
     --enable-bucketing \
     --top-k 1 \
-    --do-sample \
     --pad-token-id 0 \
     --prompt "I believe the meaning of life is" \
     --prompt "The color of the sky is" \
@@ -76,9 +74,7 @@ inference_demo \
     --seq-len 64 \
     --enable-bucketing \
     --speculation-length 5 \
-    --no-trace-tokengen-model \
     --top-k 1 \
-    --do-sample \
     --pad-token-id 2 \
     --prompt "I believe the meaning of life is" \
     --check-accuracy-mode token-matching \
@@ -105,7 +101,6 @@ inference_demo \
     --quantized-checkpoints-path /home/ubuntu/model_hf/Llama-2-7b/model_quant.pt \
     --quantization-type per_channel_symmetric \
     --top-k 1 \
-    --do-sample \
     --pad-token-id 2 \
     --prompt "I believe the meaning of life is" \
     --prompt "The color of the sky is"
@@ -130,7 +125,6 @@ inference_demo \
     --tol-map "{5: (1e-5, 0.02)}" \
     --enable-bucketing \
     --top-k 1 \
-    --do-sample \
     --pad-token-id 2 \
     --prompt "I believe the meaning of life is" \
     --prompt "The color of the sky is"
 
@@ -3,22 +3,22 @@ set -e
 
 : ${BUILD_PATH:=build}
 
-python3.8 -m pip install ruff
+python3.10 -m pip install ruff
 # remove --exit-zero once all errors are fixed/explicitly ignore
-python3.8 -m ruff check --line-length=120 --ignore=F401,E203
+python3.10 -m ruff check --line-length=120 --ignore=F401,E203
 # exit when asked to run `ruff` only
 if [[ "$1" == "ruff" ]]
 then
   exit 0
 fi
 
 # Run static code analysis
-python3.8 -m pip install mypy
-python3.8 -m mypy --no-incremental || true
+python3.10 -m pip install mypy
+python3.10 -m mypy --no-incremental || true
 # exit when asked to run `mypy` only
 if [[ "$1" == "mypy" ]]
 then
   exit 0
 fi
 
-python3.8 setup.py bdist_wheel --dist-dir ${BUILD_PATH}/pip/public/neuronx-distributed-inference
+python3.10 setup.py bdist_wheel --dist-dir ${BUILD_PATH}/pip/public/neuronx-distributed-inference
@@ -44,15 +44,16 @@ def run_llama_generate():
         seq_len=seq_len,
         on_device_sampling_config=on_device_sampling_config,
         enable_bucketing=True,
-        sequence_parallel_enabled=False,
-        fused_qkv=False,
+        sequence_parallel_enabled=True,
+        fused_qkv=True,
         async_mode=False,
     )
     config = MllamaInferenceConfig(
         neuron_config,
         load_config=load_pretrained_config(model_path),
     )
-    
+    config.neuron_config.skip_vision = False
+
     tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="right")
     tokenizer.pad_token = tokenizer.eos_token
 
 
@@ -0,0 +1,110 @@
+from transformers import ViTModel, AutoImageProcessor
+from PIL import Image
+import time
+import torch
+import os
+import numpy as np
+import logging
+
+import torch_xla
+
+from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config
+from neuronx_distributed_inference.models.config import NeuronConfig
+from neuronx_distributed_inference.utils.accuracy import check_accuracy_embeddings
+from neuronx_distributed_inference.utils.benchmark import LatencyCollector
+from neuronx_distributed_inference.models.vit.modeling_vit import NeuronViTForImageEncoding, ViTInferenceConfig
+
+
+NUM_BENCHMARK_ITER = 10
+MODEL_PATH = "/home/ubuntu/model_hf/google--vit-huge-patch14-224-in21k/"
+TRACED_MODEL_PATH = "/home/ubuntu/model_hf/google--vit-huge-patch14-224-in21k/traced_model/"
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+def setup_debug_env():
+    os.environ["XLA_FALLBACK_CPU"] = "0"
+    os.environ["XLA_IR_DEBUG"] = "1"
+    os.environ["XLA_HLO_DEBUG"] = "1"
+    os.environ["NEURON_FUSE_SOFTMAX"] = "1"
+    torch_xla._XLAC._set_ir_debug(True)
+    torch.manual_seed(0)
+
+
+def run_vit_encoding(validate_accuracy=True):
+    # Define configs
+    neuron_config = NeuronConfig(
+        tp_degree=32,
+        torch_dtype=torch.float32,
+    )
+    inference_config = ViTInferenceConfig(
+        neuron_config=neuron_config, 
+        load_config=load_pretrained_config(MODEL_PATH),
+        use_mask_token=False,
+        add_pooling_layer=False,
+        interpolate_pos_encoding=False
+    )
+
+    # input image
+    image_file = "dog.jpg" # [512, 512]
+    with open(image_file, "rb") as f:
+        image = Image.open(f).convert("RGB")
+    print(f"Input image size {image.size}")
+    # preprocess input image
+    image_processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
+    pixel_values = image_processor(image, return_tensors="pt")["pixel_values"]
+
+    # Get neuron model
+    neuron_model = NeuronViTForImageEncoding(model_path=MODEL_PATH, config=inference_config)
+
+    # Compile model on Neuron
+    compile_start_time = time.time()
+    neuron_model.compile(TRACED_MODEL_PATH)
+    compile_elapsed_time = time.time() - compile_start_time
+    print(f"Compilation time taken {compile_elapsed_time} s")
+
+    # Load model on Neuron
+    neuron_model.load(TRACED_MODEL_PATH)
+    print("Done loading neuron model")
+
+    # Run NxDI implementation on Neuron
+    neuron_latency_collector = LatencyCollector()
+    for i in range(NUM_BENCHMARK_ITER):
+        neuron_latency_collector.pre_hook()
+        neuron_output = neuron_model(pixel_values)[0] # NeuronViTModel output (sequence_output,) or (sequence_output, pooled_output)
+        neuron_latency_collector.hook()
+    print(f"Got neuron output {neuron_output.shape} {neuron_output}")
+    # Benchmark report
+    for p in [25, 50, 90, 99]:
+        latency = np.percentile(neuron_latency_collector.latency_list, p) * 1000
+        print(f"Neuron inference latency_ms_p{p}: {latency}")
+
+    # The below section is optional, use if you want to validate e2e accuracy against golden
+    if validate_accuracy:
+        # Get CPU model
+        cpu_model = ViTModel.from_pretrained(MODEL_PATH)
+        print(f"cpu model {cpu_model}")
+
+        # Get golden output by running original implementation on CPU
+        cpu_latency_collector = LatencyCollector()
+        for i in range(NUM_BENCHMARK_ITER):
+            cpu_latency_collector.pre_hook()
+            golden_output = cpu_model(pixel_values).last_hidden_state
+            cpu_latency_collector.hook()
+        print(f"expected_output {golden_output.shape} {golden_output}")
+        # Benchmark report
+        for p in [25, 50, 90, 99]:
+            latency = np.percentile(cpu_latency_collector.latency_list, p) * 1000
+            print(f"CPU inference latency_ms_p{p}: {latency}")
+
+        # Compare output logits
+        passed, max_err = check_accuracy_embeddings(neuron_output, golden_output, plot_outputs=True, atol=1e-5, rtol=1e-5)
+        print(f"Golden and Neuron outputs match: {passed}, max relative error: {max_err}")
+
+        
+
+if __name__ == "__main__":
+    # Set flags for debugging
+    setup_debug_env()
+
+    run_vit_encoding(validate_accuracy=True)
@@ -61,7 +61,7 @@ NEURON_CPP_LOG_LEVEL=1 NEURON_RT_ROOT_COMM_ID=10.1.201.64:63423 inference_demo \
               --model-type llama \
               --task-type causal-lm \
               run \
-                --model-path /shared_3/bowencc/nxd-ws-09-03/TinyLLama-v0 \
+                --model-path TinyLLama-v0 \
                 --compiled-model-path  traced_models/TinyLLama-v0-multi-node_0/ \
                 --torch-dtype bfloat16 \
                 --start_rank_id 0 \
@@ -84,7 +84,7 @@ NEURON_CPP_LOG_LEVEL=1 NEURON_RT_ROOT_COMM_ID=10.1.201.64:63423 inference_demo \
               --model-type llama \
               --task-type causal-lm \
               run \
-                --model-path /shared_3/bowencc/nxd-ws-09-03/TinyLLama-v0 \
+                --model-path TinyLLama-v0 \
                 --compiled-model-path  traced_models/TinyLLama-v0-multi-node_1/ \
                 --torch-dtype bfloat16 \
                 --start_rank_id 32 \
 
@@ -1,4 +1,6 @@
-transformers==4.45.*
+transformers==4.48.*
+huggingface-hub
+diffusers==0.32.0
 sentencepiece
 pillow
 pytest-forked