aws-neuron
diff --git a/‎examples/generate_flux.py‎
Lines changed: 181 additions & 0 deletions b/‎examples/generate_flux.py‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎examples/generation_llama4.py‎
Lines changed: 76 additions & 61 deletions b/‎examples/generation_llama4.py‎
Lines changed: 76 additions & 61 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,181 @@
+import os
+import argparse
+import time
+import torch
+from neuronx_distributed_inference.models.diffusers.flux.application import NeuronFluxApplication
+from neuronx_distributed_inference.models.config import NeuronConfig
+from neuronx_distributed_inference.models.diffusers.flux.clip.modeling_clip import CLIPInferenceConfig
+from neuronx_distributed_inference.models.diffusers.flux.t5.modeling_t5 import T5InferenceConfig
+from neuronx_distributed_inference.models.diffusers.flux.modeling_flux import FluxBackboneInferenceConfig
+from neuronx_distributed_inference.models.diffusers.flux.vae.modeling_vae import VAEDecoderInferenceConfig
+from neuronx_distributed_inference.utils.hf_adapter import load_pretrained_config
+from neuronx_distributed_inference.utils.diffusers_adapter import load_diffusers_config
+from neuronx_distributed_inference.utils.random import set_random_seed
+
+set_random_seed(0)
+
+# Existing Compiled working directory for the compiler
+BASE_COMPILE_WORK_DIR = "/tmp/flux/compiler_workdir/"
+
+
+def create_flux_config(model_path, world_size, backbone_tp_degree, dtype, height, width):
+    text_encoder_path = os.path.join(model_path, "text_encoder")
+    text_encoder_2_path = os.path.join(model_path, "text_encoder_2")
+    backbone_path = os.path.join(model_path, "transformer")
+    vae_decoder_path = os.path.join(model_path, "vae")
+
+    clip_neuron_config = NeuronConfig(
+        tp_degree=1,
+        world_size=world_size,
+        torch_dtype=dtype,
+    )
+    clip_config = CLIPInferenceConfig(
+        neuron_config=clip_neuron_config,
+        load_config=load_pretrained_config(text_encoder_path),
+    )
+
+    t5_neuron_config = NeuronConfig(
+        tp_degree = world_size,     # T5: TP degree = world_size
+        world_size = world_size,
+        torch_dtype=dtype
+    )
+    t5_config = T5InferenceConfig(
+        neuron_config=t5_neuron_config,
+        load_config=load_pretrained_config(text_encoder_2_path),
+    )
+
+    backbone_neuron_config = NeuronConfig(
+        tp_degree = backbone_tp_degree,
+        world_size = world_size,
+        torch_type = dtype
+    )
+    backbone_config = FluxBackboneInferenceConfig(
+        neuron_config = backbone_neuron_config,
+        load_config = load_diffusers_config(backbone_path),
+        height = height,
+        width = width,
+    )
+
+    decoder_neuron_config = NeuronConfig(
+        tp_degree = 1,
+        world_size = world_size,
+        torch_type = dtype
+    )
+    decoder_config = VAEDecoderInferenceConfig(
+        neuron_config = decoder_neuron_config,
+        load_config = load_diffusers_config(vae_decoder_path),
+        height = height,
+        width = width,
+        transformer_in_channels = backbone_config.in_channels,
+    )
+
+    setattr(backbone_config, "vae_scale_factor", decoder_config.vae_scale_factor)
+
+    return (clip_config, t5_config, backbone_config, decoder_config)
+
+def run_flux_generate(args):
+    print(f"run_flux_generate with args: {args}")
+    world_size = 8
+    backbone_tp_degree = 8
+    if args.instance_type == "trn1":
+        if args.context_parallel_enabled:
+            world_size = 16
+            backbone_tp_degree = 8
+        else:
+            world_size = 8
+            backbone_tp_degree = 8
+    elif args.instance_type == "trn2":
+        if args.context_parallel_enabled:
+            world_size = 8
+            backbone_tp_degree = 4
+        else:
+            world_size = 4
+            backbone_tp_degree = 4
+
+    dtype = torch.bfloat16
+
+    clip_config, t5_config, backbone_config, decoder_config = create_flux_config(args.checkpoint_dir, world_size, backbone_tp_degree, dtype, args.height, args.width)
+
+    flux_app = NeuronFluxApplication(
+        model_path=args.checkpoint_dir,
+        text_encoder_config = clip_config,
+        text_encoder2_config = t5_config,
+        backbone_config = backbone_config,
+        decoder_config = decoder_config,
+        instance_type = args.instance_type,
+        height = args.height,
+        width = args.width,
+    )
+    flux_app.compile(BASE_COMPILE_WORK_DIR)
+    flux_app.load(BASE_COMPILE_WORK_DIR)
+
+    warmup_rounds = 5
+    print("Warming up the model for better latency testing")
+    for i in range(warmup_rounds):
+        flux_app(
+            args.prompt,
+            height=args.height,
+            width=args.width,
+            guidance_scale=args.guidance_scale,
+            num_inference_steps=args.num_inference_steps
+        ).images[0]
+
+
+    if args.profile:
+        from torch.profiler import profile, ProfilerActivity
+        with profile(activities=[ProfilerActivity.CPU], record_shapes=True, profile_memory=True, with_stack=True) as prof:
+            _run_flux_helper(flux_app, args)
+
+        print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))
+        prof.export_chrome_trace(f"{args.profile_name}")
+    else:
+        _run_flux_helper(flux_app, args)
+
+
+def _run_flux_helper(flux_app, args):
+    total_time = 0
+    for i in range(args.num_images):
+        start_time = time.time()
+
+        image = flux_app(
+            args.prompt,
+            height=args.height,
+            width=args.width,
+            guidance_scale=args.guidance_scale,
+            num_inference_steps=args.num_inference_steps
+        ).images[0]
+
+        end_time = time.time()
+        generation_time = end_time - start_time
+        total_time += generation_time
+
+        if args.save_image:
+            filename = f"output_{i+1}.png"
+            image.save(filename)
+
+        print(f"Image {i+1} generated in {generation_time:.2f} seconds")
+
+    average_time = total_time / args.num_images
+    print(f"\nAverage generation time: {average_time:.2f} seconds")
+
+
+if __name__ == "__main__":
+    # The Ckpt directory root under huggingface
+    CKPT_DIR = "/shared/flux/FLUX.1-dev/"
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--prompt", type=str, default="A cat holding a sign that says hello world")
+    parser.add_argument("-hh", "--height", type=int, default=1024)
+    parser.add_argument("-w", "--width", type=int, default=1024)
+    parser.add_argument("-n", "--num_inference_steps", type=int, default=25)
+    parser.add_argument("-i", "--instance_type", type=str, default="trn2")
+    parser.add_argument("-g", "--guidance_scale", type=float, default=3.5)
+    parser.add_argument("-c", "--checkpoint_dir", type=str, default=CKPT_DIR)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--profile_name", type=str, default="flux_torch_profile.json")
+    parser.add_argument("--num_images", type=int, default=1)
+    parser.add_argument("--save_image", action="store_true")
+    parser.add_argument("--context_parallel_enabled", action="store_true", default=True)
+
+    args = parser.parse_args()
+    run_flux_generate(args)
@@ -1,7 +1,6 @@
 import torch
 import os
 import logging
-import base64
 
 from transformers import AutoTokenizer, AutoProcessor, GenerationConfig
 from neuronx_distributed_inference.models.config import OnDeviceSamplingConfig as SmplConfig
@@ -21,8 +20,7 @@
 VISION_TP_DEGERE = 16
 WORLD_SIZE = 64
 BATCH_SIZE  = 1
-SEQ_LENGTH = 8192
-# SEQ_LENGTH = 10240 for chunked attention
+SEQ_LENGTH = 16384
 TEXT_TO_TEXT = False
 # TEXT_TO_TEXT = True for text only generation
 DTYPE = torch.bfloat16
@@ -33,46 +31,65 @@
 os.environ['NEURON_RT_NUM_CORES']=f'{TEXT_TP_DEGREE}'
 os.environ['BASE_COMPILE_WORK_DIR'] = "./compiler_path/"
 
-model_path = "/home/ubuntu/models/Llama-4-Scout-17B-16E-Instruct/"
-traced_model_path = "/home/ubuntu/traced_model_Llama-4-Scout-17B-16E-Instruct"
+# Llama4 checkpoints can be downloaded from HuggingFace
+model_path = "/shared/models/Llama-4-Scout-17B-16E-Instruct/"
+# Path to the compiled model artifacts. If this directory exists, the next run will skip
+# the trace and compile steps, reducing test time.
+traced_model_path = "/shared/traced_models/Llama-4/scout_text_vision_baseline_bs1/"
 
 torch.manual_seed(0)
 
 def run_llama_generate_image_to_text():
-    # Initialize configs and tokenizer.
-    batch_size = 1
-    text_neuron_config = Llama4NeuronConfig(batch_size=1,
-                                seq_len=SEQ_LENGTH,
-                                torch_dtype=torch.bfloat16,
-                                skip_sharding=False,
-                                save_sharded_checkpoint=False,
-                                tp_degree=TEXT_TP_DEGREE,
-                                cp_degree=1,
-                                on_device_sampling_config=SmplConfig(dynamic=False, top_k=1),
-                                world_size=WORLD_SIZE,
-                                capacity_factor=None,
-                                fused_qkv=False,
-                                attention_dtype=torch.float16,
-                                rpl_reduce_dtype=torch.float32,
-                                cast_type="as-declared",
-                                logical_neuron_cores=2)
-
-    vision_neuron_config = Llama4NeuronConfig(batch_size=1,
-                                seq_len=SEQ_LENGTH, 
-                                torch_dtype=torch.float16,
-                                skip_sharding=False,
-                                save_sharded_checkpoint=False,
-                                tp_degree=VISION_TP_DEGERE,
-                                cp_degree=1,
-                                on_device_sampling_config=SmplConfig(dynamic=False, top_k=1),
-                                dp_degree=4, 
-                                world_size=WORLD_SIZE,
-                                fused_qkv=True,
-                                qkv_kernel_enabled=True,
-                                attn_kernel_enabled=True,
-                                mlp_kernel_enabled=True,
-                                enable_bucketing=False,                                
-                                logical_neuron_cores=2)
+    text_neuron_config = Llama4NeuronConfig(
+        batch_size=1,
+        is_continuous_batching=True,
+        seq_len=SEQ_LENGTH,
+        enable_bucketing=True,
+        context_encoding_buckets=[256, 512, 1024, 2048, 4096, 8192, 10240, 16384],
+        token_generation_buckets=[256, 512, 1024, 2048, 4096, 8192, 10240, 16384],
+        torch_dtype=torch.float16,
+        async_mode=True,
+        rpl_reduce_dtype=torch.float32,
+        tp_degree=TEXT_TP_DEGREE,
+        cp_degree=16,
+        on_device_sampling_config=SmplConfig(dynamic=True, top_k=1, top_k_kernel_enabled=True),
+        world_size=WORLD_SIZE,
+        fused_qkv=True,
+        cast_type="as-declared",
+        save_sharded_checkpoint=True,
+        cc_pipeline_tiling_factor=1,
+        sequence_parallel_enabled=True,
+        qkv_kernel_enabled=True,
+        attn_kernel_enabled=True,
+        attn_block_tkg_nki_kernel_enabled=True,
+        attn_block_tkg_nki_kernel_cache_update=True,
+        k_cache_transposed=False,
+        blockwise_matmul_config={
+            "block_size": 256,
+            "use_block_parallel": True,
+            "block_sharding_strategy": "HI_LO",
+            "skip_dma_token": True,
+            "skip_dma_weight": True,
+            "parallelize_token_to_block_mapping": True
+        },
+        logical_neuron_cores=2)
+
+    vision_neuron_config = Llama4NeuronConfig(
+        batch_size=1,
+        seq_len=SEQ_LENGTH,
+        torch_dtype=torch.float16,
+        tp_degree=VISION_TP_DEGERE,
+        cp_degree=1,
+        dp_degree=4,
+        world_size=WORLD_SIZE,
+        fused_qkv=True,
+        qkv_kernel_enabled=True,
+        attn_kernel_enabled=True,
+        mlp_kernel_enabled=True,
+        enable_bucketing=True,
+        buckets=[8, 28, 88],
+        save_sharded_checkpoint=True,
+        logical_neuron_cores=2)
 
     config = Llama4InferenceConfig(
         text_neuron_config=text_neuron_config,
@@ -85,10 +102,10 @@ def run_llama_generate_image_to_text():
 
     hf_llama4_processor = AutoProcessor.from_pretrained(model_path)
     # Prepare generate outputs.
-    text_prompt="If I had to write a haiku for this one"
+    text_prompt="Describe this image"
     image_path="./dog.jpg"
     role='user'
-    
+
     with torch.profiler.record_function("prepare_generation_inputs"):
         input_ids, attention_mask, pixel_values,  vision_mask = prepare_generation_inputs_hf(text_prompt, image_path, hf_llama4_processor, role, config)
 
@@ -97,7 +114,7 @@ def run_llama_generate_image_to_text():
         print("\nCompiling and saving model...")
         model = NeuronLlama4ForCausalLM(model_path, config)
         model.compile(traced_model_path)
-        tokenizer.save_pretrained(traced_model_path)
+    tokenizer.save_pretrained(traced_model_path)
 
     # Load from compiled checkpoint.
 
@@ -111,7 +128,7 @@ def run_llama_generate_image_to_text():
     generation_config = GenerationConfig.from_pretrained(model_path)
 
     # Test Sampling Parameters
-    sampling_params = prepare_sampling_params(batch_size=batch_size, top_k=[1], top_p=[1.0],  temperature=[1.0])
+    sampling_params = prepare_sampling_params(batch_size=1, top_k=[1], top_p=[1.0],  temperature=[1.0])
     outputs = generation_model.generate(
         input_ids,
         generation_config=generation_config,
@@ -134,7 +151,7 @@ def run_llama_generate_image_to_text():
     role='user'
 
     input_ids, attention_mask, _,  _ = prepare_generation_inputs_hf(text_prompt, image_path, hf_llama4_processor, role)
-    sampling_params = prepare_sampling_params(batch_size=batch_size, top_k=[1], top_p=[1.0],  temperature=[1.0])
+    sampling_params = prepare_sampling_params(batch_size=1, top_k=[1], top_p=[1.0],  temperature=[1.0])
     outputs = generation_model.generate(
         input_ids,
         generation_config=generation_config,
@@ -159,22 +176,20 @@ def run_llama_generate_image_to_text():
 def run_llama_generate_text_to_text():
     # Initialize configs and tokenizer.
     batch_size = 1
-    neuron_config = Llama4NeuronConfig(batch_size=1,
-                                seq_len=SEQ_LENGTH,
-                                torch_dtype=torch.bfloat16,
-                                skip_sharding=False,
-                                save_sharded_checkpoint=True,
-                                tp_degree=TEXT_TP_DEGREE,
-                                cp_degree=16,
-                                on_device_sampling_config=SmplConfig(dynamic=False, top_k=1),
-                                world_size=WORLD_SIZE,
-                                capacity_factor=None,
-                                fused_qkv=False,
-                                attention_dtype=torch.float16,
-                                rpl_reduce_dtype=torch.float32,
-                                cast_type="as-declared",
-                                logical_neuron_cores=2)
-
+    neuron_config = Llama4NeuronConfig(
+        batch_size=1,
+        is_continuous_batching=True,
+        seq_len=SEQ_LENGTH,
+        torch_dtype=torch.float16,
+        rpl_reduce_dtype=torch.float32,
+        tp_degree=TEXT_TP_DEGREE,
+        cp_degree=1,
+        on_device_sampling_config=SmplConfig(dynamic=True, top_k=1),
+        world_size=WORLD_SIZE,
+        fused_qkv=False,
+        cast_type="as-declared",
+        save_sharded_checkpoint=True,
+        logical_neuron_cores=2)
 
     config = LlamaInferenceConfig(
         neuron_config=neuron_config,
@@ -191,7 +206,7 @@ def run_llama_generate_text_to_text():
         print("\nCompiling and saving model...")
         model = NeuronLlama4TextForCausalLM(model_path, config.get_text_config())
         model.compile(traced_model_path)
-        tokenizer.save_pretrained(traced_model_path)
+    tokenizer.save_pretrained(traced_model_path)
     # Load from compiled checkpoint.
 
     print("\nLoading model from compiled checkpoint...")
 
@@ -39,6 +39,7 @@ def get_version(version_str):
     package_data={"": []},
     install_requires=[
         "neuronx_distributed",
+        "torch_neuronx>=2.5",
         "transformers==4.51.*",
         "huggingface-hub",
         "sentencepiece",