quic
diff --git a/‎QEfficient/finetune/utils/train_utils.py
Lines changed: 12 additions & 11 deletions b/‎QEfficient/finetune/utils/train_utils.py
Lines changed: 12 additions & 11 deletions
diff --git a/‎QEfficient/transformers/models/llama4/modeling_llama4.py
Lines changed: 0 additions & 8 deletions b/‎QEfficient/transformers/models/llama4/modeling_llama4.py
Lines changed: 0 additions & 8 deletions
diff --git a/‎QEfficient/transformers/models/modeling_auto.py
Lines changed: 12 additions & 16 deletions b/‎QEfficient/transformers/models/modeling_auto.py
Lines changed: 12 additions & 16 deletions
diff --git a/‎QEfficient/utils/test_utils.py
Lines changed: 21 additions & 0 deletions b/‎QEfficient/utils/test_utils.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎examples/llama4_example.py
Lines changed: 5 additions & 7 deletions b/‎examples/llama4_example.py
Lines changed: 5 additions & 7 deletions
diff --git a/‎examples/llama4_multi_image_example.py
Lines changed: 5 additions & 7 deletions b/‎examples/llama4_multi_image_example.py
Lines changed: 5 additions & 7 deletions
diff --git a/‎scripts/Jenkinsfile
Lines changed: 39 additions & 30 deletions b/‎scripts/Jenkinsfile
Lines changed: 39 additions & 30 deletions
@@ -124,10 +124,9 @@ def train(
 
         if train_config.use_peft and train_config.from_peft_checkpoint:
             intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
+            intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
             if epoch < intermediate_epoch:
                 logger.log_rank_zero(f"Skipping epoch {epoch + 1} since fine tuning has already completed for it.")
-                # to bring the count of train_step in sync with where it left off
-                total_train_steps += len(train_dataloader)
                 continue
 
         logger.log_rank_zero(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
@@ -149,20 +148,18 @@ def train(
 
         num_dummy_samples = 0
         for step, batch in enumerate(train_dataloader):
+            # total_train_steps indicates the cumulative number of training steps completed across all epochs.
+            # When resuming fine-tuning from previously saved checkpoints, total_train_steps indicates the total number of steps trained across the earlier session and the ongoing one.
+            total_train_steps = (epoch) * len(train_dataloader) + step
             # resume training from a particular checkpoint, assuming the dataset is not shuffled
             if train_config.use_peft and train_config.from_peft_checkpoint:
-                intermediate_step = int(train_config.from_peft_checkpoint.split("/")[-1].split("_")[-1])
-                intermediate_epoch = int(train_config.from_peft_checkpoint.split("/")[-2].split("_")[-1]) - 1
                 # to bring the count of train_step in sync with where it left off
                 if epoch == intermediate_epoch and step == 0:
-                    total_train_steps += intermediate_step
                     logger.log_rank_zero(
                         f"Skipping first {intermediate_step} steps for epoch {epoch + 1}, since fine tuning has already completed for it."
                     )
                 if epoch == intermediate_epoch and step < intermediate_step:
-                    total_train_steps += 1
                     continue
-            total_train_steps += 1
 
             if train_config.max_train_step > 0 and total_train_steps >= train_config.max_train_step:
                 max_steps_reached = True
@@ -235,12 +232,12 @@ def train(
             else:
                 num_samples_in_cur_update = len(train_dataloader) % train_config.gradient_accumulation_steps
 
-            loss = loss / num_samples_in_cur_update
+            normalized_loss = loss / num_samples_in_cur_update
 
             if train_config.grad_scaler:
-                scaler.scale(loss).backward()  # backward pass
+                scaler.scale(normalized_loss).backward()  # backward pass
             else:
-                loss.backward()  # backward pass
+                normalized_loss.backward()  # backward pass
 
             if is_optimizer_step:
                 if train_config.grad_scaler:
@@ -358,7 +355,6 @@ def train(
         logger.log_rank_zero(
             f"Epoch {epoch + 1}: Train epoch loss: {train_epoch_loss:.4f}, Train metric: {train_epoch_metric:.4f}, Epoch time {epoch_end_time:.2f} sec"
         )
-
         # Saving the results every epoch to plot later
         if train_config.save_metrics:
             save_to_json(
@@ -377,9 +373,14 @@ def train(
 
     results["last_epoch_train_loss"] = train_epoch_loss.cpu()
     results["last_epoch_train_metric"] = train_epoch_metric.cpu()
+    results["train_step_loss"] = train_step_loss
+    results["train_step_metric"] = train_step_metric
+
     if train_config.run_validation:
         results["last_epoch_eval_loss"] = eval_epoch_loss.cpu()
         results["last_epoch_eval_metric"] = eval_epoch_metric.cpu()
+        results["eval_step_loss"] = eval_step_loss
+        results["eval_step_metric"] = eval_step_metric
     results["avg_epoch_time"] = avg_epoch_time
     results["avg_checkpoint_time"] = avg_checkpoint_time
     if train_config.save_metrics:
 
@@ -925,14 +925,6 @@ def get_specializations(
         )
         vision_size = num_features_per_tile * max_num_tiles
 
-        downsample_ratio = int(round(1.0 / (self.config.vision_config.pixel_shuffle_ratio**2)))
-        num_features_per_tile = int(
-            (img_size // self.config.vision_config.patch_size)
-            * (img_size // self.config.vision_config.patch_size)
-            // downsample_ratio
-        )
-        vision_size = num_features_per_tile * max_num_tiles
-
         vision = [
             {
                 "batch_size": batch_size,
 
@@ -322,8 +322,8 @@ def compile(
         ]
 
         return self._compile(
-            onnx_path,
-            compile_dir,
+            onnx_path=onnx_path,
+            compile_dir=compile_dir,
             compile_only=True,
             specializations=specializations,
             convert_to_fp16=True,
@@ -450,7 +450,7 @@ def __init__(self, model: nn.modules):
         self.model = model.get_qeff_vision_encoder()
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None):
-        return self._export(inputs, output_names, dynamic_axes, export_dir)
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
 
     def compile(
         self,
@@ -518,7 +518,7 @@ def __init__(self, model):
         self.model = model.get_qeff_language_decoder()
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None):
-        return self._export(inputs, output_names, dynamic_axes, export_dir)
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
 
     def compile(
         self,
@@ -631,10 +631,10 @@ def export(
             inputs["vision"],
             output_names["vision"],
             dynamic_axes["vision"],
-            export_dir,
+            export_dir=export_dir,
         )
 
-        self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir)
+        self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir=export_dir)
         return self.onnx_path
 
     def compile(
@@ -699,7 +699,7 @@ def compile(
 
         if not skip_vision:
             self.vision_model._compile(
-                compile_dir,
+                compile_dir=compile_dir,
                 compile_only=True,
                 specializations=specializations["vision"],
                 convert_to_fp16=True,
@@ -726,7 +726,7 @@ def compile(
                     custom_io_lang[output_name] = "float16" if "vision_embeds" in output_name else kv_cache_dtype
 
             self.lang_model._compile(
-                compile_dir,
+                compile_dir=compile_dir,
                 compile_only=True,
                 retained_state=True,
                 specializations=specializations["lang"],
@@ -863,10 +863,6 @@ def kv_offload_generate(
         chunk_inputs = lang_inputs.copy()
         prefill_start = perf_counter()
 
-        # Prepare inputs for prefill
-        chunk_inputs = lang_inputs.copy()
-        prefill_start = perf_counter()
-
         # Run prefill
         chunk_inputs = lang_inputs.copy()
         for i in range(num_chunks):
@@ -1042,8 +1038,8 @@ def compile(
                 custom_io[output_name] = "float16" if "pixel_values" in output_name else kv_cache_dtype
 
         self._compile(
-            onnx_path,
-            compile_dir,
+            onnx_path=onnx_path,
+            compile_dir=compile_dir,
             compile_only=True,
             retained_state=True,
             specializations=specializations,
@@ -2081,8 +2077,8 @@ def compile(
                 custom_io[output_name] = kv_cache_dtype
 
         return self._compile(
-            onnx_path,
-            compile_dir,
+            onnx_path=onnx_path,
+            compile_dir=compile_dir,
             compile_only=True,
             retained_state=True,
             specializations=specializations,
 
@@ -150,3 +150,24 @@ def __call__(
             image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
             query = query.replace("<image>", image_tokens, 1)
         return query
+
+
+class ModelConfig:
+    """
+    Contains all the model types which are not default model like quantized models, external models, swiftkv models etc,.
+    """
+
+    QUANTIZED_MODELS = {
+        "neuralmagic/Qwen2-0.5B-Instruct-FP8",
+        "neuralmagic/Llama-3.2-3B-Instruct-FP8",
+        "TheBloke/Llama-2-7B-GPTQ",
+        "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+    }
+
+    EXTERNAL_MODELS = {
+        "hpcai-tech/grok-1",
+    }
+
+    SWIFTKV_MODELS = {
+        "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",
+    }
@@ -7,7 +7,7 @@
 
 import torch
 import transformers
-from transformers import AutoConfig, AutoModelForImageTextToText, AutoProcessor, TextStreamer
+from transformers import AutoConfig, AutoProcessor, TextStreamer
 
 from QEfficient import QEFFAutoModelForImageTextToText
 
@@ -17,14 +17,12 @@
 config.text_config.num_hidden_layers = 4
 config.vision_config.num_hidden_layers = 2
 
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager", config=config)
-model.eval()
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id)
 
-### For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ###
-qeff_model = QEFFAutoModelForImageTextToText(model, kv_offload=True)
-
 ### use skip_vision=Ture, if want to run only text, ow false ###
 skip_vision = True
 
 
@@ -7,7 +7,7 @@
 
 import torch
 import transformers
-from transformers import AutoConfig, AutoModelForImageTextToText, AutoProcessor, TextStreamer
+from transformers import AutoConfig, AutoProcessor, TextStreamer
 
 from QEfficient import QEFFAutoModelForImageTextToText
 
@@ -17,14 +17,12 @@
 config.text_config.num_hidden_layers = 4
 config.vision_config.num_hidden_layers = 2
 
-model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager", config=config)
-model.eval()
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
+    model_id, attn_implementation="eager", kv_offload=True, config=config
+)
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)
 processor = AutoProcessor.from_pretrained(model_id)
 
-### For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ###
-qeff_model = QEFFAutoModelForImageTextToText(model, kv_offload=True)
-
 ### For multi-image, the value of max_num_tiles should be the sum of the num_tiles values across all the images ###
 qeff_model.compile(
     prefill_seq_len=128,
 
@@ -59,7 +59,7 @@ pipeline {
                            mkdir -p $PWD/Non_qaic &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/Non_qaic &&
-                           pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
+                           pytest tests -m '(not cli) and (on_qaic) and (not nightly) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
                            junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
                            deactivate"
                            '''
@@ -97,23 +97,13 @@ pipeline {
                            mkdir -p $PWD/cli &&
                            export TOKENIZERS_PARALLELISM=false &&
                            export QEFF_HOME=$PWD/cli &&
-                           pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log3.xml &&
+                           pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log3.xml &&
                            junitparser merge tests/tests_log3.xml tests/tests_log.xml &&
                            deactivate"
                            '''
                        }
                    }
         }
-        stage('vLLM Tests') {
-            steps {
-                catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
-                    build job: 'qefficient_vllm_upstream',
-                    parameters: [string(name: 'NAME', value: "${BUILD_TAG}")],
-                    propagate: true,
-                    wait: true
-                }
-            }
-        }
         stage('QNN CLI Tests') {
             steps {
                 timeout(time: 30, unit: 'MINUTES') {
@@ -126,7 +116,7 @@ pipeline {
                     mkdir -p $PWD/Qnn_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_cli &&
-                    pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log4.xml &&
+                    pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log4.xml &&
                     junitparser merge tests/tests_log4.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -145,7 +135,7 @@ pipeline {
                     mkdir -p $PWD/Qnn_non_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_non_cli &&
-                    pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
+                    pytest tests -m '(not cli) and (qnn) and (not nightly) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log5.xml &&
                     junitparser merge tests/tests_log5.xml tests/tests_log.xml &&
                     deactivate"
                     '''
@@ -191,20 +181,39 @@ pipeline {
     }
 
    post {
-       always {
-           script {
-               try {
-                   sh '''
-                   sudo docker rm -f ${BUILD_TAG}
-                   sudo chown -R ubuntu .
-                   '''
-               } catch (error) {
-                   echo "Failed to delete container ${BUILD_TAG}: ${error}"
-               }
-           }
-           junit testResults: 'tests/tests_log.xml'
-           echo 'Cleaning Workspace'
-           deleteDir()
-       }
-   }
+        success {
+            // Trigger downstream job only if this pipeline succeeds
+            build job: 'qefficient_vllm_upstream',
+                parameters: [
+                    string(name: 'NAME', value: "${BUILD_TAG}"),
+                    string(name: 'QEFF_WORKSPACE', value: "${env.WORKSPACE}")
+                ],
+                wait: false
+        }
+        always {
+            script {
+                try {
+                    sh '''
+                    sudo chown -R ubuntu .
+                    '''
+                } catch (error) {
+                    echo "Failed to change ownership: ${error}"
+                }
+            }
+            junit testResults: 'tests/tests_log.xml'
+        }
+        unsuccessful {
+            script {
+                try {
+                    sh '''
+                    sudo docker rm -f ${BUILD_TAG}
+                    '''
+                } catch (error) {
+                    echo "Failed to delete container ${BUILD_TAG}: ${error}"
+                }
+            }
+            echo 'Cleaning Workspace'
+            deleteDir()
+        }
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -925,14 +925,6 @@ def get_specializations(`
`925`	`925`	`)`
`926`	`926`	`vision_size = num_features_per_tile * max_num_tiles`
`927`	`927`
`928`		`- downsample_ratio = int(round(1.0 / (self.config.vision_config.pixel_shuffle_ratio**2)))`
`929`		`- num_features_per_tile = int(`
`930`		`- (img_size // self.config.vision_config.patch_size)`
`931`		`- * (img_size // self.config.vision_config.patch_size)`
`932`		`- // downsample_ratio`
`933`		`- )`
`934`		`- vision_size = num_features_per_tile * max_num_tiles`
`935`		`-`
`936`	`928`	`vision = [`
`937`	`929`	`{`
`938`	`930`	`"batch_size": batch_size,`