diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py
index dd68e5fb9..c51e7bf14 100644
--- a/QEfficient/transformers/embeddings/embedding_utils.py
+++ b/QEfficient/transformers/embeddings/embedding_utils.py
@@ -80,6 +80,13 @@ def cls_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
 }
 
 
+def embedding_forward(
+        self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, **kwargs
+    ):
+    print("Forward swapped with new one")
+    output = self.old_forward(input_ids=input_ids, position_ids=position_ids, **kwargs)
+    return output[0]
+
 class PooledModel(nn.Module):
     """
     Adds pooling functionality to embedding model.
@@ -92,10 +99,10 @@ def __init__(self, base_model, pooling_fn):
         self.pooling_fn = pooling_fn
 
     def forward(
-        self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
+        self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, **kwargs
     ):
-        output = self.base_model(input_ids, attention_mask, **kwargs)
-        return self.pooling_fn(output[0], attention_mask)
+        output = self.base_model(input_ids, position_ids, **kwargs)
+        return self.pooling_fn(output[0], position_ids)
 
 
 def validate_user_pooling_function(user_function):
@@ -119,7 +126,7 @@ def validate_user_pooling_function(user_function):
         raise TypeError("Provided pooling function is not callable.")
 
     sig = inspect.signature(user_function)
-    required_args = {"last_hidden_states", "attention_mask"}
+    required_args = {"last_hidden_states", "position_ids"}
     if not required_args.issubset(sig.parameters.keys()):
         raise ValueError(f"Pooling function must accept arguments: {required_args}")
     return user_function
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 6bff10f5a..f54086117 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -38,6 +38,7 @@
 from QEfficient.transformers.modeling_utils import DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
+    EmbeddingTransform,
     KVCacheExternalModuleMapperTransform,
     KVCacheTransform,
     PoolingTransform,
@@ -160,7 +161,7 @@ class QEFFAutoModel(QEFFTransformersBase):
     """
 
     _hf_auto_class = AutoModel
-    _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
+    _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, EmbeddingTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(self, model: nn.Module, pooling=None, **kwargs):
@@ -267,10 +268,10 @@ def export(self, export_dir: Optional[str] = None) -> str:
 
         example_inputs = {
             "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
-            "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64),
+            "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1),
         }
 
-        dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}}
+        dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "position_ids": {0: "batch_size", 1: "seq_len"}}
 
         output_names = ["output"]
 
@@ -396,32 +397,37 @@ def cloud_ai_100_feature_generate(
         # To handle single seq_len as we can't fetch allowed shapes for single seq_len
         self.seq_len = self.qpc_session.bindings[0].dims[1] if not hasattr(self, "seq_len") else self.seq_len
 
+        qpc_inputs = {}
         input_ids = np.array(
             torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - input_ids_len), "constant", 0)
         )
-        attention_mask = np.array(
-            torch.nn.functional.pad(
-                inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0
+        qpc_inputs["input_ids"] = input_ids
+        qpc_input_names=self.qpc_session.input_names
+        
+        if "position_ids" in qpc_input_names:
+            attention_mask = np.array(
+                torch.nn.functional.pad(
+                    inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0
+                )
             )
-        )
-
-        inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
+            position_ids = np.where(attention_mask == 1, np.arange(attention_mask.shape[1]), -1)
+            qpc_inputs["position_ids"] = position_ids
 
         # TODO: Remove try and catch after compiler fix
         try:
             outputs = {
-                "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
+                "output": np.random.randn(*list(self.qpc_session.bindings[-1].dims)).astype(np.float32),
             }
             self.qpc_session.set_buffers(outputs)
-            outputs = self.qpc_session.run(inputs)
+            outputs = self.qpc_session.run(qpc_inputs)
         except Exception:
             outputs = {
-                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
+                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[-1].dims[1]).astype(
                     np.float32
                 ),
             }
             self.qpc_session.set_buffers(outputs)
-            outputs = self.qpc_session.run(inputs)
+            outputs = self.qpc_session.run(qpc_inputs)
         return outputs
 
     def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 42807753d..47842a019 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -154,7 +154,7 @@
 
 from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
-from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function
+from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, embedding_forward, validate_user_pooling_function
 from QEfficient.transformers.models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -632,3 +632,12 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu
         model = PooledModel(model, pooling_method)
         warnings.warn("Pooling is applied to the model.")
         return model, transformed
+class EmbeddingTransform:
+    @classmethod
+    def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]:
+        transformed = False
+        model.old_forward = model.forward
+        model.forward = MethodType(embedding_forward, model)
+        transformed = True
+        
+        return model, transformed
\ No newline at end of file
diff --git a/examples/embedding_model.py b/examples/embedding_model.py
index 7e6973e2e..694145a8f 100644
--- a/examples/embedding_model.py
+++ b/examples/embedding_model.py
@@ -14,27 +14,44 @@
 from QEfficient import QEFFAutoModel as AutoModel
 
 
-def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
-    last_hidden_states[input_mask_expanded == 0] = -1e9
-    return torch.max(last_hidden_states, 1)[0]
+# def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
+#     last_hidden_states[input_mask_expanded == 0] = -1e9
+#     return torch.max(last_hidden_states, 1)[0]
+
+import torch
 
+# def max_pooling(last_hidden_states: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
+#     # Expand position_ids to match the shape of last_hidden_states
+#     position_mask_expanded = position_ids.unsqueeze(-1).expand(last_hidden_states.size()).float()
+    
+#     # Mask out positions with a special value (e.g., -1e9) where position_id is 0
+#     last_hidden_states[position_mask_expanded == 0] = -1e9
+    
+#     # Apply max pooling across the sequence length dimension
+#     return torch.max(last_hidden_states, dim=1)[0]
+def max_pooling(last_hidden_states: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor:
+    # Create a mask where position_ids > 0 (or use a different condition based on your data)
+    position_mask = (position_ids > 0).unsqueeze(-1).expand(last_hidden_states.size()).float()
+    last_hidden_states[position_mask == 0] = -1e9
+    return torch.max(last_hidden_states, 1)[0]
 
 # Sentences we want sentence embeddings for
 sentences = "This is an example sentence"
 
+model_name="jinaai/jina-embeddings-v2-base-code"
 # Load model from HuggingFace Hub
-tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 # You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function.
 # If no pooling is specified, the model will return its default output (typically token embeddings).
-qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling)
+qeff_model = AutoModel.from_pretrained(model_name, pooling=max_pooling, trust_remote_code=True, num_hidden_layers=1)
 # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="max")
 # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 
 # Here seq_len can be list of seq_len or single int
-qeff_model.compile(num_cores=16, seq_len=[32, 64])
+qeff_model.compile(num_cores=16, seq_len=32)
 # qeff_model.compile(num_cores=16, seq_len=32)
 
 
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 2d110faeb..6c13abb63 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -20,7 +20,7 @@
 from QEfficient.utils.constants import Constants, QnnConstants
 
 embed_test_models = [
-    {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"},
+    # {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"},
     {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"},
 ]
 
@@ -40,7 +40,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     # Original PyTorch model
     pt_model = AutoModel.from_pretrained(
         model_name,
-        num_hidden_layers=n_layer,
+        # num_hidden_layers=n_layer,
         attn_implementation="eager",
         trust_remote_code=True,
     )
@@ -58,6 +58,10 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling=pooling)
 
     # QEff transformed PyTorch model output
+    position_ids = torch.where(inputs["attention_mask"] == 1, torch.arange(inputs["attention_mask"].shape[1]), -1)
+    inputs["position_ids"] = position_ids
+    inputs.pop("attention_mask")
+    
     qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False)
     qeff_pt_embeddings = qeff_pt_outputs if pooling else qeff_pt_outputs[0]
 
@@ -71,9 +75,12 @@ def check_embed_pytorch_vs_ort_vs_ai100(
 
     # Prepare the inputs for ONNX Runtime
     input_ids = np.array(inputs["input_ids"])
-    attention_mask = np.array(inputs["attention_mask"])
+    position_ids = np.array(inputs["position_ids"])
 
-    onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+    onnx_inputs = {"input_ids": input_ids}
+
+    if len(ort_session.get_inputs()) > 1 and ort_session.get_inputs()[1].name == "position_ids":
+        onnx_inputs["position_ids"] = position_ids
 
     # Run inference
     onnx_outputs = ort_session.run(None, onnx_inputs)
@@ -88,6 +95,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
     )
+    inputs = tokenizer("My name is", return_tensors="pt")
     ai100_output = qeff_model.generate(inputs=inputs)
     qeff_ai100_embeddings = (
         ai100_output["output"] if pooling else ai100_output["output"][:, : inputs["input_ids"].shape[1], :]
@@ -100,84 +108,79 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
-@pytest.mark.on_qaic
-@pytest.mark.parametrize("model", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100(model):
-    """
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-    """
-    check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1)
-
-
-@pytest.mark.on_qaic
-@pytest.mark.parametrize("model", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model):
-    """
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling.
-    """
-    check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1, pooling=model["pooling"])
-
-
-@pytest.mark.on_qaic
-@pytest.mark.parametrize("model", embed_test_models[:1])
-def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model):
-    """
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
-    """
-    check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1)
 
-
-##########  QNN TESTS ##############
+check_embed_pytorch_vs_ort_vs_ai100(model_name="jinaai/jina-embeddings-v2-base-code", seq_len=32, n_layer=1)
 
 
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model_name", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
-    """
-    QNN Compilation path test.
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
-    """
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+# @pytest.mark.on_qaic
+# @pytest.mark.parametrize("model", embed_test_models)
+# def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model):
+#     """
+#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling.
+#     """
+#     check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1, pooling=model["pooling"])
 
-    check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model_name["model_name"], seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
-    )
 
+# @pytest.mark.on_qaic
+# @pytest.mark.parametrize("model", embed_test_models[:1])
+# def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model):
+#     """
+#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
+#     """
+#     check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1)
 
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model", embed_test_models)
-def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling_qnn(model):
-    """
-    QNN Compilation path test.
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling.
-    """
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
-
-    check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model["model_name"],
-        seq_len=32,
-        n_layer=1,
-        pooling=model["pooling"],
-        enable_qnn=True,
-        qnn_config=qnn_config_json_path,
-    )
 
+##########  QNN TESTS ##############
 
-@pytest.mark.on_qaic
-@pytest.mark.qnn
-@pytest.mark.parametrize("model", [embed_test_models[0]])
-def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len_qnn(model):
-    """
-    QNN Compilation path test.
-    Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
-    """
-    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
-    create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
 
-    check_embed_pytorch_vs_ort_vs_ai100(
-        model_name=model["model_name"], seq_len=[32, 20], n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
-    )
+# @pytest.mark.on_qaic
+# @pytest.mark.qnn
+# @pytest.mark.parametrize("model_name", embed_test_models)
+# def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name):
+#     """
+#     QNN Compilation path test.
+#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output.
+#     """
+#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+#     check_embed_pytorch_vs_ort_vs_ai100(
+#         model_name=model_name["model_name"], seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+#     )
+
+
+# @pytest.mark.on_qaic
+# @pytest.mark.qnn
+# @pytest.mark.parametrize("model", embed_test_models)
+# def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling_qnn(model):
+#     """
+#     QNN Compilation path test.
+#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling.
+#     """
+#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+#     check_embed_pytorch_vs_ort_vs_ai100(
+#         model_name=model["model_name"],
+#         seq_len=32,
+#         n_layer=1,
+#         pooling=model["pooling"],
+#         enable_qnn=True,
+#         qnn_config=qnn_config_json_path,
+#     )
+
+
+# @pytest.mark.on_qaic
+# @pytest.mark.qnn
+# @pytest.mark.parametrize("model", [embed_test_models[0]])
+# def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len_qnn(model):
+#     """
+#     QNN Compilation path test.
+#     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
+#     """
+#     qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+#     create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
+
+#     check_embed_pytorch_vs_ort_vs_ai100(
+#         model_name=model["model_name"], seq_len=[32, 20], n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path
+#     )