diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py index dd68e5fb9..c51e7bf14 100644 --- a/QEfficient/transformers/embeddings/embedding_utils.py +++ b/QEfficient/transformers/embeddings/embedding_utils.py @@ -80,6 +80,13 @@ def cls_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) } +def embedding_forward( + self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, **kwargs + ): + print("Forward swapped with new one") + output = self.old_forward(input_ids=input_ids, position_ids=position_ids, **kwargs) + return output[0] + class PooledModel(nn.Module): """ Adds pooling functionality to embedding model. @@ -92,10 +99,10 @@ def __init__(self, base_model, pooling_fn): self.pooling_fn = pooling_fn def forward( - self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs + self, input_ids: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, **kwargs ): - output = self.base_model(input_ids, attention_mask, **kwargs) - return self.pooling_fn(output[0], attention_mask) + output = self.base_model(input_ids, position_ids, **kwargs) + return self.pooling_fn(output[0], position_ids) def validate_user_pooling_function(user_function): @@ -119,7 +126,7 @@ def validate_user_pooling_function(user_function): raise TypeError("Provided pooling function is not callable.") sig = inspect.signature(user_function) - required_args = {"last_hidden_states", "attention_mask"} + required_args = {"last_hidden_states", "position_ids"} if not required_args.issubset(sig.parameters.keys()): raise ValueError(f"Pooling function must accept arguments: {required_args}") return user_function diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 6bff10f5a..f54086117 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -38,6 +38,7 @@ from QEfficient.transformers.modeling_utils import DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH from QEfficient.transformers.models.pytorch_transforms import ( CustomOpsTransform, + EmbeddingTransform, KVCacheExternalModuleMapperTransform, KVCacheTransform, PoolingTransform, @@ -160,7 +161,7 @@ class QEFFAutoModel(QEFFTransformersBase): """ _hf_auto_class = AutoModel - _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] + _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, EmbeddingTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] def __init__(self, model: nn.Module, pooling=None, **kwargs): @@ -267,10 +268,10 @@ def export(self, export_dir: Optional[str] = None) -> str: example_inputs = { "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64), - "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64), + "position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1), } - dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}} + dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "position_ids": {0: "batch_size", 1: "seq_len"}} output_names = ["output"] @@ -396,32 +397,37 @@ def cloud_ai_100_feature_generate( # To handle single seq_len as we can't fetch allowed shapes for single seq_len self.seq_len = self.qpc_session.bindings[0].dims[1] if not hasattr(self, "seq_len") else self.seq_len + qpc_inputs = {} input_ids = np.array( torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - input_ids_len), "constant", 0) ) - attention_mask = np.array( - torch.nn.functional.pad( - inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0 + qpc_inputs["input_ids"] = input_ids + qpc_input_names=self.qpc_session.input_names + + if "position_ids" in qpc_input_names: + attention_mask = np.array( + torch.nn.functional.pad( + inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0 + ) ) - ) - - inputs = dict(input_ids=input_ids, attention_mask=attention_mask) + position_ids = np.where(attention_mask == 1, np.arange(attention_mask.shape[1]), -1) + qpc_inputs["position_ids"] = position_ids # TODO: Remove try and catch after compiler fix try: outputs = { - "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32), + "output": np.random.randn(*list(self.qpc_session.bindings[-1].dims)).astype(np.float32), } self.qpc_session.set_buffers(outputs) - outputs = self.qpc_session.run(inputs) + outputs = self.qpc_session.run(qpc_inputs) except Exception: outputs = { - "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype( + "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[-1].dims[1]).astype( np.float32 ), } self.qpc_session.set_buffers(outputs) - outputs = self.qpc_session.run(inputs) + outputs = self.qpc_session.run(qpc_inputs) return outputs def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]: diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index 42807753d..47842a019 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -154,7 +154,7 @@ from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC -from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function +from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, embedding_forward, validate_user_pooling_function from QEfficient.transformers.models.codegen.modeling_codegen import ( QEffCodeGenAttention, QeffCodeGenBlock, @@ -632,3 +632,12 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu model = PooledModel(model, pooling_method) warnings.warn("Pooling is applied to the model.") return model, transformed +class EmbeddingTransform: + @classmethod + def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]: + transformed = False + model.old_forward = model.forward + model.forward = MethodType(embedding_forward, model) + transformed = True + + return model, transformed \ No newline at end of file diff --git a/examples/embedding_model.py b/examples/embedding_model.py index 7e6973e2e..694145a8f 100644 --- a/examples/embedding_model.py +++ b/examples/embedding_model.py @@ -14,27 +14,44 @@ from QEfficient import QEFFAutoModel as AutoModel -def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: - input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float() - last_hidden_states[input_mask_expanded == 0] = -1e9 - return torch.max(last_hidden_states, 1)[0] +# def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: +# input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float() +# last_hidden_states[input_mask_expanded == 0] = -1e9 +# return torch.max(last_hidden_states, 1)[0] + +import torch +# def max_pooling(last_hidden_states: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor: +# # Expand position_ids to match the shape of last_hidden_states +# position_mask_expanded = position_ids.unsqueeze(-1).expand(last_hidden_states.size()).float() + +# # Mask out positions with a special value (e.g., -1e9) where position_id is 0 +# last_hidden_states[position_mask_expanded == 0] = -1e9 + +# # Apply max pooling across the sequence length dimension +# return torch.max(last_hidden_states, dim=1)[0] +def max_pooling(last_hidden_states: torch.Tensor, position_ids: torch.Tensor) -> torch.Tensor: + # Create a mask where position_ids > 0 (or use a different condition based on your data) + position_mask = (position_ids > 0).unsqueeze(-1).expand(last_hidden_states.size()).float() + last_hidden_states[position_mask == 0] = -1e9 + return torch.max(last_hidden_states, 1)[0] # Sentences we want sentence embeddings for sentences = "This is an example sentence" +model_name="jinaai/jina-embeddings-v2-base-code" # Load model from HuggingFace Hub -tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +tokenizer = AutoTokenizer.from_pretrained(model_name) # You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function. # If no pooling is specified, the model will return its default output (typically token embeddings). -qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling) +qeff_model = AutoModel.from_pretrained(model_name, pooling=max_pooling, trust_remote_code=True, num_hidden_layers=1) # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="max") # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") # Here seq_len can be list of seq_len or single int -qeff_model.compile(num_cores=16, seq_len=[32, 64]) +qeff_model.compile(num_cores=16, seq_len=32) # qeff_model.compile(num_cores=16, seq_len=32) diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 2d110faeb..6c13abb63 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -20,7 +20,7 @@ from QEfficient.utils.constants import Constants, QnnConstants embed_test_models = [ - {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, + # {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"}, ] @@ -40,7 +40,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( # Original PyTorch model pt_model = AutoModel.from_pretrained( model_name, - num_hidden_layers=n_layer, + # num_hidden_layers=n_layer, attn_implementation="eager", trust_remote_code=True, ) @@ -58,6 +58,10 @@ def check_embed_pytorch_vs_ort_vs_ai100( qeff_model = QEFFAutoModel(pt_model, pretrained_model_name_or_path=model_name, pooling=pooling) # QEff transformed PyTorch model output + position_ids = torch.where(inputs["attention_mask"] == 1, torch.arange(inputs["attention_mask"].shape[1]), -1) + inputs["position_ids"] = position_ids + inputs.pop("attention_mask") + qeff_pt_outputs = qeff_model.generate(inputs=inputs, runtime_ai100=False) qeff_pt_embeddings = qeff_pt_outputs if pooling else qeff_pt_outputs[0] @@ -71,9 +75,12 @@ def check_embed_pytorch_vs_ort_vs_ai100( # Prepare the inputs for ONNX Runtime input_ids = np.array(inputs["input_ids"]) - attention_mask = np.array(inputs["attention_mask"]) + position_ids = np.array(inputs["position_ids"]) - onnx_inputs = {"input_ids": input_ids, "attention_mask": attention_mask} + onnx_inputs = {"input_ids": input_ids} + + if len(ort_session.get_inputs()) > 1 and ort_session.get_inputs()[1].name == "position_ids": + onnx_inputs["position_ids"] = position_ids # Run inference onnx_outputs = ort_session.run(None, onnx_inputs) @@ -88,6 +95,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( enable_qnn=enable_qnn, qnn_config=qnn_config, ) + inputs = tokenizer("My name is", return_tensors="pt") ai100_output = qeff_model.generate(inputs=inputs) qeff_ai100_embeddings = ( ai100_output["output"] if pooling else ai100_output["output"][:, : inputs["input_ids"].shape[1], :] @@ -100,84 +108,79 @@ def check_embed_pytorch_vs_ort_vs_ai100( assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) -@pytest.mark.on_qaic -@pytest.mark.parametrize("model", embed_test_models) -def test_embed_model_pytorch_vs_onnx_vs_ai100(model): - """ - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. - """ - check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1) - - -@pytest.mark.on_qaic -@pytest.mark.parametrize("model", embed_test_models) -def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model): - """ - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling. - """ - check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1, pooling=model["pooling"]) - - -@pytest.mark.on_qaic -@pytest.mark.parametrize("model", embed_test_models[:1]) -def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model): - """ - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len. - """ - check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1) - -########## QNN TESTS ############## +check_embed_pytorch_vs_ort_vs_ai100(model_name="jinaai/jina-embeddings-v2-base-code", seq_len=32, n_layer=1) -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.parametrize("model_name", embed_test_models) -def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name): - """ - QNN Compilation path test. - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. - """ - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) +# @pytest.mark.on_qaic +# @pytest.mark.parametrize("model", embed_test_models) +# def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model): +# """ +# Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling. +# """ +# check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=32, n_layer=1, pooling=model["pooling"]) - check_embed_pytorch_vs_ort_vs_ai100( - model_name=model_name["model_name"], seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path - ) +# @pytest.mark.on_qaic +# @pytest.mark.parametrize("model", embed_test_models[:1]) +# def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model): +# """ +# Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len. +# """ +# check_embed_pytorch_vs_ort_vs_ai100(model_name=model["model_name"], seq_len=[32, 20], n_layer=1) -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.parametrize("model", embed_test_models) -def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling_qnn(model): - """ - QNN Compilation path test. - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling. - """ - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - - check_embed_pytorch_vs_ort_vs_ai100( - model_name=model["model_name"], - seq_len=32, - n_layer=1, - pooling=model["pooling"], - enable_qnn=True, - qnn_config=qnn_config_json_path, - ) +########## QNN TESTS ############## -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.parametrize("model", [embed_test_models[0]]) -def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len_qnn(model): - """ - QNN Compilation path test. - Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len. - """ - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - check_embed_pytorch_vs_ort_vs_ai100( - model_name=model["model_name"], seq_len=[32, 20], n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path - ) +# @pytest.mark.on_qaic +# @pytest.mark.qnn +# @pytest.mark.parametrize("model_name", embed_test_models) +# def test_embed_model_pytorch_vs_onnx_vs_ai100_qnn(model_name): +# """ +# QNN Compilation path test. +# Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output. +# """ +# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") +# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + +# check_embed_pytorch_vs_ort_vs_ai100( +# model_name=model_name["model_name"], seq_len=32, n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path +# ) + + +# @pytest.mark.on_qaic +# @pytest.mark.qnn +# @pytest.mark.parametrize("model", embed_test_models) +# def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling_qnn(model): +# """ +# QNN Compilation path test. +# Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with pooling. +# """ +# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") +# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + +# check_embed_pytorch_vs_ort_vs_ai100( +# model_name=model["model_name"], +# seq_len=32, +# n_layer=1, +# pooling=model["pooling"], +# enable_qnn=True, +# qnn_config=qnn_config_json_path, +# ) + + +# @pytest.mark.on_qaic +# @pytest.mark.qnn +# @pytest.mark.parametrize("model", [embed_test_models[0]]) +# def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len_qnn(model): +# """ +# QNN Compilation path test. +# Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len. +# """ +# qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") +# create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) + +# check_embed_pytorch_vs_ort_vs_ai100( +# model_name=model["model_name"], seq_len=[32, 20], n_layer=1, enable_qnn=True, qnn_config=qnn_config_json_path +# )