diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 74e8097bb..350f8bc31 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -33,6 +33,9 @@ "outputs": [], "source": [ "# Initiate the Original Transformer model\n", + "# Initiate the tokenizer for transformers library\n", + "from transformers import AutoTokenizer\n", + "\n", "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", @@ -92,11 +95,7 @@ "# Compile the model for provided compilation arguments\n", "# Please use platform SDK to Check num_cores for your card.\n", "\n", - "qeff_model.compile(\n", - " num_cores=14,\n", - " mxfp6=True,\n", - " device_group=[0],\n", - ")" + "qeff_model.compile(num_cores=14, mxfp6_matmul=True)" ] }, { @@ -116,8 +115,8 @@ "source": [ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "\n", - "qeff_model.generate(prompts=[\"My name is\"])" + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index d1a1f3c5f..3bb99ecbc 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -32,6 +32,8 @@ "outputs": [], "source": [ "# Initiate the Original Transformer model\n", + "# Initiate the tokenizer for transformers library\n", + "from transformers import AutoTokenizer\n", "\n", "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", "\n", @@ -91,11 +93,7 @@ "# Compile the model for provided compilation arguments\n", "# Please use platform SDK to Check num_cores for your card.\n", "\n", - "qeff_model.compile(\n", - " num_cores=14,\n", - " mxfp6=True,\n", - " device_group=[0],\n", - ")" + "qeff_model.compile(num_cores=14, mxfp6_matmul=True)" ] }, { @@ -116,7 +114,8 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "qeff_model.generate(prompts=[\"My name is\"])" + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" ] } ],