HabanaAI · vdwarakn · Sep 8, 2025
diff --git a/PyTorch/Deploying_OH/Benchmark.py b/PyTorch/Deploying_OH/Benchmark.py
diff --git a/PyTorch/Deploying_OH/Dockerfile-1.22.0-ub24-oh-v1.18.1 b/PyTorch/Deploying_OH/Dockerfile-1.22.0-ub24-oh-v1.18.1
@@ -0,0 +1,40 @@
+# Parameterize base image components
+ARG DOCKER_URL=vault.habana.ai/gaudi-docker
+ARG VERSION=1.22.0
+ARG BASE_NAME=ubuntu24.04
+ARG PT_VERSION=2.7.1
+ARG REVISION=latest
+ARG REPO_TYPE=habanalabs
+
+FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-installer-${PT_VERSION}:${REVISION}
+# Parameterize commit/branch for vllm-fork checkout
+ARG OH_FORK_COMMIT=v1.18.1
+ARG DEEPSEEK_COMMIT=1.22.0
+
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
+
+RUN apt update && \
+    apt install -y gettext moreutils jq && \
+    apt install -y npm && \
+    npm install n -g && \
+    ln -sf /usr/bin/python3 /usr/bin/python
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --upgrade-strategy eager optimum[habana] && \
+    python3 -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@${DEEPSEEK_COMMIT}
+
+WORKDIR /root
+
+RUN git clone https://github.com/huggingface/optimum-habana && \
+    cd optimum-habana && \
+    git checkout ${OH_FORK_COMMIT}
+
+WORKDIR /root/optimum-habana/examples/text-generation
+RUN python3 -m pip install -r requirements.txt && \
+    python3 -m pip install -r requirements_lm_eval.txt 
+
+COPY . .
+COPY Gaudi_1-20.json Gaudi.json
+COPY HQT_1-20.zip HQT.zip
+
+RUN python3 -m pip install -r requirements_bm.txt
diff --git a/PyTorch/Deploying_OH/Gaudi_1-19.json b/PyTorch/Deploying_OH/Gaudi_1-19.json
diff --git a/PyTorch/Deploying_OH/Gaudi_1-20.json b/PyTorch/Deploying_OH/Gaudi_1-20.json
diff --git a/PyTorch/Deploying_OH/HQT_1-19.zip b/PyTorch/Deploying_OH/HQT_1-19.zip
diff --git a/PyTorch/Deploying_OH/HQT_1-20.zip b/PyTorch/Deploying_OH/HQT_1-20.zip
diff --git a/PyTorch/Deploying_OH/README.md b/PyTorch/Deploying_OH/README.md
@@ -0,0 +1,68 @@
+# TODO - NEEDS REWRITE
+
+
+Rough Notes:
+BUILD_ARGS="--build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy"
+IMAGE_TAG=oh-textgen-1.22.0-ub24-oh-v1.18.1-deepseek-1.22.0
+docker build $BUILD_ARGS -f -t $IMAGE_TAG Dockerfile-1.22.0-ub24-oh-v1.18.1 .
+
+# OLD CONTENT BELOW
+# Benchmarking Hugging Face Pipelines with fp8 on Intel&reg; Gaudi&reg; AI Processor
+This section contains an example of how to quantize a Hugging Face models from fp32 to fp8 with Intel Gaudi and the Optimum for Intel Gaudi (aka Optimum Habana) library. An easy benchmarking python scripts with related Dockefile is also provided. Hugging Face pipelines take advantage of the Hugging Face Tasks in transformer models, such as text generation, translation, question answering and more. You can read more about Hugging Face pipelines on their main page [here](https://huggingface.co/docs/transformers/main_classes/pipelines)
+
+A jupyter notebook with fp8 instructions and a Benchmark.py for easy benchmarking are provided.
+For learning purpose, the jupyter notebook also has instructions on bare metal to get started.
+For Gaudi benchmarking purpose, Benchmark.py script will run Llama2 70b, Llama3.1 8b, Llama3.1 70b, and Llama3.1 405b inside docker and generate a report with performance comparsion against published numbers in [Gaudi Model Performance](https://www.intel.com/content/www/us/en/developer/platform/gaudi/model-performance.html). 
+
+## Requirements
+Please make sure to follow [Driver Installation](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html) to install Gaudi driver on the system.
+### Jupyter Notebook
+Please follow [README](https://github.com/intel-ai-tce/Gaudi-tutorials/blob/OH_benchmark/PyTorch/Hugging_Face_pipelines/README.md) to setup environment for Jupyter notebook.
+### Benchmark python scripts
+
+To use dockerfile provided for the sample, please follow [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html) to setup habana runtime for Docker images.
+#### Docker Build
+To build the image from the Dockerfile, please follow below command to build the optimum-habana-text-gen image.
+```bash
+docker build --no-cache -t optimum-habana-text-gen:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f Dockerfile .
+```
+#### Docker Run
+After docker build, users could follow below command to run and docker instance and users will be in the docker instance under text-generation folder.
+```bash
+docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none   --cap-add=ALL --privileged=true  --net=host --ipc=host optimum-habana-text-gen:latest
+```
+> [!NOTE]
+> The Huggingface model file size might be large, so we recommend to use an external disk as Huggingface hub folder. \
+> Please export HF_HOME environment variable to your external disk and then export the mount point into docker instance. \
+> ex: "-e HF_HOME=/mnt/huggingface -v /mnt:/mnt"
+
+## How to run Benchmark scripts
+Benchmark script will run all the models with different input len, output len and batch size and generate a report to compare all published numbers in [Gaudi Model Performance](https://www.intel.com/content/www/us/en/developer/platform/gaudi/model-performance.html).  
+### Gaudi3
+Different json file are provided for different Gaudi Software version like 1.19 and 1.20 on Gaudi3.
+To do benchmarking on a machine with 8 Gaudi3 cards, just run the below command inside the docker instance. 
+```bash
+python3 Benchmark.py
+```
+### Gaudi2
+To do benchmarking on a machine with 8 Gaudi2 cards, just run the below command instead inside the docker instance. 
+```bash
+GAUDI_VER=2 python3 Benchmark.py
+```
+
+### Skip Tests
+To skip tests for different models, pass related environment and assign its value to 1.  
+For example, skip llama3.1 405B model test by following command.  
+```bash
+skip_llama31_405b=1 python3 Benchmark.py
+```
+Here are all supported environment variables to pass different tests :  
+skip_llama2_70b, skip_llama31_8b, skip_llama31_70b, skip_llama33_70b, skip_llama31_405b
+
+### HTML Report
+A html report will be generated under a folder with timestamp, and the html report will look like below the diagram.
+> NOTE: There is also a [PerfSpect](https://github.com/intel/PerfSpect) Report for detailed system and Gaudi information.
+
+![image](https://github.com/user-attachments/assets/6fdd36f6-b563-4339-8339-0b55e4916bf8)
+
+
diff --git a/PyTorch/Deploying_OH/requirements_bm.txt b/PyTorch/Deploying_OH/requirements_bm.txt
@@ -0,0 +1,6 @@
+openpyxl
+datasets
+pandas
+sentencepiece
+transformers
+matplotlib
diff --git a/PyTorch/Deploying_OH/run_text_generation_with_fp8.ipynb b/PyTorch/Deploying_OH/run_text_generation_with_fp8.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "12fb7581-a09a-404f-a5ac-5a77996eafea",
+   "metadata": {},
+   "source": [
+    "Copyright (c) 2024 Habana Labs, Ltd. an Intel Company.\n",
+    "SPDX-License-Identifier: Apache-2.0\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0157485d-f705-405f-9997-bdcadbc85218",
+   "metadata": {},
+   "source": [
+    "### Running Hugging Face with FP8 on Intel® Gaudi®  - Text Generation\n",
+    "\n",
+    "This example shows how to quantize a Hugging Face models from fp32 to fp8 with Intel Gaudi and the Optimum for Intel Gaudi (aka Optimum Habana) library.\n",
+    "\n",
+    "Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B, phi-2 and Llama3-405B in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.\n",
+    "\n",
+    "More information on enabling fp8 in SynapseAI is available here:\n",
+    "https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f35f62c6-a93f-4d11-bff5-12d38b0f4b8b",
+   "metadata": {},
+   "source": [
+    "#### Install the Hugging Face Optimum Habana Library"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43d1f096-17ee-451e-a356-1e5b2162a83d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#%cd ~/Gaudi-tutorials/PyTorch/Hugging_Face_pipelines/Benchmarking_on_Optimum-habana_with_fp8\n",
+    "%pip install optimum-habana==1.16.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a257d27b-c742-4290-a477-a0d48f5797bb",
+   "metadata": {},
+   "source": [
+    "#### Download the Hugging Face Optimum Habana"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f8f0405-70ee-4225-acf4-91340678c03c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone -b v1.16.0 https://github.com/huggingface/optimum-habana.git;cd optimum-habana/examples/text-generation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "60e181d2-fa9e-4193-90ca-1ca5d6668101",
+   "metadata": {},
+   "source": [
+    "#### Install Required packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27f25b4d-47a0-42d7-af30-4216f16d2ab2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -r requirements.txt;pip install -r requirements_lm_eval.txt;pip install git+https://github.com/HabanaAI/[email protected]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6d5c7a14-3d38-4258-a8de-339bde866b7e",
+   "metadata": {},
+   "source": [
+    "#### Measure the tensor quantization statistics \n",
+    "Here is an example to measure the tensor quantization statistics on Llama3-8B with 1 card:  \n",
+    "By changing model_name_or_path, a different llama model could be applied.  \n",
+    "By changing world_size, multiple gaudi cards could be used for measurement. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "afb90147-891d-4ad0-b288-40e1c6673946",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py:252: UserWarning: Device capability of hccl unspecified, assuming `cpu` and `cuda`. Please specify it via the `devices` argument of `register_backend`.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "!HF_DATASETS_TRUST_REMOTE_CODE=true QUANT_CONFIG=./quantization_config/maxabs_measure.json python ../gaudi_spawn.py \\\n",
+    "--use_deepspeed --world_size 1 run_lm_eval.py \\\n",
+    "-o acc_llama3_8b_bs1_quant.txt \\\n",
+    "--model_name_or_path meta-llama/Llama-3.1-8B-Instruct \\\n",
+    "--warmup 0 \\\n",
+    "--use_hpu_graphs \\\n",
+    "--use_kv_cache \\\n",
+    "--trim_logits \\\n",
+    "--batch_size 1 \\\n",
+    "--bucket_size=128 \\\n",
+    "--bucket_internal \\\n",
+    "--trust_remote_code \\\n",
+    "--tasks hellaswag lambada_openai piqa winogrande \\\n",
+    "--bf16 \\\n",
+    "--attn_softmax_bf16 \\\n",
+    "--use_flash_attention \\\n",
+    "--flash_attention_recompute \\\n",
+    "--flash_attention_causal_mask"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b55e1ae-c373-4bed-a1a8-79ffe6bb0339",
+   "metadata": {},
+   "source": [
+    "#### Quantize and run the fp8 model\n",
+    "Here is an example to quantize the model based on previous measurements for LLama3.1 8B model:  \n",
+    "By changing model_name_or_path, a different llama model could be applied.  \n",
+    "By changing world_size, multiple gaudi cards could be used for measurement. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7edc56bc-73a9-4ccf-bfce-19a962f64efb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!HF_DATASETS_TRUST_REMOTE_CODE=true QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \\\n",
+    "--use_deepspeed --world_size 1 run_generation.py \\\n",
+    "--model_name_or_path meta-llama/Llama-3.1-8B-Instruct \\\n",
+    "--attn_softmax_bf16 \\\n",
+    "--use_hpu_graphs \\\n",
+    "--use_kv_cache \\\n",
+    "--limit_hpu_graphs \\\n",
+    "--max_input_tokens 128 \\\n",
+    "--max_new_tokens 128 \\\n",
+    "--batch_size 1536 \\\n",
+    "--bucket_size=128 \\\n",
+    "--bucket_internal \\\n",
+    "--attn_batch_split 2 \\\n",
+    "--bf16 \\\n",
+    "--reuse_cache \\\n",
+    "--trim_logits \\\n",
+    "--use_flash_attention \\\n",
+    "--flash_attention_recompute \\\n",
+    "--flash_attention_causal_mask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "088a24bd-2365-46c3-99bc-2259e2a1ffd6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exit()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}