Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b30eade
deepseekv3
kylesayrs Jun 19, 2025
a957f2f
remove dreg
kylesayrs Jun 19, 2025
2fd2a25
reformat example
kylesayrs Jun 19, 2025
b8b217c
wip: clean up moe examples
kylesayrs Jun 19, 2025
43bc91d
remove deepseek2.5 for now
kylesayrs Jun 19, 2025
7d8ed36
update readme
kylesayrs Jun 19, 2025
b7273a9
infer model device with optional override
kylesayrs Jun 19, 2025
afebe2e
handle nullable dataset_args
kylesayrs Jun 20, 2025
ab3aa3e
update docstrings, comments
kylesayrs Jun 20, 2025
e9e30c3
rename files, update examples tests
kylesayrs Jun 20, 2025
6bf5acb
rebase on main
kylesayrs Jun 20, 2025
e77a31b
clean examples
kylesayrs Jun 20, 2025
366ac25
revert examples changes
kylesayrs Jun 20, 2025
c44da34
revert extra examples
kylesayrs Jun 20, 2025
2db2789
revert examples changes
kylesayrs Jun 20, 2025
0dc2381
remove extra examples
kylesayrs Jun 20, 2025
b70aba7
revert examples tests changes
kylesayrs Jun 20, 2025
5e5657b
Revert "revert extra examples"
kylesayrs Jun 20, 2025
735c317
Merge branch 'kylesayrs/deepseek-v3' into kylesayrs/cleanup-moe-examples
kylesayrs Jun 20, 2025
4812350
clean up examples
kylesayrs Jun 20, 2025
626000d
merge with main src
kylesayrs Jun 26, 2025
45f6391
Merge remote-tracking branch 'origin' into kylesayrs/cleanup-moe-exam…
kylesayrs Jun 26, 2025
863377e
remove extra file
kylesayrs Jun 26, 2025
2f5de10
convert to fp8 examples
kylesayrs Jun 26, 2025
11d23fa
Merge remote-tracking branch 'origin' into kylesayrs/cleanup-moe-exam…
kylesayrs Jul 29, 2025
93f69f0
remove 25 3 deepseek examples
kylesayrs Jul 29, 2025
da3680f
add r1 test, which is skipped
kylesayrs Jul 29, 2025
de58207
fix readme
kylesayrs Jul 29, 2025
035298a
Merge branch 'main' into kylesayrs/cleanup-moe-examples
dsikka Jul 30, 2025
18f9545
Merge branch 'main' into kylesayrs/cleanup-moe-examples
kylesayrs Jul 31, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 16 additions & 17 deletions examples/quantizing_moe/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,17 @@ pip install -e .
The provided example script demonstrates an end-to-end process for applying the quantization algorithm:

```bash
python3 mixtral_moe_w8a8_fp8.py
python3 mixtral_example.py
```

## Creating a Quantized MoE Model

This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `open_platypus` dataset.
This example leverages `llm-compressor` and `compressed-tensors` to create an FP8-quantized `Mixtral-8x7B-Instruct-v0.1` model. The model is calibrated and trained using the `ultrachat_200k` dataset.

You can follow the detailed steps below or simply run the example script with:

```bash
python mixtral_moe_w8a8_fp8.py
python mixtral_example.py
```

### Step 1: Select a Model, Dataset, and Recipe
Expand Down Expand Up @@ -61,7 +61,6 @@ oneshot(
recipe=recipe,
save_compressed=True,
output_dir=output_dir,

max_seq_length=2048,
num_calibration_samples=512,
)
Expand All @@ -74,7 +73,7 @@ NOTE: Only per-tensor quantization is supported in vLLM as of now (`vllm==0.6.1`

The repository supports multiple quantization techniques configured via a recipe. Supported strategies include `tensor`, `group`, and `channel` quantization.

In the above example, FP8 per-tensor quantization is used as specified by the `FP8` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library.
In the above example, quantization is specified by the `FP8` scheme. For other preset schemes, refer to the [quantization schemes](https://github.com/neuralmagic/compressed-tensors/blob/main/src/compressed_tensors/quantization/quant_scheme.py) in the `compressed-tensors` library.

A custom scheme can also be specified using `config_groups`:

Expand All @@ -84,18 +83,18 @@ A custom scheme can also be specified using `config_groups`:
from llmcompressor.modifiers.quantization.gptq import GPTQModifier

config_groups = {
"group_0": {
"targets": ["Linear"],
"input_activations": None,
"output_activations": None,
"weights": {
"num_bits": 8,
"type": "int",
"symmetric": true,
"strategy": "group",
"group_size": 128,
}
}
"group_0": {
"targets": ["Linear"],
"input_activations": None,
"output_activations": None,
"weights": {
"num_bits": 8,
"type": "int",
"symmetric": True,
"strategy": "group",
"group_size": 128,
}
}
}

recipe = GPTQModifier(config_groups=config_groups)
Expand Down
125 changes: 0 additions & 125 deletions examples/quantizing_moe/deepseek_moe_w4a16.py

This file was deleted.

101 changes: 0 additions & 101 deletions examples/quantizing_moe/deepseek_moe_w8a8_int8.py

This file was deleted.

8 changes: 0 additions & 8 deletions examples/quantizing_moe/deepseek_recipe_w4a16.yaml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
import torch
from datasets import load_dataset
from packaging.version import Version
from transformers import AutoModelForCausalLM, AutoTokenizer, __version__
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.utils import dispatch_for_generation

# NOTE: transformers 4.49.0 has an attribute error with DeepSeek.
# Please consider either downgrading your transformers version to a
# previous version or upgrading to a version where this bug is fixed

# select a Mixture of Experts model for quantization
MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
MODEL_ID = "mistralai/Mixtral-8x7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype="auto", trust_remote_code=True
MODEL_ID, torch_dtype=torch.bfloat16, trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
# its recommended to use more calibration samples for MoE models so each expert is hit
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"
NUM_CALIBRATION_SAMPLES = 2048
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048


Expand Down Expand Up @@ -56,16 +51,17 @@ def tokenize(sample):

ds = ds.map(tokenize, remove_columns=ds.column_names)

# define a llmcompressor recipe for FP8 W8A8 quantization
# Configure the quantization algorithm to run.
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
# list so they remain at full precision
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8",
ignore=["lm_head", "re:.*mlp.gate$"],
),
]
recipe = QuantizationModifier(
scheme="FP8",
targets="Linear",
ignore=[
"lm_head",
"re:.*block_sparse_moe.gate", # does not quantize well
],
)

oneshot(
model=model,
Expand All @@ -76,22 +72,13 @@ def tokenize(sample):
trust_remote_code_model=True,
)

# Confirm generations of the quantized model look sane.
# Generation is broken for deepseek models when using the latest transformers package
if Version(__version__) < Version("4.48"):
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
SAMPLE_INPUT = ["I love quantization because"]
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
inputs = tokenizer(SAMPLE_INPUT, return_tensors="pt", padding=True).to(model.device)
output = model.generate(**inputs, max_length=50)
text_output = tokenizer.batch_decode(output)
print(text_output)
else:
print(
"WARNING: cannot perform sample generation of "
"deepseek models with transformers >= 4.48"
)
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
sample = tokenizer("Hello my name is", return_tensors="pt")
sample = {key: value.to("cuda") for key, value in sample.items()}
output = model.generate(**sample, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================")

# Save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8"
Expand Down
Loading