Skip to content

Add quality check to CI and fix existing errors #408

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/quality-check.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Quality Checks
on:
push:
branches:
- main
- 'release/*'
pull_request:
branches:
- main
- 'release/*'

jobs:
quality-check:
runs-on: ubuntu-24.04
steps:
- uses: actions/setup-python@v5
with:
python-version: '3.10'
- uses: actions/checkout@v4
with:
fetch-depth: 0
fetch-tags: true
- name: Set Env
run: |
pip3 install --upgrade pip && pip3 install --upgrade setuptools
- name: "⚙️ Install dependencies"
run: pip3 install .[dev]
- name: "🧹 Running quality checks"
run: make quality
3 changes: 3 additions & 0 deletions .github/workflows/test-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ on:
push:
branches:
- main
- 'release/*'
pull_request:
branches:
- main
- 'release/*'

jobs:
python-tests:
Expand All @@ -26,3 +28,4 @@ jobs:
run: pip3 install .[dev,accelerate]
- name: "🔬 Running tests"
run: make test

1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
skip = src/compressed_tensors/version.py

line_length = 88
lines_after_imports = 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
load_pretrained_quantization_parameters,
)
from compressed_tensors.quantization.lifecycle import expand_target_names
from compressed_tensors.quantization.utils import is_module_quantized
from compressed_tensors.utils import (
align_module_device,
delete_offload_parameter,
Expand Down Expand Up @@ -195,7 +194,7 @@ def from_pretrained_model(

@staticmethod
def parse_sparsity_config(
compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"]
compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"],
) -> Union[Dict[str, Any], None]:
"""
Parse sparsity config from quantization/compression config. Sparsity
Expand All @@ -215,7 +214,7 @@ def parse_sparsity_config(

@staticmethod
def parse_quantization_config(
compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"]
compression_config: Union[Dict[str, Any], "CompressedTensorsConfig"],
) -> Union[Dict[str, Any], None]:
"""
Parse quantization config from quantization/compression config. The
Expand Down Expand Up @@ -390,7 +389,6 @@ def compress_model(self, model: Module):
)

for prefix, module in tqdm(model.named_modules(), desc="Compressing model"):

if prefix in module_to_scheme or prefix in sparse_compression_targets:
module_device = get_execution_device(module)
is_meta = module_device.type == "meta"
Expand Down Expand Up @@ -562,11 +560,12 @@ def decompress(self, model_path: str, model: Module):
:param model_path: path to compressed weights
:param model: pytorch model to load decompressed weights into

Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
The variations in these methods are a result of the subtle variations between the sparsity
and quantization compressors. Specifically, quantization compressors return not just the
decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
compressors only return the decompressed weight.
Note: decompress makes use of both _replace_sparsity_weights and
_replace_weights. The variations in these methods are a result of the subtle
variations between the sparsity and quantization compressors. Specifically,
quantization compressors return not just the decompressed weight, but the
quantization parameters (e.g scales, zero_point) whereas sparsity compressors
only return the decompressed weight.

"""
model_path = get_safetensors_folder(model_path)
Expand Down Expand Up @@ -598,18 +597,17 @@ def decompress(self, model_path: str, model: Module):
with override_quantization_status(
self.quantization_config, QuantizationStatus.FROZEN
):

names_to_scheme = apply_quantization_config(
model, self.quantization_config
)
# Load activation scales/zp or any other quantization parameters
# Conditionally load the weight quantization parameters if we have a dense compressor
# Or if a sparsity compressor has already been applied
# Conditionally load the weight quantization parameters if we have a
# dense compressor or if a sparsity compressor has already been applied
load_pretrained_quantization_parameters(
model,
model_path,
# TODO: all weight quantization params will be moved to the compressor in a follow-up
# including initialization
# TODO: all weight quantization params will be moved to the
# compressor in a follow-up including initialization
load_weight_quantization=(
sparse_decompressed
or isinstance(self.quantization_compressor, DenseCompressor)
Expand Down Expand Up @@ -695,7 +693,6 @@ def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
:param model: The model whose weights are to be updated.
"""
for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):

split_name = name.split(".")
prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
module = operator.attrgetter(prefix)(model)
Expand Down Expand Up @@ -731,9 +728,10 @@ def _replace_weights(self, dense_weight_generator, model: Module):
for param_name, param_data in data.items():
if hasattr(module, param_name):
# If compressed, will have an incorrect dtype for transformers >4.49
# TODO: we can also just skip initialization of scales/zp if in decompression in init
# to be consistent with loading which happens later as well
# however, update_data does a good shape check - should be moved to the compressor
# TODO: we can also just skip initialization of scales/zp if in
# decompression in init to be consistent with loading which happens
# later as well however, update_data does a good shape check -
# should be moved to the compressor
if param_name == "weight":
delattr(module, param_name)
requires_grad = param_data.dtype in (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
get_nested_weight_mappings,
merge_names,
)
from compressed_tensors.utils.safetensors_load import match_param_name
from safetensors import safe_open
from torch import Tensor
from tqdm import tqdm
Expand Down Expand Up @@ -107,7 +106,8 @@ def compress(
compressed_dict[name] = value.to(compression_device)
continue

# compress values on meta if loading from meta otherwise on cpu (memory movement too expensive)
# compress values on meta if loading from meta otherwise on cpu (memory
# movement too expensive)
module_path = prefix[:-1] if prefix.endswith(".") else prefix
quant_args = names_to_scheme[module_path].weights
compressed_values = self.compress_weight(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from typing import Dict, Optional, Tuple

import numpy
import torch
from compressed_tensors.compressors.base import BaseCompressor
from compressed_tensors.compressors.quantized_compressors.base import (
Expand Down Expand Up @@ -71,7 +70,6 @@ def compress_weight(
zero_point: Optional[torch.Tensor] = None,
g_idx: Optional[torch.Tensor] = None,
) -> Dict[str, torch.Tensor]:

quantized_weight = quantize(
x=weight,
scale=scale,
Expand All @@ -91,7 +89,6 @@ def decompress_weight(
compressed_data: Dict[str, Tensor],
quantization_args: Optional[QuantizationArgs] = None,
) -> torch.Tensor:

weight = compressed_data["weight_packed"]
scale = compressed_data["weight_scale"]
global_scale = compressed_data["weight_global_scale"]
Expand Down Expand Up @@ -154,14 +151,16 @@ def pack_fp4_to_uint8(x: torch.Tensor) -> torch.Tensor:
[0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0], dtype=torch.float32
)


# reference: : https://github.com/vllm-project/vllm/pull/16362
def unpack_fp4_from_uint8(
a: torch.Tensor, m: int, n: int, dtype: Optional[torch.dtype] = torch.bfloat16
) -> torch.Tensor:
"""
Unpacks uint8 values into fp4. Each uint8 consists of two fp4 values
(i.e. first four bits correspond to one fp4 value, last four corresond to a consecutive
fp4 value). The bits represent an index, which are mapped to an fp4 value.
(i.e. first four bits correspond to one fp4 value, last four correspond to a
consecutive fp4 value). The bits represent an index, which are mapped to an fp4
value.

:param a: tensor to unpack
:param m: original dim 0 size of the unpacked tensor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import math
from typing import Dict, Literal, Optional, Tuple, Union

import numpy as np
import torch
from compressed_tensors.compressors.base import BaseCompressor
from compressed_tensors.compressors.quantized_compressors.base import (
Expand Down Expand Up @@ -135,7 +134,8 @@ def compress_weight(
compressed_dict["weight_shape"] = weight_shape
compressed_dict["weight_packed"] = packed_weight

# We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
# We typically don't compress zp; apart from when using the packed_compressor
# and when storing group/channel zp
if not quantization_args.symmetric and quantization_args.strategy in [
QuantizationStrategy.GROUP.value,
QuantizationStrategy.CHANNEL.value,
Expand Down Expand Up @@ -166,7 +166,8 @@ def decompress_weight(
num_bits = quantization_args.num_bits
unpacked = unpack_from_int32(weight, num_bits, original_shape)

# NOTE: this will fail decompression as we don't currently handle packed zp on decompression
# NOTE: this will fail decompression as we don't currently handle packed zp on
# decompression
if not quantization_args.symmetric and quantization_args.strategy in [
QuantizationStrategy.GROUP.value,
QuantizationStrategy.CHANNEL.value,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

from dataclasses import dataclass
from typing import Dict, Generator, List, Tuple, Union
from typing import Dict, List, Tuple, Union

import torch
from compressed_tensors.compressors.base import BaseCompressor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class Marlin24Compressor(BaseCompressor):

@staticmethod
def validate_quant_compatability(
names_to_scheme: Dict[str, QuantizationScheme]
names_to_scheme: Dict[str, QuantizationScheme],
) -> bool:
"""
Checks if every quantized module in the model is compatible with Marlin24
Expand Down
8 changes: 4 additions & 4 deletions src/compressed_tensors/quantization/lifecycle/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,14 @@ def load_pretrained_quantization_parameters(
Loads the quantization parameters (scale and zero point) from model_name_or_path to
a model that has already been initialized with a quantization config.

NOTE: Will always load inputs/output parameters.
Will conditioanlly load weight parameters, if load_weight_quantization is set to True.
NOTE: Will always load inputs/output parameters. Will conditioanlly load weight
parameters, if load_weight_quantization is set to True.

:param model: model to load pretrained quantization parameters to
:param model_name_or_path: Hugging Face stub or local folder containing a quantized
model, which is used to load quantization parameters
:param load_weight_quantization: whether or not the weight quantization parameters shoud
be laoded
:param load_weight_quantization: whether or not the weight quantization parameters
should be loaded
"""
model_path = get_safetensors_folder(model_name_or_path)
mapping = get_quantization_parameter_to_path_mapping(model_path)
Expand Down
7 changes: 4 additions & 3 deletions src/compressed_tensors/quantization/lifecycle/forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,8 @@ def _process_quantization(
q_min, q_max = calculate_range(args, x.device)
group_size = args.group_size

# blockwise FP8: quantize per 2D block, supports block_structure for static block quant
# blockwise FP8: quantize per 2D block, supports block_structure for static block
# quantization
if args.strategy == QuantizationStrategy.BLOCK:
original_shape = x.shape
rows, cols = x.shape[-2], x.shape[-1]
Expand All @@ -209,8 +210,8 @@ def _process_quantization(
# Ensure exact division (tensor dimensions must be divisible by block size)
if rows % block_height != 0:
raise ValueError(
f"Tensor height {rows} is not divisible by block_height {block_height}. "
f"Block quantization requires exact division."
f"Tensor height {rows} is not divisible by block_height {block_height}."
f" Block quantization requires exact division."
)
if cols % block_width != 0:
raise ValueError(
Expand Down
13 changes: 7 additions & 6 deletions src/compressed_tensors/quantization/lifecycle/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import math
import warnings
from enum import Enum
from typing import List, Optional
from typing import Optional

import torch
from compressed_tensors.quantization.lifecycle.forward import (
Expand Down Expand Up @@ -87,7 +87,6 @@ def initialize_module_for_quantization(
_initialize_attn_scales(module)

else:

if scheme.input_activations is not None:
_initialize_scale_zero_point(
module,
Expand Down Expand Up @@ -183,7 +182,8 @@ def _initialize_scale_zero_point(
num_groups = math.ceil(weight_shape[1] / quantization_args.group_size)
expected_shape = (weight_shape[0], max(num_groups, 1))
elif quantization_args.strategy == QuantizationStrategy.BLOCK:
# For block quantization, scale shape should match number of blocks - only for weights
# For block quantization, scale shape should match number of blocks - only
# for weights
if quantization_args.block_structure is None:
raise ValueError(
"Block quantization requires block_structure to be specified"
Expand All @@ -196,9 +196,10 @@ def _initialize_scale_zero_point(
# Warn if dimensions don't divide evenly
if rows % block_height != 0 or cols % block_width != 0:
warnings.warn(
f"Block quantization: tensor shape {weight_shape} does not divide evenly "
f"by block structure {quantization_args.block_structure}. "
f"Some blocks will be incomplete which may affect quantization quality.",
f"Block quantization: tensor shape {weight_shape} does not divide"
f"evenly by block structure {quantization_args.block_structure}. "
f"Some blocks will be incomplete which may affect quantization"
"quality.",
UserWarning,
)

Expand Down
12 changes: 7 additions & 5 deletions src/compressed_tensors/quantization/quant_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,16 +217,18 @@ def validate_block_structure(cls, value) -> Optional[List[int]]:
return [int(x) for x in value.split("x")]
except Exception:
raise ValueError(
f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
f"Invalid block_structure '{value}'. Must be a list of ints "
"[rows, cols]."
)
if isinstance(value, (list, tuple)):
if len(value) != 2 or not all(isinstance(v, int) for v in value):
raise ValueError(
f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
f"Invalid block_structure '{value}'. Must be a list of ints "
"[rows, cols]."
)
return list(value)
raise ValueError(
f"Invalid block_structure '{value}'. Must be a list of two ints [rows, cols]."
f"Invalid block_structure '{value}'. Must be a list of ints [rows, cols]."
)

@field_validator("strategy", mode="before")
Expand Down Expand Up @@ -307,7 +309,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
)
if strategy not in supported_strategies:
raise ValueError(
f"One of {supported_strategies} must be used for dynamic quantization"
f"One of {supported_strategies} must be used for dynamic quant."
)

if (
Expand All @@ -322,7 +324,7 @@ def validate_model_after(model: "QuantizationArgs") -> "QuantizationArgs":
observer != "memoryless"
): # avoid annoying users with old configs
warnings.warn(
"No observer is used for dynamic quantization, setting to None"
"No observer is used for dynamic quant., setting to None"
)
observer = None
else:
Expand Down
Loading