Skip to content

Commit c0f9514

Browse files
authored
Merge branch 'main' into dualQpcParamFix
2 parents fc5a9e1 + d020b88 commit c0f9514

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1978
-1027
lines changed

QEfficient/base/modeling_qeff.py

Lines changed: 29 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#
66
# ----------------------------------------------------------------------------
77

8-
import hashlib
98
import inspect
109
import logging
1110
import shutil
@@ -22,8 +21,16 @@
2221
from QEfficient.base.pytorch_transforms import PytorchTransform
2322
from QEfficient.compile.qnn_compiler import compile as qnn_compile
2423
from QEfficient.generation.cloud_infer import QAICInferenceSession
25-
from QEfficient.utils import constants, create_json, dump_qconfig, generate_mdp_partition_config, load_json
26-
from QEfficient.utils.cache import QEFF_HOME, to_hashable
24+
from QEfficient.utils import (
25+
constants,
26+
create_json,
27+
create_model_params,
28+
dump_qconfig,
29+
export_wrapper,
30+
generate_mdp_partition_config,
31+
hash_dict_params,
32+
load_json,
33+
)
2734

2835
logger = logging.getLogger(__name__)
2936

@@ -45,12 +52,16 @@ class QEFFBaseModel(ABC):
4552
def _transform_names(cls) -> List[str]:
4653
return [x.__name__ for x in cls._pytorch_transforms + cls._onnx_transforms]
4754

48-
def __init__(self, model: torch.nn.Module) -> None:
55+
def __init__(self, model: torch.nn.Module, **kwargs) -> None:
4956
super().__init__()
5057
self.model = model
58+
self.hash_params = create_model_params(self, **kwargs)
5159
self.onnx_path: Optional[str] = None
5260
self.qpc_path: Optional[str] = None
5361
self.qpc_session: Optional[QAICInferenceSession] = None
62+
self.model_architecture = (
63+
(arch := getattr(self.model.config, "architectures", None)) and len(arch) > 0 and arch[0]
64+
) or None
5465

5566
# Apply the transformations
5667
any_transformed = False
@@ -67,10 +78,6 @@ def __init__(self, model: torch.nn.Module) -> None:
6778
@abstractmethod
6879
def model_name(self) -> str: ...
6980

70-
@property
71-
@abstractmethod
72-
def model_hash(self) -> str: ...
73-
7481
@abstractmethod
7582
def export(self, export_dir: Optional[str] = None) -> Path:
7683
"""
@@ -114,6 +121,7 @@ def compile(self, *args, **kwargs) -> Path:
114121
:str: Path of the compiled ``qpc`` package.
115122
"""
116123

124+
@export_wrapper
117125
def _export(
118126
self,
119127
example_inputs: Dict[str, torch.Tensor],
@@ -134,8 +142,6 @@ def _export(
134142
:onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class.
135143
:export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model.
136144
"""
137-
export_dir = Path(export_dir or (QEFF_HOME / self.model_name))
138-
export_dir = export_dir.with_name(export_dir.name + "-" + self.model_hash)
139145
onnx_path = export_dir / f"{self.model_name}.onnx"
140146
if onnx_path.is_file():
141147
self.onnx_path = onnx_path
@@ -304,23 +310,16 @@ def _compile(
304310
else:
305311
mdp_ts_json = None
306312

307-
compile_hash = hashlib.sha256(to_hashable(command))
308-
309-
if specializations is not None:
310-
compile_hash.update(to_hashable(specializations))
311-
312-
if custom_io is not None:
313-
compile_hash.update(to_hashable(custom_io))
314-
315-
if num_speculative_tokens:
316-
compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens}))
317-
318-
# Hash the MDP partition config and the number of devices.
319-
compile_hash.update(to_hashable(mdp_ts_json))
320-
compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
313+
compile_hash_params = {
314+
"command": command,
315+
"specializations": specializations,
316+
"custom_io": custom_io,
317+
"mdp_ts_num_devices": mdp_ts_num_devices,
318+
"mdp_ts_json": mdp_ts_json,
319+
"num_speculative_tokens": num_speculative_tokens,
320+
}
321+
compile_hash = hash_dict_params(compile_hash_params)
321322

322-
# Check if already compiled
323-
compile_hash = compile_hash.hexdigest()[:16]
324323
compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
325324
qpc_path = compile_dir / "qpc"
326325
qpc_path.mkdir(parents=True, exist_ok=True)
@@ -371,6 +370,10 @@ def _compile(
371370
]
372371
)
373372
)
373+
# Dump JSON file with hashed parameters
374+
hashed_compile_params_path = compile_dir / "hashed_compile_params.json"
375+
create_json(hashed_compile_params_path, compile_hash_params)
376+
logger.info("Hashed parameters exported successfully.")
374377

375378
self.qpc_path = qpc_path
376379

QEfficient/cloud/finetune.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -288,11 +288,10 @@ def main(**kwargs) -> None:
288288
--model_name "meta-llama/Llama-3.2-1B" \\
289289
--lr 5e-4
290290
"""
291-
# TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
292291
train_config = TrainConfig()
293292
update_config(train_config, **kwargs)
294-
dataset_config = generate_dataset_config(train_config.dataset)
295-
update_config(dataset_config, **kwargs)
293+
custom_dataset_config_file = kwargs.pop("custom_dataset_config", None)
294+
dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file)
296295

297296
logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
298297

QEfficient/compile/qnn_compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,12 @@
1212
from typing import Dict, List, Optional
1313

1414
from QEfficient.utils._utils import create_json, execute_command, load_json
15-
from QEfficient.utils.cache import to_hashable
1615
from QEfficient.utils.constants import QnnConstants
1716
from QEfficient.utils.generate_qnn_network_specialization_config import (
1817
generate_data_format_config,
1918
generate_qnn_specialization,
2019
)
20+
from QEfficient.utils.hash_utils import to_hashable
2121
from QEfficient.utils.logging_utils import logger
2222

2323

QEfficient/finetune/configs/dataset_config.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,5 @@ class imdb_dataset:
4141
@dataclass
4242
class custom_dataset:
4343
dataset: str = "custom_dataset"
44-
file: str = "dataset/custom_dataset.py"
4544
train_split: str = "train"
4645
test_split: str = "validation"
47-
data_path: str = ""
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"r": 32,
3+
"lora_alpha": 64,
4+
"target_modules": [
5+
"q_proj",
6+
"k_proj",
7+
"v_proj",
8+
"o_proj",
9+
"up_proj",
10+
"down_proj",
11+
"gate_proj"
12+
],
13+
"bias": "none",
14+
"task_type": "CAUSAL_LM",
15+
"lora_dropout": 0.05,
16+
"inference_mode": false
17+
}

QEfficient/finetune/dataset/custom_dataset.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
# -----------------------------------------------------------------------------
77

88
import importlib
9+
import logging
910
from pathlib import Path
1011

1112
from QEfficient.finetune.utils.logging_utils import logger
@@ -26,51 +27,81 @@ def load_module_from_py_file(py_file: str) -> object:
2627

2728

2829
def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
29-
if ":" in dataset_config.file:
30-
module_path, func_name = dataset_config.file.split(":")
31-
else:
32-
module_path, func_name = dataset_config.file, "get_custom_dataset"
30+
if not hasattr(dataset_config, "preproc_file"):
31+
logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError)
32+
33+
if ":" not in dataset_config.preproc_file:
34+
logger.raise_error(
35+
"The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name",
36+
RuntimeError,
37+
)
38+
39+
module_path, func_name = dataset_config.preproc_file.split(":")
40+
logger.log_rank_zero(
41+
f"Using '{func_name}' function from {module_path} as preprocessing function in dataset preprocessing.",
42+
logging.DEBUG,
43+
)
3344

3445
if not module_path.endswith(".py"):
35-
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
46+
logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError)
3647

3748
module_path = Path(module_path)
3849
if not module_path.is_file():
3950
logger.raise_error(
40-
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
51+
f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
4152
)
4253

4354
module = load_module_from_py_file(module_path.as_posix())
4455
try:
4556
return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
4657
except AttributeError:
4758
logger.raise_error(
48-
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
59+
f"For custom dataset preprocessing, the method ({func_name}) is not "
60+
f"present in the file ({module_path.as_posix()}).",
4961
AttributeError,
5062
)
5163

5264

5365
def get_data_collator(dataset_processer, dataset_config):
54-
if ":" in dataset_config.file:
55-
module_path, func_name = dataset_config.file.split(":")
66+
if not hasattr(dataset_config, "collate_file"):
67+
logger.log_rank_zero(
68+
"Can not find collate_file key in dataset_config file. Using the default data collator function instead.",
69+
logging.WARNING,
70+
)
71+
return None
72+
73+
if ":" not in dataset_config.collate_file:
74+
logger.log_rank_zero(
75+
"Can not find function name in 'collate_file' key in dataset_config "
76+
"file. Using the default data collator function instead. If this is "
77+
"not intended then change the format of the 'collate_file' key in "
78+
"dataset_config file to follow the format: python_file_path:function_name",
79+
logging.WARNING,
80+
)
81+
return None
5682
else:
57-
module_path, func_name = dataset_config.file, "get_data_collator"
83+
module_path, func_name = dataset_config.collate_file.split(":")
84+
logger.log_rank_zero(
85+
f"Using '{func_name}' function from {module_path} as collate_fn in dataset preprocessing.",
86+
logging.DEBUG,
87+
)
5888

5989
if not module_path.endswith(".py"):
60-
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
90+
logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError)
6191

6292
module_path = Path(module_path)
6393
if not module_path.is_file():
6494
logger.raise_error(
65-
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
95+
f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
6696
)
6797

6898
module = load_module_from_py_file(module_path.as_posix())
6999
try:
70100
return getattr(module, func_name)(dataset_processer)
71101
except AttributeError:
72102
logger.log_rank_zero(
73-
f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
103+
f"Can not find the function {func_name} in file "
104+
f"({module_path.as_posix()}). Using the default data collator "
105+
"function instead."
74106
)
75-
logger.log_rank_zero("Using the default data_collator instead.")
76107
return None
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"train_split": "train",
3+
"test_split": "test",
4+
"test_split_ratio": 0.15,
5+
"preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc",
6+
"disc_style": "sarcasm_more"
7+
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------
7+
8+
9+
import datasets
10+
from transformers.data import DataCollatorForSeq2Seq
11+
12+
13+
def get_data_collator(tokenizer):
14+
return DataCollatorForSeq2Seq(tokenizer)
15+
16+
17+
def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None):
18+
dataset = datasets.load_dataset("hallisky/DiSC")
19+
20+
# Considering 'train' split as this dataset has only one split.
21+
dataset = dataset["train"]
22+
23+
test_split_ratio = dataset_config.test_split_ratio
24+
disc_style = dataset_config.disc_style
25+
26+
# Only collect the samples for a given style.
27+
available_styles = set(dataset["category"])
28+
if disc_style not in available_styles:
29+
raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.")
30+
31+
dataset = dataset.filter(lambda example: example["category"] == disc_style)
32+
33+
# Shuffle the dataset before splitting
34+
dataset = dataset.shuffle(seed=42)
35+
36+
# Split the data in train and test split.
37+
total_samples = len(dataset)
38+
test_size = int(total_samples * test_split_ratio)
39+
train_size = total_samples - test_size
40+
41+
if split == "test":
42+
indices = range(train_size, total_samples)
43+
else:
44+
indices = range(0, train_size)
45+
46+
dataset = dataset.select(indices)
47+
48+
if tokenizer.pad_token is None:
49+
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
50+
51+
# Below is the template of the DiSC dataset.
52+
# <bos>### Original:{original} \n ### Rewrite: {rewrite} <eos>
53+
template = "### Original:{original} \n ### Rewrite: "
54+
55+
def apply_prompt_template(sample):
56+
return {
57+
"input": template.format(original=sample["original"]),
58+
"label": sample["generation"],
59+
}
60+
61+
dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
62+
63+
def tokenize_add_label(sample):
64+
input = tokenizer.encode(
65+
tokenizer.bos_token + sample["input"],
66+
add_special_tokens=False,
67+
max_length=context_length,
68+
pad_to_max_length=True,
69+
)
70+
label = tokenizer.encode(
71+
sample["label"] + tokenizer.pad_token + tokenizer.eos_token,
72+
add_special_tokens=False,
73+
max_length=context_length,
74+
pad_to_max_length=True,
75+
)
76+
77+
sample = {
78+
"input_ids": (input + label),
79+
"attention_mask": [1] * (len(input) + len(label)),
80+
"labels": [-100] * len(input) + label,
81+
}
82+
83+
return sample
84+
85+
dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))
86+
87+
return dataset

QEfficient/finetune/dataset/dataset_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#
66
# -----------------------------------------------------------------------------
77

8-
from functools import partial
98

109
from QEfficient.finetune.dataset.alpaca_dataset import (
1110
InstructionDataset as get_alpaca_dataset,
@@ -23,7 +22,7 @@
2322
)
2423

2524
DATASET_PREPROC = {
26-
"alpaca_dataset": partial(get_alpaca_dataset),
25+
"alpaca_dataset": get_alpaca_dataset,
2726
"grammar_dataset": get_grammar_dataset,
2827
"gsm8k_dataset": get_gsm8k_dataset,
2928
"custom_dataset": get_custom_dataset,

0 commit comments

Comments
 (0)