quic
diff --git a/‎QEfficient/base/modeling_qeff.py
Lines changed: 29 additions & 26 deletions b/‎QEfficient/base/modeling_qeff.py
Lines changed: 29 additions & 26 deletions
diff --git a/‎QEfficient/cloud/finetune.py
Lines changed: 2 additions & 3 deletions b/‎QEfficient/cloud/finetune.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎QEfficient/compile/qnn_compiler.py
Lines changed: 1 addition & 1 deletion b/‎QEfficient/compile/qnn_compiler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎QEfficient/finetune/configs/dataset_config.py
Lines changed: 0 additions & 2 deletions b/‎QEfficient/finetune/configs/dataset_config.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎QEfficient/finetune/configs/sample_peft_config.json
Lines changed: 17 additions & 0 deletions b/‎QEfficient/finetune/configs/sample_peft_config.json
Lines changed: 17 additions & 0 deletions
diff --git a/‎QEfficient/finetune/dataset/custom_dataset.py
Lines changed: 45 additions & 14 deletions b/‎QEfficient/finetune/dataset/custom_dataset.py
Lines changed: 45 additions & 14 deletions
diff --git a/‎QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json
Lines changed: 7 additions & 0 deletions b/‎QEfficient/finetune/dataset/custom_dataset/sample_dataset_config.json
Lines changed: 7 additions & 0 deletions
diff --git a/‎QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py
Lines changed: 87 additions & 0 deletions b/‎QEfficient/finetune/dataset/custom_dataset/sample_dataset_preproc.py
Lines changed: 87 additions & 0 deletions
diff --git a/‎QEfficient/finetune/dataset/dataset_config.py
Lines changed: 1 addition & 2 deletions b/‎QEfficient/finetune/dataset/dataset_config.py
Lines changed: 1 addition & 2 deletions
@@ -5,7 +5,6 @@
 #
 # ----------------------------------------------------------------------------
 
-import hashlib
 import inspect
 import logging
 import shutil
@@ -22,8 +21,16 @@
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants, create_json, dump_qconfig, generate_mdp_partition_config, load_json
-from QEfficient.utils.cache import QEFF_HOME, to_hashable
+from QEfficient.utils import (
+    constants,
+    create_json,
+    create_model_params,
+    dump_qconfig,
+    export_wrapper,
+    generate_mdp_partition_config,
+    hash_dict_params,
+    load_json,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -45,12 +52,16 @@ class QEFFBaseModel(ABC):
     def _transform_names(cls) -> List[str]:
         return [x.__name__ for x in cls._pytorch_transforms + cls._onnx_transforms]
 
-    def __init__(self, model: torch.nn.Module) -> None:
+    def __init__(self, model: torch.nn.Module, **kwargs) -> None:
         super().__init__()
         self.model = model
+        self.hash_params = create_model_params(self, **kwargs)
         self.onnx_path: Optional[str] = None
         self.qpc_path: Optional[str] = None
         self.qpc_session: Optional[QAICInferenceSession] = None
+        self.model_architecture = (
+            (arch := getattr(self.model.config, "architectures", None)) and len(arch) > 0 and arch[0]
+        ) or None
 
         # Apply the transformations
         any_transformed = False
@@ -67,10 +78,6 @@ def __init__(self, model: torch.nn.Module) -> None:
     @abstractmethod
     def model_name(self) -> str: ...
 
-    @property
-    @abstractmethod
-    def model_hash(self) -> str: ...
-
     @abstractmethod
     def export(self, export_dir: Optional[str] = None) -> Path:
         """
@@ -114,6 +121,7 @@ def compile(self, *args, **kwargs) -> Path:
             :str: Path of the compiled ``qpc`` package.
         """
 
+    @export_wrapper
     def _export(
         self,
         example_inputs: Dict[str, torch.Tensor],
@@ -134,8 +142,6 @@ def _export(
             :onnx_transform_kwargs (dict): Additional arguments to be passed to `Transform.apply` for this class.
             :export_dir (str): Specify the export directory. The export_dir will be suffixed with a hash corresponding to current model.
         """
-        export_dir = Path(export_dir or (QEFF_HOME / self.model_name))
-        export_dir = export_dir.with_name(export_dir.name + "-" + self.model_hash)
         onnx_path = export_dir / f"{self.model_name}.onnx"
         if onnx_path.is_file():
             self.onnx_path = onnx_path
@@ -304,23 +310,16 @@ def _compile(
         else:
             mdp_ts_json = None
 
-        compile_hash = hashlib.sha256(to_hashable(command))
-
-        if specializations is not None:
-            compile_hash.update(to_hashable(specializations))
-
-        if custom_io is not None:
-            compile_hash.update(to_hashable(custom_io))
-
-        if num_speculative_tokens:
-            compile_hash.update(to_hashable({"num_speculative_tokens": num_speculative_tokens}))
-
-        # Hash the MDP partition config and the number of devices.
-        compile_hash.update(to_hashable(mdp_ts_json))
-        compile_hash.update(to_hashable({"mdp_ts_num_devices": mdp_ts_num_devices}))
+        compile_hash_params = {
+            "command": command,
+            "specializations": specializations,
+            "custom_io": custom_io,
+            "mdp_ts_num_devices": mdp_ts_num_devices,
+            "mdp_ts_json": mdp_ts_json,
+            "num_speculative_tokens": num_speculative_tokens,
+        }
+        compile_hash = hash_dict_params(compile_hash_params)
 
-        # Check if already compiled
-        compile_hash = compile_hash.hexdigest()[:16]
         compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
         qpc_path = compile_dir / "qpc"
         qpc_path.mkdir(parents=True, exist_ok=True)
@@ -371,6 +370,10 @@ def _compile(
                     ]
                 )
             )
+        # Dump JSON file with hashed parameters
+        hashed_compile_params_path = compile_dir / "hashed_compile_params.json"
+        create_json(hashed_compile_params_path, compile_hash_params)
+        logger.info("Hashed parameters exported successfully.")
 
         self.qpc_path = qpc_path
 
 
@@ -288,11 +288,10 @@ def main(**kwargs) -> None:
                 --model_name "meta-llama/Llama-3.2-1B" \\
                 --lr 5e-4
     """
-    # TODO:Remove TrainConfig() and update_config() as all params are passed in kwargs by parser
     train_config = TrainConfig()
     update_config(train_config, **kwargs)
-    dataset_config = generate_dataset_config(train_config.dataset)
-    update_config(dataset_config, **kwargs)
+    custom_dataset_config_file = kwargs.pop("custom_dataset_config", None)
+    dataset_config = generate_dataset_config(train_config.dataset, custom_dataset_config_file)
 
     logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
 
 
@@ -12,12 +12,12 @@
 from typing import Dict, List, Optional
 
 from QEfficient.utils._utils import create_json, execute_command, load_json
-from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.constants import QnnConstants
 from QEfficient.utils.generate_qnn_network_specialization_config import (
     generate_data_format_config,
     generate_qnn_specialization,
 )
+from QEfficient.utils.hash_utils import to_hashable
 from QEfficient.utils.logging_utils import logger
 
 
 
@@ -41,7 +41,5 @@ class imdb_dataset:
 @dataclass
 class custom_dataset:
     dataset: str = "custom_dataset"
-    file: str = "dataset/custom_dataset.py"
     train_split: str = "train"
     test_split: str = "validation"
-    data_path: str = ""
@@ -0,0 +1,17 @@
+{
+    "r": 32,
+    "lora_alpha": 64,
+    "target_modules": [
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "up_proj",
+        "down_proj",
+        "gate_proj"
+    ],
+    "bias": "none",
+    "task_type": "CAUSAL_LM",
+    "lora_dropout": 0.05,
+    "inference_mode": false
+}
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import importlib
+import logging
 from pathlib import Path
 
 from QEfficient.finetune.utils.logging_utils import logger
@@ -26,51 +27,81 @@ def load_module_from_py_file(py_file: str) -> object:
 
 
 def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=None):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
-    else:
-        module_path, func_name = dataset_config.file, "get_custom_dataset"
+    if not hasattr(dataset_config, "preproc_file"):
+        logger.raise_error("Can not find preproc_file key in dataset_config file.", RuntimeError)
+
+    if ":" not in dataset_config.preproc_file:
+        logger.raise_error(
+            "The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name",
+            RuntimeError,
+        )
+
+    module_path, func_name = dataset_config.preproc_file.split(":")
+    logger.log_rank_zero(
+        f"Using '{func_name}' function from {module_path} as preprocessing function in dataset preprocessing.",
+        logging.DEBUG,
+    )
 
     if not module_path.endswith(".py"):
-        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
+        logger.raise_error(f"Custom dataset preprocessing file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
         logger.raise_error(
-            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+            f"Custom dataset file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
         )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
     except AttributeError:
         logger.raise_error(
-            f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
+            f"For custom dataset preprocessing, the method ({func_name}) is not "
+            f"present in the file ({module_path.as_posix()}).",
             AttributeError,
         )
 
 
 def get_data_collator(dataset_processer, dataset_config):
-    if ":" in dataset_config.file:
-        module_path, func_name = dataset_config.file.split(":")
+    if not hasattr(dataset_config, "collate_file"):
+        logger.log_rank_zero(
+            "Can not find collate_file key in dataset_config file. Using the default data collator function instead.",
+            logging.WARNING,
+        )
+        return None
+
+    if ":" not in dataset_config.collate_file:
+        logger.log_rank_zero(
+            "Can not find function name in 'collate_file' key in dataset_config "
+            "file. Using the default data collator function instead. If this is "
+            "not intended then change the format of the 'collate_file' key in "
+            "dataset_config file to follow the format: python_file_path:function_name",
+            logging.WARNING,
+        )
+        return None
     else:
-        module_path, func_name = dataset_config.file, "get_data_collator"
+        module_path, func_name = dataset_config.collate_file.split(":")
+        logger.log_rank_zero(
+            f"Using '{func_name}' function from {module_path} as collate_fn in dataset preprocessing.",
+            logging.DEBUG,
+        )
 
     if not module_path.endswith(".py"):
-        logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
+        logger.raise_error(f"Custom dataset collate file {module_path} is not a .py file.", ValueError)
 
     module_path = Path(module_path)
     if not module_path.is_file():
         logger.raise_error(
-            f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
+            f"Custom dataset collate file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
         )
 
     module = load_module_from_py_file(module_path.as_posix())
     try:
         return getattr(module, func_name)(dataset_processer)
     except AttributeError:
         logger.log_rank_zero(
-            f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
+            f"Can not find the function {func_name} in file "
+            f"({module_path.as_posix()}). Using the default data collator "
+            "function instead."
         )
-        logger.log_rank_zero("Using the default data_collator instead.")
         return None
@@ -0,0 +1,7 @@
+{
+    "train_split": "train",
+    "test_split": "test",
+    "test_split_ratio": 0.15,
+    "preproc_file": "./QEfficient/finetune/dataset/custom_dataset/disc_preproc.py:get_preprocessed_disc",
+    "disc_style": "sarcasm_more"
+}
@@ -0,0 +1,87 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+
+import datasets
+from transformers.data import DataCollatorForSeq2Seq
+
+
+def get_data_collator(tokenizer):
+    return DataCollatorForSeq2Seq(tokenizer)
+
+
+def get_preprocessed_disc(dataset_config, tokenizer, split, context_length=None):
+    dataset = datasets.load_dataset("hallisky/DiSC")
+
+    # Considering 'train' split as this dataset has only one split.
+    dataset = dataset["train"]
+
+    test_split_ratio = dataset_config.test_split_ratio
+    disc_style = dataset_config.disc_style
+
+    # Only collect the samples for a given style.
+    available_styles = set(dataset["category"])
+    if disc_style not in available_styles:
+        raise RuntimeError(f"For DiSC dataset the provided disc_style '{disc_style}' is not supported.")
+
+    dataset = dataset.filter(lambda example: example["category"] == disc_style)
+
+    # Shuffle the dataset before splitting
+    dataset = dataset.shuffle(seed=42)
+
+    # Split the data in train and test split.
+    total_samples = len(dataset)
+    test_size = int(total_samples * test_split_ratio)
+    train_size = total_samples - test_size
+
+    if split == "test":
+        indices = range(train_size, total_samples)
+    else:
+        indices = range(0, train_size)
+
+    dataset = dataset.select(indices)
+
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+    # Below is the template of the DiSC dataset.
+    # <bos>### Original:{original} \n ### Rewrite: {rewrite} <eos>
+    template = "### Original:{original} \n ### Rewrite: "
+
+    def apply_prompt_template(sample):
+        return {
+            "input": template.format(original=sample["original"]),
+            "label": sample["generation"],
+        }
+
+    dataset = dataset.map(apply_prompt_template, remove_columns=list(dataset.features))
+
+    def tokenize_add_label(sample):
+        input = tokenizer.encode(
+            tokenizer.bos_token + sample["input"],
+            add_special_tokens=False,
+            max_length=context_length,
+            pad_to_max_length=True,
+        )
+        label = tokenizer.encode(
+            sample["label"] + tokenizer.pad_token + tokenizer.eos_token,
+            add_special_tokens=False,
+            max_length=context_length,
+            pad_to_max_length=True,
+        )
+
+        sample = {
+            "input_ids": (input + label),
+            "attention_mask": [1] * (len(input) + len(label)),
+            "labels": [-100] * len(input) + label,
+        }
+
+        return sample
+
+    dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset.features))
+
+    return dataset
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-from functools import partial
 
 from QEfficient.finetune.dataset.alpaca_dataset import (
     InstructionDataset as get_alpaca_dataset,
@@ -23,7 +22,7 @@
 )
 
 DATASET_PREPROC = {
-    "alpaca_dataset": partial(get_alpaca_dataset),
+    "alpaca_dataset": get_alpaca_dataset,
     "grammar_dataset": get_grammar_dataset,
     "gsm8k_dataset": get_gsm8k_dataset,
     "custom_dataset": get_custom_dataset,
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,6 @@`
`5`	`5`	`#`
`6`	`6`	`# -----------------------------------------------------------------------------`
`7`	`7`
`8`		`-from functools import partial`
`9`	`8`
`10`	`9`	`from QEfficient.finetune.dataset.alpaca_dataset import (`
`11`	`10`	`InstructionDataset as get_alpaca_dataset,`
`@@ -23,7 +22,7 @@`
`23`	`22`	`)`
`24`	`23`
`25`	`24`	`DATASET_PREPROC = {`
`26`		`- "alpaca_dataset": partial(get_alpaca_dataset),`
	`25`	`+ "alpaca_dataset": get_alpaca_dataset,`
`27`	`26`	`"grammar_dataset": get_grammar_dataset,`
`28`	`27`	`"gsm8k_dataset": get_gsm8k_dataset,`
`29`	`28`	`"custom_dataset": get_custom_dataset,`