From 104f1fbd2955d4ce7ec541aa24ba7e3e156e67d2 Mon Sep 17 00:00:00 2001
From: "Ahmed Hussein (amahussein)" <a@ahussein.me>
Date: Mon, 4 Aug 2025 14:05:54 -0500
Subject: [PATCH 1/4] Use tools-API in qualx predict

Signed-off-by: Ahmed Hussein (amahussein) <a@ahussein.me>

Contributes to #1836

- uses tools-API in the predict code
- introduces a new CSV Combiner class that automatically combine the CSV
  reports generated by multiple-apps and combine them into a single DF.
---
 .../rapids/qualification.py                   |   6 +-
 .../rapids/qualx/prediction.py                |  22 ++--
 .../spark_rapids_tools/api_v1/app_handler.py  |  25 +++-
 .../src/spark_rapids_tools/api_v1/builder.py  | 117 ++++++++++++++++++
 .../api_v1/core/qual_handler.py               |   5 +
 .../api_v1/result_handler.py                  |  22 ++++
 .../tools/qualx/preprocess.py                 |  12 +-
 .../tools/qualx/qualx_main.py                 |  81 ++++++------
 .../spark_rapids_tools/tools/qualx/util.py    |  12 +-
 9 files changed, 234 insertions(+), 68 deletions(-)

diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
index 78c7adb26..bb3e7a9f0 100644
--- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py
+++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
@@ -29,6 +29,7 @@
 from spark_rapids_pytools.common.sys_storage import FSUtil
 from spark_rapids_pytools.common.utilities import Utils, TemplateGenerator
 from spark_rapids_pytools.rapids.qualification_core import QualificationCore
+from spark_rapids_tools.api_v1.builder import APIResultHandler
 from spark_rapids_tools.enums import QualFilterApp, QualEstimationModel, SubmissionMode
 from spark_rapids_tools.tools.additional_heuristics import AdditionalHeuristics
 from spark_rapids_tools.tools.cluster_config_recommender import ClusterConfigRecommender
@@ -537,12 +538,13 @@ def __update_apps_with_prediction_info(self,
         model_name = self.ctxt.platform.get_prediction_model_name()
         qual_output_dir = self.ctxt.get_csp_output_path()
         output_info = self.__build_prediction_output_files_info()
-        qual_handler = self.ctxt.get_ctxt('qualHandler')
         try:
+            # Build the QualCore handler object to handle the prediction model output
+            q_core_handler = APIResultHandler().qual_core().with_path(qual_output_dir).build()
             predictions_df = predict(platform=model_name, qual=qual_output_dir,
                                      output_info=output_info,
                                      model=estimation_model_args['customModelFile'],
-                                     qual_handlers=[qual_handler])
+                                     qual_handlers=[q_core_handler])
         except Exception as e:  # pylint: disable=broad-except
             predictions_df = pd.DataFrame()
             self.logger.error(
diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualx/prediction.py b/user_tools/src/spark_rapids_pytools/rapids/qualx/prediction.py
index 1b66412b9..4a83d2265 100644
--- a/user_tools/src/spark_rapids_pytools/rapids/qualx/prediction.py
+++ b/user_tools/src/spark_rapids_pytools/rapids/qualx/prediction.py
@@ -18,7 +18,8 @@
 
 from spark_rapids_pytools.common.sys_storage import FSUtil
 from spark_rapids_pytools.rapids.qualx.qualx_tool import QualXTool
-from spark_rapids_tools.tools.core.qual_handler import QualCoreHandler
+from spark_rapids_tools.api_v1 import QualCoreResultHandler
+from spark_rapids_tools.api_v1.builder import APIResultHandler
 from spark_rapids_tools.tools.qualx.qualx_main import predict
 from spark_rapids_tools.tools.qualx.util import print_summary, print_speedup_summary
 
@@ -26,25 +27,16 @@
 @dataclass
 class Prediction(QualXTool):
     """
-    Wrapper layer around Prediction Tool.
-
-    Attributes
-    ----------
-    qual_output: str
-        Path to a directory containing qualification tool output.
-    qual_handler: QualCoreHandler
-        Handler for reading qualification core tool results.
+    A wrapper to run the QualX prediction stage on an existing Qual's output.
+    :param qual_output: Path to the directory containing the qualification tool output.
     """
     qual_output: str = None
-    qual_handler: QualCoreHandler = None
 
     name = 'prediction'
 
-    def __post_init__(self):
-        """Initialize the QualCoreHandler from qual_output."""
-        super().__post_init__()
-        if self.qual_output is not None:
-            self.qual_handler = QualCoreHandler(result_path=self.qual_output)
+    @property
+    def qual_handler(self) -> QualCoreResultHandler:
+        return APIResultHandler().qual_core().with_path(self.qual_output).build()
 
     def __prepare_prediction_output_info(self) -> dict:
         """
diff --git a/user_tools/src/spark_rapids_tools/api_v1/app_handler.py b/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
index bc9200490..7e12412e5 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
@@ -16,7 +16,7 @@
 
 from dataclasses import dataclass, field
 from functools import cached_property
-from typing import Optional
+from typing import Optional, List
 
 import pandas as pd
 
@@ -57,17 +57,32 @@ def uuid(self) -> str:
         """
         return self._app_id
 
-    def patch_into_df(self, df: pd.DataFrame) -> pd.DataFrame:
+    def patch_into_df(self,
+                      df: pd.DataFrame,
+                      col_names: Optional[List[str]] = None) -> pd.DataFrame:
         """
         Given a dataframe, this method will stitch the app_id and app-name to the dataframe.
         This can be useful in automatically adding the app-id/app-name to the data-frame
         :param df: the dataframe that we want to modify.
+        :param col_names: optional list of column names that defines the app_id and app_name to the
+                          dataframe. It is assumed that the list comes in the order it is inserted in
+                          the column names.
         :return: the resulting dataframe from adding the columns.
         """
+        # TODO: We should consider add UUID as well, and use that for the joins instead.
+        # append attempt_id to support multiple attempts
+        col_values = [self.app_id]
+        if col_names is None:
+            # append attemptId to support multi-attempts
+            col_names = ['appId']
         if not df.empty:
-            # TODO: We should consider add UUID as well, and use that for the joins instead.
-            df.insert(0, 'attemptId', self._attempt_id)
-            df.insert(0, 'appId', self._app_id)
+            for col_k, col_v in zip(reversed(col_names), reversed(col_values)):
+                if col_k not in df.columns:
+                    df.insert(0, col_k, col_v)
+                else:
+                    # if the column already exists, we should not overwrite it
+                    # this is useful when we want to patch the app_id/app_name to an existing dataframe
+                    df[col_k] = col_v
         return df
 
     @property
diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 77d1e0d50..59768b7ba 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -92,6 +92,11 @@ def rep_reader(self) -> 'ToolReportReaderT':
             raise ValueError(f'No reader found for table: {self._tbl}')
         return reader
 
+    @property
+    def tbl(self) -> str:
+        """Get the table id."""
+        return self._tbl
+
     @property
     def is_per_app_tbl(self) -> bool:
         if self._tbl is None:
@@ -206,6 +211,118 @@ def _load_single_app(self) -> LoadDFResult:
         )
 
 
+@dataclass
+class CSVCombiner(object):
+    """A class that combines multiple CSV reports into a single report."""
+    rep_builder: CSVReport
+    _failed_app_processor: Optional[Callable[[str, LoadDFResult], None]] = field(default=None, init=False)
+    _success_app_processor: Optional[Callable[[ToolResultHandlerT, str, pd.DataFrame, dict], pd.DataFrame]] = (
+        field(default=None, init=False))
+    _combine_args: Optional[dict] = field(default=None, init=False)
+
+    @property
+    def result_handler(self) -> ToolResultHandlerT:
+        """Get the result handler associated with this combiner."""
+        return self.rep_builder.handler
+
+    @staticmethod
+    def default_success_app_processor(result_handler: ToolResultHandlerT,
+                                      app_id: str,
+                                      df: pd.DataFrame,
+                                      combine_args: dict) -> pd.DataFrame:
+        """Default processor for successful applications."""
+        col_names = None
+        app_entry = result_handler.app_handlers.get(app_id)
+        if not app_entry:
+            raise ValueError(f'App entry not found for ID: {app_id}')
+        if combine_args:
+            # check if the col_names are provided to stitch the app_ids
+            col_names = combine_args.get('col_names', None)
+        if col_names:
+            # patch the app_uuid and if the columns are defined.
+            return app_entry.patch_into_df(df, col_names=col_names)
+        return df
+
+    def _evaluate_args(self) -> None:
+        """Evaluate the arguments to ensure they are set correctly."""
+
+        if self._success_app_processor is None:
+            # set the default processor for successful applications
+            self._success_app_processor = self.default_success_app_processor
+        # TODO: we should fail if the the combiner is built for AppIds but columns are not defined.
+
+    ################################
+    # Setters/Getters for processors
+    ################################
+
+    def process_failed(self,
+                       processor: Callable[[str, LoadDFResult], None]) -> 'CSVCombiner':
+        """Set the processor for failed applications."""
+        self._failed_app_processor = processor
+        return self
+
+    def process_success(self,
+                        cb_fn: Callable[[ToolResultHandlerT, str, pd.DataFrame, dict], pd.DataFrame]) -> 'CSVCombiner':
+        """Set the processor for successful applications."""
+        self._success_app_processor = cb_fn
+        return self
+
+    def combine_args(self, args: dict) -> 'CSVCombiner':
+        """Set the arguments for combining the reports."""
+        self._combine_args = args
+        return self
+
+    def on_apps(self) -> 'CSVCombiner':
+        """specify that the combiner should append IDs to the individual results before the concatenation."""
+        self.process_success(self.default_success_app_processor)
+        return self
+
+    #########################
+    # Public Interfaces
+    #########################
+
+    def build(self) -> LoadDFResult:
+        """Build the combined CSV report."""
+        # process teh arguments to ensure they are set correctly
+        self._evaluate_args()
+
+        load_error = None
+        final_df = None
+        success = False
+        try:
+            per_app_res = self.rep_builder.load()
+            # this is a dictionary and we should loop on it one by one to combine it
+            combined_dfs = []
+            for app_id, app_res in per_app_res.items():
+                # we need to patch the app_id to the dataframe
+                if app_res.load_error or app_res.data.empty:
+                    # process entry with failed results or skip them if no handlder is defined.
+                    if self._failed_app_processor:
+                        self._failed_app_processor(app_id, app_res)
+                    else:
+                        # default behavior is to skip the app
+                        continue
+                else:
+                    # process entry with successful results
+                    app_df = self._success_app_processor(self.result_handler,
+                                                         app_id,
+                                                         app_res.data,
+                                                         self._combine_args)
+                    # Q: Should we ignore or skip the empty dataframes?
+                    combined_dfs.append(app_df)
+            final_df = pd.concat(combined_dfs, ignore_index=True)
+            success = True
+        except Exception as e:  # pylint: disable=broad-except
+            # handle any exceptions that occur during the combination phase
+            load_error = e
+        return LoadDFResult(
+            f_path='combination of multiple path for table: ' + self.rep_builder.tbl,
+            data=final_df,
+            success=success,
+            fallen_back=False,
+            load_error=load_error)
+
+
 @dataclass
 class JPropsReport(APIReport[JPropsResult]):
     """A report that loads data in JSON properties format."""
diff --git a/user_tools/src/spark_rapids_tools/api_v1/core/qual_handler.py b/user_tools/src/spark_rapids_tools/api_v1/core/qual_handler.py
index ade9dd072..c4d90ee76 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/core/qual_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/core/qual_handler.py
@@ -15,10 +15,12 @@
 """Module that contains the definition of the qualification Result handler for the core module."""
 
 from dataclasses import dataclass
+from typing import Optional
 
 from spark_rapids_tools import override
 from spark_rapids_tools.api_v1 import register_result_class, ResultHandler
 from spark_rapids_tools.api_v1.report_reader import ToolReportReader
+from spark_rapids_tools.storagelib.cspfs import BoundedCspPath
 
 
 @register_result_class('qualCoreOutput')
@@ -29,3 +31,6 @@ class QualCoreResultHandler(ResultHandler):
     @property
     def alpha_reader(self) -> ToolReportReader:
         return self.readers.get(self.report_id)
+
+    def get_raw_metrics_path(self) -> Optional[BoundedCspPath]:
+        return self.get_reader_path('coreRawMetrics')
diff --git a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
index 853c10e5d..31ec27c66 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/result_handler.py
@@ -132,6 +132,17 @@ def get_reader_by_tbl(self, tbl: str) -> Optional[ToolReportReader]:
     # Public Interfaces
     #########################
 
+    def get_reader_path(self, report_id: str) -> Optional[BoundedCspPath]:
+        """
+        Get the path to the report file for the given report ID.
+        :param report_id: The unique identifier for the report.
+        :return: The path to the report file, or None if not found.
+        """
+        reader = self.readers.get(report_id)
+        if reader:
+            return reader.out_path
+        return None
+
     def create_empty_df(self, tbl: str) -> pd.DataFrame:
         """
         Create an empty DataFrame for the given table label.
@@ -153,6 +164,17 @@ def get_table_path(self, table_label: str) -> Optional[BoundedCspPath]:
             return reader.get_table_path(table_label)
         return None
 
+    def is_empty(self) -> bool:
+        """
+        Check if the result handler has no data.
+        :return: True if the result handler is empty, False otherwise.
+        """
+        # first check that the output file exists
+        if not self.out_path.exists():
+            return True
+        # then check that the app_handlers are empty
+        return not self.app_handlers
+
 #########################
 # Type Definitions
 #########################
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
index 9bd4618c7..8fb8fbd0f 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
@@ -460,14 +460,14 @@ def _is_ignore_no_perf(action: str) -> bool:
             | node_level_supp['Action'].apply(_is_ignore_no_perf)
             | node_level_supp['Exec Name']
             .astype(str)
+            # TODO: revisit the need to check for 'WholeStageCodegen' in Exec Name.
+            #  Ideally, we want to remove those execs that should be dropped from the analysis (
+            #  e.g. WholeStageCodegen, WholeStageCodegenExec, etc.)
             .apply(lambda x: x.startswith('WholeStageCodegen'))
         )
-        node_level_supp = (
-            node_level_supp[['App ID', 'SQL ID', 'SQL Node Id', 'Exec Is Supported']]
-            .groupby(['App ID', 'SQL ID', 'SQL Node Id'])
-            .agg('all')
-            .reset_index(level=[0, 1, 2])
-        )
+        # in previous version we used to group by 'App ID', 'SQL ID', 'SQL Node Id', but this is not
+        # needed since the 3 keys form an uuid for each row.
+        node_level_supp = node_level_supp[['App ID', 'SQL ID', 'SQL Node Id', 'Exec Is Supported']]
     return node_level_supp
 
 
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
index 20e95c4ad..cf339bc57 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
@@ -28,7 +28,9 @@
 import fire
 
 from spark_rapids_tools import CspPath
-from spark_rapids_tools.tools.core.qual_handler import QualCoreHandler
+from spark_rapids_tools.api_v1 import QualCoreResultHandler
+from spark_rapids_tools.api_v1.builder import CSVReport, CSVCombiner, APIResultHandler
+
 from spark_rapids_tools.tools.qualx.config import (
     get_cache_dir,
     get_config,
@@ -152,7 +154,6 @@ def _get_model(platform: str,
     xgb_model.load_model(model_path)
     return xgb_model
 
-
 def _get_calib_params(platform: str,
                       model: Optional[str] = None,
                       variant: Optional[str] = None) -> Dict[str, float]:
@@ -193,34 +194,34 @@ def _get_calib_params(platform: str,
         logger.debug('Calib params file not found: %s', calib_path)
     return calib_params
 
-
-def _get_qual_data(qual_handler: QualCoreHandler) -> Tuple[
-    Optional[pd.DataFrame],
-    Optional[pd.DataFrame],
-    List[str]
-]:
-    node_level_supp = None
-    qualtool_output = None
-    qual_metrics = []
-    # Get node level support from combined exec DataFrame
-    exec_table = qual_handler.get_table_by_label('execCSVReport')
-    node_level_supp = load_qtool_execs(exec_table)
-    # Get qualification summary from qualCoreCSVSummary
-    qualtool_output = qual_handler.get_table_by_label('qualCoreCSVSummary')
-    if qualtool_output is not None:
-        # Extract only the required columns
-        cols = ['App Name', 'App ID', 'App Duration']
-        available_cols = [col for col in cols if col in qualtool_output.columns]
-        if available_cols:
-            qualtool_output = qualtool_output[available_cols]
-    # Get path to the raw metrics directory which has the per-app
-    # raw_metrics files
-    qual_metrics = qual_handler.get_raw_metrics_paths()
-
-    return node_level_supp, qualtool_output, qual_metrics
-
-
-def _get_combined_qual_data(qual_handlers: List[QualCoreHandler]) -> Tuple[
+def _get_qual_data(qual_handler: QualCoreResultHandler) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame]]:
+    # extract the ['App Name', App ID', 'App Duration'] columns from the qualification summary.
+    q_sum_res = (CSVReport(qual_handler)
+                 .table('qualCoreCSVSummary')
+                 .pd_args({'usecols': ['App Name', 'App ID', 'App Duration']})
+                 .load())
+    if not q_sum_res.success:
+        # That should not happen unless there is something terribly wrong because the caller
+        # has checked that the result is not empty. Therefore, the summary must exist
+        raise q_sum_res.load_error
+    qualtool_summary = q_sum_res.data
+    # Extracting the execsCSVReport should return a dictionary of results.
+    # Combine that dictionary into a single DF.
+    # The following creates a combiner that:
+    # 1- uses the default combination method which is dedicated to use the appHanlder to combine the results.
+    # 2- ignore the failed execs. i.e., apps that has no execs. Otherwise, it is possible to add a call-back to handle
+    # the apps that had no execs.
+    execs_combiner = CSVCombiner(
+        rep_builder=CSVReport(qual_handler).table('execCSVReport')  # create the csvLoader
+    ).combine_args({'col_names': ['App ID']})  # tells the combiner to use those column names to insert the App uuid
+    q_execs_res = execs_combiner.build()
+    if not q_execs_res.success:
+        raise q_execs_res.load_error
+    node_level_supp = load_qtool_execs(q_execs_res.data)
+
+    return node_level_supp, qualtool_summary
+
+def _get_combined_qual_data(qual_handlers: List[QualCoreResultHandler]) -> Tuple[
     Optional[pd.DataFrame],
     Optional[pd.DataFrame],
     List[str]
@@ -236,15 +237,22 @@ def _get_combined_qual_data(qual_handlers: List[QualCoreHandler]) -> Tuple[
     combined_qual_metrics = []
 
     for handler in qual_handlers:
-        node_level_supp, qualtool_output, qual_metrics = _get_qual_data(handler)
+        if handler.is_empty():
+            # skip the handler that has no apps
+            continue
+        node_level_supp, qualtool_output = _get_qual_data(handler)
 
         if node_level_supp is not None:
             combined_node_level_supp.append(node_level_supp)
 
         if qualtool_output is not None:
             combined_qualtool_output.append(qualtool_output)
-
-        combined_qual_metrics.extend(qual_metrics)
+        # Add the raw_metrics folder if it is not None
+        raw_metrics_path = handler.get_raw_metrics_path()
+        if raw_metrics_path:
+            # For now, append the path without the scheme to be compatible with the remaining code
+            # that does not expect a URI value.
+            combined_qual_metrics.append(raw_metrics_path.no_scheme)
 
     # Combine DataFrames
     final_node_level_supp = None
@@ -701,7 +709,7 @@ def predict(
     model: Optional[str] = None,
     qual_tool_filter: Optional[str] = None,
     config: Optional[str] = None,
-    qual_handlers: List[QualCoreHandler]
+    qual_handlers: List[QualCoreResultHandler]
 ) -> pd.DataFrame:
     """Predict GPU speedup given CPU logs.
 
@@ -720,7 +728,7 @@ def predict(
     config:
         Path to a qualx-conf.yaml file to use for configuration.
     qual_handlers:
-        List of QualCoreHandler instances for reading qualification data.
+        List of QualCoreResultHandler instances for reading qualification data.
     """
     # load config from command line argument, or use default
     cfg = get_config(config)
@@ -923,7 +931,8 @@ def _predict_cli(
         qual = output_dir
     else:
         qual = qual_output
-        qual_handlers.append(QualCoreHandler(result_path=qual_output))
+        q_core_handler = APIResultHandler().qual_core().with_path(qual_output).build()
+        qual_handlers.append(q_core_handler)
 
     output_info = {
         'perSql': {'path': os.path.join(output_dir, 'per_sql.csv')},
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/util.py b/user_tools/src/spark_rapids_tools/tools/qualx/util.py
index 952544b38..0072a9a70 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/util.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/util.py
@@ -31,8 +31,9 @@
 import pandas as pd
 from tabulate import tabulate
 
+from spark_rapids_tools.api_v1 import QualCoreResultHandler
+from spark_rapids_tools.api_v1.builder import APIResultHandler
 from spark_rapids_tools.cmdli.dev_cli import DevCLI
-from spark_rapids_tools.tools.core.qual_handler import QualCoreHandler
 from spark_rapids_tools.tools.qualx.config import get_config, get_label
 
 
@@ -366,8 +367,11 @@ def run_profiling_core(eventlog: str) -> None:
     run_commands(eventlogs, run_profiling_core)
 
 
-def run_qualification_tool(platform: str, eventlogs: List[str],
-                           output_dir: str, skip_run: bool = False, tools_config: str = None) -> List[QualCoreHandler]:
+def run_qualification_tool(platform: str,
+                           eventlogs: List[str],
+                           output_dir: str,
+                           skip_run: bool = False,
+                           tools_config: str = None) -> List[QualCoreResultHandler]:
     logger.info('Running qualification on: %s', eventlogs if len(eventlogs) < 5 else f'{len(eventlogs)} eventlogs')
     logger.info('Saving output to: %s', output_dir)
 
@@ -396,7 +400,7 @@ def run_qualification_core(eventlog: str) -> None:
     qual_handlers = []
     for output_path in output_dirs:
         try:
-            handler = QualCoreHandler(result_path=output_path)
+            handler = APIResultHandler().qual_core().with_path(output_path).build()
             qual_handlers.append(handler)
         except Exception as e:  # pylint: disable=broad-except
             logger.warning('Failed to create QualCoreHandler for %s: %s', output_path, e)

From 3d818be322c932d0601c08f3c27c182be5c77312 Mon Sep 17 00:00:00 2001
From: "Ahmed Hussein (amahussein)" <a@ahussein.me>
Date: Mon, 4 Aug 2025 15:52:46 -0500
Subject: [PATCH 2/4] fix flake8

Signed-off-by: Ahmed Hussein (amahussein) <a@ahussein.me>
---
 user_tools/src/spark_rapids_tools/api_v1/builder.py |  2 +-
 .../spark_rapids_tools/tools/qualx/qualx_main.py    | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 59768b7ba..563f57cc4 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -273,7 +273,7 @@ def combine_args(self, args: dict) -> 'CSVCombiner':
         return self
 
     def on_apps(self) -> 'CSVCombiner':
-        """specify that the combiner should append IDs to the individual results before the concatenation."""
+        """specify that the combiner inject AP UUID to the individual results before the concatenation."""
         self.process_success(self.default_success_app_processor)
         return self
 
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
index cf339bc57..0deaa18d0 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
@@ -154,6 +154,7 @@ def _get_model(platform: str,
     xgb_model.load_model(model_path)
     return xgb_model
 
+
 def _get_calib_params(platform: str,
                       model: Optional[str] = None,
                       variant: Optional[str] = None) -> Dict[str, float]:
@@ -194,6 +195,7 @@ def _get_calib_params(platform: str,
         logger.debug('Calib params file not found: %s', calib_path)
     return calib_params
 
+
 def _get_qual_data(qual_handler: QualCoreResultHandler) -> Tuple[Optional[pd.DataFrame], Optional[pd.DataFrame]]:
     # extract the ['App Name', App ID', 'App Duration'] columns from the qualification summary.
     q_sum_res = (CSVReport(qual_handler)
@@ -211,16 +213,19 @@ def _get_qual_data(qual_handler: QualCoreResultHandler) -> Tuple[Optional[pd.Dat
     # 1- uses the default combination method which is dedicated to use the appHanlder to combine the results.
     # 2- ignore the failed execs. i.e., apps that has no execs. Otherwise, it is possible to add a call-back to handle
     # the apps that had no execs.
-    execs_combiner = CSVCombiner(
-        rep_builder=CSVReport(qual_handler).table('execCSVReport')  # create the csvLoader
-    ).combine_args({'col_names': ['App ID']})  # tells the combiner to use those column names to insert the App uuid
-    q_execs_res = execs_combiner.build()
+    q_execs_res = (
+        CSVCombiner(rep_builder=CSVReport(qual_handler).table('execCSVReport'))  # use ExecsCSV report
+        .on_apps()                                                               # combine DFs on apps
+        .combine_args({'col_names': ['App ID']})                                 # inject cols when combining on apps
+        .build()                                                                 # combine the results
+    )
     if not q_execs_res.success:
         raise q_execs_res.load_error
     node_level_supp = load_qtool_execs(q_execs_res.data)
 
     return node_level_supp, qualtool_summary
 
+
 def _get_combined_qual_data(qual_handlers: List[QualCoreResultHandler]) -> Tuple[
     Optional[pd.DataFrame],
     Optional[pd.DataFrame],

From 2a1b7490a15d6f6d667d3e28cb09191131b6f6ca Mon Sep 17 00:00:00 2001
From: "Ahmed Hussein (amahussein)" <a@ahussein.me>
Date: Tue, 5 Aug 2025 11:20:01 -0500
Subject: [PATCH 3/4] handle special cases with concatenation

Signed-off-by: Ahmed Hussein (amahussein) <a@ahussein.me>
---
 .../spark_rapids_tools/api_v1/app_handler.py  |  5 +++++
 .../src/spark_rapids_tools/api_v1/builder.py  | 19 +++++++++++++++++-
 .../tools/qualx/qualx_main.py                 | 20 +++++++++----------
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/user_tools/src/spark_rapids_tools/api_v1/app_handler.py b/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
index 7e12412e5..02ffbf074 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
@@ -75,6 +75,11 @@ def patch_into_df(self,
         if col_names is None:
             # append attemptId to support multi-attempts
             col_names = ['appId']
+        # Ensure col_values matches col_names in length
+        if len(col_values) == 1 and len(col_names) > 1:
+            col_values = col_values * len(col_names)
+        elif len(col_values) != len(col_names):
+            raise ValueError('Length of col_values must be 1 or match length of col_names')
         if not df.empty:
             for col_k, col_v in zip(reversed(col_names), reversed(col_values)):
                 if col_k not in df.columns:
diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 563f57cc4..7740845d9 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -251,6 +251,18 @@ def _evaluate_args(self) -> None:
             self._success_app_processor = self.default_success_app_processor
         # TODO: we should fail if the the combiner is built for AppIds but columns are not defined.
 
+    def _create_empty_df(self) -> pd.DataFrame:
+        """
+        creates an empty DataFrame with the columns defined in the report builder.
+        :return: an empty dataframe.
+        """
+        empty_df = self.result_handler.create_empty_df(self.rep_builder.tbl)
+        if self._combine_args and 'use_cols' in self._combine_args:
+            # make sure that we insert the columns to the empty dataframe
+            injected_cols = pd.DataFrame(columns=self._combine_args['use_cols'])
+            return pd.concat([injected_cols, empty_df], axis=1)
+        return empty_df
+
     ################################
     # Setters/Getters for processors
     ################################
@@ -310,7 +322,12 @@ def build(self) -> LoadDFResult:
                                                          self._combine_args)
                     # Q: Should we ignore or skip the empty dataframes?
                     combined_dfs.append(app_df)
-            final_df = pd.concat(combined_dfs, ignore_index=True)
+            if combined_dfs:
+                # only concatenate if we have any dataframes to combine
+                final_df = pd.concat(combined_dfs, ignore_index=True)
+            else:
+                # create an empty DataFrame if no data was collected. uses the table schema.
+                final_df = self._create_empty_df()
             success = True
         except Exception as e:  # pylint: disable=broad-except
             # handle any exceptions that occur during the combination phase
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
index 0deaa18d0..6f192892f 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
@@ -205,7 +205,7 @@ def _get_qual_data(qual_handler: QualCoreResultHandler) -> Tuple[Optional[pd.Dat
     if not q_sum_res.success:
         # That should not happen unless there is something terribly wrong because the caller
         # has checked that the result is not empty. Therefore, the summary must exist
-        raise q_sum_res.load_error
+        raise RuntimeError(f'Failed to load qualCoreCSVSummary {q_sum_res.load_error}') from q_sum_res.load_error
     qualtool_summary = q_sum_res.data
     # Extracting the execsCSVReport should return a dictionary of results.
     # Combine that dictionary into a single DF.
@@ -220,7 +220,7 @@ def _get_qual_data(qual_handler: QualCoreResultHandler) -> Tuple[Optional[pd.Dat
         .build()                                                                 # combine the results
     )
     if not q_execs_res.success:
-        raise q_execs_res.load_error
+        raise RuntimeError(f'Failed to load execution CSV report: {q_execs_res.load_error}') from q_execs_res.load_error
     node_level_supp = load_qtool_execs(q_execs_res.data)
 
     return node_level_supp, qualtool_summary
@@ -238,20 +238,20 @@ def _get_combined_qual_data(qual_handlers: List[QualCoreResultHandler]) -> Tuple
         return None, None, []
 
     combined_node_level_supp = []
-    combined_qualtool_output = []
+    combined_q_core_sum = []       # summary of the Qualification core (Id, Name, Duration)
     combined_qual_metrics = []
 
     for handler in qual_handlers:
         if handler.is_empty():
             # skip the handler that has no apps
             continue
-        node_level_supp, qualtool_output = _get_qual_data(handler)
+        node_level_supp, q_core_summary = _get_qual_data(handler)
 
         if node_level_supp is not None:
             combined_node_level_supp.append(node_level_supp)
 
-        if qualtool_output is not None:
-            combined_qualtool_output.append(qualtool_output)
+        if q_core_summary is not None:
+            combined_q_core_sum.append(q_core_summary)
         # Add the raw_metrics folder if it is not None
         raw_metrics_path = handler.get_raw_metrics_path()
         if raw_metrics_path:
@@ -264,11 +264,11 @@ def _get_combined_qual_data(qual_handlers: List[QualCoreResultHandler]) -> Tuple
     if combined_node_level_supp:
         final_node_level_supp = pd.concat(combined_node_level_supp, ignore_index=True)
 
-    final_qualtool_output = None
-    if combined_qualtool_output:
-        final_qualtool_output = pd.concat(combined_qualtool_output, ignore_index=True)
+    final_q_core_sum = None
+    if combined_q_core_sum:
+        final_q_core_sum = pd.concat(combined_q_core_sum, ignore_index=True)
 
-    return final_node_level_supp, final_qualtool_output, combined_qual_metrics
+    return final_node_level_supp, final_q_core_sum, combined_qual_metrics
 
 
 def _get_split_fn(split_fn: Union[str, dict]) -> Callable[[pd.DataFrame], pd.DataFrame]:

From 59714c7cd91afba9929fa6c073156198f14df036 Mon Sep 17 00:00:00 2001
From: "Ahmed Hussein (amahussein)" <a@ahussein.me>
Date: Tue, 12 Aug 2025 12:04:32 -0500
Subject: [PATCH 4/4] Improve the CSVCombiner API and add shadow implementation
 for QualX

Signed-off-by: Ahmed Hussein (amahussein) <a@ahussein.me>
---
 .../rapids/qualification.py                   |  16 +-
 .../src/spark_rapids_tools/api_v1/__init__.py |  12 +-
 .../spark_rapids_tools/api_v1/app_handler.py  | 126 +++-
 .../src/spark_rapids_tools/api_v1/builder.py  | 501 +++++++++++---
 .../tools/qualx/featurizers/default.py        |  14 +-
 .../tools/qualx/preprocess.py                 |  11 +
 .../tools/qualx/qualx_main.py                 |  30 +-
 .../tools/qualx/revamp/__init__.py            |  15 +
 .../tools/qualx/revamp/featurizer.py          | 655 ++++++++++++++++++
 .../tools/qualx/revamp/preprocess.py          |  81 +++
 .../tools/qualx/revamp/x_main.py              |  74 ++
 .../spark_rapids_tools/utils/data_utils.py    |  22 +-
 .../src/spark_rapids_tools/utils/util.py      |  15 +
 .../spark_rapids_tools_ut/api/__init__.py     |  15 +
 .../api/test_app_handlers.py                  |  91 +++
 15 files changed, 1560 insertions(+), 118 deletions(-)
 create mode 100644 user_tools/src/spark_rapids_tools/tools/qualx/revamp/__init__.py
 create mode 100644 user_tools/src/spark_rapids_tools/tools/qualx/revamp/featurizer.py
 create mode 100644 user_tools/src/spark_rapids_tools/tools/qualx/revamp/preprocess.py
 create mode 100644 user_tools/src/spark_rapids_tools/tools/qualx/revamp/x_main.py
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/api/__init__.py
 create mode 100644 user_tools/tests/spark_rapids_tools_ut/api/test_app_handlers.py

diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
index bb3e7a9f0..78d910f62 100644
--- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py
+++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py
@@ -36,6 +36,7 @@
 from spark_rapids_tools.tools.core.qual_handler import QualCoreHandler
 from spark_rapids_tools.tools.qualx.qualx_main import predict
 from spark_rapids_tools.tools.qualification_stats_report import SparkQualificationStats
+from spark_rapids_tools.tools.qualx.revamp.x_main import predict_x
 from spark_rapids_tools.tools.speedup_category import SpeedupCategory
 from spark_rapids_tools.tools.top_candidates import TopCandidates
 from spark_rapids_tools.tools.unsupported_ops_stage_duration import UnsupportedOpsStageDuration
@@ -541,10 +542,17 @@ def __update_apps_with_prediction_info(self,
         try:
             # Build the QualCore handler object to handle the prediction model output
             q_core_handler = APIResultHandler().qual_core().with_path(qual_output_dir).build()
-            predictions_df = predict(platform=model_name, qual=qual_output_dir,
-                                     output_info=output_info,
-                                     model=estimation_model_args['customModelFile'],
-                                     qual_handlers=[q_core_handler])
+            if Utils.get_rapids_tools_env('QUALX_REVAMP'):
+                predictions_df = predict_x(platform=model_name,
+                                           qual=qual_output_dir,
+                                           output_info=output_info,
+                                           model=estimation_model_args['customModelFile'],
+                                           qual_handlers=[q_core_handler])
+            else:
+                predictions_df = predict(platform=model_name, qual=qual_output_dir,
+                                         output_info=output_info,
+                                         model=estimation_model_args['customModelFile'],
+                                         qual_handlers=[q_core_handler])
         except Exception as e:  # pylint: disable=broad-except
             predictions_df = pd.DataFrame()
             self.logger.error(
diff --git a/user_tools/src/spark_rapids_tools/api_v1/__init__.py b/user_tools/src/spark_rapids_tools/api_v1/__init__.py
index 515181b81..a0021494a 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/__init__.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/__init__.py
@@ -32,6 +32,12 @@
     QualCoreResultHandler,
     ProfCoreResultHandler
 )
+from .builder import (
+    LoadRawFilesResult,
+    APIUtils,
+    CSVReportCombiner,
+    CSVReport
+)
 
 __all__ = [
     'AppHandler',
@@ -42,5 +48,9 @@
     'QualWrapperResultHandler',
     'QualCoreResultHandler',
     'ProfWrapperResultHandler',
-    'ProfCoreResultHandler'
+    'ProfCoreResultHandler',
+    'LoadRawFilesResult',
+    'APIUtils',
+    'CSVReportCombiner',
+    'CSVReport'
 ]
diff --git a/user_tools/src/spark_rapids_tools/api_v1/app_handler.py b/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
index 02ffbf074..895829dee 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/app_handler.py
@@ -13,12 +13,15 @@
 # limitations under the License.
 
 """module that defines the app descriptor for the results loaded by the tools."""
-
+import re
 from dataclasses import dataclass, field
 from functools import cached_property
-from typing import Optional, List
+from typing import Optional, List, Dict
 
 import pandas as pd
+from pydantic.alias_generators import to_camel
+
+from spark_rapids_tools.utils import Utilities
 
 
 @dataclass
@@ -32,6 +35,56 @@ class AppHandler(object):
     # this will be loaded from the core-status csv report
     eventlog_path: Optional[str] = None
 
+    @staticmethod
+    def get_pd_dtypes() -> Dict[str, str]:
+        """
+        Get the pandas data types for the AppHandler attributes.
+        :return: Dictionary mapping attribute names to pandas data types.
+        """
+        return {
+            'app_id': Utilities.scala_to_pandas_type('String'),
+            'attempt_id': Utilities.scala_to_pandas_type('Int'),
+            'app_name': Utilities.scala_to_pandas_type('String'),
+            'eventlog_path': Utilities.scala_to_pandas_type('String')
+        }
+
+    @staticmethod
+    def normalize_attribute(arg_value: str) -> str:
+        """
+        Normalize the attribute name to a plain format.
+        It uses re.sub to replace any '-' or '_' with a space using the regexp 'r"(_|-)+"'.
+        Finally, it uses str.replace() to remove any spaces.
+        :param arg_value: the attribute name to normalize.
+        :return: the actual field name that is used in the AppHandler.
+        """
+        processed_value = re.sub(r'([_\-])+', ' ', arg_value.strip().lower()).replace(' ', '')
+        lookup_map = {
+            'appname': 'app_name',
+            'appid': 'app_id',
+            'attemptid': 'attempt_id',
+            'eventlogpath': 'eventlog_path'
+        }
+        return lookup_map.get(processed_value, arg_value)
+
+    @classmethod
+    def get_key_attributes(cls) -> List[str]:
+        """
+        Get the key attributes that define an AppHandler.
+        :return: List of key attributes.
+        """
+        return ['app_id']
+
+    @classmethod
+    def get_default_key_columns(cls) -> Dict[str, str]:
+        """
+        Get the default key columns for the AppHandler.
+        :return: Dictionary mapping attribute names to column names.
+        """
+        res = {}
+        for attr in cls.get_key_attributes():
+            res[attr] = to_camel(attr)
+        return res
+
     def is_name_defined(self) -> bool:
         """
         Check if the app name is defined.
@@ -141,3 +194,72 @@ def merge(self, other: 'AppHandler') -> 'AppHandler':
         if self.eventlog_path is None and other.eventlog_path is not None:
             self.eventlog_path = other.eventlog_path
         return self
+
+    ################################
+    # Public Methods
+    ################################
+
+    def convert_to_df(self) -> pd.DataFrame:
+        """
+        Convert the AppHandler attributes to a DataFrame.
+        :return: DataFrame with app_id, app_name, and attempt_id as columns.
+        """
+        data = {
+            'app_id': [self.app_id],
+            'attempt_id': [self.attempt_id],
+            'app_name': [self.app_name],
+            'eventlog_path': [self.eventlog_path]
+        }
+        data_types = AppHandler.get_pd_dtypes()
+        return pd.DataFrame({
+            col: pd.Series(data[col], dtype=dtype) for col, dtype in data_types.items()
+        })
+
+    def add_fields_to_dataframe(self,
+                                df: pd.DataFrame,
+                                field_to_col_map: Dict[str, str]) -> pd.DataFrame:
+        """
+        Insert fields/properties from AppHandler into the DataFrame, with user-specified column names.
+        :param df: Existing DataFrame to append to.
+        :type df: pd.DataFrame
+        :param field_to_col_map: Dictionary mapping AppHandler attributes (keys) to DataFrame column names (values).
+        :type field_to_col_map: Dict[str, str]
+        default: Value to use if attribute/property not found (raises if None).
+        """
+        converted_df = self.convert_to_df()
+        row_data = []
+        for attr, col in field_to_col_map.items():
+            # Normalize the attribute name
+            norm_attr = AppHandler.normalize_attribute(attr)
+            try:
+                value = getattr(self, norm_attr)
+                row_data.append((col, norm_attr, value))
+            except AttributeError as exc:
+                raise AttributeError(f"Attribute '{attr}' not found in AppHandler.") from exc
+        for col, norm_attr, value in reversed(row_data):
+            # Check if the column already exists in the DataFrame
+            if col in df.columns:
+                # If it exists, we should not overwrite it, skip
+                continue
+            # create a new column with the correct type. We do this because we do not want to
+            # to add values to an empty dataframe.
+            df.insert(loc=0, column=col, value=pd.Series(dtype=converted_df[norm_attr].dtype))
+            # set the values in case the dataframe was non-empty.
+            df[col] = pd.Series([value] * len(df), dtype=converted_df[norm_attr].dtype)
+        return df
+
+    @classmethod
+    def inject_into_df(cls,
+                       df: pd.DataFrame,
+                       field_to_col_map: Dict[str, str],
+                       app_h: Optional['AppHandler'] = None) -> pd.DataFrame:
+        """
+        Inject AppHandler fields into a DataFrame using a mapping of field names to column names.
+        :param df:
+        :param field_to_col_map:
+        :param app_h:
+        :return:
+        """
+        if app_h is None:
+            app_h = AppHandler(_app_id='UNKNOWN_APP', _app_name='UNKNOWN_APP', _attempt_id=1)
+        return app_h.add_fields_to_dataframe(df, field_to_col_map)
diff --git a/user_tools/src/spark_rapids_tools/api_v1/builder.py b/user_tools/src/spark_rapids_tools/api_v1/builder.py
index 7740845d9..1409fede7 100644
--- a/user_tools/src/spark_rapids_tools/api_v1/builder.py
+++ b/user_tools/src/spark_rapids_tools/api_v1/builder.py
@@ -15,7 +15,8 @@
 """Module that contains the entry for the API v1 builder."""
 
 from dataclasses import dataclass, field
-from typing import Union, Optional, TypeVar, Generic, List, Dict, Callable
+from logging import Logger
+from typing import Union, Optional, TypeVar, Generic, List, Dict, Callable, Any, Set, Tuple
 
 import pandas as pd
 
@@ -25,7 +26,7 @@
 from spark_rapids_tools.api_v1.report_loader import ReportLoader
 from spark_rapids_tools.api_v1.report_reader import ToolReportReaderT
 from spark_rapids_tools.storagelib.cspfs import BoundedCspPath
-from spark_rapids_tools.utils.data_utils import LoadDFResult, JPropsResult, TXTResult
+from spark_rapids_tools.utils.data_utils import LoadDFResult, JPropsResult, TXTResult, DataUtils
 
 
 @dataclass
@@ -103,6 +104,22 @@ def is_per_app_tbl(self) -> bool:
             return False
         return self.rep_reader.is_per_app
 
+    @property
+    def app_handlers(self) -> List[AppHandler]:
+        """
+        Return the list of application handlers associated with this report.
+
+        If no specific apps are set, returns all app handlers from the handler.
+        Otherwise, returns handlers for the specified apps.
+        """
+        if not self._apps:
+            return list(self.handler.app_handlers.values())
+        app_handlers = [
+            self.handler.app_handlers.get(app_e) if isinstance(app_e, str) else app_e
+            for app_e in self._apps
+        ]
+        return [app_h for app_h in app_handlers if app_h is not None]
+
     def _check_apps(self) -> None:
         """Check if applications are properly configured."""
         if not self.is_per_app_tbl:
@@ -164,6 +181,12 @@ class CSVReport(APIReport[LoadDFResult]):
     _fall_cb: Optional[Callable[[], pd.DataFrame]] = field(default=None, init=False)
     _map_cols: Optional[dict] = field(default=None, init=False)
     _pd_args: Optional[dict] = field(default=None, init=False)
+    _col_mapper_cb: Optional[Callable[[List[str]], Dict[str, str]]] = field(default=None, init=False)
+
+    def map_cols_cb(self, cb: Callable[[List[str]], Dict[str, str]]) -> 'CSVReport':
+        """Set the callback for mapping columns when loading data."""
+        self._col_mapper_cb = cb
+        return self
 
     def fall_cb(self, cb: Callable[[], pd.DataFrame]) -> 'CSVReport':
         """Set the fallback callback for loading data."""
@@ -180,6 +203,39 @@ def pd_args(self, args: dict) -> 'CSVReport':
         self._pd_args = args
         return self
 
+    def _check_map_col_args(self) -> None:
+        """
+        build the final column mapping of the report
+        """
+        temp_map_cols = {}
+        if self._col_mapper_cb:
+            # if the column mapper callback is defined, we need to build the static column.
+            actual_tbl_cols = [col.name for col in self.rep_reader.get_table(self.tbl).columns]
+            # check if the use_cols is defined
+            if self._pd_args and 'usecols' in self._pd_args:
+                # only pick those columns if defined
+                selected_cols = self._pd_args['usecols']
+            else:
+                # if usecols is not defined, we will use all columns
+                selected_cols = actual_tbl_cols
+            # apply the map callback to the columns of the table
+            temp_map_cols = self._col_mapper_cb(selected_cols)
+        if self._map_cols is not None:
+            # if the static map_cols is defined, then it is possible that we want to apply a
+            # dynamic method to rename the columns, but for some reason there is a couple of fields
+            # mapped statically.
+            # overwrite the temp_map_cols keys with the static map_cols, or insert new ones.
+            temp_map_cols.update(self._map_cols)
+        if temp_map_cols:
+            # finally re-assign the _map_cols field with the final column-mapping.
+            self.map_cols(temp_map_cols)
+
+    @override
+    def _check_args(self) -> None:
+        """Check if the required arguments are set."""
+        super()._check_args()
+        self._check_map_col_args()
+
     @override
     def _load_global(self) -> LoadDFResult:
         return self.rep_reader.load_df(
@@ -210,133 +266,263 @@ def _load_single_app(self) -> LoadDFResult:
             pd_args=self._pd_args
         )
 
+    def create_empty_df(self) -> pd.DataFrame:
+        """
+        Create an empty DataFrame with the columns defined in the report.
+        :return: An empty DataFrame with the columns defined in the report.
+        """
+        tbl_df = self.handler.create_empty_df(self.tbl)
+        # we need to handle if the columns were renamed, or selected
+        if self._pd_args and 'usecols' in self._pd_args:
+            # if usecols is defined, we need to filter the columns
+            use_cols = self._pd_args['usecols']
+            tbl_df = tbl_df[use_cols]
+        if self._map_cols:
+            tbl_df = tbl_df.rename(columns=self._map_cols)
+        return tbl_df
+
 
 @dataclass
-class CSVCombiner(object):
-    """A class that combines multiple CSV reports into a single report."""
-    rep_builder: CSVReport
-    _failed_app_processor: Optional[Callable[[str, LoadDFResult], None]] = field(default=None, init=False)
-    _success_app_processor: Optional[Callable[[ToolResultHandlerT, str, pd.DataFrame, dict], pd.DataFrame]] = (
+class CSVReportCombiner(object):
+    """
+    A class for combining multiple CSVReport instances into a single DataFrame. This implementation
+    assumes that:
+    1. It must be called on a Per-App report (aka, a report that has per app-data).
+    2. All CSVReport instances are associated with the same Table.
+    3. The reports can be loaded independently and combined based on application IDs.
+    4. The reports may have different sets of applications
+
+    This class manages the process of loading, combining, and post-processing data from
+    multiple report builders, handling both successful and failed application loads.
+    It supports injecting application IDs into the resulting DataFrame, custom callbacks
+    for success and failure cases, and fallback to empty DataFrames if needed.
+
+    :param: rep_builders: List[CSVReport]. A list of CSVReport instances to combine.
+    :param: _inject_app_ids_enabled: Bool, default True. Flag to enable/disable injection of
+           application IDs into the combined DataFrame.
+    :param: _app_fields: Optional[Dict[str, str]], default None. A dictionary that specifies the
+           fields of the AppHandlers and how to inject them into the per-app DataFrame before they
+           get combined.
+           The expected structure is [field_name: str, column_name: str] where the normalized
+           value of field_name is a valid field in AppHandler instance.
+           Example of acceptable keys for AppHandler.app_id are: 'app_id', 'App ID', 'appId', 'app ID'.
+    :param: _success_cb: Optional[Callable[[ToolResultHandlerT, str, pd.DataFrame], pd.DataFrame]],
+           default None.
+           A callback function that is called to provide any extra custom processing for each
+           successful application. This cb is applied after injecting the app columns and right-before
+           concatenating the app-dataframe to the combined dataFrame.
+           This is useful in case the caller wants to apply some dataframe operations on the DataFrame.
+           Note, it might be tricky if this cb is changing the shape of the DataFrame, as it might
+           conflict with the expected Dataframe DTypes extracted from the original Table.
+    :param: _failure_cb: Optional[Callable[[ToolResultHandlerT, str, LoadDFResult], Any]], default None.
+           Provides custom handling for failed application loads. If the caller needs more processing
+           than just appending the failures into the failed_apps dictionary, this callback can become
+           handy.
+    """
+    rep_builders: List[CSVReport]
+    _inject_app_ids_enabled: bool = field(default=True, init=False)
+    _app_fields: Optional[Dict[str, str]] = field(default=None, init=False)
+    _success_cb: Optional[Callable[[ToolResultHandlerT, str, pd.DataFrame], pd.DataFrame]] = (
         field(default=None, init=False))
-    _combine_args: Optional[dict] = field(default=None, init=False)
+    _failure_cb: Optional[Callable[[ToolResultHandlerT, str, LoadDFResult], Any]] = (
+        field(default=None, init=False))
+    _fall_back_to_empty_df: bool = field(default=False, init=False)
+    _failed_apps: Dict[str, Exception] = field(default_factory=dict, init=False)
+    _successful_apps: Set[str] = field(default_factory=set, init=False)
 
     @property
-    def result_handler(self) -> ToolResultHandlerT:
-        """Get the result handler associated with this combiner."""
-        return self.rep_builder.handler
+    def failed_apps(self) -> Dict[str, Exception]:
+        """Get the dictionary of failed applications."""
+        return self._failed_apps
 
-    @staticmethod
-    def default_success_app_processor(result_handler: ToolResultHandlerT,
-                                      app_id: str,
-                                      df: pd.DataFrame,
-                                      combine_args: dict) -> pd.DataFrame:
-        """Default processor for successful applications."""
-        col_names = None
-        app_entry = result_handler.app_handlers.get(app_id)
-        if not app_entry:
-            raise ValueError(f'App entry not found for ID: {app_id}')
-        if combine_args:
-            # check if the col_names are provided to stitch the app_ids
-            col_names = combine_args.get('col_names', None)
-        if col_names:
-            # patch the app_uuid and if the columns are defined.
-            return app_entry.patch_into_df(df, col_names=col_names)
-        return df
-
-    def _evaluate_args(self) -> None:
-        """Evaluate the arguments to ensure they are set correctly."""
-
-        if self._success_app_processor is None:
-            # set the default processor for successful applications
-            self._success_app_processor = self.default_success_app_processor
-        # TODO: we should fail if the the combiner is built for AppIds but columns are not defined.
+    @property
+    def default_rep_builder(self) -> CSVReport:
+        """Get the first report builder."""
+        if not self.rep_builders:
+            raise ValueError('No report builders provided for combination.')
+        return self.rep_builders[0]
 
-    def _create_empty_df(self) -> pd.DataFrame:
-        """
-        creates an empty DataFrame with the columns defined in the report builder.
-        :return: an empty dataframe.
+    @property
+    def tbl(self) -> str:
+        """Get the table label for the first report builder."""
+        return self.default_rep_builder.tbl
+
+    @property
+    def app_handlers(self) -> List[AppHandler]:
         """
-        empty_df = self.result_handler.create_empty_df(self.rep_builder.tbl)
-        if self._combine_args and 'use_cols' in self._combine_args:
-            # make sure that we insert the columns to the empty dataframe
-            injected_cols = pd.DataFrame(columns=self._combine_args['use_cols'])
-            return pd.concat([injected_cols, empty_df], axis=1)
-        return empty_df
+        Return the list of application handlers associated with the report builder.
+        If no specific apps are set, returns all app handlers from the handler.
+        Otherwise, returns handlers for the specified apps.
 
-    ################################
-    # Setters/Getters for processors
-    ################################
+        Note that, it is possible to have duplicate apps if the same app is present in multiple reports.
+        """
+        res = []
+        for rep_b in self.rep_builders:
+            res.extend(rep_b.app_handlers)
+        return res
+
+    def disable_apps_injection(self) -> 'CSVReportCombiner':
+        """Disable the injection of application IDs into the combined DataFrame."""
+        self._inject_app_ids_enabled = False
+        return self
 
-    def process_failed(self,
-                       processor: Callable[[str, LoadDFResult], None]) -> 'CSVCombiner':
-        """Set the processor for failed applications."""
-        self._failed_app_processor = processor
+    def on_app_fields(self, combine_on: Dict[str, str]) -> 'CSVReportCombiner':
+        """Set the columns to combine on."""
+        self._app_fields = combine_on
         return self
 
-    def process_success(self,
-                        cb_fn: Callable[[ToolResultHandlerT, str, pd.DataFrame, dict], pd.DataFrame]) -> 'CSVCombiner':
-        """Set the processor for successful applications."""
-        self._success_app_processor = cb_fn
+    def entry_success_cb(self,
+                         cb: Callable[[ToolResultHandlerT, str, pd.DataFrame], pd.DataFrame]) -> 'CSVReportCombiner':
+        """Set the callback for successful applications."""
+        self._success_cb = cb
         return self
 
-    def combine_args(self, args: dict) -> 'CSVCombiner':
-        """Set the arguments for combining the reports."""
-        self._combine_args = args
+    def entry_failure_cb(self, cb: Callable[[ToolResultHandlerT, str, LoadDFResult], Any]) -> 'CSVReportCombiner':
+        """Set the callback for failed applications"""
+        self._failure_cb = cb
         return self
 
-    def on_apps(self) -> 'CSVCombiner':
-        """specify that the combiner inject AP UUID to the individual results before the concatenation."""
-        self.process_success(self.default_success_app_processor)
+    def empty_df_on_error(self, fall_to_empty: bool = True) -> 'CSVReportCombiner':
+        """Set whether to fall back to an empty DataFrame if no data is available."""
+        self._fall_back_to_empty_df = fall_to_empty
         return self
 
-    #########################
-    # Public Interfaces
-    #########################
+    @staticmethod
+    def logger(csv_rep: CSVReport) -> Logger:
+        return csv_rep.handler.logger
+
+    @staticmethod
+    def log_failed_app(app_id: str, csv_rep: CSVReport, failed_load: LoadDFResult) -> None:
+        """Log a failed application."""
+        CSVReportCombiner.logger(csv_rep).debug(
+            f'Failed to load {csv_rep.tbl} for app {app_id}: {failed_load.get_fail_cause()}')
+
+    def _process_args(self) -> None:
+        """Process the arguments to ensure they are set correctly."""
+        if not self.rep_builders:
+            raise ValueError('No report builders provided for combination.')
+        if self._app_fields is None:
+            # by default, we will use app_id column and it will be inserted as 'appId'
+            # Later we can add more fields
+            self.on_app_fields(AppHandler.get_default_key_columns())
+
+    def _inject_app_into_df(
+            self,
+            res_h: ToolResultHandlerT,
+            df: pd.DataFrame, app_id: str) -> pd.DataFrame:
+        """
+        Inject the application ID into the DataFrame.
+        :param df: The DataFrame to inject the application ID into.
+        :param app_id: The application ID to inject.
+        :return: The DataFrame with the application ID injected.
+        """
+        # TODO: Should we check if app_obj is not found?
+        app_obj = res_h.app_handlers.get(app_id)
+        return app_obj.add_fields_to_dataframe(df, self._app_fields)
+
+    def _build_single_report(self, csv_rep: CSVReport) -> List[pd.DataFrame]:
+        combined_dfs = []
+        # this is a dictionary and we should loop on it one by one to combine it
+        per_app_res = csv_rep.load()
+        for app_id, app_res in per_app_res.items():
+            try:
+                # Set a generic try-except block to handle unexpected errors for each entry to avoid
+                # failing the entire combination process.
+                if not app_res.success:  # what is the correct way to check for success?
+                    # Process entry with failed results.
+                    # 1. log debug message (We do not want error message because it will confuse the users)
+                    # 2. Add it to the dictionary of failed apps
+                    # 3. Call the failure callback if defined.
+                    CSVReportCombiner.log_failed_app(app_id, csv_rep, app_res)
+                    self._failed_apps[app_id] = app_res.get_fail_cause()
+                    if self._failure_cb:
+                        try:
+                            self._failure_cb(csv_rep.handler, app_id, app_res)
+                        except Exception as failure_cb_ex:  # pylint: disable=broad-except
+                            # if the failure callback fails, we log it but do not raise an error.
+                            CSVReportCombiner.logger(csv_rep).error(
+                                f'Failed to apply  failure_cb for app {app_id} on {csv_rep.tbl}: {failure_cb_ex}')
+                else:
+                    # This is a successful result, we need to process it.
+                    # 1. Append it to the list of successful apps.
+                    # 2. Inject the app key columns into the dataframe if enabled.
+                    # 3. Call the success callback if defined.
+                    self._successful_apps.add(app_id)
+                    processed_df = app_res.data
+                    if self._inject_app_ids_enabled:
+                        # inject the app_id into the dataframe
+                        processed_df = self._inject_app_into_df(csv_rep.handler, app_res.data, app_id)
+                    if self._success_cb:
+                        # apply the success_callback defined by the caller
+                        try:
+                            processed_df = self._success_cb(csv_rep.handler, app_id, processed_df)
+                        except Exception as success_cb_ex:  # pylint: disable=broad-except
+                            # if the success callback fails, we log it but do not raise an error.
+                            CSVReportCombiner.logger(csv_rep).error(
+                                f'Failed to apply success_cb for app {app_id} on {csv_rep.tbl}: {success_cb_ex}')
+                    # Q: Should we ignore or skip the empty dataframes?
+                    combined_dfs.append(processed_df)
+            except Exception as single_entry_ex:  # pylint: disable=broad-except
+                # if any exception occurs during the processing of the app, we log it and continue.
+                # add it to the failed_apps dictionary
+                CSVReportCombiner.logger(csv_rep).error(
+                    f'Failed to process app entry {app_id} for {csv_rep.tbl}: {single_entry_ex}')
+                self._failed_apps[app_id] = single_entry_ex
+        return combined_dfs
+
+    def _create_empty_df(self) -> pd.DataFrame:
+        """
+        creates an empty DataFrame with the columns defined in the report builder.
+        :return: an empty dataframe.
+        """
+        # get the dataframe based on user arguments.
+        empty_df = self.default_rep_builder.create_empty_df()
+        # if apps injection is enabled, we need to inject the app_id columns
+        if self._inject_app_ids_enabled and self._app_fields:
+            # make sure that we inject the app_id columns
+            return AppHandler.inject_into_df(empty_df, self._app_fields)
+        return empty_df
 
     def build(self) -> LoadDFResult:
-        """Build the combined CSV report."""
+        """Build the combined report."""
         # process teh arguments to ensure they are set correctly
-        self._evaluate_args()
-
+        self._process_args()
         load_error = None
         final_df = None
         success = False
+        fallen_back = False
+        # loop on all the reports and combine their results
         try:
-            per_app_res = self.rep_builder.load()
-            # this is a dictionary and we should loop on it one by one to combine it
             combined_dfs = []
-            for app_id, app_res in per_app_res.items():
-                # we need to patch the app_id to the dataframe
-                if app_res.load_error or app_res.data.empty:
-                    # process entry with failed results or skip them if no handlder is defined.
-                    if self._failed_app_processor:
-                        self._failed_app_processor(app_id, app_res)
-                    else:
-                        # default behavior is to skip the app
-                        continue
-                else:
-                    # process entry with successful results
-                    app_df = self._success_app_processor(self.result_handler,
-                                                         app_id,
-                                                         app_res.data,
-                                                         self._combine_args)
-                    # Q: Should we ignore or skip the empty dataframes?
-                    combined_dfs.append(app_df)
+            for csv_rep in self.rep_builders:
+                # load the report and combine the results.
+                # if an exception is thrown in a single iteration, then their must be something completely wrong and we
+                # need to fail.
+                combined_dfs.extend(self._build_single_report(csv_rep))
             if combined_dfs:
                 # only concatenate if we have any dataframes to combine
                 final_df = pd.concat(combined_dfs, ignore_index=True)
             else:
-                # create an empty DataFrame if no data was collected. uses the table schema.
+                # create an empty DataFrame if no data was collected.
                 final_df = self._create_empty_df()
             success = True
         except Exception as e:  # pylint: disable=broad-except
-            # handle any exceptions that occur during the combination phase
+            # handle any exceptions that occur during combination phase or loading sub-reports
             load_error = e
+            if self._fall_back_to_empty_df:
+                # if we are falling back to an empty DataFrame, we create it here.
+                try:
+                    final_df = self._create_empty_df()
+                    success = True
+                    fallen_back = True
+                except Exception as fall_back_ex:  # pylint: disable=broad-except
+                    self.logger(self.default_rep_builder).error(
+                        f'could not fall back to empty df {self.tbl}: {fall_back_ex}')
         return LoadDFResult(
-            f_path='combination of multiple path for table: ' + self.rep_builder.tbl,
+            f_path='combination of multiple path for table: ' + self.tbl,
             data=final_df,
             success=success,
-            fallen_back=False,
+            fallen_back=fallen_back,
             load_error=load_error)
 
 
@@ -381,3 +567,134 @@ def _load_single_app(self) -> TXTResult:
         return self.rep_reader.load_app_txt(
             self._tbl,
             app=self._apps[0])
+
+
+@dataclass
+class LoadRawFilesResult(object):
+    """
+    A dataclass to hold the result of loading raw files.
+    :param ds_name: The name of the dataset used to identify the loading process.
+    :param _reports: A Private field mapping [report_id: str, dataFrame: pd.DataFrame]. Each dataframe is
+           the combined dataframe of all per-app tools output.
+    :param _failed_loads: A dictionary mapping the appIDs to the failed reports. Each entry is of the structure
+           [app_id: str, tuple(report_id: str, error: str)] in order to give details on the root
+           cause of each failure in-case the caller needs to process those failures.
+    """
+    ds_name: str
+    _reports: Dict[str, pd.DataFrame] = field(init=False, default_factory=dict)
+    _failed_loads: Dict[str, List[Tuple[str, str]]] = field(init=False, default_factory=dict)
+
+    @property
+    def reports(self) -> Dict[str, pd.DataFrame]:
+        """
+        Get the reports loaded from the CSV files.
+        :return: A dictionary mapping report IDs to their DataFrames.
+        """
+        return self._reports
+
+    @property
+    def failed_loads(self) -> Dict[str, List[Tuple[str, str]]]:
+        """
+        Get the failed loads.
+        :return: A dictionary mapping app IDs to a list of tuples containing report ID and error message.
+        """
+        return self._failed_loads
+
+    def append_success(self, report_id: str, df_res: LoadDFResult) -> None:
+        """
+        Append a successful report to the csv_files.
+        :param report_id: The unique identifier for the report.
+        :param df_res: The result of loading a DataFrame.
+        """
+        self._reports[report_id] = df_res.data
+
+    def append_failure(self, app_id: str, report_id: str, load_excep: Optional[Exception]) -> None:
+        """
+        Append a report to the csv_files or failed_loads based on the success of the load.
+        :param app_id: The unique identifier for the application.
+        :param report_id: The unique identifier for the report.
+        :param load_excep: The exception raised during the loading of the report.
+        """
+        self._failed_loads.setdefault(app_id, []).append((report_id, str(load_excep)))
+
+
+@dataclass
+class APIUtils(object):
+    """
+    A utility class for API v1 components.
+    This class provides static methods to process results from CSVReportCombiner and handle exceptions.
+    """
+    @staticmethod
+    def normalize_app_id_col(col_names: List[str]) -> Dict[str, str]:
+        """
+        Normalize the appId column name to 'appId' if it exists in the column names.
+        :param col_names: List of column names.
+        :return: A dictionary mapping the original column name to 'appId' if it exists.
+        """
+        for col_name in col_names:
+            if AppHandler.normalize_attribute(col_name) == 'app_id':
+                return {col_name: 'appId'}
+        return {}
+
+    @staticmethod
+    def process_res(
+            raw_res: LoadRawFilesResult,
+            combiner: CSVReportCombiner,
+            convert_to_camel: bool = False,
+            raise_on_failure: bool = True,
+            raise_on_empty: bool = True
+    ) -> Optional[pd.DataFrame]:
+        """
+        A utility function that wraps the creation of a combined per-app-report.
+        It processes the result of a CSVReportCombiner and handles exceptions.
+        :param raw_res: The LoadRawFilesResult instance to append the results to.
+        :param combiner: The CSVReportCombiner to process.
+        :param convert_to_camel: If True, convert the column names to camelCase.
+               This is useful to normalize the column-names to a common format.
+        :param raise_on_failure: If True, raise an exception if the combiner fails to
+               build the combined dataframe. Note that this is not intended to handle individual app-level
+               failures. For the latter, visit the arguments provided by the CSVReportCombiner.
+        :param raise_on_empty: If True, raise an exception if the resulting DataFrame is
+               empty or None. This is useful to ensure that mandatory reports are always valid DataFrames
+               while allow other optional reports to be empty without throwing exceptions.
+        :return: The resulting DataFrame. The DataFrame can be None if there was an error while loading the report
+               and the raise_on_empty is False. This cane be avoided if the combiner is configured to
+               fall_back to an empty DataFrame in case of failure.
+        :raises RuntimeError: If the combiner fails to build the final result and raise_on_failure is True.
+        :raises ValueError: If the resulting DataFrame is empty and raise_on_empty is True.
+        """
+        try:
+            if convert_to_camel:
+                # Update the rep_builders by setting the col_map_cb to the staticmethod to_camel.
+                for rep in combiner.rep_builders:
+                    rep.map_cols_cb(DataUtils.cols_to_camel_case)
+            # final_res is dictionary of [app_id: str, LoadDFResult] where each LoadDFResult
+            # contains the data and the success flag.
+            final_res = combiner.build()
+            # If the combiner failed to load the report, we should raise an exception
+            if not final_res.success and raise_on_failure:
+                if final_res.load_error:
+                    raise RuntimeError(
+                        f'Loading report {combiner.tbl} failed with error: {final_res.get_fail_cause()}'
+                    ) from final_res.get_fail_cause()  # use the get_fail_cause to get the original exception.
+                raise RuntimeError(f'Loading report {combiner.tbl} failed with unexpected error.')
+            # if dataframe is None, or dataframe is empty and raise_on_empty is True, raise an exception
+            if raise_on_empty and (final_res.data is None or final_res.data.empty):
+                detail_reason = 'a None DataFrame' if final_res.data is None else 'an empty DataFrame'
+                raise ValueError(
+                    f'Loading report {combiner.tbl} on dataset {raw_res.ds_name} returned {detail_reason}.')
+            # If we reach this point, then there is no raise on exceptions, just proceed with wrapping up the report.
+            # 1. Append failed apps: Loop on combiner_failed apps and append them to the raw_res.
+            if combiner.failed_apps:
+                for app_id, app_error in combiner.failed_apps.items():
+                    raw_res.append_failure(app_id, combiner.tbl, app_error)
+            # 2. Append the resulting dataframe to the reports if it is successful. Else set the Dataframe to None.
+            if final_res.success:
+                raw_res.append_success(combiner.tbl, final_res)
+                return final_res.data
+            # If the combiner failed to build the final result, we should None
+            return None
+        except Exception as e:  # pylint: disable=broad-except
+            # If we reach here, it means that the combiner failed to build the final result.
+            # We should raise an exception to inform the caller.
+            raise RuntimeError(f'Failed to load report {combiner.tbl} on dataset {raw_res.ds_name}: {e}') from e
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/featurizers/default.py b/user_tools/src/spark_rapids_tools/tools/qualx/featurizers/default.py
index e605539d9..c55ca9fbd 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/featurizers/default.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/featurizers/default.py
@@ -174,7 +174,7 @@ def extract_raw_features(
     Parameters
     ----------
     toc: pd.DataFrame
-        Table of contents of CSV files for the dataset.
+        Table of contents of CSV files for the dataset dF <filepath | ds_name | appId | table_name>
     node_level_supp: pd.DataFrame
         Node-level support information used to filter out metrics associated with unsupported operators.
     qualtool_filter: str
@@ -536,15 +536,17 @@ def load_csv_files(
     Parameters
     ----------
     toc: pd.DataFrame
-        Table of contents of CSV files for the dataset.
+        Table of contents of CSV files for the dataset dF <filepath | ds_name | appId | table_name>
     app_id: str
-        Application ID.
+        Application ID to target the specific profiler CSV files.
     node_level_supp: pd.DataFrame
         Node-level support information used to filter out metrics associated with unsupported operators.
+        dF <App ID | SQL ID | SQL Node Id | Exec Is Supported>
     qualtool_filter: str
         Type of filter to apply to the qualification tool output, either 'stage' or None.
     qualtool_output: pd.DataFrame
-        Qualification tool output.
+        Qualification tool output. This is the Dataframe loaded for the Qualification core summary.
+        dF <App Name | App ID | App Duration >
     remove_failed_sql: bool
         Remove sqlIDs with high failure rates, default: True.
 
@@ -646,6 +648,8 @@ def scan_tbl(
     sql_to_stage = scan_tbl('sql_to_stage_information')
     if not sql_to_stage.empty:
         # try:
+        # exclude the rows that do not show nodeIds.
+        # creates dF <sqlID | jobID>
         sqls_with_execs = (
             sql_to_stage.loc[sql_to_stage['SQL Nodes(IDs)'].notna()][['sqlID', 'jobID']]
             .groupby(['sqlID'])
@@ -656,6 +660,7 @@ def scan_tbl(
         sqls_with_execs = pd.DataFrame()
 
     if not sql_app_metrics.empty and not sqls_with_execs.empty:
+        # TODO: Not sure why we are doing this
         sql_app_metrics = (
             sql_app_metrics.merge(sqls_with_execs, on='sqlID')
             .drop(columns=['jobID'])
@@ -665,6 +670,7 @@ def scan_tbl(
     # Job to stageIds/sqlID mapping:
     job_map_tbl = scan_tbl('job_information')
     if not job_map_tbl.empty:
+        # dF <jobID | stageIds | sqlID | jobStartTime_min | endTime]>
         job_map_tbl = job_map_tbl.rename(columns={'startTime': 'jobStartTime_min'})
         job_map_tbl['sqlID'] = job_map_tbl['sqlID'].fillna(-1).astype(int)
         job_map_tbl['jobID'] = 'job_' + job_map_tbl['jobID'].astype(str)
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
index 8fb8fbd0f..10717f0dd 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
@@ -288,6 +288,8 @@ def load_profiles(
             profile_files.extend(csv_files + json_files)
 
         # filter files by app_ids in app_meta
+        # toc_list is a list of dataframes of the following schema
+        # <filepath | ds_name | appId | table_name>
         toc_list = []
         app_meta_list = []
         for app_id, meta in app_meta.items():
@@ -297,12 +299,19 @@ def load_profiles(
             # filter profiler files by app_id and attach ds_name, appId, table_name
             # convert glob pattern to regex pattern
             if app_id == 'default':
+                # Create a list of all the raw metrics files.
                 app_id_files = profile_files
             else:
                 app_id = app_id.replace('*', '.*')
                 app_id_files = [f for f in profile_files if re.search(app_id, f)]
 
             if app_id_files:
+                # creates a dF <filepath | ds_name | appId | table_name>
+                # where:
+                # * filepath is the full path to the CSV/JSON file,
+                # * ds_name is the dataset name,
+                # * appId is the appId extracted from the filepath,
+                # * table_name is the file extracted from the filepath (without extension).
                 tmp = pd.DataFrame({'filepath': app_id_files})
                 fp_split = tmp['filepath'].str.split(r'/')
                 tmp['ds_name'] = f'{ds_name}:{job_name}' if job_name else ds_name
@@ -310,6 +319,8 @@ def load_profiles(
                 tmp['table_name'] = fp_split.str[-1].str.split('.').str[0]
 
                 # collect mapping of appId (after globbing) to meta
+                # this eventually creates the list of AppIds
+                # creates a dF <appId | runType | scaleFactor>
                 tmp_app_meta = tmp[['appId']].drop_duplicates()
                 for key, value in meta.items():
                     tmp_app_meta[key] = value
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
index 6f192892f..09f2218cd 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/qualx_main.py
@@ -29,7 +29,8 @@
 
 from spark_rapids_tools import CspPath
 from spark_rapids_tools.api_v1 import QualCoreResultHandler
-from spark_rapids_tools.api_v1.builder import CSVReport, CSVCombiner, APIResultHandler
+from spark_rapids_tools.api_v1.builder import CSVReport, APIResultHandler, APIUtils, LoadRawFilesResult, \
+    CSVReportCombiner
 
 from spark_rapids_tools.tools.qualx.config import (
     get_cache_dir,
@@ -213,15 +214,16 @@ def _get_qual_data(qual_handler: QualCoreResultHandler) -> Tuple[Optional[pd.Dat
     # 1- uses the default combination method which is dedicated to use the appHanlder to combine the results.
     # 2- ignore the failed execs. i.e., apps that has no execs. Otherwise, it is possible to add a call-back to handle
     # the apps that had no execs.
-    q_execs_res = (
-        CSVCombiner(rep_builder=CSVReport(qual_handler).table('execCSVReport'))  # use ExecsCSV report
-        .on_apps()                                                               # combine DFs on apps
-        .combine_args({'col_names': ['App ID']})                                 # inject cols when combining on apps
-        .build()                                                                 # combine the results
+    execs_process_res = LoadRawFilesResult(ds_name='QualTool execs')
+    APIUtils.process_res(
+        raw_res=execs_process_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[
+                CSVReport(qual_handler)
+                .table('execCSVReport')]
+        ).on_app_fields({'app_id': 'App ID'})  # use "App ID" to be consistent with the remaining qualx code.
     )
-    if not q_execs_res.success:
-        raise RuntimeError(f'Failed to load execution CSV report: {q_execs_res.load_error}') from q_execs_res.load_error
-    node_level_supp = load_qtool_execs(q_execs_res.data)
+    node_level_supp = load_qtool_execs(execs_process_res.reports.get('execCSVReport'))
 
     return node_level_supp, qualtool_summary
 
@@ -743,15 +745,19 @@ def predict(
 
     if not qual_handlers:
         raise ValueError('qual_handlers list is empty - no qualification data available for prediction')
+    if all(q_handler.is_empty() for q_handler in qual_handlers):
+        logger.warning('All qualification handlers are empty - no apps to predict')
+        return pd.DataFrame()
 
     node_level_supp, qual_tool_output, qual_metrics = _get_combined_qual_data(qual_handlers)
     # create a DataFrame with default predictions for all app IDs.
     # this will be used for apps without predictions.
     default_preds_df = qual_tool_output.apply(create_row_with_default_speedup, axis=1)
 
-    if len(qual_metrics) == 0:
-        logger.warning('Qualification tool metrics are missing. Speedup predictions will be skipped.')
-        return pd.DataFrame()
+    # it is too early to decide whether to there are raw metrics or not
+    # if  any(qual_metrics) == 0:
+    #     logger.warning('Qualification tool metrics are missing. Speedup predictions will be skipped.')
+    #     return pd.DataFrame()
 
     # construct mapping of appIds to original appNames
     app_id_name_map = default_preds_df.set_index('appId')['appName'].to_dict()
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/revamp/__init__.py b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/__init__.py
new file mode 100644
index 000000000..b1d500c1e
--- /dev/null
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""module that defines the rewriting of the qualX using the new Tools API."""
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/revamp/featurizer.py b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/featurizer.py
new file mode 100644
index 000000000..d926b0c97
--- /dev/null
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/featurizer.py
@@ -0,0 +1,655 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for featurizer for QualX"""
+
+from typing import List, Dict, Optional
+
+import pandas as pd
+
+from spark_rapids_tools.api_v1 import LoadRawFilesResult, ToolResultHandlerT, CSVReportCombiner, CSVReport, \
+    APIUtils
+from spark_rapids_tools.tools.qualx.preprocess import load_qtool_execs
+
+from spark_rapids_tools.utils import Utilities
+
+
+def load_csv_files(
+        ds_name: str,
+        res_hs: List[ToolResultHandlerT]
+) -> LoadRawFilesResult:
+    """
+    Load CSV files from the result handlers into memory. This method should minimize applying logic
+    on the dataframes as mush as possible in order to give a clear separation between loading
+    raw-data Vs. processing them to create features.
+    The implementation uses CSVReportCombiner to combine the reports across all the apps.
+    By default, the combiner injects the appId column if it does not exist in the report.
+    :param ds_name: The name of the dataset.
+    :param res_hs: List of tools result handlers. The assumptions that each handler is valid and has
+           non-zero apps.
+    :return: A LoadRawFilesResult result object.
+    :raises: RuntimeError: If unexpected error is triggered while loading a mandatory report.
+    """
+    raw_res = LoadRawFilesResult(ds_name=ds_name)
+
+    # Get the app_info report.
+    # Raise when empty/failed to load.
+    # The API will not inject appId into the result because the report has the appId column already.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawApplicationInformationCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get the jars table.
+    # Do not fail if it is empty/failed. This is empty for CPU applications.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawRapidsJarsCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False,
+        raise_on_failure=False,
+    )
+
+    # Get the spark properties table (this should not be empty)
+    # Raise when empty/failed to load.
+    # TODO: It does not seem that this table is actually used anywhere
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawSparkPropertiesCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get the executor time-percent table (this should not be empty)
+    # Raise if empty or failed to load.
+    # This table already has "App ID" column for some reason. Rename the column to appId and disable
+    # app injection.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[
+                CSVReport(r_h)
+                .map_cols_cb(APIUtils.normalize_app_id_col)  # normalize the "App ID" column to 'appId'
+                .table('coreRawSqlDurationAndExecutorCpuTimePercentCSV') for r_h in res_hs
+            ]
+        ).disable_apps_injection()  # disable app-injection
+    )
+
+    # Get the executor information table
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawExecutorInformationCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get the sql level aggregated metrics (this might be empty if no SQLs/metrics are defined)
+    # Raise if empty or failed to load
+    # This table already has  "appID". Rename the column to appId and avoid injecting the appId column.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[
+                CSVReport(r_h)
+                .map_cols_cb(APIUtils.normalize_app_id_col)
+                .table('coreRawSqlLevelAggregatedTaskMetricsCSV') for r_h in res_hs
+            ]
+        ).disable_apps_injection()
+    )
+
+    # Get sql-to-stage info (this can be empty if stages are not defined or sqls are not defined)
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawSqlToStageInformationCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False
+    )
+
+    # Get job info
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawJobInformationCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get the sql-plan-metrics. This cannot be empty.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawSqlPlanMetricsForApplicationCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get the job-agg-metrics. This cannot be empty.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawJobLevelAggregatedTaskMetricsCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get the stage-agg-metrics. This cannot be empty.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawStageLevelAggregatedTaskMetricsCSV') for r_h in res_hs]
+        )
+    )
+
+    # Get whole stage operator info. It can be empty; especially for the GPU jobs.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawWholeStageCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False,
+        raise_on_failure=False
+    )
+
+    # Get failed-tasks info. It can be empty.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawFailedTasksCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False,
+        raise_on_failure=False
+    )
+
+    # Get failed-stages info. It can be empty.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawFailedStagesCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False,
+        raise_on_failure=False
+    )
+
+    # Get job-failed info. It can be empty.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawFailedJobsCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False,
+        raise_on_failure=False
+    )
+
+    # Get data-source info. It can be empty.
+    # Note that this report is error-prune due to parsing the read-operators.
+    # P.S: It is wise to not fail if this report fail. this table may have lots non-deterministic
+    #      behavior due to handling new read operators.
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[CSVReport(r_h).table('coreRawDataSourceInformationCSV') for r_h in res_hs]
+        ),
+        raise_on_empty=False,
+        raise_on_failure=False
+    )
+
+    #########################################
+    # Load Qualification reports
+    #########################################
+
+    # Gets the execs_report. We want to safely inform the reporter not to fail because
+    # it should not if the result_handlers are not Qualification handlers.
+    # note that there is a side effect that we do not filter the rows immediately here.
+    # Instead, we combine all rows across all the apps before any filters which can be huge.
+
+    APIUtils.process_res(
+        raw_res=raw_res,
+        combiner=CSVReportCombiner(
+            rep_builders=[
+                CSVReport(r_h)
+                .table('execCSVReport')
+                for r_h in res_hs
+            ]
+        ).on_app_fields({'app_id': 'App ID'}),  # use "App ID" to be consistent with the remaining qualx code.
+        raise_on_empty=False,
+        raise_on_failure=False
+    )
+    # TODO: there is no need to load app_summary to get the app_durations.
+    #       App duration should be available in the app_info_df.
+
+    return raw_res
+
+
+def process_raw_features(
+        raw_tbls: LoadRawFilesResult,
+        *,
+        qualtool_filter: Optional[str],
+        remove_failed_sql: bool = True) -> Dict[str, pd.DataFrame]:
+    """
+    Process the raw features from the loaded CSV files.
+    :param raw_tbls: The result of loading raw files.
+    :param qualtool_filter: The type of filter to apply to the qualification tool output, either 'stage' or None.
+    :param remove_failed_sql: If True, remove SQLs with high failure rates.
+    :return: A DataFrame containing the processed features.
+    """
+    def valid_df(arg: Optional[pd.DataFrame], allow_empty: bool = False) -> bool:
+        """
+        Check if the DataFrame is not None and not empty.
+        :param arg: The DataFrame to check.
+        :param allow_empty: If True, allow the DataFrame to be empty.
+        :return: True if the DataFrame is valid, False otherwise.
+        """
+        if arg is None:
+            return False
+        if arg.empty:
+            # If the DataFrame is empty, we check if allow_empty is True.
+            return allow_empty
+        return True
+
+    # Step-1: process the failed_apps to decide if we need to drop some of them based on the failures
+    # Step-2: process the successful apps to extract the features
+    #   Step-a: combine some of the columns.
+    #   Step-b: return the processed DataFrame with the features.
+
+    app_info_df = raw_tbls.reports.get('coreRawApplicationInformationCSV')
+
+    if valid_df(app_info_df):
+        # Get jar versions: This is only valid for GPU applications.
+        raw_jars_df = raw_tbls.reports.get('coreRawRapidsJarsCSV')
+        # Merge jar versions into app_info_df based on appId
+        if valid_df(raw_jars_df):
+            # Extract version info per appId
+            def extract_versions(jars):
+                cudf_version = '-'
+                rapids4spark_version = '-'
+                bm_runner_version = '-'
+                for jar in jars.split(','):
+                    jar = jar.strip().split('/')[-1].replace('.jar', '')
+                    if jar.startswith('cudf'):
+                        cudf_version = jar
+                    elif jar.startswith('rapids-4-spark_'):
+                        rapids4spark_version = jar
+                    elif jar.startswith('rapids-4-spark-benchmarks_'):
+                        bm_runner_version = jar
+                return pd.Series([cudf_version, rapids4spark_version, bm_runner_version])
+
+            jars_versions = raw_jars_df[['appId', 'Rapids4Spark jars']].copy()
+            jars_versions[['cudfVersion', 'rapids4sparkVersion', 'bmRunnerVersion']] = (
+                jars_versions['Rapids4Spark jars'].apply(extract_versions)
+            )
+            jars_versions = jars_versions[['appId', 'cudfVersion', 'rapids4sparkVersion', 'bmRunnerVersion']]
+
+            # Merge with app_info_df
+            app_info_df = app_info_df.merge(
+                jars_versions,
+                on='appId',
+                how='left'
+            )
+            # Fill missing version info with '-'
+            app_info_df[['cudfVersion', 'rapids4sparkVersion', 'bmRunnerVersion']] = (
+                app_info_df[['cudfVersion', 'rapids4sparkVersion', 'bmRunnerVersion']].fillna('-')
+            )
+        else:
+            # If no jars info, set all to '-'
+            app_info_df['cudfVersion'] = '-'
+            app_info_df['rapids4sparkVersion'] = '-'
+            app_info_df['bmRunnerVersion'] = '-'
+
+        # Allow user-provided 'ds_name' as 'appName'
+        # TODO: Do we still need the line below?
+        app_info_df['appName'] = raw_tbls.ds_name
+        # Enforce the value of SparkVersion
+        app_info_df.fillna({'sparkVersion': 'Unknown'}, inplace=True)
+
+    ################################################
+    # coreRawSqlDurationAndExecutorCpuTimePercentCSV
+    ################################################
+    # TODO: legacy code overwrite the SQLDuration with the AppDuration. It is not clear why we do so?
+    raw_exec_dur_df = raw_tbls.reports.get('coreRawSqlDurationAndExecutorCpuTimePercentCSV')
+    if valid_df(raw_exec_dur_df):
+        raw_exec_dur_df = raw_exec_dur_df.rename(
+            {
+                'App Duration': 'appDuration',
+                'Contains Dataset or RDD Op': 'containsDatasetOrRDDOp',
+            },
+            axis=1,
+        )
+        # create a column potentialProblems to indicate whether the "Potential Problems" column is equal to UDF or not.
+        raw_exec_dur_df['potentialProblems'] = (
+                raw_exec_dur_df['Potential Problems'] == 'UDF'
+        ).fillna(False).astype(bool)
+        raw_exec_dur_df.drop(columns=['Potential Problems'])
+
+    ###############################################
+    # coreRawSqlToStageInformationCSV
+    # coreRawSqlLevelAggregatedTaskMetricsCSV
+    ###############################################
+    # filter out sql ids that have no execs associated with them
+    # this should remove root sql ids in 3.4.1+
+    raw_sql_to_stage_df = raw_tbls.reports.get('coreRawSqlToStageInformationCSV')
+    raw_sql_agg_metrics_df = raw_tbls.reports.get('coreRawSqlLevelAggregatedTaskMetricsCSV')
+    if valid_df(raw_sql_to_stage_df):
+        # Filters the DataFrame raw_sql_to_stage_df to only include rows where the column SQL Nodes(IDs)
+        # is not null (i.e., contains data). It then selects only the [appId, sqlID, jobID] columns from those
+        # filtered rows. The result is a new DataFrame containing just the appId, sqlID and jobID
+        # for entries that have associated SQL node IDs.
+        sqls_with_execs = (
+            raw_sql_to_stage_df.loc[raw_sql_to_stage_df['SQL Nodes(IDs)'].notna()][['appId', 'sqlID', 'jobID']]
+            .groupby(['appId', 'sqlID'])  # groups the filtered rows by appId and sqlID.
+            .first()                      # keeps the first occurence (i.e., 1st jobID for each unique pair).
+            .reset_index()                # Resets the index to turn the groupby result back into a regular DataFrame.
+        )
+        # filter the sqls with execs
+        if valid_df(raw_sql_agg_metrics_df) and valid_df(sqls_with_execs):
+            raw_sql_agg_metrics_df = (
+                raw_sql_agg_metrics_df
+                .merge(sqls_with_execs, left_on=['appId', 'sqlID'], right_on=['appId', 'sqlID'])
+                .drop(columns=['jobID'])
+                .reset_index(drop=True)
+            )
+
+    ##################################
+    # execCSVReport
+    ##################################
+    node_level_supp = None
+    raw_execs_df = raw_tbls.reports.get('execCSVReport')
+    if valid_df(raw_execs_df):
+        node_level_supp = load_qtool_execs(raw_execs_df)
+
+    ###############################################
+    # coreRawSqlPlanMetricsForApplicationCSV
+    ###############################################
+    raw_sql_plan_metrics_df = raw_tbls.reports.get('coreRawSqlPlanMetricsForApplicationCSV')
+    stages_supp = pd.DataFrame(columns=['appId', 'sqlID', 'stageIds']).astype({
+            'appId': Utilities.scala_to_pandas_type('String'),
+            'sqlID': Utilities.scala_to_pandas_type('Long'),
+            'stageIds': Utilities.scala_to_pandas_type('Int')
+        })
+    if valid_df(raw_sql_plan_metrics_df):
+        if node_level_supp is not None:
+            if qualtool_filter == 'stage':
+                # Filter out rows that do not have matching keys in both tables.
+                raw_sql_plan_metrics_df = raw_sql_plan_metrics_df.merge(
+                    node_level_supp,
+                    left_on=['appId', 'sqlID', 'nodeID'],
+                    right_on=['App ID', 'SQL ID', 'SQL Node Id'],
+                    how='inner',
+                )
+                raw_sql_plan_metrics_df = raw_sql_plan_metrics_df.drop(
+                    columns=['App ID', 'SQL ID', 'SQL Node Id']
+                )
+                raw_sql_plan_metrics_df['stageIds'] = raw_sql_plan_metrics_df['stageIds'].apply(
+                    lambda x: str(x).split(',')
+                )
+                raw_sql_plan_metrics_df = raw_sql_plan_metrics_df.explode('stageIds')
+                # compute supported stages for use below. TBD.
+                # rename column from 'Exec Is Supported' to 'Stage Is Supported' and change elsewhere
+                stages_supp = (
+                    raw_sql_plan_metrics_df.groupby(['appId', 'sqlID', 'stageIds'])[
+                        'Exec Is Supported'
+                    ]
+                    .agg('all')
+                    .reset_index()
+                )
+                # the stageIds column resulting from the explode is of type object. So, we need to
+                # convert it back to Int64. Then we drop the rows with N/A Values.
+                stages_supp['stageIds'] = pd.to_numeric(stages_supp['stageIds'], errors='coerce').astype('Int64')
+                # Filter out N/A: make sure to use `notna()` to handle all sort of N/A values.
+                stages_supp = stages_supp.loc[
+                    stages_supp['stageIds'].notna()
+                ].reset_index(drop=True)
+                # filter sql ops to have only supported ones for processing in calling fn
+                raw_sql_plan_metrics_df = raw_sql_plan_metrics_df.loc[
+                    raw_sql_plan_metrics_df['Exec Is Supported']
+                ].drop(columns=['Exec Is Supported'])
+            elif qualtool_filter == 'sqlId':
+                sql_level_supp = (
+                    node_level_supp.groupby(['App ID', 'SQL ID'])['Exec Is Supported']
+                    .agg('all')
+                    .reset_index()
+                )
+                sql_level_supp = sql_level_supp.loc[sql_level_supp['Exec Is Supported']]
+                raw_sql_agg_metrics_df = raw_tbls.reports.get('coreRawSqlLevelAggregatedTaskMetricsCSV')
+                if valid_df(raw_sql_agg_metrics_df):
+                    raw_sql_agg_metrics_df = raw_sql_agg_metrics_df.merge(
+                        sql_level_supp,
+                        left_on=['appId', 'sqlID'],
+                        right_on=['App ID', 'SQL ID'],
+                        how='inner'
+                    )
+                    raw_sql_agg_metrics_df = raw_sql_agg_metrics_df.drop(
+                        columns=['Exec Is Supported', 'App ID', 'SQL ID']
+                    )
+            else:
+                # Don't filter out unsupported ops
+                pass
+
+    # sql ids to drop due to failures meeting below criteria
+    # mark for removal from modeling sql ids that have failed stage time > 10% total stage time
+    allowed_failed_duration_fraction = 0.10
+    sqls_to_drop = pd.DataFrame(columns=['appId', 'sqlID']).astype({
+        'appId': Utilities.scala_to_pandas_type('String'),
+        'sqlID': Utilities.scala_to_pandas_type('Long')
+    })
+
+    ###############################################
+    # coreRawStageLevelAggregatedTaskMetricsCSV
+    ###############################################
+    raw_stage_agg_metrics_df = raw_tbls.reports.get('coreRawStageLevelAggregatedTaskMetricsCSV')
+    if valid_df(raw_stage_agg_metrics_df):
+        # get the failed stage_df
+        raw_failed_stage_df = raw_tbls.reports.get('coreRawFailedStagesCSV')
+        if valid_df(raw_failed_stage_df) and valid_df(raw_sql_to_stage_df):
+            stage_agg_tbl = raw_stage_agg_metrics_df[['appId', 'stageId', 'Duration']].merge(
+                raw_sql_to_stage_df, left_on=['appId', 'stageId'], right_on=['appId', 'stageId']
+            )
+            total_stage_time = (
+                stage_agg_tbl[['appId', 'sqlID', 'Duration']]
+                .groupby(['appId', 'sqlID'])
+                .agg('sum')
+                .reset_index()
+            )
+            failed_stage_time = (
+                stage_agg_tbl[['appId', 'sqlID', 'stageId', 'Duration']]
+                .merge(raw_failed_stage_df, left_on=['appId', 'stageId'], right_on=['appId', 'stageId'], how='inner')[
+                    ['appId', 'sqlID', 'Duration']
+                ]
+                .groupby(['appId', 'sqlID'])
+                .agg('sum')
+                .reset_index()
+            )
+            stage_times = total_stage_time.merge(
+                failed_stage_time, on=['appId', 'sqlID'], how='inner'
+            )
+            # append the failed sqls to the sqls_to_drop DataFrame
+            rows_to_drop = stage_times.loc[
+                stage_times.Duration_y
+                > stage_times.Duration_x * allowed_failed_duration_fraction
+                ][['appId', 'sqlID']]
+            sqls_to_drop = pd.concat([sqls_to_drop, rows_to_drop], ignore_index=True)
+            # we can debug here to see the failed sqls
+
+        if valid_df(stages_supp, allow_empty=True) and (qualtool_filter == 'stage'):
+            raw_stage_agg_metrics_df = raw_stage_agg_metrics_df.merge(
+                stages_supp,
+                left_on=['appId', 'stageId'],
+                right_on=['appId', 'stageIds'],
+                how='inner'
+            ).drop(columns=['stageIds'])  # drop the stageIds column from the stages_supp dataframe
+
+        # add a boolean column to indicate if the stage is associated with a SQL ID or not.
+        raw_stage_agg_metrics_df['hasSqlID'] = raw_stage_agg_metrics_df['sqlID'].notna()
+
+    ###############################################
+    # coreRawJobInformationCSV
+    ###############################################
+    raw_job_info_df = raw_tbls.reports.get('coreRawJobInformationCSV')
+    if valid_df(raw_job_info_df):
+        # dF <jobID | stageIds | sqlID | jobStartTime_min | endTime]>
+        raw_job_info_df = raw_job_info_df.rename(columns={'startTime': 'jobStartTime_min'})
+        raw_job_info_df['sqlID'] = raw_job_info_df['sqlID'].fillna(-1).astype(Utilities.scala_to_pandas_type('Int'))
+        # TODO: Maybe we should not need this line, but it is used in the legacy code.
+        # raw_job_info_df['jobID'] = 'job_' + raw_job_info_df['jobID'].astype(Utilities.scala_to_pandas_type('String'))
+
+    ###############################################
+    # coreRawJobLevelAggregatedTaskMetricsCSV
+    ###############################################
+    # TODO: raw_job_agg_metrics_df does not seem to be used after that. Maybe we do not need it?
+    raw_job_agg_metrics_df = raw_tbls.reports.get('coreRawJobLevelAggregatedTaskMetricsCSV')
+    if valid_df(raw_job_agg_metrics_df, allow_empty=True):
+        # # rename the column jobId to jobID
+        # raw_job_agg_metrics_df = raw_job_info_df.rename(columns={'jobId': 'jobID'})
+        # get the information from jobInfo tbl
+        if valid_df(raw_job_info_df, allow_empty=True):
+            raw_job_agg_metrics_df = raw_job_agg_metrics_df.merge(
+                raw_job_info_df[['appId', 'jobID', 'sqlID', 'jobStartTime_min']],
+                left_on=['appId', 'jobId'],
+                right_on=['appId', 'jobID'],
+                how='left'
+            )
+            # Add a boolean column to indicate if the job is associated with a SQL ID or not.
+            raw_job_agg_metrics_df['hasSqlID'] = raw_job_agg_metrics_df['sqlID'] != -1
+            # drop redundant columns jobId
+            raw_job_agg_metrics_df = raw_job_agg_metrics_df.drop(columns=['jobId'])
+
+    raw_spark_props_df = raw_tbls.reports.get('coreRawSparkPropertiesCSV')
+    if valid_df(raw_spark_props_df):
+        raw_spark_props_df = raw_spark_props_df.set_index('propertyName')
+
+    ###############################################
+    # coreRawFailedTasksCSV
+    ###############################################
+    raw_failed_tasks_df = raw_tbls.reports.get('coreRawFailedTasksCSV')
+    if valid_df(raw_failed_tasks_df):
+        # aggregate failed tasks per appName, appId, sqlID
+        raw_failed_tasks_df = raw_failed_tasks_df.groupby(['appId', 'stageId'])['attempt'].count().reset_index()
+        raw_failed_tasks_df = raw_failed_tasks_df.merge(
+            raw_sql_to_stage_df[['appId', 'stageId', 'sqlID']],
+            on=['appId', 'stageId']
+        )
+        raw_failed_tasks_df = raw_failed_tasks_df.groupby(
+            ['appId', 'sqlID']
+        )['attempt'].sum().reset_index()
+        raw_failed_tasks_df = raw_failed_tasks_df.rename(columns={'attempt': 'failed_tasks'})
+    else:
+        raw_failed_tasks_df = pd.DataFrame(columns=['appId', 'sqlID', 'failed_tasks']).astype({
+            'appId': Utilities.scala_to_pandas_type('String'),
+            'sqlID': Utilities.scala_to_pandas_type('Long'),
+            'failed_tasks': Utilities.scala_to_pandas_type('Int')
+        })
+
+    ###############################################
+    # Merging tables together
+    ###############################################
+
+    # merged app_info_tbl
+    raw_exec_info_df = raw_tbls.reports.get('coreRawExecutorInformationCSV')
+    if all(valid_df(t_df) for t_df in [
+        app_info_df, raw_exec_dur_df, raw_sql_agg_metrics_df, raw_exec_info_df
+    ]):
+        # merge all the tables together
+        app_info_mg = app_info_df.merge(
+            raw_exec_info_df,
+            left_on='appId',
+            right_on='appId'
+        )
+        app_info_mg = app_info_mg.merge(
+            raw_sql_agg_metrics_df, left_on='appId', right_on='appId'
+        )
+        app_info_mg = app_info_mg.merge(
+            raw_exec_dur_df[
+                [
+                    'appId',
+                    'sqlID',
+                    'appDuration',
+                ]
+            ],
+            left_on=['appId', 'sqlID'],
+            right_on=['appId', 'sqlID'],
+        )
+
+        # filter out sqlIDs with aborted jobs (these are jobs failed due to sufficiently many (configurable) failed
+        # attempts of a stage due to error conditions). These are failed sqlIDs that we shouldn't model,
+        # but are still included in profiler output.
+        raw_failed_jobs_df = raw_tbls.reports.get('coreRawFailedJobsCSV')
+        if valid_df(raw_failed_jobs_df) and valid_df(raw_job_info_df):
+            aborted_jobs = raw_failed_jobs_df.loc[
+                raw_failed_jobs_df.failureReason.str.contains('aborted')
+            ][['appId', 'jobID']]
+            # TODO: Is this always empty? jobInfo contains failed jobs?
+            aborted_jobs_sql_id = raw_job_info_df.merge(
+                aborted_jobs, how='inner', on=['appId', 'jobID']
+            )
+            aborted_sql_ids = aborted_jobs_sql_id[['appId', 'sqlID']].drop_duplicates()
+            sqls_to_drop = pd.concat([sqls_to_drop, aborted_sql_ids], ignore_index=True).drop_duplicates()
+
+    else:
+        app_info_mg = pd.DataFrame()
+
+    if remove_failed_sql and valid_df(app_info_mg) and valid_df(sqls_to_drop):
+        # removes rows from the app_info_mg DataFrame that have a matching appId and sqlID in
+        # the sqls_to_drop DataFrame.
+        # 1. performs a left merge of app_info_mg with sqls_to_drop on appId and sqlID, adding
+        #    a column _merge that indicates whether each row was matched (both) or only present
+        #   in app_info_mg (left_only).
+        # 2. It filters the merged DataFrame to keep only rows where _merge is left_only, i.e.,
+        #    those not present in sqls_to_drop.
+        # 3. It drops the _merge column, returning a DataFrame with the unwanted rows removed.
+        app_info_mg = app_info_mg.merge(
+                            sqls_to_drop,
+                            on=['appId', 'sqlID'],
+                            how='left',
+                            indicator=True
+                        )
+        app_info_mg = app_info_mg[app_info_mg['_merge'] == 'left_only'].drop(columns=['_merge'])
+
+    return {
+        'app_tbl': app_info_mg,
+        'ops_tbl': raw_sql_plan_metrics_df,
+        'spark_props_tbl': raw_spark_props_df,
+        'job_map_tbl': raw_job_info_df,
+        'job_stage_agg_tbl': raw_stage_agg_metrics_df,
+        'wholestage_tbl': raw_tbls.reports.get('coreRawWholeStageCSV'),
+        'ds_tbl': raw_tbls.reports.get('coreRawDataSourceInformationCSV'),
+        'failed_tasks_tbl': raw_failed_tasks_df
+    }
+
+
+def extract_raw_features(
+        ds_name: str,
+        res_hs: List[ToolResultHandlerT],
+        *,
+        qualtool_filter: Optional[str],
+        remove_failed_sql: bool = True) -> pd.DataFrame:
+    """
+    Extract raw features from the result handlers for a given dataset.
+    :param ds_name:
+    :param res_hs:
+    :param qualtool_filter: Type of filter to apply to the qualification tool output, either 'stage' or None.
+    :param remove_failed_sql:
+    :return:
+    """
+    raw_tbls = load_csv_files(ds_name, res_hs)
+    # if no reports were loaded, return an empty DataFrame
+    if not raw_tbls.reports:
+        return pd.DataFrame()
+    # process the raw features from the loaded CSV files
+    process_raw_features(
+        raw_tbls,
+        qualtool_filter=qualtool_filter,
+        remove_failed_sql=remove_failed_sql
+    )
+    print(f'those are the successful loaded reports: {raw_tbls.reports.keys()}')
+    return pd.DataFrame()
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/revamp/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/preprocess.py
new file mode 100644
index 000000000..4b1f6a106
--- /dev/null
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/preprocess.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for preprocessing for QualX"""
+
+from typing import Mapping, Optional
+
+import pandas as pd
+
+
+from spark_rapids_tools.tools.qualx.config import get_config
+from spark_rapids_tools.tools.qualx.preprocess import infer_app_meta, get_alignment
+from spark_rapids_tools.tools.qualx.revamp.featurizer import extract_raw_features
+
+
+def load_profiles(
+        datasets: Mapping[str, Mapping],
+        *,
+        qual_tool_filter: Optional[str] = None,
+        remove_failed_sql: bool = True) -> pd.DataFrame:
+    """
+    Load and preprocess the profiles from the datasets.
+    :param datasets: A mapping of dataset names to their profiles.
+    :param qual_tool_filter: Optional filter for the qualification tool output, either 'stage' or None.
+    :param remove_failed_sql: If True, remove profiles with failed SQL.
+    :raises KeyError: If dataSet does not have 'app_meta' or 'eventlogs' defined.
+    :raises ValueError: If the default app_meta is not defined or if it has multiple entries.
+    :return: A DataFrame containing the loaded profiles.
+    """
+    # define a dict to load all the plugins
+    plugins = {}
+    config = get_config()
+    alignment_df = get_alignment()
+    # the line below is modified to enforce using the revamp methods
+    # featurizers: List[Callable[[str, List[ToolResultHandlerT]], pd.DataFrame]] = [extract_raw_features]
+    for ds_name, ds_meta in datasets.items():
+        # get platform from dataset metadata, or use onprem if not provided
+        platform = ds_meta.get('platform', 'onprem')
+        if 'load_profiles_hook' in ds_meta:
+            plugins[ds_name] = ds_meta['load_profiles_hook']
+        if 'eventlogs' in ds_meta:
+            # dataset has an entry for eventlogs, this implies that we need to get the app_meta,
+            # or infer from directory structure of eventlogs.
+            app_meta = ds_meta.get('app_meta', infer_app_meta(ds_meta['eventlogs']))
+        else:
+            # dataset does not have an entry for eventlogs, so we can use the app_meta directly.
+            # This will raise a keyError if app_meta is not defined.
+            app_meta = ds_meta['app_meta']
+        # get the array of resultHandlers from the dataset metadata.
+        core_res_handlers = ds_meta['resultHandlers']
+
+        # get default app_meta
+        app_meta_default = app_meta['default']
+        if len(app_meta) != 1:
+            raise ValueError(f'Default app_meta for {ds_name} cannot be used with additional entries.')
+        app_meta_default = {'runType': 'CPU', 'scaleFactor': 1}
+
+        # TODO: Should we merge the result handlers or just keep it that way?
+        # invoke featurizers to extract raw features from profiler output
+        features = extract_raw_features(
+            ds_name=ds_name,
+            res_hs=core_res_handlers,
+            qualtool_filter=qual_tool_filter,
+            remove_failed_sql=remove_failed_sql
+        )
+        if features is not None:
+            # add the features to the dataset metadata
+            return features
+
+    return pd.DataFrame()
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/revamp/x_main.py b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/x_main.py
new file mode 100644
index 000000000..a07547e54
--- /dev/null
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/revamp/x_main.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Main module for QualX related commands """
+
+from pathlib import Path
+from typing import Optional, List
+
+import pandas as pd
+
+from spark_rapids_tools.api_v1 import QualCoreResultHandler
+from spark_rapids_tools.tools.qualx.config import get_config
+from spark_rapids_tools.tools.qualx.revamp.preprocess import load_profiles
+from spark_rapids_tools.tools.qualx.util import get_logger
+
+logger = get_logger(__name__)
+
+
+def predict_x(
+        platform: str,
+        qual: str,
+        output_info: dict,
+        *,
+        model: Optional[str] = None,
+        qual_tool_filter: Optional[str] = None,
+        config: Optional[str] = None,
+        qual_handlers: List[QualCoreResultHandler]
+) -> pd.DataFrame:
+    """
+    sddsdssd
+    :param platform:
+    :param qual:
+    :param output_info:
+    :param model:
+    :param qual_tool_filter:
+    :param config:
+    :param qual_handlers:
+    :return:
+    """
+    # load config from command line argument, or use default
+    cfg = get_config(config)
+    model_type = cfg.model_type
+    model_config = cfg.__dict__.get(model_type, {})
+    qual_filter = qual_tool_filter if qual_tool_filter else model_config.get('qual_tool_filter', 'stage')
+
+    if not qual_handlers:
+        raise ValueError('qual_handlers list is empty - no qualification data available for prediction')
+    if all(q_handler.is_empty() for q_handler in qual_handlers):
+        logger.warning('All qualification handlers are empty - no apps to predict')
+        return pd.DataFrame()
+        # if qualification metrics are provided, load metrics and apply filtering
+    datasets = {}                       # create a dummy dataset
+    dataset_name = Path(qual).name      # use qual directory name as dataset name
+    datasets[dataset_name] = {
+        'resultHandlers': qual_handlers,
+        'app_meta': {'default': {'runType': 'CPU', 'scaleFactor': 1}},
+        'platform': platform,
+    }
+    logger.debug('Loading dataset: %s', dataset_name)
+    # load the raw files from the qualification handlers
+    load_profiles(datasets=datasets, qual_tool_filter=qual_filter, remove_failed_sql=False)
+
+    return pd.DataFrame()
diff --git a/user_tools/src/spark_rapids_tools/utils/data_utils.py b/user_tools/src/spark_rapids_tools/utils/data_utils.py
index 831e480c8..3db850f50 100644
--- a/user_tools/src/spark_rapids_tools/utils/data_utils.py
+++ b/user_tools/src/spark_rapids_tools/utils/data_utils.py
@@ -24,6 +24,7 @@
 from jproperties import Properties
 
 from spark_rapids_tools import CspPathT
+from spark_rapids_tools.utils import Utilities
 
 ResDataT = TypeVar('ResDataT')
 
@@ -44,9 +45,15 @@ class AbstractReportResult(Generic[ResDataT]):
     load_error: Optional[Exception] = None
 
     def get_fail_cause(self) -> Optional[Exception]:
-        if self.load_error:
-            return self.load_error.__cause__
-        return None
+        """
+        Get the root cause of the failure if any. If the exception was raised from another,
+        return the original exception; otherwise, return the direct exception.
+        :return: the root exception if any
+        """
+        exc = self.load_error
+        while exc and exc.__cause__ is not None:
+            exc = exc.__cause__
+        return exc
 
 
 @dataclass
@@ -118,6 +125,15 @@ class DataUtils:
     Utility functions to use common data handling such as reading an opening CSV files.
     """
 
+    @staticmethod
+    def cols_to_camel_case(col_names: List[str]) -> Dict[str, str]:
+        """
+        Map the column names to camelCase.
+        :param col_names: The list of column names to map.
+        :return: A dictionary mapping the original column names to their camelCase equivalents.
+        """
+        return {col: Utilities.str_to_camel(col) for col in col_names}
+
     @staticmethod
     def convert_df_to_dict(df: pd.DataFrame) -> List[dict[str, str]]:
         """
diff --git a/user_tools/src/spark_rapids_tools/utils/util.py b/user_tools/src/spark_rapids_tools/utils/util.py
index 11062dcf9..c0847215f 100644
--- a/user_tools/src/spark_rapids_tools/utils/util.py
+++ b/user_tools/src/spark_rapids_tools/utils/util.py
@@ -443,3 +443,18 @@ def scala_to_pandas_type(scala_type: str) -> str:
             # Add more mappings for other Scala types as needed
         }
         return scala_to_pandas_map.get(scala_type, 'object')  # Default to object for unknown types.
+
+    @staticmethod
+    def str_to_camel(s: str) -> str:
+        """
+        Convert a string to camel case.
+        Adopted from
+        https://www.30secondsofcode.org/python/s/string-capitalize-camel-snake-kebab/#camel-case-string
+        > To convert a string to camel case, you can use re.sub() to replace any - or _ with a space,
+        > using the regexp r"(_|-)+". Then, use str.title() to capitalize every word and convert the
+        > rest to lowercase. Finally, use str.replace() to remove any spaces between words.
+        :param s: The input string.
+        :return: The camel case version of the input string.
+        """
+        s = re.sub(r'([_\-])+', ' ', s).title().replace(' ', '')
+        return ''.join([s[0].lower(), s[1:]])
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/__init__.py b/user_tools/tests/spark_rapids_tools_ut/api/__init__.py
new file mode 100644
index 000000000..094d723ae
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/api/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module to test the API Tools"""
diff --git a/user_tools/tests/spark_rapids_tools_ut/api/test_app_handlers.py b/user_tools/tests/spark_rapids_tools_ut/api/test_app_handlers.py
new file mode 100644
index 000000000..1ddccfcee
--- /dev/null
+++ b/user_tools/tests/spark_rapids_tools_ut/api/test_app_handlers.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Module to test functionality of the app_handlers"""
+
+import unittest
+from io import StringIO
+
+import pandas as pd
+
+from spark_rapids_tools.api_v1 import AppHandler
+
+
+class TestAppHandlers(unittest.TestCase):
+    """
+    A unit test for the app-handlers
+    """
+
+
+def test_add_fields_to_dataframe_basic():
+    csv_data = 'a,b\n1,2\n3,4'
+    original_df = pd.read_csv(StringIO(csv_data))
+    handler = AppHandler(_app_id='app-00',
+                         _attempt_id=1,
+                         _app_name='app_name-00',
+                         eventlog_path='/path/to/file')
+    # User wants to map 'appID' and 'name' to columns 'APP ID' and 'App Name'
+    mapping = {'app_id': 'App ID', 'app_name': 'App Name'}
+    result_df = handler.add_fields_to_dataframe(original_df, mapping)
+
+    assert result_df.shape == (2, 4)
+    assert result_df.columns[0] == 'App ID'
+    assert result_df.columns[1] == 'App Name'
+    expected_data = 'App ID,App Name,a,b\napp-00,app_name-00,1,2\napp-00,app_name-00,3,4'
+    expected_df = pd.read_csv(StringIO(expected_data)).astype({'App ID': 'string', 'App Name': 'string'})
+    # assert the dataTypes are correct (string and not object)
+    assert result_df.dtypes[0] == 'string'
+    assert result_df.dtypes[1] == 'string'
+    # verify the entire dataframes are equal
+    pd.testing.assert_frame_equal(expected_df, result_df)
+
+
+def test_add_fields_to_dataframe_empty():
+    """
+    Test adding fields to an empty DataFrame.
+    """
+    # Define the schema using a dictionary where keys are column names and values are data types
+    schema = {
+        'col01': 'string',
+        'col02': 'int64',
+        'col03': 'string',
+        'col04': 'float32'
+    }
+
+    # Create an empty DataFrame with the defined schema
+    empty_df = pd.DataFrame({
+        col: pd.Series(dtype=dtype)
+        for col, dtype in schema.items()
+    })
+
+    handler = AppHandler(_app_id='app-00',
+                         _attempt_id=1,
+                         _app_name='app_name-00',
+                         eventlog_path='/path/to/file')
+    # User wants to map 'appID' and 'name' to columns 'APP ID' and 'App Name'
+    mapping = {'app_id': 'App ID', 'attempt_id': 'Attempt ID', 'app_name': 'App Name'}
+    result_df = handler.add_fields_to_dataframe(empty_df, mapping)
+    assert result_df.shape == (0, 7)  # Should still be empty with 6 columns
+    expected_schema = {
+        'App ID': 'string',
+        'Attempt ID': 'Int64',
+        'App Name': 'string',
+        'col01': 'string',
+        'col02': 'int64',
+        'col03': 'string',
+        'col04': 'float32'
+    }
+
+    for col, dtype in expected_schema.items():
+        assert result_df[col].dtype.name == dtype