ELELAB · mtiberti · Jan 13, 2025 · Nov 24, 2025 · Nov 30, 2025 · Dec 14, 2025
diff --git a/mavisp/methods.py b/mavisp/methods.py
@@ -91,14 +91,17 @@ def parse(self, dir_path):
 
         return averages_df, stds_df, warnings
 
-class MutateXBinding(Method):
-
+class MutateXBinding(Method): 
+    #a parsing class; produces binding ΔΔG data
+
     unit = "kcal/mol"
     type = "Local Int."
     heterodimer_chains = set(['A'])
     homodimer_chains   = set(['AB'])
     target_chain       = 'A'
     measure = "Binding with"
+    averages_filename = 'energies.csv'
+    stds_filename = 'energies_std.csv'
     complex_status = "heterodimer"
 
     def __init__(self, version, complex_status=None):
@@ -110,82 +113,115 @@ def __init__(self, version, complex_status=None):
 
         self.interactors = []
 
-    def parse(self, dir_path):
-
-        warnings = []
-
-        interactors = os.listdir(dir_path)
-        self.interactors = interactors
-
-        if len(interactors) == 0:
-            raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")],
-                                      warning=warnings)
-
-        all_data = None
+    # data_type is either '' or 'st. dev.'
+    def _parse_mutatex_binding_energy_file(self, fname, interactor, data_type):
+        """Parse a single MutateX binding energy file (average or std)."""
 
-        for interactor in interactors:
-
-            interactor_dir = os.path.join(dir_path, interactor)
-
-            mutatex_files = os.listdir(interactor_dir)
+        try:
+            df = pd.read_csv(fname)
+        except Exception as e:
+            this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}"
+            raise MAVISpMultipleError(warning=warnings,
+                                      critical=[MAVISpCriticalError(this_error)])
+
+        # Create residue column
+        df['residue'] = df['WT residue type'] + df['Residue #'].astype(str)
 
-            if len(mutatex_files) != 1:
-                raise MAVISpMultipleError(critical=[MAVISpCriticalError(f"zero or multiple files found in {interactor_dir}; exactly one expected")],
-                                        warning=warnings)
+        # Detect and handle homodimer case
+        chains = set(df['chain ID'].unique())
+
+        # If chain A exists, KEEP ONLY rows where chain ID == 'A'.  
+        if self.target_chain in chains:
+            df = df[ df['chain ID'] == self.target_chain ]
+        # If chain A does NOT exist, the ONLY valid alternative is homodimer AB.
+        elif set(df['chain ID'].unique()) != self.homodimer_chains:
+            message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)"
+            raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)],
+                                      warning=[])
+
+        # Drop unnecessary columns
+        df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1)
 
-            mutatex_file = mutatex_files[0]
+        # Stack remaining columns
+        df = df.set_index('residue') # set 'residue' column as index
+        df = df.stack() # rotates columns downward and makes the dataframe long-format (level_1 contains the original column names and 0 contains the values)
+        df = df.reset_index() # reset index to turn the index into a column
 
-            try:
-                df = pd.read_csv(os.path.join(interactor_dir, mutatex_file))
-            except Exception as e:
-                this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}"
-                raise MAVISpMultipleError(warning=warnings,
-                                        critical=[MAVISpCriticalError(this_error)])
+        # Create mutation column
+        df['mutations'] = df['residue'] + df['level_1'] # concatenate 'residue' and 'level_1' columns to create 'mutations' column
+        df = df.set_index('mutations') # set 'mutations' column as index
 
-            # create residue column
-            df['residue'] = df['WT residue type'] + df['Residue #'].astype(str)
+        # Drop now useless columns, rename
+        df = df.drop(['residue', 'level_1'], axis=1)
 
-            # detect and handle homodimer case
-            chains = set(df['chain ID'].unique())
+        # handle space around measure
+        if self.measure == "":
+            measure = ""
+        else:
+            measure = f"{self.measure} "
 
-            if self.target_chain in chains:
-                df = df[ df['chain ID'] == self.target_chain ]
+        # rename column Local Int. (Binding with B, heterodimer, FoldX5, kcal/mol)
+        if data_type is None or data_type == '':
+            data_type = ""
+        else:
+            data_type = f", {data_type}"
 
-            elif set(df['chain ID'].unique()) != self.homodimer_chains:
-                message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)"
-                raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)],
-                                          warning=[])
+        colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit}{data_type})"
+
+        return df.rename(columns={0 : colname}) # rename the sinle column named 0 to the formatted name
 
-            df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1)
+    def parse(self, dir_path):
+        """
+        reads the MutateX output files (energies.csv + energies_std.csv) for each interactor,
+        converts them into mutation-indexed dataframes, and returns them to the Local interaction module.
+        """
+
+        warnings = []
+        all_data = None
 
-            # stack remaining columns
-            df = df.set_index('residue')
-            df = df.stack()
-            df = df.reset_index()
+        interactors = os.listdir(dir_path) # list of subfolders in dir_path
+        self.interactors = interactors # store interactors in the instance variable
 
-            # create mutation column
-            df['mutations'] = df['residue'] + df['level_1']
-            df = df.set_index('mutations')
+        if len(interactors) == 0:
+            raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")],
+                                      warning=warnings)
 
-            # drop now useless columns, rename
-            df = df.drop(['residue', 'level_1'], axis=1)
+        for interactor in interactors:
+
+            interactor_dir = os.path.join(dir_path, interactor)
+            mutatex_files = os.listdir(interactor_dir)
 
-            # handle space around measure
-            if self.measure == "":
-                measure = ""
+            # expect energies.csv file per interactor
+            if self.averages_filename not in mutatex_files:
+                this_error = f"energies.csv file not found in {interactor_dir}"
+                raise MAVISpMultipleError(warning=warnings,
+                                          critical=[MAVISpCriticalError(this_error)])   
+
+            # Parse averages file
+            averages_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.averages_filename), interactor, '')
+
+            # Parse stds file if it exists
+            if self.stds_filename in mutatex_files:
+                stds_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.stds_filename), interactor, 'st. dev.')
             else:
-                measure = f"{self.measure} "
-
-            df = df.rename(columns={0 : f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit})"})
-
+                warnings.append(MAVISpWarningError("standard deviation file not found for MutateX binding energy data"))
+                stds_df = None
+
+            # Combine averages and stds data
+            if stds_df is not None:
+                interactor_data = averages_df.join(stds_df, how='outer')
+            else:
+                interactor_data = averages_df
+
+            # Combine data across interactors
             if all_data is None:
-                all_data = df
+                all_data = interactor_data
             else:
-                all_data = all_data.join(df, how='outer')
+                all_data = all_data.join(interactor_data, how='outer')
 
-        return all_data, warnings
+        return all_data, warnings  
 
-class MutateXDNABinding(Method):
+class MutateXDNABinding(MutateXBinding):
 
     unit = "kcal/mol"
     type = "Local Int. With DNA"
@@ -195,8 +231,6 @@ class MutateXDNABinding(Method):
     measure = ""
     complex_status = "heterodimer"
 
-    parse = MutateXBinding.parse
-
 class RosettaDDGPredictionStability(Method):
 
     unit = "kcal/mol"
@@ -310,14 +344,14 @@ def parse(self, dir_path):
 
         return avg_mutation_data, std_mutation_data, warnings
 
-class RosettaDDGPredictionBinding(Method):
+class RosettaDDGPredictionBinding(RosettaDDGPredictionStability):
 
     unit = "kcal/mol"
     type = "Local Int."
     chain = 'A'
     complex_status = 'heterodimer'
-
-    _parse_aggregate_csv = RosettaDDGPredictionStability._parse_aggregate_csv
+    aggregate_fname = 'ddg_mutations_aggregate.csv'
+    structures_fname = 'ddg_mutations_structures.csv'
 
     def __init__(self, version, complex_status=None):
 
@@ -328,53 +362,99 @@ def __init__(self, version, complex_status=None):
 
         self.interactors = []
 
+    def _parse_structure_csv(self, csvf, warnings):
+        """Parse the RosettaDDGPrediction binding structure CSV file."""
+        try:
+            df = pd.read_csv(csvf)
+        except Exception as e:
+            this_error = f"Exception {type(e).__name__} while reading structure CSV: {e.args}"
+            raise MAVISpMultipleError(
+                warning=warnings,
+                critical=[MAVISpCriticalError(this_error)]
+            )
+
+        #keep only ddg rows
+        df = df[df["state"] == "ddg"]
+
+        #group by mutation and compute stdev of total_score
+        std_series = df.groupby("mutation_label")["total_score"].std()
+        average_series = df.groupby("mutation_label")["total_score"].mean()
+
+        #turn into DataFrame
+        std_df = std_series.to_frame(name ="total_score")
+        average_df = average_series.to_frame(name ="total_score")
+
+        return average_df, std_df
+
     def parse(self, dir_path):
 
         warnings = []
 
-        interactors = os.listdir(dir_path)
-        self.interactors = interactors
+        interactors = os.listdir(dir_path) 
+        self.interactors = interactors 
 
         if len(interactors) == 0:
             raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")],
-                                        warning=[])
+                                      warning=[])
 
         all_data = None
 
         for interactor in interactors:
 
-            interactor_dir = os.path.join(dir_path, interactor)
-            rosetta_files = os.listdir(interactor_dir)
-
-            if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])):
-
+            interactor_dir = os.path.join(dir_path, interactor) 
+            rosetta_files = os.listdir(interactor_dir) 
+
+            # Identify the correct files
+            agg_file = None
+            struct_file = None
+
+            # Expect either a single file or multiple directories containing one file each
+            if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and rosetta_files[0] == self.aggregate_fname:
                 rosetta_file = rosetta_files[0]
+                # Parse single aggregate CSV file
                 mutation_data = self._parse_aggregate_csv(os.path.join(interactor_dir, rosetta_file), warnings)
 
+            # or structures file (which we can use to calculate average and stdev)
+            elif (len(rosetta_files) == 1 and\
+                    os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and \
+                    rosetta_files[0] == self.structures_fname)    or\
+                 (set(rosetta_files) == set([self.aggregate_fname, self.structures_fname]) and\
+                    os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and\
+                    os.path.isfile(os.path.join(interactor_dir, rosetta_files[1]))):
+
+                if len(rosetta_files) == 2:
+                    warnings.append(MAVISpWarningError(f"for {interactor}, both Rosetta aggregate and structures file were found; the aggregate file will be ignored"))
+
+                mutation_data, std_df = self._parse_structure_csv(os.path.join(interactor_dir, self.structures_fname), warnings)
+
+                std_df = std_df.rename(columns={'total_score' : f"{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)"})
+                mutation_data = mutation_data.join(std_df, how="outer")
+
+            # Multiple directories containing one file each
             elif len(rosetta_files) > 1 and all( [ os.path.isdir(os.path.join(interactor_dir, f)) for f in rosetta_files] ):
                 mutation_data = None
-                for c, conformer_dir in enumerate(rosetta_files):
+                for c, conformer_dir in enumerate(rosetta_files): 
 
-                    conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir))
+                    conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir)) 
 
                     if len(conformer_files) != 1:
                         text = "only one file per conformer is supported for RosettaDDGPrediction"
                         raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)],
                                                   warning=warnings)
-
-                    conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings)
-
+
+                    conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings) 
                     conformer_data = conformer_data.rename(columns={'total_score' : f'total_score_{c}'})
 
                     if mutation_data is None:
                         mutation_data = conformer_data
                     else:
                         mutation_data = mutation_data.join(conformer_data)
-
+
+                # Average total_score across conformers
                 mutation_data = pd.DataFrame(mutation_data.mean(axis=1), columns=['total_score'])
 
             else:
-                text = f"dataset {interactor_dir} was not either a single files, or multiple directories containing one file"
+                text = f"dataset {interactor_dir} did not contain an expected folder structure"
                 raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)],
                                             warning=warnings)
 
@@ -384,7 +464,8 @@ def parse(self, dir_path):
                 all_data = mutation_data
             else:
                 all_data = all_data.join(mutation_data, how='outer')
-
+
+        # return the combined data for all interactors
         return all_data, warnings
 
 class AlloSigma(Method):

diff --git a/mavisp/modules.py b/mavisp/modules.py
@@ -526,8 +526,7 @@ def ingest(self, mutations):
         except MAVISpMultipleError as e:
             if len(e.critical) > 0:
                 raise
-        else:
-            e = None
+            warnings += e.warning
 
         module_dir_files = os.listdir(os.path.join(self.data_dir, self.module_dir))
 
@@ -546,12 +545,9 @@ def ingest(self, mutations):
 
         self.data = self.data.drop(columns=['res_num', 'sas_sc_rel'])
 
-        if e is None and len(warnings) > 0:
+        if len(warnings) > 0:
             raise MAVISpMultipleError(warning=warnings,
                                       critical=[])
-        elif len(warnings) > 0:
-            e.warning.extend(warnings)
-            raise e
 
     def _generate_local_interactions_classification(self, row, ci, stab_co=1.0):
 
@@ -628,8 +624,7 @@ def ingest(self, mutations):
         except MAVISpMultipleError as e:
             if len(e.critical) > 0:
                 raise
-        else:
-            e = None
+            warnings += e.warning
 
         rsa = self._parse_sas(os.path.join(self.data_dir, self.module_dir, self.sas_filename), warnings)
 
@@ -646,12 +641,9 @@ def ingest(self, mutations):
 
         self.data = self.data.drop(columns=['res_num', 'sas_sc_rel'])
 
-        if e is None and len(warnings) > 0:
+        if len(warnings) > 0:
             raise MAVISpMultipleError(warning=warnings,
                                       critical=[])
-        elif len(warnings) > 0:
-            e.warning.extend(warnings)
-            raise e
 
     def _generate_local_interactions_DNA_classification(self, row, ci, stab_co=1.0):