Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 162 additions & 81 deletions mavisp/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,17 @@ def parse(self, dir_path):

return averages_df, stds_df, warnings

class MutateXBinding(Method):

class MutateXBinding(Method):
#a parsing class; produces binding ΔΔG data

unit = "kcal/mol"
type = "Local Int."
heterodimer_chains = set(['A'])
homodimer_chains = set(['AB'])
target_chain = 'A'
measure = "Binding with"
averages_filename = 'energies.csv'
stds_filename = 'energies_std.csv'
complex_status = "heterodimer"

def __init__(self, version, complex_status=None):
Expand All @@ -110,82 +113,115 @@ def __init__(self, version, complex_status=None):

self.interactors = []

def parse(self, dir_path):

warnings = []

interactors = os.listdir(dir_path)
self.interactors = interactors

if len(interactors) == 0:
raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")],
warning=warnings)

all_data = None
# data_type is either '' or 'st. dev.'
def _parse_mutatex_binding_energy_file(self, fname, interactor, data_type):
"""Parse a single MutateX binding energy file (average or std)."""

for interactor in interactors:

interactor_dir = os.path.join(dir_path, interactor)

mutatex_files = os.listdir(interactor_dir)
try:
df = pd.read_csv(fname)
except Exception as e:
this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}"
raise MAVISpMultipleError(warning=warnings,
critical=[MAVISpCriticalError(this_error)])

# Create residue column
df['residue'] = df['WT residue type'] + df['Residue #'].astype(str)

if len(mutatex_files) != 1:
raise MAVISpMultipleError(critical=[MAVISpCriticalError(f"zero or multiple files found in {interactor_dir}; exactly one expected")],
warning=warnings)
# Detect and handle homodimer case
chains = set(df['chain ID'].unique())

# If chain A exists, KEEP ONLY rows where chain ID == 'A'.
if self.target_chain in chains:
df = df[ df['chain ID'] == self.target_chain ]
# If chain A does NOT exist, the ONLY valid alternative is homodimer AB.
elif set(df['chain ID'].unique()) != self.homodimer_chains:
message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)"
raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)],
warning=[])

# Drop unnecessary columns
df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1)

mutatex_file = mutatex_files[0]
# Stack remaining columns
df = df.set_index('residue') # set 'residue' column as index
df = df.stack() # rotates columns downward and makes the dataframe long-format (level_1 contains the original column names and 0 contains the values)
df = df.reset_index() # reset index to turn the index into a column

try:
df = pd.read_csv(os.path.join(interactor_dir, mutatex_file))
except Exception as e:
this_error = f"Exception {type(e).__name__} occurred when parsing the MutateX csv file. Arguments:{e.args}"
raise MAVISpMultipleError(warning=warnings,
critical=[MAVISpCriticalError(this_error)])
# Create mutation column
df['mutations'] = df['residue'] + df['level_1'] # concatenate 'residue' and 'level_1' columns to create 'mutations' column
df = df.set_index('mutations') # set 'mutations' column as index

# create residue column
df['residue'] = df['WT residue type'] + df['Residue #'].astype(str)
# Drop now useless columns, rename
df = df.drop(['residue', 'level_1'], axis=1)

# detect and handle homodimer case
chains = set(df['chain ID'].unique())
# handle space around measure
if self.measure == "":
measure = ""
else:
measure = f"{self.measure} "

if self.target_chain in chains:
df = df[ df['chain ID'] == self.target_chain ]
# rename column Local Int. (Binding with B, heterodimer, FoldX5, kcal/mol)
if data_type is None or data_type == '':
data_type = ""
else:
data_type = f", {data_type}"

elif set(df['chain ID'].unique()) != self.homodimer_chains:
message = "chain ID in FoldX energy file must be either A or B (heterodimer case) or AB (homodimer case)"
raise MAVISpMultipleError(critical=[MAVISpCriticalError(message)],
warning=[])
colname = f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit}{data_type})"

return df.rename(columns={0 : colname}) # rename the sinle column named 0 to the formatted name

df = df.drop(['WT residue type', 'Residue #', 'chain ID'], axis=1)
def parse(self, dir_path):
"""
reads the MutateX output files (energies.csv + energies_std.csv) for each interactor,
converts them into mutation-indexed dataframes, and returns them to the Local interaction module.
"""

warnings = []
all_data = None

# stack remaining columns
df = df.set_index('residue')
df = df.stack()
df = df.reset_index()
interactors = os.listdir(dir_path) # list of subfolders in dir_path
self.interactors = interactors # store interactors in the instance variable

# create mutation column
df['mutations'] = df['residue'] + df['level_1']
df = df.set_index('mutations')
if len(interactors) == 0:
raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")],
warning=warnings)

# drop now useless columns, rename
df = df.drop(['residue', 'level_1'], axis=1)
for interactor in interactors:

interactor_dir = os.path.join(dir_path, interactor)
mutatex_files = os.listdir(interactor_dir)

# handle space around measure
if self.measure == "":
measure = ""
# expect energies.csv file per interactor
if self.averages_filename not in mutatex_files:
this_error = f"energies.csv file not found in {interactor_dir}"
raise MAVISpMultipleError(warning=warnings,
critical=[MAVISpCriticalError(this_error)])

# Parse averages file
averages_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.averages_filename), interactor, '')

# Parse stds file if it exists
if self.stds_filename in mutatex_files:
stds_df = self._parse_mutatex_binding_energy_file(os.path.join(interactor_dir, self.stds_filename), interactor, 'st. dev.')
else:
measure = f"{self.measure} "

df = df.rename(columns={0 : f"{self.type} ({measure}{interactor}, {self.complex_status}, {self.version}, {self.unit})"})

warnings.append(MAVISpWarningError("standard deviation file not found for MutateX binding energy data"))
stds_df = None

# Combine averages and stds data
if stds_df is not None:
interactor_data = averages_df.join(stds_df, how='outer')
else:
interactor_data = averages_df

# Combine data across interactors
if all_data is None:
all_data = df
all_data = interactor_data
else:
all_data = all_data.join(df, how='outer')
all_data = all_data.join(interactor_data, how='outer')

return all_data, warnings
return all_data, warnings

class MutateXDNABinding(Method):
class MutateXDNABinding(MutateXBinding):

unit = "kcal/mol"
type = "Local Int. With DNA"
Expand All @@ -195,8 +231,6 @@ class MutateXDNABinding(Method):
measure = ""
complex_status = "heterodimer"

parse = MutateXBinding.parse

class RosettaDDGPredictionStability(Method):

unit = "kcal/mol"
Expand Down Expand Up @@ -310,14 +344,14 @@ def parse(self, dir_path):

return avg_mutation_data, std_mutation_data, warnings

class RosettaDDGPredictionBinding(Method):
class RosettaDDGPredictionBinding(RosettaDDGPredictionStability):

unit = "kcal/mol"
type = "Local Int."
chain = 'A'
complex_status = 'heterodimer'

_parse_aggregate_csv = RosettaDDGPredictionStability._parse_aggregate_csv
aggregate_fname = 'ddg_mutations_aggregate.csv'
structures_fname = 'ddg_mutations_structures.csv'

def __init__(self, version, complex_status=None):

Expand All @@ -328,53 +362,99 @@ def __init__(self, version, complex_status=None):

self.interactors = []

def _parse_structure_csv(self, csvf, warnings):
"""Parse the RosettaDDGPrediction binding structure CSV file."""
try:
df = pd.read_csv(csvf)
except Exception as e:
this_error = f"Exception {type(e).__name__} while reading structure CSV: {e.args}"
raise MAVISpMultipleError(
warning=warnings,
critical=[MAVISpCriticalError(this_error)]
)

#keep only ddg rows
df = df[df["state"] == "ddg"]

#group by mutation and compute stdev of total_score
std_series = df.groupby("mutation_label")["total_score"].std()
average_series = df.groupby("mutation_label")["total_score"].mean()

#turn into DataFrame
std_df = std_series.to_frame(name ="total_score")
average_df = average_series.to_frame(name ="total_score")

return average_df, std_df

def parse(self, dir_path):

warnings = []

interactors = os.listdir(dir_path)
self.interactors = interactors
interactors = os.listdir(dir_path)
self.interactors = interactors

if len(interactors) == 0:
raise MAVISpMultipleError(critical=[MAVISpCriticalError("no interactor folders found")],
warning=[])
warning=[])

all_data = None

for interactor in interactors:

interactor_dir = os.path.join(dir_path, interactor)
rosetta_files = os.listdir(interactor_dir)

if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])):

interactor_dir = os.path.join(dir_path, interactor)
rosetta_files = os.listdir(interactor_dir)

# Identify the correct files
agg_file = None
struct_file = None

# Expect either a single file or multiple directories containing one file each
if len(rosetta_files) == 1 and os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and rosetta_files[0] == self.aggregate_fname:
rosetta_file = rosetta_files[0]
# Parse single aggregate CSV file
mutation_data = self._parse_aggregate_csv(os.path.join(interactor_dir, rosetta_file), warnings)

# or structures file (which we can use to calculate average and stdev)
elif (len(rosetta_files) == 1 and\
os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and \
rosetta_files[0] == self.structures_fname) or\
(set(rosetta_files) == set([self.aggregate_fname, self.structures_fname]) and\
os.path.isfile(os.path.join(interactor_dir, rosetta_files[0])) and\
os.path.isfile(os.path.join(interactor_dir, rosetta_files[1]))):

if len(rosetta_files) == 2:
warnings.append(MAVISpWarningError(f"for {interactor}, both Rosetta aggregate and structures file were found; the aggregate file will be ignored"))

mutation_data, std_df = self._parse_structure_csv(os.path.join(interactor_dir, self.structures_fname), warnings)

std_df = std_df.rename(columns={'total_score' : f"{self.type} (Binding with {interactor}, {self.complex_status}, {self.version}, {self.unit}, st. dev.)"})
mutation_data = mutation_data.join(std_df, how="outer")

# Multiple directories containing one file each
elif len(rosetta_files) > 1 and all( [ os.path.isdir(os.path.join(interactor_dir, f)) for f in rosetta_files] ):
mutation_data = None
for c, conformer_dir in enumerate(rosetta_files):
for c, conformer_dir in enumerate(rosetta_files):

conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir))
conformer_files = os.listdir(os.path.join(interactor_dir, conformer_dir))

if len(conformer_files) != 1:
text = "only one file per conformer is supported for RosettaDDGPrediction"
raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)],
warning=warnings)

conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings)


conformer_data = self._parse_aggregate_csv(os.path.join(interactor_dir, conformer_dir, conformer_files[0]), warnings)
conformer_data = conformer_data.rename(columns={'total_score' : f'total_score_{c}'})

if mutation_data is None:
mutation_data = conformer_data
else:
mutation_data = mutation_data.join(conformer_data)


# Average total_score across conformers
mutation_data = pd.DataFrame(mutation_data.mean(axis=1), columns=['total_score'])

else:
text = f"dataset {interactor_dir} was not either a single files, or multiple directories containing one file"
text = f"dataset {interactor_dir} did not contain an expected folder structure"
raise MAVISpMultipleError(critical=[MAVISpCriticalError(text)],
warning=warnings)

Expand All @@ -384,7 +464,8 @@ def parse(self, dir_path):
all_data = mutation_data
else:
all_data = all_data.join(mutation_data, how='outer')


# return the combined data for all interactors
return all_data, warnings

class AlloSigma(Method):
Expand Down
16 changes: 4 additions & 12 deletions mavisp/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,8 +526,7 @@ def ingest(self, mutations):
except MAVISpMultipleError as e:
if len(e.critical) > 0:
raise
else:
e = None
warnings += e.warning

module_dir_files = os.listdir(os.path.join(self.data_dir, self.module_dir))

Expand All @@ -546,12 +545,9 @@ def ingest(self, mutations):

self.data = self.data.drop(columns=['res_num', 'sas_sc_rel'])

if e is None and len(warnings) > 0:
if len(warnings) > 0:
raise MAVISpMultipleError(warning=warnings,
critical=[])
elif len(warnings) > 0:
e.warning.extend(warnings)
raise e

def _generate_local_interactions_classification(self, row, ci, stab_co=1.0):

Expand Down Expand Up @@ -628,8 +624,7 @@ def ingest(self, mutations):
except MAVISpMultipleError as e:
if len(e.critical) > 0:
raise
else:
e = None
warnings += e.warning

rsa = self._parse_sas(os.path.join(self.data_dir, self.module_dir, self.sas_filename), warnings)

Expand All @@ -646,12 +641,9 @@ def ingest(self, mutations):

self.data = self.data.drop(columns=['res_num', 'sas_sc_rel'])

if e is None and len(warnings) > 0:
if len(warnings) > 0:
raise MAVISpMultipleError(warning=warnings,
critical=[])
elif len(warnings) > 0:
e.warning.extend(warnings)
raise e

def _generate_local_interactions_DNA_classification(self, row, ci, stab_co=1.0):

Expand Down