diff --git a/CHANGELOG.md b/CHANGELOG.md index fc4cd993c7..f10befd4fe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,3 +5,4 @@ - 2025-06-06: Added selection of Nter, Cter and 5'end states at topology generation - Issue #1269 - 2025-12-06: Added new restrain_ligand sub-command in the haddock3-restraints CLI - Issue #1299 - 2025-07-23: Added printing of covalent energies to the PDB headers - Issue #1323 +- 2025-07-24: Added computation of standard deviation in alascan cluster analyses - Issue #1332 \ No newline at end of file diff --git a/integration_tests/test_alascan.py b/integration_tests/test_alascan.py index 9d6c112316..82bb05ff46 100644 --- a/integration_tests/test_alascan.py +++ b/integration_tests/test_alascan.py @@ -77,7 +77,7 @@ def test_alascan_default(alascan_module, mocker): expected_csv1 = Path(alascan_module.path, "scan_protprot_complex_1.tsv") expected_csv2 = Path(alascan_module.path, "scan_protprot_complex_2.tsv") - expected_clt_csv = Path(alascan_module.path, "scan_clt_-.tsv") + expected_clt_csv = Path(alascan_module.path, "scan_clt_unclustered.tsv") assert expected_csv1.exists(), f"{expected_csv1} does not exist" assert expected_csv2.exists(), f"{expected_csv2} does not exist" @@ -89,9 +89,9 @@ def test_alascan_default(alascan_module, mocker): # ARG 17 B should have a negative delta_score assert df.loc[df["ori_resname"] == "ARG"].iloc[0, :]["delta_score"] < 0.0 - # check clt csv + # check cluster csv df_clt = pd.read_csv(expected_clt_csv, sep="\t", comment="#") - assert df_clt.shape == (18, 11), f"{expected_clt_csv} has wrong shape" + assert df_clt.shape == (18, 16), f"{expected_clt_csv} has wrong shape" # average delta score of A-38-ASP should be negative assert ( df_clt.loc[df_clt["full_resname"] == "A-38-ASP"].iloc[0, :]["delta_score"] < 0.0 @@ -106,7 +106,7 @@ def test_alascan_single_model(alascan_module, mocker): alascan_module.run() expected_csv = Path(alascan_module.path, "scan_2oob.tsv") - expected_clt_csv = Path(alascan_module.path, "scan_clt_-.tsv") + expected_clt_csv = Path(alascan_module.path, "scan_clt_unclustered.tsv") assert expected_csv.exists(), f"{expected_csv} does not exist" assert expected_clt_csv.exists(), f"{expected_clt_csv} does not exist" diff --git a/src/haddock/libs/libplots.py b/src/haddock/libs/libplots.py index 3ef8ff51ee..c09324237b 100644 --- a/src/haddock/libs/libplots.py +++ b/src/haddock/libs/libplots.py @@ -1414,7 +1414,7 @@ def make_alascan_plot( clt_id: int, scan_res: str = "ALA", offline: bool = False, - ) -> None: + ) -> str: """ Make a plotly interactive plot. @@ -1429,6 +1429,11 @@ def make_alascan_plot( Cluster ID. scan_res : str, optional Residue name used for the scan, by default "ALA" + + Returns + ------- + html_output_filename : str + Name of the plot generated """ plot_name = f"scan_clt_{clt_id}" log.info(f"Generating {scan_res} scanning plot {plot_name}") @@ -1437,18 +1442,21 @@ def make_alascan_plot( width, height = 2000, 1000 fig = go.Figure(layout={"width": width, "height": height}) # add traces + # Delta HADDOCK score fig.add_trace( go.Bar( x=df["full_resname"], y=df["delta_score"], + error_y={"type": "data", "array": df["delta_score_std"]}, name="delta_score", ) ) - + # Delta VdW fig.add_trace( go.Bar( x=df["full_resname"], y=df["delta_vdw"], + error_y={"type": "data", "array": df["delta_vdw_std"]}, name="delta_vdw", ) ) @@ -1457,37 +1465,46 @@ def make_alascan_plot( go.Bar( x=df["full_resname"], y=0.2 * df["delta_elec"], + error_y={"type": "data", "array": df["delta_elec_std"]}, name="delta_elec", ) ) - + # Delta desolvation score fig.add_trace( go.Bar( x=df["full_resname"], y=df["delta_desolv"], + error_y={"type": "data", "array": df["delta_desolv_std"]}, name="delta_desolv", ) ) # prettifying layout fig.update_layout( title=f"{scan_res} scanning cluster {clt_id}", - xaxis=dict( - title=dict(text="Residue Name", font=dict(size=16)), - tickfont_size=14, - tick0=df["full_resname"], + xaxis={ + "title": {"text": "Residue Name", "font": {"size": 16}}, + "tickfont_size": 14, + "tick0": df["full_resname"], # in case we want to show less residues - # dtick=10, - ), - yaxis=dict( - title=dict(text="Average Delta (WT - mutant)", font=dict(size=16)), - tickfont_size=14, - ), - legend=dict(x=1.01, y=1.0, font_family="Helvetica", font_size=16), + # "dtick": 10, + }, + yaxis={ + "title": { + "text": "Average Delta (WT - mutant)", + "font": {"size": 16} + }, + "tickfont_size": 14, + }, + legend={ + "x": 1.01, "y": 1.0, + "font_family": "Helvetica", + "font_size": 16 + }, barmode="group", bargap=0.05, bargroupgap=0.05, hovermode="x unified", - hoverlabel=dict(font_size=16, font_family="Helvetica"), + hoverlabel={"font_size": 16, "font_family": "Helvetica"}, ) for n in range(df.shape[0] - 1): fig.add_vline(x=0.5 + n, line_color="gray", opacity=0.2) @@ -1501,6 +1518,7 @@ def make_alascan_plot( figure_width=width, offline=offline, ) + return html_output_filename def fig_to_html( diff --git a/src/haddock/modules/analysis/alascan/scan.py b/src/haddock/modules/analysis/alascan/scan.py index 9b7f2e6d57..d7d2e4a250 100644 --- a/src/haddock/modules/analysis/alascan/scan.py +++ b/src/haddock/modules/analysis/alascan/scan.py @@ -271,12 +271,13 @@ def alascan_cluster_analysis(models): cl_id = native.clt_id # unclustered models have cl_id = None if cl_id is None: - cl_id = "-" + cl_id = "unclustered" + # Initiate key if this cluster id is encountered for the first time if cl_id not in clt_scan: clt_scan[cl_id] = {} - cl_pops[cl_id] = 1 - else: - cl_pops[cl_id] += 1 + cl_pops[cl_id] = 0 + # Increase the population of that cluster + cl_pops[cl_id] += 1 # read the scan file alascan_fname = f"scan_{native.file_name.removesuffix('.pdb')}.tsv" df_scan = pd.read_csv(alascan_fname, sep="\t", comment="#") @@ -293,44 +294,60 @@ def alascan_cluster_analysis(models): delta_bsa = row['delta_bsa'] # add to the cluster data with the ident logic ident = f"{chain}-{res}-{ori_resname}" - if ident not in clt_scan[cl_id]: + # Create variable with appropriate key + if ident not in clt_scan[cl_id].keys(): clt_scan[cl_id][ident] = { - 'delta_score': delta_score, - 'delta_vdw': delta_vdw, - 'delta_elec': delta_elec, - 'delta_desolv': delta_desolv, - 'delta_bsa': delta_bsa, - 'frac_pr': 1 + 'delta_score': [], + 'delta_vdw': [], + 'delta_elec': [], + 'delta_desolv': [], + 'delta_bsa': [], + 'frac_pr': 0, } - else: - clt_scan[cl_id][ident]['delta_score'] += delta_score - clt_scan[cl_id][ident]['delta_vdw'] += delta_vdw - clt_scan[cl_id][ident]['delta_elec'] += delta_elec - clt_scan[cl_id][ident]['delta_desolv'] += delta_desolv - clt_scan[cl_id][ident]['delta_bsa'] += delta_bsa - clt_scan[cl_id][ident]['frac_pr'] += 1 - # now average the data + # Add data + clt_scan[cl_id][ident]['delta_score'].append(delta_score) + clt_scan[cl_id][ident]['delta_vdw'].append(delta_vdw) + clt_scan[cl_id][ident]['delta_elec'].append(delta_elec) + clt_scan[cl_id][ident]['delta_desolv'].append(delta_desolv) + clt_scan[cl_id][ident]['delta_bsa'].append(delta_bsa) + clt_scan[cl_id][ident]['frac_pr'] += 1 + # now average the data for every cluster for cl_id in clt_scan: scan_clt_filename = f"scan_clt_{cl_id}.tsv" log.info(f"Writing {scan_clt_filename}") clt_data = [] + # Loop over residues for ident in clt_scan[cl_id]: + # Split identifyer to retrieve residue data chain = ident.split("-")[0] resnum = int(ident.split("-")[1]) resname = ident.split("-")[2] - frac_pr = clt_scan[cl_id][ident]["frac_pr"] - clt_data.append([chain, resnum, resname, ident, - clt_scan[cl_id][ident]['delta_score'] / frac_pr, - clt_scan[cl_id][ident]['delta_vdw'] / frac_pr, - clt_scan[cl_id][ident]['delta_elec'] / frac_pr, - clt_scan[cl_id][ident]['delta_desolv'] / frac_pr, - clt_scan[cl_id][ident]['delta_bsa'] / frac_pr, - clt_scan[cl_id][ident]['frac_pr'] / cl_pops[cl_id] - ] - ) - df_cols = ['chain', 'resnum', 'resname', 'full_resname', 'delta_score', - 'delta_vdw', 'delta_elec', 'delta_desolv', 'delta_bsa', - 'frac_pres'] + # Point data for this specific residue + clt_res_dt = clt_scan[cl_id][ident] + # Compute averages and stddev and hold data. + clt_data.append([ + chain, + resnum, + resname, + ident, + np.mean(clt_res_dt['delta_score']), + np.std(clt_res_dt['delta_score']), + np.mean(clt_res_dt['delta_vdw']), + np.std(clt_res_dt['delta_vdw']), + np.mean(clt_res_dt['delta_elec']), + np.std(clt_res_dt['delta_elec']), + np.mean(clt_res_dt['delta_desolv']), + np.std(clt_res_dt['delta_desolv']), + np.mean(clt_res_dt['delta_bsa']), + np.std(clt_res_dt['delta_bsa']), + clt_res_dt['frac_pr'] / cl_pops[cl_id], + ]) + df_cols = [ + 'chain', 'resnum', 'resname', 'full_resname', + 'delta_score', 'delta_score_std', 'delta_vdw', 'delta_vdw_std', + 'delta_elec', 'delta_elec_std', 'delta_desolv', 'delta_desolv_std', + 'delta_bsa', 'delta_bsa_std', 'frac_pres', + ] df_scan_clt = pd.DataFrame(clt_data, columns=df_cols) # adding clt-based Z score df_scan_clt = add_zscores(df_scan_clt, 'delta_score') @@ -347,6 +364,7 @@ def alascan_cluster_analysis(models): with open(scan_clt_filename, 'w') as f: f.write(f"{'#' * 80}{os.linesep}") # noqa E501 f.write(f"# `alascan` cluster results for cluster {cl_id}{os.linesep}") # noqa E501 + f.write(f"# reported values are the average for the cluster{os.linesep}") # noqa E501 f.write(f"#{os.linesep}") f.write(f"# z_score is calculated with respect to the mean values of all residues{os.linesep}") # noqa E501 f.write(f"{'#' * 80}{os.linesep}") # noqa E501 diff --git a/tests/test_libplots.py b/tests/test_libplots.py index 7fb081389b..d3427a6d21 100644 --- a/tests/test_libplots.py +++ b/tests/test_libplots.py @@ -232,8 +232,14 @@ def example_capri_ss_dashcluster(): def example_df_scan_clt(): """Return example alascan clt DataFrame.""" example_clt_data = [ - ["A", 38, "ASP", "A-38-ASP", -2.0, -1.0, -0.4, -2.3, -0.5, -7.2, 1.0], - ["A", 69, "LYS", "A-38-ASP", -0.0, 1.0, -0.4, 0.8, 0.5, -7.2, 1.0], + [ + "A", 38, "ASP", "A-38-ASP", -2.0, -1.0, 0.1, -0.4, 0.04, -2.3, + 0.23, -0.5, 0.05, -7.2, 0.72, 1.0, + ], + [ + "A", 69, "LYS", "A-38-ASP", -0.0, 1.0, 0.1, -0.4, 0.04, -2.3, + 0.23, -0.5, 0.05, -7.2, 0.72, 1.0, + ], ] columns = [ "chain", @@ -242,10 +248,15 @@ def example_df_scan_clt(): "full_resname", "score", "delta_score", + "delta_score_std", "delta_vdw", + "delta_vdw_std", "delta_elec", + "delta_elec_std", "delta_desolv", + "delta_desolv_std", "delta_bsa", + "delta_bsa_std", "frac_pres", ] diff --git a/tests/test_module_alascan.py b/tests/test_module_alascan.py index 607fe99bbb..9ade1faf27 100644 --- a/tests/test_module_alascan.py +++ b/tests/test_module_alascan.py @@ -270,8 +270,8 @@ def test_run( ) alascan.run() - assert Path(alascan.path, "scan_clt_-.tsv").exists() - assert Path(alascan.path, "scan_clt_-.html").exists() + assert Path(alascan.path, "scan_clt_unclustered.tsv").exists() + assert Path(alascan.path, "scan_clt_unclustered.html").exists() def test_scanjob_run(scanjob_obj, mocker): @@ -378,13 +378,13 @@ def test_alascan_cluster_analysis(protprot_input_list, scan_file, monkeypatch): shutil.copy(scan_file, Path("scan_protprot_complex_2.tsv")) alascan_cluster_analysis(protprot_input_list) - assert Path("scan_clt_-.tsv").exists() + assert Path("scan_clt_unclustered.tsv").exists() protprot_input_list[1].clt_id = 1 alascan_cluster_analysis(protprot_input_list) assert Path("scan_clt_1.tsv").exists() - assert Path("scan_clt_-.tsv").exists() + assert Path("scan_clt_unclustered.tsv").exists() def test_create_alascan_plots(mocker, caplog):