diff --git a/pyaptamer/utils/__init__.py b/pyaptamer/utils/__init__.py index 6152047..54d715b 100644 --- a/pyaptamer/utils/__init__.py +++ b/pyaptamer/utils/__init__.py @@ -8,11 +8,13 @@ "rna2vec", "pdb_to_struct", "struct_to_aaseq", + "pdb_to_seq_uniprot", "pdb_to_aaseq", ] from pyaptamer.utils._aa_str_to_letter import aa_str_to_letter from pyaptamer.utils._pdb_to_aaseq import pdb_to_aaseq +from pyaptamer.utils._pdb_to_seq_uniprot import pdb_to_seq_uniprot from pyaptamer.utils._pdb_to_struct import pdb_to_struct from pyaptamer.utils._rna import ( dna2rna, diff --git a/pyaptamer/utils/_pdb_to_seq_uniprot.py b/pyaptamer/utils/_pdb_to_seq_uniprot.py new file mode 100644 index 0000000..94a6b37 --- /dev/null +++ b/pyaptamer/utils/_pdb_to_seq_uniprot.py @@ -0,0 +1,53 @@ +import io + +import pandas as pd +import requests +from Bio import SeqIO + + +def pdb_to_seq_uniprot(pdb_id, return_type="list"): + """ + Retrieve the canonical UniProt amino-acid sequence for a given PDB ID. + + Parameters + ---------- + pdb_id : str + PDB ID (e.g., '1a3n'). + return_type : {'list', 'pd.df'}, optional, default='list' + Format of returned value: + + - ``'list'`` : list with one amino-acid sequence + - ``'pd.df'`` : pandas.DataFrame with a single column ['sequence'] + + Returns + ------- + list of str or pandas.DataFrame + Depending on ``return_type``. + """ + pdb_id = pdb_id.lower() + + mapping_url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}" + mapping_resp = requests.get(mapping_url) + mapping_data = mapping_resp.json() + + uniprot_ids = list(mapping_data.get(pdb_id, {}).get("UniProt", {}).keys()) + if not uniprot_ids: + raise ValueError(f"No UniProt mapping found for PDB ID '{pdb_id}'") + + uniprot_id = uniprot_ids[0] + + fasta_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta" + fasta_resp = requests.get(fasta_url) + fasta_data = fasta_resp.text + + record = next(SeqIO.parse(io.StringIO(fasta_data), "fasta")) + sequence = str(record.seq) + + df = pd.DataFrame({"sequence": [sequence]}) + + if return_type == "list": + return df["sequence"].tolist() + elif return_type == "pd.df": + return df.reset_index(drop=True) + else: + raise ValueError("`return_type` must be either 'list' or 'pd.df'") diff --git a/pyaptamer/utils/tests/test_pdb_to_seq_uniprot.py b/pyaptamer/utils/tests/test_pdb_to_seq_uniprot.py new file mode 100644 index 0000000..fdd0fd6 --- /dev/null +++ b/pyaptamer/utils/tests/test_pdb_to_seq_uniprot.py @@ -0,0 +1,18 @@ +import pandas as pd + +from pyaptamer.utils import pdb_to_seq_uniprot + + +def test_pdb_to_seq_uniprot(): + """Test the `pdb_to_seq_uniprot` function.""" + pdb_id = "1a3n" + + df = pdb_to_seq_uniprot(pdb_id, return_type="pd.df") + assert isinstance(df, pd.DataFrame) + assert "sequence" in df.columns + assert len(df.iloc[0]["sequence"]) > 0 + + lst = pdb_to_seq_uniprot(pdb_id, return_type="list") + assert isinstance(lst, list) + assert len(lst) == 1 + assert len(lst[0]) > 0