Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyaptamer/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
"rna2vec",
"pdb_to_struct",
"struct_to_aaseq",
"pdb_to_seq_uniprot",
"pdb_to_aaseq",
]

from pyaptamer.utils._aa_str_to_letter import aa_str_to_letter
from pyaptamer.utils._pdb_to_aaseq import pdb_to_aaseq
from pyaptamer.utils._pdb_to_seq_uniprot import pdb_to_seq_uniprot
from pyaptamer.utils._pdb_to_struct import pdb_to_struct
from pyaptamer.utils._rna import (
dna2rna,
Expand Down
53 changes: 53 additions & 0 deletions pyaptamer/utils/_pdb_to_seq_uniprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import io

import pandas as pd
import requests
from Bio import SeqIO


def pdb_to_seq_uniprot(pdb_id, return_type="list"):
"""
Retrieve the canonical UniProt amino-acid sequence for a given PDB ID.

Parameters
----------
pdb_id : str
PDB ID (e.g., '1a3n').
return_type : {'list', 'pd.df'}, optional, default='list'
Format of returned value:

- ``'list'`` : list with one amino-acid sequence
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

newline missing

- ``'pd.df'`` : pandas.DataFrame with a single column ['sequence']

Returns
-------
list of str or pandas.DataFrame
Depending on ``return_type``.
"""
pdb_id = pdb_id.lower()

mapping_url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
mapping_resp = requests.get(mapping_url)
mapping_data = mapping_resp.json()

uniprot_ids = list(mapping_data.get(pdb_id, {}).get("UniProt", {}).keys())
if not uniprot_ids:
raise ValueError(f"No UniProt mapping found for PDB ID '{pdb_id}'")

uniprot_id = uniprot_ids[0]

fasta_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
fasta_resp = requests.get(fasta_url)
fasta_data = fasta_resp.text

record = next(SeqIO.parse(io.StringIO(fasta_data), "fasta"))
sequence = str(record.seq)

df = pd.DataFrame({"sequence": [sequence]})

if return_type == "list":
return df["sequence"].tolist()
elif return_type == "pd.df":
return df.reset_index(drop=True)
else:
raise ValueError("`return_type` must be either 'list' or 'pd.df'")
18 changes: 18 additions & 0 deletions pyaptamer/utils/tests/test_pdb_to_seq_uniprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

from pyaptamer.utils import pdb_to_seq_uniprot


def test_pdb_to_seq_uniprot():
"""Test the `pdb_to_seq_uniprot` function."""
pdb_id = "1a3n"

df = pdb_to_seq_uniprot(pdb_id, return_type="pd.df")
assert isinstance(df, pd.DataFrame)
assert "sequence" in df.columns
assert len(df.iloc[0]["sequence"]) > 0

lst = pdb_to_seq_uniprot(pdb_id, return_type="list")
assert isinstance(lst, list)
assert len(lst) == 1
assert len(lst[0]) > 0