Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyaptamer/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@
"rna2vec",
"pdb_to_struct",
"struct_to_aaseq",
"pdb_to_seq_uniprot",
"pdb_to_aaseq",
]

from pyaptamer.utils._aa_str_to_letter import aa_str_to_letter
from pyaptamer.utils._pdb_to_aaseq import pdb_to_aaseq
from pyaptamer.utils._pdb_to_seq_uniprot import pdb_to_seq_uniprot
from pyaptamer.utils._pdb_to_struct import pdb_to_struct
from pyaptamer.utils._rna import (
dna2rna,
Expand Down
52 changes: 52 additions & 0 deletions pyaptamer/utils/_pdb_to_seq_uniprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import io

import pandas as pd
import requests
from Bio import SeqIO


def pdb_to_seq_uniprot(pdb_id, return_type="list"):
"""
Retrieve the canonical UniProt amino-acid sequence for a given PDB ID.

Parameters
----------
pdb_id : str
PDB ID (e.g., '1a3n').
return_type : {'list', 'pd.df'}, optional, default='list'
Format of returned value:
- ``'list'`` : list with one amino-acid sequence
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

newline missing

- ``'pd.df'`` : pandas.DataFrame with a single column ['sequence']

Returns
-------
list of str or pandas.DataFrame
Depending on ``return_type``.
"""
pdb_id = pdb_id.lower()

mapping_url = f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}"
mapping_resp = requests.get(mapping_url)
mapping_data = mapping_resp.json()

uniprot_ids = list(mapping_data.get(pdb_id, {}).get("UniProt", {}).keys())
if not uniprot_ids:
raise ValueError(f"No UniProt mapping found for PDB ID '{pdb_id}'")

uniprot_id = uniprot_ids[0]

fasta_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
fasta_resp = requests.get(fasta_url)
fasta_data = fasta_resp.text

record = next(SeqIO.parse(io.StringIO(fasta_data), "fasta"))
sequence = str(record.seq)

df = pd.DataFrame({"sequence": [sequence]})

if return_type == "list":
return df["sequence"].tolist()
elif return_type == "pd.df":
return df.reset_index(drop=True)
else:
raise ValueError("`return_type` must be either 'list' or 'pd.df'")
18 changes: 18 additions & 0 deletions pyaptamer/utils/tests/test_pdb_to_seq_uniprot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

from pyaptamer.utils import pdb_to_seq_uniprot


def test_pdb_to_seq_uniprot():
"""Test the `pdb_to_seq_uniprot` function."""
pdb_id = "1a3n"

df = pdb_to_seq_uniprot(pdb_id, return_type="pd.df")
assert isinstance(df, pd.DataFrame)
assert "sequence" in df.columns
assert len(df.iloc[0]["sequence"]) > 0

lst = pdb_to_seq_uniprot(pdb_id, return_type="list")
assert isinstance(lst, list)
assert len(lst) == 1
assert len(lst[0]) > 0