Skip to content

Commit 1fe11ed

Browse files
rtlortegagcroci2
andauthored
Local strain mappings (#310)
* '[ADD] functions and test for local strain mapping creation' * '[ADD] also added test for antushmash loading' * '[FIX] errores and testing with pytest' * '[ADD] ignore strain_mappings.jon' --------- Co-authored-by: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Co-authored-by: gcroci2 <crocioni.giulia@gmail.com>
1 parent b66fb72 commit 1fe11ed

File tree

5 files changed

+323
-1
lines changed

5 files changed

+323
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,4 @@ webapp/npapp/static/css/bokeh*.css
6363
webapp/npapp/static/js/bokeh*.js
6464
src/nplinker/scoring/iokr/data/SPEC/
6565
tests/integration/data/nplinker_local_mode_example.zip
66+
strain_mappings.json

src/nplinker/genomics/antismash/antismash_loader.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,17 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
5757
bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()
5858
}
5959

60+
def get_genome_bgcs_mapping(self) -> dict[str, list]:
61+
"""Get the mapping from genome to BGCs.
62+
63+
Returns:
64+
The key is genome id and value is a list of BGC names (gbk file names
65+
"""
66+
genome_to_bgcs: dict[str, list] = {}
67+
for bgc, genome in self.get_bgc_genome_mapping().items():
68+
genome_to_bgcs.setdefault(genome, []).append(bgc)
69+
return genome_to_bgcs
70+
6071
def get_files(self) -> dict[str, str]:
6172
"""Get BGC gbk files.
6273

src/nplinker/strain/utils.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
import json
33
import logging
44
from os import PathLike
5+
from typing import Any
56
from jsonschema import validate
7+
from nplinker.genomics.antismash import AntismashBGCLoader
68
from nplinker.schemas import USER_STRAINS_SCHEMA
79
from ..genomics.utils import extract_mappings_original_genome_id_resolved_genome_id
810
from ..genomics.utils import extract_mappings_resolved_genome_id_bgc_id
@@ -17,6 +19,105 @@
1719

1820
logger = logging.getLogger(__name__)
1921

22+
def create_strain_mappings(
23+
strain_bgcs: dict[str, list[str]],
24+
strain_features: dict[str, list[str]],
25+
version: str,
26+
filename: str,
27+
) -> dict[str, Any]:
28+
"""Creates a dictionary for strain mappings based on strain IDs and their aliases.
29+
30+
Args:
31+
strain_bgcs: dict that comes from extract_bgcs_genome_id
32+
strain_features: dict that comes from extract_features_metabolome_id
33+
version: str representing the version of the data
34+
filename: str to save the JSON output
35+
36+
Returns:
37+
A dict with the version and strain mappings
38+
"""
39+
dict_bgcs_features = merge_bgcs_features(strain_bgcs, strain_features)
40+
41+
strain_mappings: dict[str, Any] = {"version": version, "strain_mappings": []}
42+
for strain_id, strain_alias in dict_bgcs_features.items():
43+
if not isinstance(strain_alias, list):
44+
strain_alias = list(strain_alias)
45+
strain_mappings["strain_mappings"].append(
46+
{"strain_id": strain_id, "strain_alias": strain_alias}
47+
)
48+
with open(filename, "w") as json_file:
49+
json.dump(strain_mappings, json_file, indent=4)
50+
return strain_mappings
51+
52+
def extract_bgcs_genome_id(strain_genome: dict[str, list[str]], bgc_path: str | PathLike) -> dict:
53+
"""Extract bgcs based on the strain_genome mapping.
54+
55+
Args:
56+
strain_genome: dict that comes from extract_strain_metadata function
57+
bgc_path: path of the folder of antismash results
58+
59+
Returns:
60+
A dict with the strains and bgcs
61+
"""
62+
bgc_loader = AntismashBGCLoader(bgc_path)
63+
bgc_dict = bgc_loader.get_genome_bgcs_mapping()
64+
65+
strain_bgcs = {}
66+
67+
for strain_id, genome_ids in strain_genome.items():
68+
for genome_id in genome_ids:
69+
if genome_id in bgc_dict:
70+
strain_bgcs[strain_id] = bgc_dict[genome_id]
71+
break
72+
73+
return strain_bgcs
74+
75+
def extract_features_metabolome_id(strain_spectra: dict, features_file: str | PathLike) -> dict:
76+
"""Extract features based on the strain_spectra mapping.
77+
78+
Args:
79+
strain_spectra: dict that comes from extract_strain_metadata function
80+
features_file: path of file of the gnps results
81+
82+
Returns:
83+
A dict with the strains and features
84+
"""
85+
features_dict = extract_mappings_ms_filename_spectrum_id(features_file)
86+
strain_features = {}
87+
for strain_id, spectra in strain_spectra.items():
88+
if strain_id == "StrainID":
89+
continue
90+
if isinstance(spectra, str):
91+
spectra = [spectra]
92+
features_set = set()
93+
94+
for spectrum in spectra:
95+
if spectrum in features_dict:
96+
features_set.update(features_dict[spectrum])
97+
98+
strain_features[strain_id] = sorted(features_set)
99+
100+
return strain_features
101+
102+
def extract_strain_metadata(
103+
strain_path: str | PathLike) -> dict[str, list[str]]:
104+
"""Generate dict based on strain - genome, strain - spectra metadata info.
105+
106+
Args:
107+
strain_path: The path to the txt file
108+
109+
Returns:
110+
A dict with the strains and the genome_id or spectra_id
111+
"""
112+
dictionary: dict[str, list[str]] = {}
113+
with open(strain_path, "r") as file:
114+
next(file)
115+
for line in file:
116+
key, value = map(str.strip, line.strip().split("\t"))
117+
if key not in dictionary:
118+
dictionary[key] = []
119+
dictionary[key].append(value)
120+
return dictionary
20121

21122
def load_user_strains(json_file: str | PathLike) -> set[Strain]:
22123
"""Load user specified strains from a JSON file.
@@ -47,6 +148,31 @@ def load_user_strains(json_file: str | PathLike) -> set[Strain]:
47148

48149
return strains
49150

151+
def merge_bgcs_features(
152+
strain_bgcs: dict[str, list[str]], strain_features: dict[str, list[str]]
153+
) -> dict[str, list[str]]:
154+
"""Merges dict based on the strains with the dict_bgcs and dict_features.
155+
156+
Args:
157+
strain_bgcs: dict that comes from extract_bgcs_genome_id
158+
strain_features: dict that comes from extract_features_metabolome_id
159+
160+
Returns:
161+
A dict with the strains and features
162+
"""
163+
dict_bgcs_features: dict[str, list[str]] = {}
164+
165+
for key in strain_bgcs.keys():
166+
if key in strain_features:
167+
dict_bgcs_features[key] = strain_bgcs[key] + strain_features[key]
168+
else:
169+
dict_bgcs_features[key] = strain_bgcs[key]
170+
171+
for key in strain_features.keys():
172+
if key not in dict_bgcs_features:
173+
dict_bgcs_features[key] = strain_features[key]
174+
175+
return dict_bgcs_features
50176

51177
def podp_generate_strain_mappings(
52178
podp_project_json_file: str | PathLike,

tests/unit/genomics/test_antismash_loader.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ def test_get_bgc_genome_mapping(self, loader):
2828
assert mapping["NZ_KI911412.1.region001"] == "GCF_000514515.1"
2929
assert mapping["NZ_AZWS01000001.region001"] == "GCF_000514855.1"
3030
assert mapping["NZ_KI911483.1.region001"] == "GCF_000514855.1"
31-
31+
def test_get_genome_bgcs_mapping(self, loader):
32+
mapping = loader.get_genome_bgcs_mapping()
33+
assert isinstance(mapping, dict)
34+
assert len(mapping) == 2
35+
assert len(mapping["GCF_000514515.1"]) == 20
3236
def test_get_files(self, loader):
3337
bgc_files = loader.get_files()
3438
assert isinstance(bgc_files, dict)

tests/unit/strain/test_utils.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
import json
2+
import os
3+
from pathlib import Path
24
import pytest
35
from nplinker.strain import Strain
46
from nplinker.strain import StrainCollection
7+
from nplinker.strain.utils import create_strain_mappings
8+
from nplinker.strain.utils import extract_features_metabolome_id
9+
from nplinker.strain.utils import extract_strain_metadata
510
from nplinker.strain.utils import load_user_strains
11+
from nplinker.strain.utils import merge_bgcs_features
612
from nplinker.strain.utils import podp_generate_strain_mappings
713

814

@@ -88,3 +94,177 @@ def test_podp_generate_strain_mappings(monkeypatch, tmp_path):
8894
# check output file
8995
sc = StrainCollection.read_json(output_file)
9096
assert sc == expected_sc
97+
98+
class FakeAntismashBGCLoader:
99+
def __init__(self, bgc_path):
100+
pass
101+
def get_genome_bgcs_mapping(self):
102+
# create the bgc_dict for testing
103+
return {
104+
"genome_1": ["bgc_1", "bgc_2"],
105+
"genome_2": ["bgc_3"],
106+
"genome_3": ["bgc_4", "bgc_5"],
107+
}
108+
109+
# redefine function to use FakeAntismashBGCLoader instead
110+
def extract_bgcs_genome_id_test(strain_genome, bgc_path):
111+
"""Extract bgcs based on the strain_genome mapping.
112+
113+
Args:
114+
strain_genome: dict that comes from extract_strain_metadata function
115+
bgc_path: path of the folder of antismash results
116+
"""
117+
bgc_loader = FakeAntismashBGCLoader(bgc_path)
118+
bgc_dict = bgc_loader.get_genome_bgcs_mapping()
119+
120+
strain_bgcs = {}
121+
122+
for strain_id, genome_id in strain_genome.items():
123+
if genome_id in bgc_dict:
124+
strain_bgcs[strain_id] = bgc_dict[genome_id]
125+
126+
return strain_bgcs
127+
128+
def test_extract_bgcs_genome_id():
129+
strain_genome = {
130+
"strain_1": "genome_1",
131+
"strain_2": "genome_2",
132+
"strain_3": "genome_3",
133+
"strain_4": "genome_4",
134+
}
135+
# now test everything together
136+
strain_bgcs = extract_bgcs_genome_id_test(strain_genome, None) # None to avoid path access
137+
138+
expected_strain_bgcs = {
139+
"strain_1": ["bgc_1", "bgc_2"],
140+
"strain_2": ["bgc_3"],
141+
"strain_3": ["bgc_4", "bgc_5"],
142+
}
143+
assert strain_bgcs == expected_strain_bgcs
144+
145+
def test_extract_strain_metadata(tmp_path):
146+
# creation of a tab-separated file for genome
147+
data_genome = """StrainID\tGenomeID
148+
strain1\tgenome1
149+
strain2\tgenome2"""
150+
file_path = tmp_path / "strain_metadata_genomics.txt"
151+
assert file_path.suffix == ".txt", "Input file should have a .txt extension"
152+
with open(file_path, "w") as f:
153+
f.write(data_genome)
154+
155+
actual_genome = extract_strain_metadata(file_path)
156+
assert isinstance(actual_genome, dict), "The output should be a dictionary"
157+
assert all(
158+
isinstance(k, str) for k in actual_genome.keys()
159+
), "All keys in the output dictionary should be StrainIDs (strings)"
160+
expected_genome = {
161+
"strain1": ["genome1"],
162+
"strain2": ["genome2"],
163+
}
164+
assert actual_genome == expected_genome
165+
166+
# creation of a tab-separated file for spectra
167+
data_spectra = """StrainID\tSpectraID
168+
strain1\tspectra1.mzML
169+
strain2\tspectra2.mzML"""
170+
file_path = tmp_path / "strain_metadata_metabolomics.txt"
171+
assert file_path.suffix == ".txt", "Input file should have a .txt extension"
172+
173+
with open(file_path, "w") as f:
174+
f.write(data_spectra)
175+
176+
actual_spectra = extract_strain_metadata(file_path)
177+
assert isinstance(actual_spectra, dict), "The output should be a dictionary"
178+
assert all(
179+
isinstance(k, str) for k in actual_spectra.keys()
180+
), "All keys in the output dictionary should be StrainIDs (strings)"
181+
expected_spectra = {
182+
"strain1": ["spectra1.mzML"],
183+
"strain2": ["spectra2.mzML"],
184+
}
185+
assert actual_spectra == expected_spectra
186+
187+
def extract_mappings_ms_filename_spectrum_id(_):
188+
# simulating the output function
189+
return {
190+
"spectrum1": ["featureA", "featureB"],
191+
"spectrum2": ["featureC"],
192+
"spectrum3": ["featureA", "featureD"],
193+
}
194+
195+
196+
def extract_features_metabolome_id_test(strain_spectra, _):
197+
"""Fake function"""
198+
features_dict = extract_mappings_ms_filename_spectrum_id(
199+
None
200+
) # None or ignore it completely!!!
201+
strain_features = {}
202+
for strain_id, spectra in strain_spectra.items():
203+
if strain_id == "StrainID":
204+
continue
205+
if isinstance(spectra, str):
206+
spectra = [spectra]
207+
features_set = set()
208+
209+
for spectrum in spectra:
210+
if spectrum in features_dict:
211+
features_set.update(features_dict[spectrum])
212+
213+
strain_features[strain_id] = sorted(features_set)
214+
return strain_features
215+
216+
217+
def test_extract_features_metabolome_id():
218+
# Step 1: Prepare the strain_spectra data
219+
strain_spectra = {
220+
"StrainID": "ExtractID", # This is ignored
221+
"Strain1": ["15b.mzXML", "12c.mzXML"],
222+
"Strain2": "15a.mzXML"
223+
}
224+
225+
# Get the absolute path to the test file
226+
test_file = Path(__file__).parent.parent / "data/gnps/nodes.tsv"
227+
228+
# Call the function with the dynamically determined path
229+
result = extract_features_metabolome_id(strain_spectra, str(test_file))
230+
231+
# Check if the result matches the expected output
232+
assert len(result) == 2
233+
234+
235+
236+
def test_merge_bgcs_features():
237+
strain_bgcs_fake= {'strain_1': ['bgc1','bgc2','bgc3','bgc4',]}
238+
strain_features_fake= {'strain_1': ['feature1','feature2','feature3','feature4']}
239+
strain_bgcs_features = merge_bgcs_features(strain_bgcs_fake, strain_features_fake)
240+
expected = {'strain_1': ['bgc1','bgc2','bgc3','bgc4','feature1','feature2','feature3','feature4']}
241+
assert strain_bgcs_features == expected, f"Test failed! Expected {expected}, but got {strain_bgcs_features}"
242+
243+
def test_create_strain_mappings():
244+
strain_bgcs = {
245+
"Strain1": ["BGC1", "BGC2"],
246+
"Strain2": ["BGC3"],
247+
}
248+
249+
strain_features = {"Strain1": ["featureA", "featureB"], "Strain2": ["featureC"]}
250+
251+
expected_output = {
252+
"version": "1.0",
253+
"strain_mappings": [
254+
{"strain_id": "Strain1", "strain_alias": ["BGC1", "BGC2", "featureA", "featureB"]},
255+
{"strain_id": "Strain2", "strain_alias": ["BGC3", "featureC"]},
256+
],
257+
}
258+
result = create_strain_mappings(
259+
strain_bgcs, strain_features, version="1.0", filename="strain_mappings.json"
260+
)
261+
assert result == expected_output
262+
263+
# check if the json file was created
264+
file_path = "strain_mappings.json"
265+
assert os.path.exists(file_path), f"File {file_path} was not created."
266+
with open(file_path, "r") as json_file:
267+
file_content = json.load(json_file)
268+
assert (
269+
file_content == expected_output
270+
), f"File content {file_content} does not match expected output {expected_output}"

0 commit comments

Comments
 (0)