Local strain mappings (#310)

rtlortega · gcroci2 · web-flow · commit 1fe11ed17563 · 2025-03-17T16:16:47.000+01:00
* '[ADD] functions and test for local strain mapping creation'

* '[ADD] also added test for antushmash loading'

* '[FIX] errores and testing with pytest'

* '[ADD] ignore strain_mappings.jon'

---------

Co-authored-by: Giulia Crocioni &lt;55382553+gcroci2@users.noreply.github.com&gt;
Co-authored-by: gcroci2 &lt;crocioni.giulia@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -63,3 +63,4 @@ webapp/npapp/static/css/bokeh*.css
 webapp/npapp/static/js/bokeh*.js
 src/nplinker/scoring/iokr/data/SPEC/
 tests/integration/data/nplinker_local_mode_example.zip
+strain_mappings.json
diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -57,6 +57,17 @@ def get_bgc_genome_mapping(self) -> dict[str, str]:
             bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()
         }
 
+    def get_genome_bgcs_mapping(self) -> dict[str, list]:
+            """Get the mapping from genome to BGCs.
+
+            Returns:
+                The key is genome id and value is a list of BGC names (gbk file names
+            """
+            genome_to_bgcs: dict[str, list] = {}
+            for bgc, genome in self.get_bgc_genome_mapping().items():
+                genome_to_bgcs.setdefault(genome, []).append(bgc)
+            return genome_to_bgcs
+
     def get_files(self) -> dict[str, str]:
         """Get BGC gbk files.
 
diff --git a/src/nplinker/strain/utils.py b/src/nplinker/strain/utils.py
@@ -2,7 +2,9 @@
 import json
 import logging
 from os import PathLike
+from typing import Any
 from jsonschema import validate
+from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.schemas import USER_STRAINS_SCHEMA
 from ..genomics.utils import extract_mappings_original_genome_id_resolved_genome_id
 from ..genomics.utils import extract_mappings_resolved_genome_id_bgc_id
@@ -17,6 +19,105 @@
 
 logger = logging.getLogger(__name__)
 
+def create_strain_mappings(
+    strain_bgcs: dict[str, list[str]],
+    strain_features: dict[str, list[str]],
+    version: str,
+    filename: str,
+) -> dict[str, Any]:
+    """Creates a dictionary for strain mappings based on strain IDs and their aliases.
+
+    Args:
+        strain_bgcs: dict that comes from extract_bgcs_genome_id
+        strain_features: dict that comes from extract_features_metabolome_id
+        version: str representing the version of the data
+        filename: str to save the JSON output
+
+    Returns:
+        A dict with the version and strain mappings
+    """
+    dict_bgcs_features = merge_bgcs_features(strain_bgcs, strain_features)
+
+    strain_mappings: dict[str, Any] = {"version": version, "strain_mappings": []}
+    for strain_id, strain_alias in dict_bgcs_features.items():
+        if not isinstance(strain_alias, list):
+            strain_alias = list(strain_alias)
+        strain_mappings["strain_mappings"].append(
+            {"strain_id": strain_id, "strain_alias": strain_alias}
+        )
+    with open(filename, "w") as json_file:
+        json.dump(strain_mappings, json_file, indent=4)
+    return strain_mappings
+
+def extract_bgcs_genome_id(strain_genome: dict[str, list[str]], bgc_path: str | PathLike) -> dict:
+    """Extract bgcs based on the strain_genome mapping.
+
+    Args:
+        strain_genome: dict that comes from extract_strain_metadata function
+        bgc_path: path of the folder of antismash results
+
+    Returns:
+        A dict with the strains and bgcs
+    """
+    bgc_loader = AntismashBGCLoader(bgc_path)
+    bgc_dict = bgc_loader.get_genome_bgcs_mapping()
+
+    strain_bgcs = {}
+
+    for strain_id, genome_ids in strain_genome.items():
+        for genome_id in genome_ids:
+            if genome_id in bgc_dict:
+                strain_bgcs[strain_id] = bgc_dict[genome_id]
+                break
+
+    return strain_bgcs
+
+def extract_features_metabolome_id(strain_spectra: dict, features_file: str | PathLike) -> dict:
+    """Extract features based on the strain_spectra mapping.
+
+    Args:
+        strain_spectra: dict that comes from extract_strain_metadata function
+        features_file: path of file of the gnps results
+
+    Returns:
+        A dict with the strains and features
+    """
+    features_dict = extract_mappings_ms_filename_spectrum_id(features_file)
+    strain_features = {}
+    for strain_id, spectra in strain_spectra.items():
+        if strain_id == "StrainID":
+            continue
+        if isinstance(spectra, str):
+            spectra = [spectra]
+        features_set = set()
+
+        for spectrum in spectra:
+            if spectrum in features_dict:
+                features_set.update(features_dict[spectrum])
+
+        strain_features[strain_id] = sorted(features_set)
+
+    return strain_features
+
+def extract_strain_metadata(
+        strain_path: str | PathLike) -> dict[str, list[str]]:
+    """Generate dict based on strain - genome, strain - spectra metadata info.
+
+    Args:
+        strain_path: The path to the txt file
+
+    Returns:
+        A dict with the strains and the genome_id or spectra_id
+    """
+    dictionary: dict[str, list[str]] = {}
+    with open(strain_path, "r") as file:
+        next(file)
+        for line in file:
+            key, value = map(str.strip, line.strip().split("\t"))
+            if key not in dictionary:
+                dictionary[key] = []
+            dictionary[key].append(value)
+    return dictionary
 
 def load_user_strains(json_file: str | PathLike) -> set[Strain]:
     """Load user specified strains from a JSON file.
@@ -47,6 +148,31 @@ def load_user_strains(json_file: str | PathLike) -> set[Strain]:
 
     return strains
 
+def merge_bgcs_features(
+    strain_bgcs: dict[str, list[str]], strain_features: dict[str, list[str]]
+) -> dict[str, list[str]]:
+    """Merges dict based on the strains with the dict_bgcs and dict_features.
+
+    Args:
+        strain_bgcs: dict that comes from extract_bgcs_genome_id
+        strain_features: dict that comes from extract_features_metabolome_id
+
+    Returns:
+        A dict with the strains and features
+    """
+    dict_bgcs_features: dict[str, list[str]] = {}
+
+    for key in strain_bgcs.keys():
+        if key in strain_features:
+            dict_bgcs_features[key] = strain_bgcs[key] + strain_features[key]
+        else:
+            dict_bgcs_features[key] = strain_bgcs[key]
+
+    for key in strain_features.keys():
+        if key not in dict_bgcs_features:
+            dict_bgcs_features[key] = strain_features[key]
+
+    return dict_bgcs_features
 
 def podp_generate_strain_mappings(
     podp_project_json_file: str | PathLike,
diff --git a/tests/unit/genomics/test_antismash_loader.py b/tests/unit/genomics/test_antismash_loader.py
@@ -28,7 +28,11 @@ def test_get_bgc_genome_mapping(self, loader):
         assert mapping["NZ_KI911412.1.region001"] == "GCF_000514515.1"
         assert mapping["NZ_AZWS01000001.region001"] == "GCF_000514855.1"
         assert mapping["NZ_KI911483.1.region001"] == "GCF_000514855.1"
-
+    def test_get_genome_bgcs_mapping(self, loader):
+        mapping = loader.get_genome_bgcs_mapping()
+        assert isinstance(mapping, dict)
+        assert len(mapping) == 2
+        assert len(mapping["GCF_000514515.1"]) == 20
     def test_get_files(self, loader):
         bgc_files = loader.get_files()
         assert isinstance(bgc_files, dict)
diff --git a/tests/unit/strain/test_utils.py b/tests/unit/strain/test_utils.py
@@ -1,8 +1,14 @@
 import json
+import os
+from pathlib import Path
 import pytest
 from nplinker.strain import Strain
 from nplinker.strain import StrainCollection
+from nplinker.strain.utils import create_strain_mappings
+from nplinker.strain.utils import extract_features_metabolome_id
+from nplinker.strain.utils import extract_strain_metadata
 from nplinker.strain.utils import load_user_strains
+from nplinker.strain.utils import merge_bgcs_features
 from nplinker.strain.utils import podp_generate_strain_mappings
 
 
@@ -88,3 +94,177 @@ def test_podp_generate_strain_mappings(monkeypatch, tmp_path):
     # check output file
     sc = StrainCollection.read_json(output_file)
     assert sc == expected_sc
+
+class FakeAntismashBGCLoader:
+    def __init__(self, bgc_path):
+        pass
+    def get_genome_bgcs_mapping(self):
+        # create the bgc_dict for testing
+        return {
+            "genome_1": ["bgc_1", "bgc_2"],
+            "genome_2": ["bgc_3"],
+            "genome_3": ["bgc_4", "bgc_5"],
+        }
+
+# redefine function to use  FakeAntismashBGCLoader instead
+def extract_bgcs_genome_id_test(strain_genome, bgc_path):
+    """Extract bgcs based on the strain_genome mapping.
+
+    Args:
+        strain_genome: dict that comes from extract_strain_metadata function
+        bgc_path: path of the folder of antismash results
+    """
+    bgc_loader = FakeAntismashBGCLoader(bgc_path)
+    bgc_dict = bgc_loader.get_genome_bgcs_mapping()
+
+    strain_bgcs = {}
+
+    for strain_id, genome_id in strain_genome.items():
+        if genome_id in bgc_dict:
+            strain_bgcs[strain_id] = bgc_dict[genome_id]
+
+    return strain_bgcs
+
+def test_extract_bgcs_genome_id():
+    strain_genome = {
+        "strain_1": "genome_1",
+        "strain_2": "genome_2",
+        "strain_3": "genome_3",
+        "strain_4": "genome_4",
+    }
+    # now test everything together
+    strain_bgcs = extract_bgcs_genome_id_test(strain_genome, None)  # None to avoid path access
+
+    expected_strain_bgcs = {
+        "strain_1": ["bgc_1", "bgc_2"],
+        "strain_2": ["bgc_3"],
+        "strain_3": ["bgc_4", "bgc_5"],
+    }
+    assert strain_bgcs == expected_strain_bgcs
+
+def test_extract_strain_metadata(tmp_path):
+    # creation of a tab-separated file for genome
+    data_genome = """StrainID\tGenomeID
+    strain1\tgenome1
+    strain2\tgenome2"""
+    file_path = tmp_path / "strain_metadata_genomics.txt"
+    assert file_path.suffix == ".txt", "Input file should have a .txt extension"
+    with open(file_path, "w") as f:
+        f.write(data_genome)
+
+    actual_genome = extract_strain_metadata(file_path)
+    assert isinstance(actual_genome, dict), "The output should be a dictionary"
+    assert all(
+        isinstance(k, str) for k in actual_genome.keys()
+    ), "All keys in the output dictionary should be StrainIDs (strings)"
+    expected_genome = {
+        "strain1": ["genome1"],
+        "strain2": ["genome2"],
+    }
+    assert actual_genome == expected_genome
+
+    # creation of a tab-separated file for spectra
+    data_spectra = """StrainID\tSpectraID
+    strain1\tspectra1.mzML
+    strain2\tspectra2.mzML"""
+    file_path = tmp_path / "strain_metadata_metabolomics.txt"
+    assert file_path.suffix == ".txt", "Input file should have a .txt extension"
+
+    with open(file_path, "w") as f:
+        f.write(data_spectra)
+
+    actual_spectra = extract_strain_metadata(file_path)
+    assert isinstance(actual_spectra, dict), "The output should be a dictionary"
+    assert all(
+        isinstance(k, str) for k in actual_spectra.keys()
+    ), "All keys in the output dictionary should be StrainIDs (strings)"
+    expected_spectra = {
+        "strain1": ["spectra1.mzML"],
+        "strain2": ["spectra2.mzML"],
+    }
+    assert actual_spectra == expected_spectra
+
+def extract_mappings_ms_filename_spectrum_id(_):
+    # simulating the output function
+    return {
+        "spectrum1": ["featureA", "featureB"],
+        "spectrum2": ["featureC"],
+        "spectrum3": ["featureA", "featureD"],
+    }
+
+
+def extract_features_metabolome_id_test(strain_spectra, _):
+    """Fake function"""
+    features_dict = extract_mappings_ms_filename_spectrum_id(
+        None
+    )  # None or ignore it completely!!!
+    strain_features = {}
+    for strain_id, spectra in strain_spectra.items():
+        if strain_id == "StrainID":
+            continue
+        if isinstance(spectra, str):
+            spectra = [spectra]
+        features_set = set()
+
+        for spectrum in spectra:
+            if spectrum in features_dict:
+                features_set.update(features_dict[spectrum])
+
+        strain_features[strain_id] = sorted(features_set)
+    return strain_features
+
+
+def test_extract_features_metabolome_id():
+    # Step 1: Prepare the strain_spectra data
+    strain_spectra = {
+        "StrainID": "ExtractID",  # This is ignored
+        "Strain1": ["15b.mzXML", "12c.mzXML"],
+        "Strain2": "15a.mzXML"
+    }
+
+    # Get the absolute path to the test file
+    test_file = Path(__file__).parent.parent / "data/gnps/nodes.tsv"
+
+    # Call the function with the dynamically determined path
+    result = extract_features_metabolome_id(strain_spectra, str(test_file))
+
+    # Check if the result matches the expected output
+    assert len(result) == 2
+
+
+
+def test_merge_bgcs_features():
+    strain_bgcs_fake= {'strain_1': ['bgc1','bgc2','bgc3','bgc4',]}
+    strain_features_fake= {'strain_1': ['feature1','feature2','feature3','feature4']}
+    strain_bgcs_features = merge_bgcs_features(strain_bgcs_fake, strain_features_fake)
+    expected = {'strain_1': ['bgc1','bgc2','bgc3','bgc4','feature1','feature2','feature3','feature4']}
+    assert strain_bgcs_features == expected, f"Test failed! Expected {expected}, but got {strain_bgcs_features}"
+
+def test_create_strain_mappings():
+    strain_bgcs = {
+        "Strain1": ["BGC1", "BGC2"],
+        "Strain2": ["BGC3"],
+    }
+
+    strain_features = {"Strain1": ["featureA", "featureB"], "Strain2": ["featureC"]}
+
+    expected_output = {
+        "version": "1.0",
+        "strain_mappings": [
+            {"strain_id": "Strain1", "strain_alias": ["BGC1", "BGC2", "featureA", "featureB"]},
+            {"strain_id": "Strain2", "strain_alias": ["BGC3", "featureC"]},
+        ],
+    }
+    result = create_strain_mappings(
+        strain_bgcs, strain_features, version="1.0", filename="strain_mappings.json"
+    )
+    assert result == expected_output
+
+    # check if the json file was created
+    file_path = "strain_mappings.json"
+    assert os.path.exists(file_path), f"File {file_path} was not created."
+    with open(file_path, "r") as json_file:
+        file_content = json.load(json_file)
+        assert (
+            file_content == expected_output
+        ), f"File content {file_content} does not match expected output {expected_output}"