nmdp-bioinformatics · Regev32 · Feb 20, 2025 · Feb 20, 2025 · May 4, 2025 · Aug 18, 2025
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ py-graph-match
 
 Matching with Graph
 
-`grma`` is a package for finding HLA matches using graphs approach.
+`grma` is a package for finding HLA matches using graphs approach.
 The matching is based on [grim's](https://github.com/nmdp-bioinformatics/py-graph-imputation) imputation.
 
 

diff --git a/data/donors_dir/donors.txt b/data/donors_dir/donors.txt
diff --git a/grma/__init__.py b/grma/__init__.py
@@ -1,2 +1,2 @@
 from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
-from grma.match import matching, find_matches
+from grma.match.match import matching, find_matches
diff --git a/grma/donorsgraph/build_donors_graph.py b/grma/donorsgraph/build_donors_graph.py
@@ -12,8 +12,6 @@
 from grma.utilities.geno_representation import HashableArray
 from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time
 
-CLASS_I_END = 6
-
 
 class BuildMatchingGraph:
     """
@@ -111,7 +109,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
                     geno = gl_string_to_integers(geno)
 
                     # sort alleles for each HLA-X
-                    for x in range(0, 10, 2):
+                    for x in range(0, len(geno), 2):
                         geno[x : x + 2] = sorted(geno[x : x + 2])
                     geno = HashableArray(geno)
 
@@ -137,6 +135,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
                     # continue creation of classes and subclasses
                     if geno not in layers["GENOTYPE"]:
                         layers["GENOTYPE"].add(geno)
+                        CLASS_I_END = -2 * int(-len(geno)/4 - 0.5)
                         geno_class1 = tuple(geno[:CLASS_I_END])
                         geno_class2 = tuple(geno[CLASS_I_END:])
                         self._create_classes_edges(geno, geno_class1, layers)

diff --git a/grma/donorsgraph/create_lol.py b/grma/donorsgraph/create_lol.py
@@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]):
         arrays_start = free
         # map lol-ids to arrays
         # given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
+        geno = layers['GENOTYPE'].pop()
+        layers['GENOTYPE'].add(geno)
         map_number_to_arr_node = np.zeros(
-            (len(layers["GENOTYPE"]), 10), dtype=np.uint16
+            (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16
         )
         for i, geno in tqdm(
             enumerate(layers["GENOTYPE"]),

diff --git a/grma/match/donors_matching.py b/grma/match/donors_matching.py
@@ -20,8 +20,6 @@
 
 DONORS_DB: pd.DataFrame = pd.DataFrame()
 ZEROS: HashableArray = HashableArray([0])
-ALLELES_IN_CLASS_I: int = 6
-ALLELES_IN_CLASS_II: int = 4
 
 
 def set_database(donors_db: pd.DataFrame = pd.DataFrame()):
@@ -39,24 +37,13 @@ def _init_results_df(donors_info):
     fields_in_results = {
         "Patient_ID": [],
         "Donor_ID": [],
+        "GvH_Mismatches": [],
+        "HvG_Mismatches": [],
         "Number_Of_Mismatches": [],
         "Matching_Probability": [],
-        "Match_Probability_A_1": [],
-        "Match_Probability_A_2": [],
-        "Match_Probability_B_1": [],
-        "Match_Probability_B_2": [],
-        "Match_Probability_C_1": [],
-        "Match_Probability_C_2": [],
-        "Match_Probability_DQB1_1": [],
-        "Match_Probability_DQB1_2": [],
-        "Match_Probability_DRB1_1": [],
-        "Match_Probability_DRB1_2": [],
+        "Match_Probability": [],
         "Permissive/Non-Permissive": [],
-        "Match_Between_Most_Commons_A": [],
-        "Match_Between_Most_Commons_B": [],
-        "Match_Between_Most_Commons_C": [],
-        "Match_Between_Most_Commons_DQB": [],
-        "Match_Between_Most_Commons_DRB": [],
+        "Match_Between_Most_Commons": [],
     }
 
     donors_db_fields = DONORS_DB.columns.values.tolist()
@@ -66,17 +53,26 @@ def _init_results_df(donors_info):
     return pd.DataFrame(fields_in_results)
 
 
-def locuses_match_between_genos(geno1, geno2):
+def locuses_match_between_genos(geno_pat, geno_don):
     matches = []
-    for i in range(5):
-        a1, b1 = geno1[2 * i], geno1[2 * i + 1]
-        a2, b2 = geno2[2 * i], geno2[2 * i + 1]
+    total_gvh = 0
+    total_hvg = 0
+
+    for i in range(0, len(geno_pat), 2):
+        a1, b1 = geno_pat[i],   geno_pat[i + 1]
+        a2, b2 = geno_don[i],   geno_don[i + 1]
 
         s1 = int(a1 == a2) + int(b1 == b2)
         s2 = int(a1 == b2) + int(b1 == a2)
         matches.append(max(s1, s2))
 
-    return matches
+        p_set = {x for x in (a1, b1) if x not in (None, 0)}
+        d_set = {x for x in (a2, b2) if x not in (None, 0)}
+
+        total_gvh += len(p_set - d_set)  # patient has, donor lacks
+        total_hvg += len(d_set - p_set)  # donor has, patient lacks
+
+    return matches, total_gvh, total_hvg
 
 
 class DonorsMatching(object):
@@ -127,7 +123,7 @@ def probability_to_allele(
     ) -> List[float]:
         """Takes a donor ID and a genotype.
         Returns the probability of match for each allele"""
-        probs = [0 for _ in range(10)]
+        probs = [0 for _ in range(len(pat_geno))]
 
         for i, allele in enumerate(pat_geno):
             p = 0
@@ -148,7 +144,7 @@ def __find_genotype_candidates_from_class(
     ) -> Tuple[np.ndarray, np.ndarray]:
         """Takes an integer subclass.
         Returns the genotypes (ids and values) which are connected to it in the graph"""
-        return self._graph.class_neighbors(clss)
+        return self._graph.class_neighbors(clss, Len = len(list(self.patients.values())[0]))
 
     def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]:
         """Gets the LOL ID of a genotype.
@@ -216,12 +212,14 @@ def __add_matched_genos_to_graph(
 
     def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
         subclasses = []
-        classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]]
+        ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5)
+        ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I
+        classes = [(genotype[:ALLELES_IN_CLASS_I], 0), (genotype[ALLELES_IN_CLASS_I:], 1)]
         num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II]
 
-        int_classes = [tuple_geno_to_int(tuple(clss)) for clss in classes]
+        int_classes = [(tuple_geno_to_int(tuple(clss[0])), clss[1]) for clss in classes]
         for clss in int_classes:
-            self._patients_graph.add_edge(clss, genotype)
+            self._patients_graph.add_edge(clss[0], genotype)
 
         # class one is considered as 0.
         # class two is considered as 1.
@@ -231,14 +229,14 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
                 # set the missing allele to always be the second allele in the locus
                 if k % 2 == 0:
                     sub = tuple_geno_to_int(
-                        classes[class_num][0:k] + ZEROS + classes[class_num][k + 1 :]
+                        classes[class_num][0][0:k] + ZEROS + classes[class_num][0][k + 1 :]
                     )
                 else:
                     sub = tuple_geno_to_int(
-                        classes[class_num][0 : k - 1]
+                        classes[class_num][0][0 : k - 1]
                         + ZEROS
-                        + classes[class_num][k - 1 : k]
-                        + classes[class_num][k + 1 :]
+                        + classes[class_num][0][k - 1 : k]
+                        + classes[class_num][0][k + 1 :]
                     )
 
                 # missing allele number is the index of the first allele of the locus the missing allele belongs to.
@@ -255,6 +253,8 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
 
         return int_classes, subclasses
 
+
+
     def create_patients_graph(self, f_patients: str):
         """
         create patients graph. \n
@@ -300,7 +300,8 @@ def create_patients_graph(self, f_patients: str):
                 classes_by_patient[patient_id] = set()
 
             # sort alleles for each HLA-X
-            for x in range(0, 10, 2):
+            l = len(geno)
+            for x in range(0, l, 2):
                 geno[x : x + 2] = sorted(geno[x : x + 2])
 
             geno = HashableArray(geno)
@@ -351,15 +352,14 @@ def find_geno_candidates_by_subclasses(self, subclasses):
                     genotypes_value,
                 ) = self.__find_genotype_candidates_from_subclass(subclass.subclass)
 
+                geno = genotypes_value[0]
+                ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5)
+                ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I
                 # Checks only the locuses that are not certain to match
                 if subclass.class_num == 0:
-                    allele_range_to_check = np.array(
-                        [6, 8, subclass.allele_num], dtype=np.uint8
-                    )
+                    allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)] + [subclass.allele_num], dtype=np.uint8)
                 else:
-                    allele_range_to_check = np.array(
-                        [0, 2, 4, subclass.allele_num], dtype=np.uint8
-                    )
+                    allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)] + [subclass.allele_num], dtype=np.uint8)
 
                 # number of alleles that already match due to match in subclass
                 matched_alleles: int = (
@@ -383,9 +383,9 @@ def find_geno_candidates_by_classes(self, classes):
             desc="finding classes matching candidates",
             disable=not self.verbose,
         ):
-            if self._graph.in_nodes(clss):
+            if self._graph.in_nodes(clss[0]):
                 patient_genos = self._patients_graph.neighbors(
-                    clss
+                    clss[0]
                 )  # The patient's genotypes which might be match
                 (
                     genotypes_ids,
@@ -395,12 +395,14 @@ def find_geno_candidates_by_classes(self, classes):
                 # Checks only the locuses that are not certain to match (the locuses of the other class)
                 # Class I appearances: 3 locuses = 6 alleles = 23/24 digits
                 # Class II appearances: 2 locuses = 4 alleles = 15/16 digits
-                if len(str(clss)) > 20:
-                    allele_range_to_check = np.array([6, 8], dtype=np.uint8)
-                    matched_alleles: int = 6
+                geno = list(self.patients.values())[0]
+                if clss[1] == 0:
+                    allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)], dtype=np.uint8)
+                    matched_alleles: int = len(geno)//2 + (len(geno)//2 & 1)
+
                 else:
-                    allele_range_to_check = np.array([0, 2, 4], dtype=np.uint8)
-                    matched_alleles: int = 4
+                    allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)], dtype=np.uint8)
+                    matched_alleles: int = len(geno)//2 - (len(geno)//2 & 1)
 
                 # Compares the candidate to the patient's genotypes, and adds the match geno candidates to the graph.
                 self.__add_matched_genos_to_graph(
@@ -441,7 +443,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int):
             # and each patient connects only to their own genos, so we wouldn't override the weight dict.
             # self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE
             self._genotype_candidates[patient_id][geno_id] = {
-                geno_num: (probability, 10)
+                geno_num: (probability, len(geno))
             }  # AMIT ADD
             # else:
             #     print(f"Missing 'geno_num' for patient_id: {patient_id}")
@@ -507,7 +509,7 @@ def score_matches(
         ].items():  # AMIT ADD
             for prob, matches in genotype_matches.values():  # AMIT CHANGE
                 # match_info = (probability of patient's genotype, number of matches to patient's genotype)
-                if matches != 10 - mismatch:
+                if matches != len(list(self.patients.values())[0]) - mismatch:
                     continue
 
                 # add the probabilities multiplication of the patient and all the donors that has this genotype
@@ -568,38 +570,32 @@ def __append_matching_donor(
         mm_number: int,
     ) -> None:
         """add a donor to the matches dictionary"""
-
-        compare_commons = locuses_match_between_genos(
-            self.patients[patient], self.get_most_common_genotype(donor)
+        pat = self.patients[patient]
+        don = self.get_most_common_genotype(donor)
+        compare_commons, gvh, hvg = locuses_match_between_genos(
+            pat, don
         )
 
         add_donors["Patient_ID"].append(patient)
         add_donors["Donor_ID"].append(donor)
         allele_prob = self.probability_to_allele(
             don_id=donor, pat_geno=self.patients[patient]
         )
-        add_donors["Match_Probability_A_1"].append(allele_prob[0])
-        add_donors["Match_Probability_A_2"].append(allele_prob[1])
-        add_donors["Match_Probability_B_1"].append(allele_prob[2])
-        add_donors["Match_Probability_B_2"].append(allele_prob[3])
-        add_donors["Match_Probability_C_1"].append(allele_prob[4])
-        add_donors["Match_Probability_C_2"].append(allele_prob[5])
-        add_donors["Match_Probability_DQB1_1"].append(allele_prob[6])
-        add_donors["Match_Probability_DQB1_2"].append(allele_prob[7])
-        add_donors["Match_Probability_DRB1_1"].append(allele_prob[8])
-        add_donors["Match_Probability_DRB1_2"].append(allele_prob[9])
-
-        add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0])
-        add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1])
-        add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2])
-        add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3])
-        add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4])
+        add_donors["Match_Probability"].append(allele_prob)
+        add_donors["Match_Between_Most_Commons"].append(compare_commons)
 
         add_donors["Matching_Probability"].append(match_prob)
-        add_donors["Number_Of_Mismatches"].append(mm_number)
-        add_donors["Permissive/Non-Permissive"].append(
-            "-"
-        )  # TODO: add permissiveness algorithm
+        actual_mismatches = 0
+        for match_score in compare_commons:
+            if match_score != 2:
+                actual_mismatches += (2 - match_score)
+
+        add_donors["Number_Of_Mismatches"].append(actual_mismatches)
+        add_donors["GvH_Mismatches"].append(gvh)
+        add_donors["HvG_Mismatches"].append(hvg)
+
+        add_donors["Permissive/Non-Permissive"].append("-")
+        # TODO: add permissiveness algorithm
 
         # add the other given fields to the results
         for field in donors_info:

diff --git a/grma/match/graph_wrapper.py b/grma/match/graph_wrapper.py
@@ -5,7 +5,6 @@
 from typing import Union
 
 import numpy as np
-
 from grma.utilities.geno_representation import HashableArray
 from grma.match.lol_graph import LolGraph
 
@@ -58,11 +57,10 @@ def get_edge_data(
         ret = self._graph.get_edge_data(node1_num, node2_num)
         return default if ret == exception_val else ret
 
-    def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False):
-        node_num = self._map_node_to_number[node] if not search_lol_id else node
+    def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10):
+        node_num = self._map_node_to_number[node[0]] if not search_lol_id else node
         neighbors_list = self._graph.neighbors_unweighted(node_num)
-
-        neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16)
+        neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16)
         for i, neighbor in enumerate(neighbors_list):
             neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor)
 
@@ -112,7 +110,7 @@ def neighbors(
     def neighbors_2nd(self, node):
         node_num = self._map_node_to_number[node]
         r0, r1 = self._graph.neighbors_2nd(node_num)
-        return r0[:-1], r1
+        return r0, r1
 
     def node_value_from_id(self, node_id: int) -> NODES_TYPES:
         """convert lol ID to node value"""

diff --git a/grma/match/lol_graph.pyx b/grma/match/lol_graph.pyx
@@ -155,14 +155,14 @@ cdef class LolGraph:
         cdef UINT16[:] arr
         cdef np.ndarray[UINT16, ndim=2] neighbors_value
         cdef UINT num_of_neighbors_2nd
+        cdef UINT loci_len
 
         idx = self._index_list[node]
         idx_end = self._index_list[node + 1]
 
         neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32)
         for i in range(idx, idx_end):
             neighbors_list_id[i - idx] = self._neighbors_list[i]
-
         num_of_neighbors_2nd = <UINT>self._weights_list[idx]
 
         neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32)
@@ -177,11 +177,12 @@ cdef class LolGraph:
                 neighbors_id[pointer] = self._neighbors_list[j]
                 pointer += 1
 
-        neighbors_value = np.zeros((num_of_neighbors_2nd - 1, 10), dtype=np.uint16)
-        for i in range(len(neighbors_id) - 1):
+        loci_len = <UINT> self._map_number_to_arr_node.shape[1]
+        neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16)
+        for i in range(len(neighbors_id)):
             neighbor_id = neighbors_id[i]
             arr = self.arr_node_value_from_id(neighbor_id)
-            for j in range(10):
+            for j in range(loci_len):
                 neighbors_value[i, j] = arr[j]
 
         return neighbors_id, neighbors_value