Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ py-graph-match

Matching with Graph

`grma`` is a package for finding HLA matches using graphs approach.
`grma` is a package for finding HLA matches using graphs approach.
The matching is based on [grim's](https://github.com/nmdp-bioinformatics/py-graph-imputation) imputation.


Expand Down
Empty file modified data/donors_dir/donors.txt
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion grma/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
from grma.match import matching, find_matches
from grma.match.match import matching, find_matches
5 changes: 2 additions & 3 deletions grma/donorsgraph/build_donors_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
from grma.utilities.geno_representation import HashableArray
from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time

CLASS_I_END = 6


class BuildMatchingGraph:
"""
Expand Down Expand Up @@ -111,7 +109,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
geno = gl_string_to_integers(geno)

# sort alleles for each HLA-X
for x in range(0, 10, 2):
for x in range(0, len(geno), 2):
geno[x : x + 2] = sorted(geno[x : x + 2])
geno = HashableArray(geno)

Expand All @@ -137,6 +135,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
# continue creation of classes and subclasses
if geno not in layers["GENOTYPE"]:
layers["GENOTYPE"].add(geno)
CLASS_I_END = -2 * int(-len(geno)/4 - 0.5)
geno_class1 = tuple(geno[:CLASS_I_END])
geno_class2 = tuple(geno[CLASS_I_END:])
self._create_classes_edges(geno, geno_class1, layers)
Expand Down
4 changes: 3 additions & 1 deletion grma/donorsgraph/create_lol.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]):
arrays_start = free
# map lol-ids to arrays
# given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
geno = layers['GENOTYPE'].pop()
layers['GENOTYPE'].add(geno)
map_number_to_arr_node = np.zeros(
(len(layers["GENOTYPE"]), 10), dtype=np.uint16
(len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16
)
for i, geno in tqdm(
enumerate(layers["GENOTYPE"]),
Expand Down
136 changes: 66 additions & 70 deletions grma/match/donors_matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@

DONORS_DB: pd.DataFrame = pd.DataFrame()
ZEROS: HashableArray = HashableArray([0])
ALLELES_IN_CLASS_I: int = 6
ALLELES_IN_CLASS_II: int = 4


def set_database(donors_db: pd.DataFrame = pd.DataFrame()):
Expand All @@ -39,24 +37,13 @@ def _init_results_df(donors_info):
fields_in_results = {
"Patient_ID": [],
"Donor_ID": [],
"GvH_Mismatches": [],
"HvG_Mismatches": [],
"Number_Of_Mismatches": [],
"Matching_Probability": [],
"Match_Probability_A_1": [],
"Match_Probability_A_2": [],
"Match_Probability_B_1": [],
"Match_Probability_B_2": [],
"Match_Probability_C_1": [],
"Match_Probability_C_2": [],
"Match_Probability_DQB1_1": [],
"Match_Probability_DQB1_2": [],
"Match_Probability_DRB1_1": [],
"Match_Probability_DRB1_2": [],
"Match_Probability": [],
"Permissive/Non-Permissive": [],
"Match_Between_Most_Commons_A": [],
"Match_Between_Most_Commons_B": [],
"Match_Between_Most_Commons_C": [],
"Match_Between_Most_Commons_DQB": [],
"Match_Between_Most_Commons_DRB": [],
"Match_Between_Most_Commons": [],
}

donors_db_fields = DONORS_DB.columns.values.tolist()
Expand All @@ -66,17 +53,26 @@ def _init_results_df(donors_info):
return pd.DataFrame(fields_in_results)


def locuses_match_between_genos(geno1, geno2):
def locuses_match_between_genos(geno_pat, geno_don):
matches = []
for i in range(5):
a1, b1 = geno1[2 * i], geno1[2 * i + 1]
a2, b2 = geno2[2 * i], geno2[2 * i + 1]
total_gvh = 0
total_hvg = 0

for i in range(0, len(geno_pat), 2):
a1, b1 = geno_pat[i], geno_pat[i + 1]
a2, b2 = geno_don[i], geno_don[i + 1]

s1 = int(a1 == a2) + int(b1 == b2)
s2 = int(a1 == b2) + int(b1 == a2)
matches.append(max(s1, s2))

return matches
p_set = {x for x in (a1, b1) if x not in (None, 0)}
d_set = {x for x in (a2, b2) if x not in (None, 0)}

total_gvh += len(p_set - d_set) # patient has, donor lacks
total_hvg += len(d_set - p_set) # donor has, patient lacks

return matches, total_gvh, total_hvg


class DonorsMatching(object):
Expand Down Expand Up @@ -127,7 +123,7 @@ def probability_to_allele(
) -> List[float]:
"""Takes a donor ID and a genotype.
Returns the probability of match for each allele"""
probs = [0 for _ in range(10)]
probs = [0 for _ in range(len(pat_geno))]

for i, allele in enumerate(pat_geno):
p = 0
Expand All @@ -148,7 +144,7 @@ def __find_genotype_candidates_from_class(
) -> Tuple[np.ndarray, np.ndarray]:
"""Takes an integer subclass.
Returns the genotypes (ids and values) which are connected to it in the graph"""
return self._graph.class_neighbors(clss)
return self._graph.class_neighbors(clss, Len = len(list(self.patients.values())[0]))

def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]:
"""Gets the LOL ID of a genotype.
Expand Down Expand Up @@ -216,12 +212,14 @@ def __add_matched_genos_to_graph(

def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
subclasses = []
classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]]
ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5)
ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I
classes = [(genotype[:ALLELES_IN_CLASS_I], 0), (genotype[ALLELES_IN_CLASS_I:], 1)]
num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II]

int_classes = [tuple_geno_to_int(tuple(clss)) for clss in classes]
int_classes = [(tuple_geno_to_int(tuple(clss[0])), clss[1]) for clss in classes]
for clss in int_classes:
self._patients_graph.add_edge(clss, genotype)
self._patients_graph.add_edge(clss[0], genotype)

# class one is considered as 0.
# class two is considered as 1.
Expand All @@ -231,14 +229,14 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
# set the missing allele to always be the second allele in the locus
if k % 2 == 0:
sub = tuple_geno_to_int(
classes[class_num][0:k] + ZEROS + classes[class_num][k + 1 :]
classes[class_num][0][0:k] + ZEROS + classes[class_num][0][k + 1 :]
)
else:
sub = tuple_geno_to_int(
classes[class_num][0 : k - 1]
classes[class_num][0][0 : k - 1]
+ ZEROS
+ classes[class_num][k - 1 : k]
+ classes[class_num][k + 1 :]
+ classes[class_num][0][k - 1 : k]
+ classes[class_num][0][k + 1 :]
)

# missing allele number is the index of the first allele of the locus the missing allele belongs to.
Expand All @@ -255,6 +253,8 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):

return int_classes, subclasses



def create_patients_graph(self, f_patients: str):
"""
create patients graph. \n
Expand Down Expand Up @@ -300,7 +300,8 @@ def create_patients_graph(self, f_patients: str):
classes_by_patient[patient_id] = set()

# sort alleles for each HLA-X
for x in range(0, 10, 2):
l = len(geno)
for x in range(0, l, 2):
geno[x : x + 2] = sorted(geno[x : x + 2])

geno = HashableArray(geno)
Expand Down Expand Up @@ -351,15 +352,14 @@ def find_geno_candidates_by_subclasses(self, subclasses):
genotypes_value,
) = self.__find_genotype_candidates_from_subclass(subclass.subclass)

geno = genotypes_value[0]
ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5)
ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I
# Checks only the locuses that are not certain to match
if subclass.class_num == 0:
allele_range_to_check = np.array(
[6, 8, subclass.allele_num], dtype=np.uint8
)
allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)] + [subclass.allele_num], dtype=np.uint8)
else:
allele_range_to_check = np.array(
[0, 2, 4, subclass.allele_num], dtype=np.uint8
)
allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)] + [subclass.allele_num], dtype=np.uint8)

# number of alleles that already match due to match in subclass
matched_alleles: int = (
Expand All @@ -383,9 +383,9 @@ def find_geno_candidates_by_classes(self, classes):
desc="finding classes matching candidates",
disable=not self.verbose,
):
if self._graph.in_nodes(clss):
if self._graph.in_nodes(clss[0]):
patient_genos = self._patients_graph.neighbors(
clss
clss[0]
) # The patient's genotypes which might be match
(
genotypes_ids,
Expand All @@ -395,12 +395,14 @@ def find_geno_candidates_by_classes(self, classes):
# Checks only the locuses that are not certain to match (the locuses of the other class)
# Class I appearances: 3 locuses = 6 alleles = 23/24 digits
# Class II appearances: 2 locuses = 4 alleles = 15/16 digits
if len(str(clss)) > 20:
allele_range_to_check = np.array([6, 8], dtype=np.uint8)
matched_alleles: int = 6
geno = list(self.patients.values())[0]
if clss[1] == 0:
allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)], dtype=np.uint8)
matched_alleles: int = len(geno)//2 + (len(geno)//2 & 1)

else:
allele_range_to_check = np.array([0, 2, 4], dtype=np.uint8)
matched_alleles: int = 4
allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)], dtype=np.uint8)
matched_alleles: int = len(geno)//2 - (len(geno)//2 & 1)

# Compares the candidate to the patient's genotypes, and adds the match geno candidates to the graph.
self.__add_matched_genos_to_graph(
Expand Down Expand Up @@ -441,7 +443,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int):
# and each patient connects only to their own genos, so we wouldn't override the weight dict.
# self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE
self._genotype_candidates[patient_id][geno_id] = {
geno_num: (probability, 10)
geno_num: (probability, len(geno))
} # AMIT ADD
# else:
# print(f"Missing 'geno_num' for patient_id: {patient_id}")
Expand Down Expand Up @@ -507,7 +509,7 @@ def score_matches(
].items(): # AMIT ADD
for prob, matches in genotype_matches.values(): # AMIT CHANGE
# match_info = (probability of patient's genotype, number of matches to patient's genotype)
if matches != 10 - mismatch:
if matches != len(list(self.patients.values())[0]) - mismatch:
continue

# add the probabilities multiplication of the patient and all the donors that has this genotype
Expand Down Expand Up @@ -568,38 +570,32 @@ def __append_matching_donor(
mm_number: int,
) -> None:
"""add a donor to the matches dictionary"""

compare_commons = locuses_match_between_genos(
self.patients[patient], self.get_most_common_genotype(donor)
pat = self.patients[patient]
don = self.get_most_common_genotype(donor)
compare_commons, gvh, hvg = locuses_match_between_genos(
pat, don
)

add_donors["Patient_ID"].append(patient)
add_donors["Donor_ID"].append(donor)
allele_prob = self.probability_to_allele(
don_id=donor, pat_geno=self.patients[patient]
)
add_donors["Match_Probability_A_1"].append(allele_prob[0])
add_donors["Match_Probability_A_2"].append(allele_prob[1])
add_donors["Match_Probability_B_1"].append(allele_prob[2])
add_donors["Match_Probability_B_2"].append(allele_prob[3])
add_donors["Match_Probability_C_1"].append(allele_prob[4])
add_donors["Match_Probability_C_2"].append(allele_prob[5])
add_donors["Match_Probability_DQB1_1"].append(allele_prob[6])
add_donors["Match_Probability_DQB1_2"].append(allele_prob[7])
add_donors["Match_Probability_DRB1_1"].append(allele_prob[8])
add_donors["Match_Probability_DRB1_2"].append(allele_prob[9])

add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0])
add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1])
add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2])
add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3])
add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4])
add_donors["Match_Probability"].append(allele_prob)
add_donors["Match_Between_Most_Commons"].append(compare_commons)

add_donors["Matching_Probability"].append(match_prob)
add_donors["Number_Of_Mismatches"].append(mm_number)
add_donors["Permissive/Non-Permissive"].append(
"-"
) # TODO: add permissiveness algorithm
actual_mismatches = 0
for match_score in compare_commons:
if match_score != 2:
actual_mismatches += (2 - match_score)

add_donors["Number_Of_Mismatches"].append(actual_mismatches)
add_donors["GvH_Mismatches"].append(gvh)
add_donors["HvG_Mismatches"].append(hvg)

add_donors["Permissive/Non-Permissive"].append("-")
# TODO: add permissiveness algorithm

# add the other given fields to the results
for field in donors_info:
Expand Down
10 changes: 4 additions & 6 deletions grma/match/graph_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Union

import numpy as np

from grma.utilities.geno_representation import HashableArray
from grma.match.lol_graph import LolGraph

Expand Down Expand Up @@ -58,11 +57,10 @@ def get_edge_data(
ret = self._graph.get_edge_data(node1_num, node2_num)
return default if ret == exception_val else ret

def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False):
node_num = self._map_node_to_number[node] if not search_lol_id else node
def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10):
node_num = self._map_node_to_number[node[0]] if not search_lol_id else node
neighbors_list = self._graph.neighbors_unweighted(node_num)

neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16)
neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16)
for i, neighbor in enumerate(neighbors_list):
neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor)

Expand Down Expand Up @@ -112,7 +110,7 @@ def neighbors(
def neighbors_2nd(self, node):
node_num = self._map_node_to_number[node]
r0, r1 = self._graph.neighbors_2nd(node_num)
return r0[:-1], r1
return r0, r1

def node_value_from_id(self, node_id: int) -> NODES_TYPES:
"""convert lol ID to node value"""
Expand Down
9 changes: 5 additions & 4 deletions grma/match/lol_graph.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,14 @@ cdef class LolGraph:
cdef UINT16[:] arr
cdef np.ndarray[UINT16, ndim=2] neighbors_value
cdef UINT num_of_neighbors_2nd
cdef UINT loci_len

idx = self._index_list[node]
idx_end = self._index_list[node + 1]

neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32)
for i in range(idx, idx_end):
neighbors_list_id[i - idx] = self._neighbors_list[i]

num_of_neighbors_2nd = <UINT>self._weights_list[idx]

neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32)
Expand All @@ -177,11 +177,12 @@ cdef class LolGraph:
neighbors_id[pointer] = self._neighbors_list[j]
pointer += 1

neighbors_value = np.zeros((num_of_neighbors_2nd - 1, 10), dtype=np.uint16)
for i in range(len(neighbors_id) - 1):
loci_len = <UINT> self._map_number_to_arr_node.shape[1]
neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16)
for i in range(len(neighbors_id)):
neighbor_id = neighbors_id[i]
arr = self.arr_node_value_from_id(neighbor_id)
for j in range(10):
for j in range(loci_len):
neighbors_value[i, j] = arr[j]

return neighbors_id, neighbors_value
Loading