diff --git a/src/trim_primer_quality.cpp b/src/trim_primer_quality.cpp index ecea3161..257582d4 100755 --- a/src/trim_primer_quality.cpp +++ b/src/trim_primer_quality.cpp @@ -1,5 +1,5 @@ #include "trim_primer_quality.h" - +#include #define round_int(x, total) \ ((int)(0.5 + ((float)x / float(total)) * 10000)) / (float)100 @@ -388,90 +388,6 @@ std::vector insertionSort(std::vector primers, uint32_t n) { return primers; } -// For paired reads -void get_overlapping_primers(bam1_t *r, std::vector &primers, - std::vector &overlapped_primers) { - overlapped_primers.clear(); - uint32_t start_pos = -1; - char strand = '+'; - if (bam_is_rev(r)) { - start_pos = bam_endpos(r) - 1; - strand = '-'; - } else { - start_pos = r->core.pos; - } - - int low = 0; - int high = primers.size(); - // then we iterate and push what fits - int loc_exact = binary_search(primers, start_pos, low, - high); - primer possible_match; - if(loc_exact >= low && loc_exact < high){ - possible_match = primers[loc_exact]; - if(start_pos >= possible_match.get_start() && start_pos <= possible_match.get_end() && - (strand == possible_match.get_strand() || possible_match.get_strand() == 0)){ - overlapped_primers.push_back(possible_match); - } - } - int loc = 0; - bool done_right = false; - bool done_left = false; - int i = 1; - while(!done_left && !done_right){ - loc = loc_exact + i; - if(loc >= low && loc < high){ - possible_match = primers[loc]; - - if(start_pos >= possible_match.get_start() && start_pos <= possible_match.get_end() && - (strand == possible_match.get_strand() || possible_match.get_strand() == 0)){ - overlapped_primers.push_back(possible_match); - } - if(start_pos < possible_match.get_start()){ - done_right = true; - } - } else{ - done_right = true; - } - - loc = loc_exact - i; - if(loc >= low && loc < high){ - possible_match = primers[loc]; - if(start_pos >= possible_match.get_start() && start_pos <= possible_match.get_end() && - (strand == possible_match.get_strand() || possible_match.get_strand() == 0)){ - overlapped_primers.push_back(possible_match); - } - if(start_pos > possible_match.get_end()){ - done_left = true; - } - } else{ - done_left = true; - } - i++; - } -} - -// For unpaired reads -void get_overlapping_primers(bam1_t *r, std::vector primers, - std::vector &overlapped_primers, - bool unpaired_rev) { - overlapped_primers.clear(); - uint32_t start_pos = -1; - char strand = '+'; - if (unpaired_rev) { - start_pos = bam_endpos(r) - 1; - strand = '-'; - } else { - start_pos = r->core.pos; - } - for (std::vector::iterator it = primers.begin(); it != primers.end(); - ++it) { - if (start_pos >= it->get_start() && start_pos <= it->get_end() && - (strand == it->get_strand() || it->get_strand() == 0)) - overlapped_primers.push_back(*it); - } -} - void condense_cigar(cigar_ *t) { uint32_t i = 0, len = 0, cig, next_cig; while (i < t->nlength - 1) { @@ -542,6 +458,36 @@ int iterate_aln(std::vector::iterator &aln_itr, return iterate_reads; } +std::vector>> find_primer_per_position(std::vector primers){ + //end pos of last primer + uint32_t last_pos = 0; + for (uint32_t i = 0; i < primers.size(); i++){ + if (primers[i].get_end() > last_pos){ + last_pos = primers[i].get_end(); + } + } + std::map> primer_map_forward; + std::map> primer_map_reverse; + for(uint32_t j = 0; j < last_pos; j++){ + for (uint32_t i = 0; i < primers.size(); i++){ + uint32_t start = primers[i].get_start(); + uint32_t end = primers[i].get_end(); + char strand = primers[i].get_strand(); + if(start <= j && j <= end){ + if(strand == '+'){ + primer_map_forward[j].push_back(primers[i]); + }else{ + primer_map_reverse[j].push_back(primers[i]); + } + } + } + } + std::vector>> hash; + hash.push_back(primer_map_forward); + hash.push_back(primer_map_reverse); + return hash; +} + int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, uint8_t min_qual, uint8_t sliding_window, std::string cmd, bool write_no_primer_reads, @@ -567,6 +513,11 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, amplicons.inOrder(); } + // calculate the primers that should cover each position + std::vector>> hash = find_primer_per_position(primers); + std::map> primer_map_forward = hash[0]; + std::map> primer_map_reverse = hash[1]; + // Read in input file samFile *in; @@ -642,12 +593,31 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, std::vector sorted_primers = insertionSort(primers, primers.size()); std::vector::iterator aln_itr = alns.begin(); + uint32_t start_pos = -1; + char strand = '+'; // Iterate through reads while (iterate_aln(aln_itr, alns, header, in, aln) >= 0) { unmapped_flag = false; primer_trimmed = false; - get_overlapping_primers(aln, sorted_primers, overlapping_primers); + strand = '+'; + if (bam_is_rev(aln)) { + start_pos = bam_endpos(aln) - 1; + strand = '-'; + } else { + start_pos = aln->core.pos; + } + overlapping_primers.clear(); + if(strand == '+'){ + if (primer_map_forward.find(start_pos) != primer_map_forward.end()) { + overlapping_primers = primer_map_forward[start_pos]; + } + }else{ + if (primer_map_reverse.find(start_pos) != primer_map_reverse.end()){ + overlapping_primers = primer_map_reverse[start_pos]; + } + } + if ((aln->core.flag & BAM_FUNMAP) == 0) { // If mapped // if primer pair info provided, check if read correctly overlaps with // atleast one amplicon @@ -672,7 +642,16 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, (abs(aln->core.isize) - max_primer_len) > abs(aln->core.l_qseq); // if reverse strand if ((aln->core.flag & BAM_FPAIRED) != 0 && isize_flag) { // If paired - get_overlapping_primers(aln, sorted_primers, overlapping_primers); + overlapping_primers.clear(); + if(strand == '+'){ + if (primer_map_forward.find(start_pos) != primer_map_forward.end()) { + overlapping_primers = primer_map_forward[start_pos]; + } + }else{ //FIX THIS DUMMY + if (primer_map_reverse.find(start_pos) != primer_map_reverse.end()) { + overlapping_primers = primer_map_reverse[start_pos]; + } + } if (overlapping_primers.size() > 0) { // If read starts before overlapping regions (?) primer_trimmed = true; @@ -707,7 +686,11 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, // handle the unpaired reads // forward primer unpaired read - get_overlapping_primers(aln, primers, overlapping_primers, false); + start_pos = aln->core.pos; + overlapping_primers.clear(); + if (primer_map_forward.find(start_pos) != primer_map_forward.end()) { + overlapping_primers = primer_map_forward[start_pos]; + } if (overlapping_primers.size() > 0) { primer_trimmed = true; cand_primer = get_max_end(overlapping_primers); @@ -721,8 +704,12 @@ int trim_bam_qual_primer(std::string bam, std::string bed, std::string bam_out, } // reverse primer unpaired read - get_overlapping_primers(aln, primers, overlapping_primers, true); - + overlapping_primers.clear(); + start_pos = bam_endpos(aln) - 1; + if (primer_map_reverse.find(start_pos) != primer_map_reverse.end()) { + overlapping_primers = primer_map_reverse[start_pos]; + } + if (overlapping_primers.size() > 0) { primer_trimmed = true; cand_primer = get_min_start(overlapping_primers); diff --git a/src/trim_primer_quality.h b/src/trim_primer_quality.h index 91ec6c52..69358a89 100755 --- a/src/trim_primer_quality.h +++ b/src/trim_primer_quality.h @@ -1,6 +1,6 @@ #include #include - +#include #include #include #include @@ -50,17 +50,13 @@ int32_t get_pos_on_reference(uint32_t *cigar, uint32_t ncigar, uint32_t pos, void reverse_qual(uint8_t *q, int l); void reverse_cigar(uint32_t *cigar, int l); double mean_quality(uint8_t *a, int s, int e); +std::vector>> find_primer_per_position(std::vector primers); cigar_ quality_trim(bam1_t *r, uint8_t qual_threshold, uint8_t sliding_window); void print_cigar(uint32_t *cigar, int nlength); cigar_ primer_trim(bam1_t *r, bool &isize_flag, int32_t new_pos, bool unpaired_rev); void replace_cigar(bam1_t *b, uint32_t n, uint32_t *cigar); void condense_cigar(cigar_ *t); -void get_overlapping_primers(bam1_t *r, std::vector &primers, - std::vector &overlapping_primers); -void get_overlapping_primers(bam1_t *r, std::vector primers, - std::vector &overlapping_primers, - bool unpaired_rev); int get_bigger_primer(std::vector primers); bool amplicon_filter(IntervalTree amplicons, bam1_t *r); std::vector insertionSort(std::vector primers, uint32_t n); diff --git a/tests/test_primer_trim.cpp b/tests/test_primer_trim.cpp index dbc4a18c..c16e3988 100755 --- a/tests/test_primer_trim.cpp +++ b/tests/test_primer_trim.cpp @@ -80,7 +80,30 @@ int main() { isize_flag = (abs(aln->core.isize) - max_primer_len) > abs(aln->core.l_qseq); std::cout << bam_get_qname(aln) << std::endl; - get_overlapping_primers(aln, sorted_primers, overlapping_primers); + std::vector>> hash = find_primer_per_position(primers); + std::map> primer_map_forward = hash[0]; + std::map> primer_map_reverse = hash[1]; + + uint32_t start_pos = -1; + char strand = '+'; + if (bam_is_rev(aln)) { + start_pos = bam_endpos(aln) - 1; + strand = '-'; + } else { + start_pos = aln->core.pos; + } + overlapping_primers.clear(); + if(strand == '+'){ + if (primer_map_forward.find(start_pos) != primer_map_forward.end()) { + overlapping_primers = primer_map_forward[start_pos]; + } + }else{ + if (primer_map_reverse.find(start_pos) != primer_map_reverse.end()){ + overlapping_primers = primer_map_reverse[start_pos]; + } + } + //get_overlapping_primers(aln, sorted_primers, overlapping_primers); + if (overlapping_primers.size() != overlapping_primer_sizes[ctr]) { success = -1; std::cout << "Overlapping primer sizes for " << bam_get_qname(aln) diff --git a/tests/test_primer_trim_edge_cases.cpp b/tests/test_primer_trim_edge_cases.cpp index 2e4704fb..e71c1114 100755 --- a/tests/test_primer_trim_edge_cases.cpp +++ b/tests/test_primer_trim_edge_cases.cpp @@ -25,6 +25,10 @@ int main() { } } } + // calculate the primers that should cover each position + std::vector>> hash = find_primer_per_position(primers); + std::map> primer_map_forward = hash[0]; + std::map> primer_map_reverse = hash[1]; bam_hdr_t *header = sam_hdr_read(in); region_.assign(header->target_name[0]); std::string temp(header->text); @@ -39,16 +43,23 @@ int main() { std::vector overlapping_primers; primer cand_primer; bool isize_flag = false; + uint32_t start_pos = -1; while (sam_itr_next(in, iter, aln) >= 0) { if ((aln->core.flag & BAM_FUNMAP) != 0) { continue; } + + start_pos = aln->core.pos; isize_flag = (abs(aln->core.isize) - max_primer_len) > abs(aln->core.l_qseq); len = bam_cigar2qlen(aln->core.n_cigar, bam_get_cigar(aln)); std::cout << bam_get_qname(aln) << std::endl; print_cigar(bam_get_cigar(aln), aln->core.n_cigar); - get_overlapping_primers(aln, primers, overlapping_primers, false); + overlapping_primers.clear(); + if (primer_map_forward.find(start_pos) != primer_map_forward.end()) { + overlapping_primers = primer_map_forward[start_pos]; + } + //get_overlapping_primers(aln, primers, overlapping_primers, false); if (overlapping_primers.size() > 0) { // Forward trim cand_primer = get_max_end(overlapping_primers); @@ -61,7 +72,13 @@ int main() { cigar = bam_get_cigar(aln); print_cigar(cigar, aln->core.n_cigar); // Reverse trim - get_overlapping_primers(aln, primers, overlapping_primers, true); + start_pos = bam_endpos(aln) - 1; + overlapping_primers.clear(); + if (primer_map_reverse.find(start_pos) != primer_map_reverse.end()){ + overlapping_primers = primer_map_reverse[start_pos]; + } +; + //get_overlapping_primers(aln, primers, overlapping_primers, true); if (overlapping_primers.size() > 0) { cand_primer = get_min_start(overlapping_primers); t = primer_trim(aln, isize_flag, cand_primer.get_start() - 1, true); @@ -82,7 +99,7 @@ int main() { success = -1; std::cout << "Cigar length and read length don't match after trimming" << std::endl; - std::cout << "Expected" << len << ". Got " + std::cout << "Expected " << len << ". Got " << bam_cigar2qlen(aln->core.n_cigar, bam_get_cigar(aln)) << std::endl; } diff --git a/tests/test_unpaired_trim.cpp b/tests/test_unpaired_trim.cpp index 8c64bcb7..85e045ce 100755 --- a/tests/test_unpaired_trim.cpp +++ b/tests/test_unpaired_trim.cpp @@ -77,6 +77,12 @@ int main() { std::vector overlapping_primers; primer cand_primer; bool isize_flag = false; + uint32_t start_pos = -1; + // calculate the primers that should cover each position + std::vector>> hash = find_primer_per_position(primers); + std::map> primer_map_forward = hash[0]; + std::map> primer_map_reverse = hash[1]; + while (sam_itr_next(in, iter, aln) >= 0) { if ((aln->core.flag & BAM_FUNMAP) != 0) { continue; @@ -85,10 +91,17 @@ int main() { (abs(aln->core.isize) - max_primer_len) > abs(aln->core.l_qseq); std::cout << bam_get_qname(aln) << std::endl; std::cout << std::endl << "Forward" << std::endl; - get_overlapping_primers(aln, primers, overlapping_primers, false); + overlapping_primers.clear(); + start_pos = aln->core.pos; + if (primer_map_forward.find(start_pos) != primer_map_forward.end()) { + overlapping_primers.clear(); + overlapping_primers = primer_map_forward[start_pos]; + } + start_pos = bam_endpos(aln) - 1; // Forward primer if (overlapping_primers.size() > 0) { cand_primer = get_max_end(overlapping_primers); + std::cout << cand_primer.get_start() << " " << cand_primer.get_end() << std::endl; t = primer_trim(aln, isize_flag, cand_primer.get_end() + 1, false); aln->core.pos += t.start_pos; replace_cigar(aln, t.nlength, t.cigar); @@ -108,7 +121,12 @@ int main() { } // Reverse primer std::cout << std::endl << "Reverse" << std::endl; - get_overlapping_primers(aln, primers, overlapping_primers, true); + overlapping_primers.clear(); + if (primer_map_reverse.find(start_pos) != primer_map_reverse.end()){ + overlapping_primers = primer_map_reverse[start_pos]; + } + + //get_overlapping_primers(aln, primers, overlapping_primers, true); if (overlapping_primers.size() > 0) { cand_primer = get_min_start(overlapping_primers); free_cigar(t);