3434import bisect
3535import argparse as ap
3636import inspect
37+ import hashlib
3738
3839
3940logger = logging .getLogger ('CLAM.Preprocessor' )
@@ -118,17 +119,23 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
118119
119120 tagged_read = pysam .AlignedSegment ()
120121 tagged_read .query_name = read .query_name
121- tagged_read .query_sequence = read . query_sequence
122+ tagged_read .query_sequence = 'N'
122123 tagged_read .flag = read .flag
123124 tagged_read .reference_id = read .reference_id
124125 tagged_read .reference_start = read_tag - 1 # 0-based leftmost coordinate
125126 tagged_read .mapping_quality = read .mapping_quality
126- tagged_read .cigar = read . cigar
127+ tagged_read .cigar = (( 0 , 1 ),)
127128 tagged_read .template_length = read .template_length
128- tagged_read .query_qualities = read . query_qualities # pysam.qualitystring_to_array("<")
129+ tagged_read .query_qualities = pysam .qualitystring_to_array ("<" )
129130 tagged_read .tags = read .tags
130- read_len = sum ([i [1 ] for i in read .cigar if i [0 ]== 0 ])
131+ read_len = sum ([i [1 ] for i in read .cigar if i [0 ] == 0 ])
131132 tagged_read .tags += [('RL' , read_len )]
133+ if len (read .query_sequence ) >= 32 :
134+ tagged_read .tags += [('SQ' ,
135+ hashlib .md5 (read .query_sequence .encode ('utf-8' )).hexdigest ())]
136+ else :
137+ tagged_read .tags += [('SQ' , read .query_sequence )]
138+
132139
133140 # add lib_type check
134141 if lib_type != "unstranded" :
@@ -186,7 +193,7 @@ def collapse_stack(stack, collapse_dict, max_tags):
186193 new_alignment_list = []
187194 new_alignment_dict = defaultdict (list )
188195 for aln in stack :
189- new_alignment_dict [aln .query_sequence ].append (aln )
196+ new_alignment_dict [aln .tags [ - 1 ][ 1 ] ].append (aln )
190197
191198 # TODO 2017.10.21:
192199 # further collapse `new_alignment_dict`
@@ -206,7 +213,9 @@ def collapse_stack(stack, collapse_dict, max_tags):
206213 target_alignment = new_alignment_dict [seq ][0 :max_tags ]
207214 for aln_qname in this_alignment_qname_list :
208215 collapse_dict [aln_qname ] = [x .qname for x in target_alignment ]
209- new_alignment_list .extend ( target_alignment )
216+ for read in target_alignment :
217+ read .tags = read .tags [:- 1 ]
218+ new_alignment_list .append ( read )
210219 return new_alignment_list , collapse_dict
211220
212221
0 commit comments