Skip to content

Commit aed16a4

Browse files
authored
Merge pull request #26 from Xinglab/dev-3
Dev 3 Reduce mem usage while solving assertion error
2 parents 7bbd8db + 4038e78 commit aed16a4

File tree

2 files changed

+16
-7
lines changed

2 files changed

+16
-7
lines changed

CLAM/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
""" General Version and other info
44
"""
55

6-
__version__ = '1.2.2'
6+
__version__ = '1.2.3'
77
__author__ = 'Zijun Zhang'
88
__email__ = 'zj.z@ucla.edu'

CLAM/preprocessor.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import bisect
3535
import argparse as ap
3636
import inspect
37+
import hashlib
3738

3839

3940
logger = logging.getLogger('CLAM.Preprocessor')
@@ -118,17 +119,23 @@ def filter_bam_multihits(filename, max_tags, max_hits, out_dir, read_tagger_meth
118119

119120
tagged_read = pysam.AlignedSegment()
120121
tagged_read.query_name = read.query_name
121-
tagged_read.query_sequence = read.query_sequence
122+
tagged_read.query_sequence = 'N'
122123
tagged_read.flag = read.flag
123124
tagged_read.reference_id = read.reference_id
124125
tagged_read.reference_start = read_tag - 1 # 0-based leftmost coordinate
125126
tagged_read.mapping_quality = read.mapping_quality
126-
tagged_read.cigar = read.cigar
127+
tagged_read.cigar = ((0, 1),)
127128
tagged_read.template_length = read.template_length
128-
tagged_read.query_qualities = read.query_qualities#pysam.qualitystring_to_array("<")
129+
tagged_read.query_qualities = pysam.qualitystring_to_array("<")
129130
tagged_read.tags = read.tags
130-
read_len = sum([i[1] for i in read.cigar if i[0]==0])
131+
read_len = sum([i[1] for i in read.cigar if i[0] == 0])
131132
tagged_read.tags += [('RL', read_len)]
133+
if len(read.query_sequence) >= 32:
134+
tagged_read.tags += [('SQ',
135+
hashlib.md5(read.query_sequence.encode('utf-8')).hexdigest())]
136+
else:
137+
tagged_read.tags += [('SQ', read.query_sequence)]
138+
132139

133140
# add lib_type check
134141
if lib_type != "unstranded":
@@ -186,7 +193,7 @@ def collapse_stack(stack, collapse_dict, max_tags):
186193
new_alignment_list = []
187194
new_alignment_dict = defaultdict(list)
188195
for aln in stack:
189-
new_alignment_dict[aln.query_sequence].append(aln)
196+
new_alignment_dict[aln.tags[-1][1]].append(aln)
190197

191198
# TODO 2017.10.21:
192199
# further collapse `new_alignment_dict`
@@ -206,7 +213,9 @@ def collapse_stack(stack, collapse_dict, max_tags):
206213
target_alignment = new_alignment_dict[seq][0:max_tags]
207214
for aln_qname in this_alignment_qname_list:
208215
collapse_dict[aln_qname] = [x.qname for x in target_alignment]
209-
new_alignment_list.extend( target_alignment )
216+
for read in target_alignment:
217+
read.tags=read.tags[:-1]
218+
new_alignment_list.append( read )
210219
return new_alignment_list, collapse_dict
211220

212221

0 commit comments

Comments
 (0)