-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfragmentplot.py
More file actions
86 lines (61 loc) · 2.24 KB
/
fragmentplot.py
File metadata and controls
86 lines (61 loc) · 2.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env python3
"""
# By Jana Alghoraibi
# This script parse alignment output in sam tab-delimited file format and generates a fragment recruitment-
plot using matplotlib graphing module. The script accept 3 arguments including: sam input file, reference genome name,
and sample id.
# Example on how to run the script:
python3 fragmentplot.py SRX001355bowtie27.md.sam 'Parabacteroides distasonis' 'Sample: SRX001355'
"""
from matplotlib import pyplot as plt
import sys
import re
# Takes input sam file with aligned reads
input = sys.argv[1]
reference = sys.argv[2]
sampleid = sys.argv[3]
# Set lists for %identity and bp positions
y_match = []
x_match = []
genome = []
# Open file, read each line, extracts the alignment positions, the CIGAR and MD:Z columns to calculate % identity
with open(input) as input_file:
# Parse each line in the file after skip headers
for lines in input_file:
if lines.startswith("@SQ"):
l = re.search('(LN:)(\d+)', lines)
genome_length = int(l.group(2))
genome.append(genome_length)
genome.sort()
genome_size = genome[-1]
if lines.startswith("@"):
continue
else:
line = lines.rstrip("\n").split()
# Extract mapping position
position = int(line[3])
x_match.append(position)
# find CIGAR line and extract value of match/mismatch M
CIGAR = line[5]
m = re.search('([0-9]+)([M])', CIGAR)
CIGAR_M = int(m.group(1))
# Find MD:Z line and calculate the number of matches
MD = line[13]
n = re.search('[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', MD)
MDZ = n.group()
matches = 0
for entry in MDZ:
if entry.isdigit():
matches += int(entry)
identity = round((matches / CIGAR_M) * 100)
# print(identity)
y_match.append(identity)
# Create fragment recruitment plot using bp positions 'x-axis' and %identity 'y-axis'
plt.scatter(x_match, y_match)
plt.title(sampleid)
plt.suptitle(reference)
plt.xlabel('Genome Position (bp)')
plt.xlim(0, genome_size + 1)
plt.ylabel('% Identity')
plt.ylim([0, 101])
plt.show()