-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdna_analysis.py
More file actions
85 lines (74 loc) · 3.14 KB
/
dna_analysis.py
File metadata and controls
85 lines (74 loc) · 3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import collections
from typing import Dict
class InvalidNucleotideError(Exception):
""""
Custom error message when an invalid nucleotide is encountered.
"""
pass
def _validate_dna_sequence(sequence:str)->str:
""""
Ensures only A,T,G,C nucleotides are present in the DNA sequence.
"""
valid_nucleotides= {'A','T','G','C'}
invalid_characters = set(sequence) - set(valid_nucleotides)
if invalid_characters:
raise InvalidNucleotideError(
f"Invalid nucleotides found: {invalid_characters}"
f"Only {valid_nucleotides} are allowed in the DNA sequence"
)
return sequence
def count_nucleotides(sequence:str)->Dict[str, int]:
""""
Returns the count of each nucleotide in the DNA sequence.
"""
sequence = _validate_dna_sequence(sequence)
counts={"A":0,"T":0,"G":0,"C":0}
counts.update(collections.Counter(sequence))
return counts
def calculate_dna_percentage(sequence:str)-> Dict[str, float]:
""""
Calculates the percentage of each nucleotide in the DNA sequence.
"""
sequence = _validate_dna_sequence(sequence)
length = len(sequence)
if length == 0:
return {"A":0.0,"T":0.0,"G":0.0,"C":0.0}
counts = count_nucleotides(sequence)
return {n: round ((c/length)*100, 2)for n,c in counts.items()}
def generate_analysis_report (sequence:str, name:str="Sequence")->str:
""""
Generate a comprehensive report of the DNA sequence.
"""
sequence = _validate_dna_sequence(sequence)
length = len(sequence)
count= count_nucleotides(sequence)
frequency= calculate_dna_percentage(sequence)
GC= frequency["G"] + frequency["C"]
AT= frequency["A"] + frequency["T"]
report = f"\n Nucleotide analysis report for {name}\n"
report += f"Sequence: {sequence}\n"
report += f"Length: {length} base pairs\n"
report+= "{:<10}{:<10}{:<10}\n".format("Nucleotide", "Count", "Frequency(%)")
report += "-" * 30 + "\n"
for n in sorted(count):
report += "{:<10}{:<10}{:<10.2f}\n".format(n, count[n], frequency[n])
report +=f"\nGC Content: {GC:.2f}%\nAT Content: {AT:.2f}%\n"
return report
def compare_nucleotide_composition(seq1:str, name1:str, seq2:str, name2:str)->str:
""""
Compares the nucleotide composition of two sequences.
"""
frequency_1 , frequency_2 = calculate_dna_percentage(seq1), calculate_dna_percentage(seq2)
GC1= frequency_1["G"] + frequency_1["C"]
GC2= frequency_2["G"] + frequency_2["C"]
AT1= frequency_1["A"] + frequency_1["T"]
AT2= frequency_2["A"] + frequency_2["T"]
comparison = f"\nComparison of two DNA sequences- {name1} and {name2}\n"
comparison += "{:<10}{:<15}{:<15}\n".format("Nucleotide", name1, name2)
comparison += "-" * 40 + "\n"
for n in sorted(frequency_1):
comparison += "{:<10}{:<15.2f}{:<15.2f}\n".format(n, frequency_1[n], frequency_2[n])
comparison += "\n"
comparison += "{:<10}{:<15.2f}{:<15.2f}\n".format("GC-Content", GC1, GC2)
comparison += "{:<10}{:<15.2f}{:<15.2f}\n".format("AT-Content", AT1, AT2)
return comparison