diff --git a/.gitignore b/.gitignore index a00a6c0..df93901 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,26 @@ -*profile* \ No newline at end of file +# Files generated by invoking Julia with --code-coverage +*.jl.cov +*.jl.*.cov + +# Files generated by invoking Julia with --track-allocation +*.jl.mem + +# System-specific files and directories generated by the BinaryProvider and BinDeps packages +# They contain absolute paths specific to the host computer, and so should not be committed +deps/deps.jl +deps/build.log +deps/downloads/ +deps/usr/ +deps/src/ + +# Build artifacts for creating documentation generated by the Documenter package +docs/build/ +docs/site/ + +# File generated by Pkg, the package manager, based on a corresponding Project.toml +# It records a fixed state of all packages used by the project. As such, it should not be +# committed for packages, but should be committed for applications that require a static +# environment. +Manifest.toml + +*profile* diff --git a/src/Evolve/Evolve.jl b/src/Evolve/Evolve.jl index fb3ba97..2afdbd3 100644 --- a/src/Evolve/Evolve.jl +++ b/src/Evolve/Evolve.jl @@ -4,9 +4,12 @@ using BioSequences using StatsBase using SubstitutionModels using TreeTools +using FASTX +using Dates export evolve! export compute_mutations! +export write_seq2fasta include("main.jl") diff --git a/src/Evolve/main.jl b/src/Evolve/main.jl index ce7e4ca..57bd224 100644 --- a/src/Evolve/main.jl +++ b/src/Evolve/main.jl @@ -1,22 +1,59 @@ """ evolve!(t::Tree, L::Int, μ=1.; model = JC69(1.), seqkey=:seq) + + initialize a random sequence of nucleotides of length `L` at the root, then simulate evolution + by changing nucleotides at random according to a probability distribution specified + by the chosen `model`, which takes the mutation rate μ and branch lengths of nodes as parameters. + + The sequence at a node `n` can be accessed by `n.data.dat[seqkey]`, if a mutation has occured + on the branch between node `n` and its ancestor is given as a boolean in `n.data.dat["evolved"]`. """ function evolve!(t::Tree, L::Int, μ=1.; model = JC69(1.), seqkey=:seq) t.root.data.dat[seqkey] = randseq(DNAAlphabet{4}(), L) + t.root.data.dat["evolved"]= false for c in t.root.child evolve!(c, model, μ, seqkey) end end function evolve!(n::TreeNode, model, μ, seqkey=:seq) + n.data.dat["evolved"]= false w = ProbabilityWeights(SubstitutionModels.P(model, μ * n.tau)[:,1], 1.) n.data.dat[seqkey] = deepcopy(n.anc.data.dat[seqkey]) - evolve!(n.data.dat[seqkey], w) + evolve_seq!(n, w, seqkey) for c in n.child evolve!(c, model, μ, seqkey) end end +function evolve_seq!(n::TreeNode, w, seqkey=:seq) + seq = n.data.dat[seqkey] + for (i,nt) in enumerate(seq) + newnt = sample(1:4, w) + if newnt != 1 + n.data.dat["evolved"] = true + if nt == DNA_A + if newnt == 2; seq[i] = DNA_C + elseif newnt == 3; seq[i] = DNA_G + elseif newnt == 4; seq[i] = DNA_T; end + elseif nt == DNA_C + if newnt == 2; seq[i] = DNA_G + elseif newnt == 3; seq[i] = DNA_T + elseif newnt == 4; seq[i] = DNA_A; end + elseif nt == DNA_G + if newnt == 2; seq[i] = DNA_T + elseif newnt == 3; seq[i] = DNA_A + elseif newnt == 4; seq[i] = DNA_C; end + elseif nt == DNA_T + if newnt == 2; seq[i] = DNA_A + elseif newnt == 3; seq[i] = DNA_C + elseif newnt == 4; seq[i] = DNA_G; end + end + end + end + return nothing +end + function evolve!(seq::NucSeq{4, DNAAlphabet{4}}, Q) Qt = Q' for (i, nt) in enumerate(seq) @@ -49,3 +86,38 @@ function onehot(c::DNA) end return s end + +""" + write_seq2fasta(t::Tree, fasta_name::String, output_dir::String, seqkey=:seq; only_terminals=false, remove_0_mutations=true) + + write evolved sequences to a fasta file with name `fasta_name`, `only_terminals` specifies if only terminal sequences should be + written to the file and `remove_0_mutations` if only sequences with mutations on their branches should be written to the file. +""" +function write_seq2fasta(t::Tree, fasta_name::String, output_dir::String, seqkey=:seq; + only_terminals=false, remove_0_mutations=true, write_date=false, year_rate=4.73e-3) + mkpath(output_dir) + open(FASTA.Writer, output_dir *"/"*fasta_name* ".fasta") do w + if only_terminals + iter = POTleaves(t) + else + iter = POT(t) + end + for node in iter + if remove_0_mutations && !node.data.dat["evolved"] + continue + else + x = node.data.dat[seqkey] + if write_date + start_date = Date(2000,1,1) + dist = TreeTools.distance(node, t.root)/year_rate + date = start_date+Dates.Year(round(dist)) + Dates.Day(round((dist-round(dist))/365)) + rec = FASTA.Record(node.label*"|"*string(date), x) + else + rec = FASTA.Record(node.label, x) + end + write(w, rec) + end + end + end + +end