Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
fb0b0c5
dirty impl of tkf92, not tested in this repo.
MattesMrzik Sep 17, 2025
50995e8
added reassignment and fixed almost all warnings
MattesMrzik Sep 18, 2025
84af102
added toy msa tree tkf92 test
MattesMrzik Sep 18, 2025
6b80f73
test substitutions dont depend on indel points
MattesMrzik Sep 19, 2025
6fceced
test reassignment parallel, dp prob not same as calculated from
MattesMrzik Sep 22, 2025
b64319e
forgot to commit test file
MattesMrzik Sep 22, 2025
d888403
fixed error mentioned in last commit, parallel
MattesMrzik Sep 23, 2025
320dff9
compare runtimes of pip vs tkf cost (including reestimation)
MattesMrzik Oct 8, 2025
872dfb0
removed reassignment
MattesMrzik Oct 8, 2025
fbd2db2
TKFModelSearchCost & tests, still unused warning
MattesMrzik Oct 15, 2025
435b42f
Merge remote-tracking branch 'upstream' into 58-TKF92
MattesMrzik Oct 15, 2025
55f6a82
missing added file change from last commit
MattesMrzik Oct 15, 2025
09deae4
added pub to TKFCostBuilder
MattesMrzik Oct 15, 2025
be0f69e
removed unnecessary import
MattesMrzik Oct 15, 2025
f1d5fdf
sanity tests for model parameter change
MattesMrzik Oct 16, 2025
27c80a6
params&freqs test, dont make indel model info invalid on substparam c…
MattesMrzik Oct 16, 2025
fd72ea4
fixed old bug
MattesMrzik Oct 16, 2025
57e4b42
clean up and removing duplicate code
MattesMrzik Oct 16, 2025
d09e977
more tests for coverage
MattesMrzik Oct 16, 2025
e93addb
added param valid checks to builder and set_param
MattesMrzik Oct 16, 2025
d71f35c
tkf trait and tkf91 impl, no tkf91 tests yet
MattesMrzik Oct 17, 2025
067ae57
tiny clean up
MattesMrzik Oct 17, 2025
83ba149
more coverage; tk92 r cannot be zero fix
MattesMrzik Oct 20, 2025
e6c2143
fix set param bug, strip fasta in info builder,
MattesMrzik Oct 28, 2025
431ea6c
Merge remote-tracking branch 'upstream' into 58-TKF92
MattesMrzik Oct 28, 2025
ec30d91
param ranges,
MattesMrzik Oct 29, 2025
40a1333
clean up,fixed compatible: had spr instead of nni
MattesMrzik Oct 29, 2025
2a19d81
tkf91 now without r, tkf92 r not zero in check
MattesMrzik Oct 29, 2025
644d635
Update phylo/src/phylo_info/phyloinfo_builder.rs
MattesMrzik Oct 29, 2025
6c54ff2
fix copilot review bug
MattesMrzik Oct 29, 2025
41344ad
Update phylo/src/tkf_model/mod.rs
MattesMrzik Oct 29, 2025
b483408
Update phylo/src/tkf_model/mod.rs
MattesMrzik Oct 29, 2025
be968d0
implemented most requested changes
MattesMrzik Oct 30, 2025
5f7451f
separated tkf into multiple files/modules
MattesMrzik Oct 30, 2025
a4dced0
reworked intermediate value caching
MattesMrzik Nov 3, 2025
006962e
split tests into multiple fns
MattesMrzik Nov 11, 2025
233d7a1
param range test
MattesMrzik Nov 11, 2025
3eac852
before removing caching
MattesMrzik Nov 12, 2025
817efe3
implemented requested changes
MattesMrzik Nov 13, 2025
c2e7b59
moved tkf indel stuff to separate file
MattesMrzik Nov 13, 2025
11f6393
more polish
MattesMrzik Nov 13, 2025
59bbf38
fixed borders of param range in tkf and tests
MattesMrzik Nov 14, 2025
be6f230
implemented requested changes
MattesMrzik Nov 18, 2025
53f16b3
Merge remote-tracking branch 'upstream/develop' into 58-TKF92
MattesMrzik Nov 18, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion phylo/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ log = "0.4.19"
# through the matrixmultiply peer-dep
nalgebra = "0.32.3"
ntimestamp = "1.0.0"
num_enum = "0.7.5"
ordered-float = "3.7.0"
pest = "2.7.2"
pest_derive = "2.7.2"
Expand All @@ -78,7 +79,6 @@ unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] }
criterion = "0.5.1"
pprof = { version = "0.14.0", features = ["criterion", "flamegraph"] }


[[bench]]
name = "tree_from_msa"
harness = false
Expand Down
2 changes: 1 addition & 1 deletion phylo/benches/helpers.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![allow(dead_code)]
/// this file is essentially a workaround for #[cfg(test)] like behaviour for the benchmarks
/// The dev-depencies are only available in benchmarks or tests
/// The dev-dependencies are only available in benchmarks or tests
use std::{collections::HashMap, hint::black_box, path::PathBuf, time::Duration};

use criterion::Criterion;
Expand Down
2 changes: 1 addition & 1 deletion phylo/data/tree_multiple.newick
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
((A:1.0,B:2.0):3.0,(C:3.5,D:0.5):5.25);
((E:1.0,D:2.0):3.0,(C:3.5,F:0.5):5.25);
(K:1,L:1);
(K:1,L:1);
18 changes: 17 additions & 1 deletion phylo/src/alignment/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ pub trait Alignment: Display + Clone + Debug {
pub trait AncestralAlignment: Alignment {
fn ancestral_seqs(&self) -> &Sequences;
fn ancestral_map(&self, node_idx: &NodeIdx) -> &Mapping;
fn ancestral_maps(&self) -> &SeqMaps;
fn update_ancestral_map(&mut self, node_idx: &NodeIdx, map: Mapping);
/// Checks if inputs are compatible and calls [`Self::from_aligned_with_ancestral_unchecked`].
/// Checks:
/// - if sequences are aligned
Expand Down Expand Up @@ -412,7 +414,7 @@ impl Alignment for MASA {
/// assert_eq!(i1_seq, "XXX");
/// let i1_map = phylo_info.msa.ancestral_map(&phylo_info.tree.by_id("I1").idx);
/// assert_eq!(i1_map, &vec![Some(0), Some(1), None, Some(2)]);
/// /// or use the align_seq marco to test seq and map at the same time
/// // or use the align_seq macro to test seq and map at the same time
/// # Ok(()) }
/// ```
fn from_aligned(sequences: Sequences, tree: &Tree) -> Result<Self> {
Expand Down Expand Up @@ -446,6 +448,20 @@ impl AncestralAlignment for MASA {
self.ancestral_maps.get(node).unwrap()
}

fn ancestral_maps(&self) -> &SeqMaps {
&self.ancestral_maps
}

// This is needed because with the TKF models we need to re-estimate the ancestral maps after
// a tree move is applied.
fn update_ancestral_map(&mut self, node_idx: &NodeIdx, map: Mapping) {
if let Some(anc_map) = self.ancestral_maps.get_mut(node_idx) {
*anc_map = map;
} else {
panic!("NodeIdx {node_idx} is not an internal node");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering -- should this really panic? Or could you print a warning/error instead.
The reason why I'm asking is whether this would be something that breaks the reconstruction or could it just be safely ignored.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like panic is fitting since if this method is called, there is something wrong with the logic of the calling code. When people use update_ancestral_map and then have code where a leaf is passed, sth is flawed with their logic and its nice to point that out directly as one might miss the warning in the log.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair, I agree with this explanation!

}
}

/// # Example
/// ```
/// # use bio::io::fasta::Record;
Expand Down
1 change: 1 addition & 0 deletions phylo/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub mod phylo_info;
pub mod pip_model;
pub mod random;
pub mod substitution_models;
pub mod tkf_model;
pub mod tree;

pub(crate) mod macros;
Expand Down
61 changes: 61 additions & 0 deletions phylo/src/optimisers/model_optimiser_tests.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::fmt::Display;
use std::path::Path;

use approx::assert_relative_eq;
Expand All @@ -12,6 +13,7 @@ use crate::substitution_models::{
dna_models::*, protein_models::*, FreqVector, QMatrix, QMatrixMaker, SubstModel,
SubstitutionCostBuilder as SCB,
};
use crate::tkf_model::{TKF91CostBuilder, TKF92CostBuilder};

#[test]
fn likelihood_improves_k80() {
Expand Down Expand Up @@ -574,3 +576,62 @@ fn stop_condition_epsilon() {
let mut costs = result.costs;
assert!(costs.pop().unwrap() - costs.pop().unwrap() < epsilon);
}

#[cfg(test)]
fn tkf_model_opti_template<C: ModelSearchCost + Clone + Display>(c: C) {
let initial_llik = c.cost();
let o = ModelOptimiser::new(c.clone(), FrequencyOptimisation::Fixed)
.run()
.unwrap();
let intermediate_cost = o.final_cost;
assert_eq!(initial_llik, o.initial_cost);
assert_eq!(o.final_cost, o.cost.cost());
assert!(o.initial_cost < o.final_cost);
assert_eq!(c.freqs(), o.cost.freqs());
for param in 0..c.param_count() {
assert_ne!(c.param(param), o.cost.param(param));
let valid_range = o.cost.param_range(param);
assert!(valid_range.0 <= o.cost.param(param) && o.cost.param(param) <= valid_range.1);
}

let o = ModelOptimiser::new(o.cost.clone(), FrequencyOptimisation::Empirical)
.run()
.unwrap();
assert_eq!(intermediate_cost, o.initial_cost);
assert_eq!(o.final_cost, o.cost.cost());
assert!(o.initial_cost < o.final_cost);
assert_eq!(o.cost.freqs(), &o.cost.empirical_freqs());
for param in 0..c.param_count() {
assert_ne!(c.param(param), o.cost.param(param));
let valid_range = o.cost.param_range(param);
assert!(valid_range.0 <= o.cost.param(param) && o.cost.param(param) <= valid_range.1);
}
}

#[test]
#[cfg_attr(feature = "ci_coverage", ignore)]
fn tkf91_model_opti() {
let fldr = Path::new("./data/pip/arpip/");
let info = PIB::with_attrs(fldr.join("msa.fasta"), fldr.join("tree.nwk"))
.build_with_ancestors()
.unwrap();
let subst_model = SubstModel::<HKY>::new(&[], &[2.0]);
let tkf91 = TKF91CostBuilder::new(0.8, 1.0, subst_model.clone(), info.clone())
.build()
.unwrap();
tkf_model_opti_template(tkf91);
}

#[test]
#[cfg_attr(feature = "ci_coverage", ignore)]
fn tkf92_model_opti() {
let fldr = Path::new("./data/pip/arpip/");
let info = PIB::with_attrs(fldr.join("msa.fasta"), fldr.join("tree.nwk"))
.build_with_ancestors()
.unwrap();
let subst_model = SubstModel::<HKY>::new(&[], &[2.0]);
let tkf92 = TKF92CostBuilder::new(0.8, 1.0, 0.2, subst_model, info)
.build()
.unwrap();
tkf_model_opti_template(tkf92);
}
2 changes: 2 additions & 0 deletions phylo/src/optimisers/spr_optimiser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ fn calc_spr_cost_with_blen_opt<C: TreeSearchCost + Clone + Display>(
cost_fn.update_tree(new_tree.clone());

let mut move_cost = cost_fn.cost();
// Branch length optimisation is skipped if the model does not support branch lengths
// or if the move_cost is already better since we apply the move as is.
if cost_fn.blen_optimisation() && move_cost <= base_cost {
// reoptimise branch length at the regraft location
let blen_opt = optimise_branch(&cost_fn, &regraft)?;
Expand Down
5 changes: 4 additions & 1 deletion phylo/src/optimisers/topo_optimiser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use itertools::Itertools;
use log::{debug, info};
use rand::{Rng, SeedableRng};

use crate::alignment::Alignment;
use crate::alignment::{Alignment, AncestralAlignment};
use crate::likelihood::TreeSearchCost;
use crate::optimisers::{
BranchOptimiser, MoveCostInfo, MoveOptimiser, NniOptimiser, SprOptimiser, StopCondition,
Expand All @@ -15,6 +15,7 @@ use crate::parsimony::{BasicParsimonyCost, DolloParsimonyCost};
use crate::pip_model::PIPCost;
use crate::random::RandomGenerator;
use crate::substitution_models::{QMatrix, SubstitutionCost};
use crate::tkf_model::{TKFCost, TKFIndelCost, TKFModel as TKFM};
use crate::tree::NodeIdx;
use crate::Result;

Expand All @@ -31,6 +32,8 @@ impl<S: ParsimonyScoring, A: Alignment> Compatible<SprOptimiser> for DolloParsim
impl<S: ParsimonyScoring, A: Alignment> Compatible<NniOptimiser> for DolloParsimonyCost<S, A> {}
impl<A: Alignment> Compatible<SprOptimiser> for BasicParsimonyCost<A> {}
impl<A: Alignment> Compatible<NniOptimiser> for BasicParsimonyCost<A> {}
impl<T: TKFM, AA: AncestralAlignment> Compatible<NniOptimiser> for TKFIndelCost<T, AA> {}
impl<Q: QMatrix, T: TKFM, AA: AncestralAlignment> Compatible<NniOptimiser> for TKFCost<Q, T, AA> {}

pub struct TopologyOptimiser<'a, MO, C, R>
where
Expand Down
6 changes: 4 additions & 2 deletions phylo/src/phylo_info/phyloinfo_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ impl<A: Alignment, AA: AncestralAlignment> PhyloInfoBuilder<A, AA> {
bail!("Building an ancestral alignment from unaligned sequences (including ancestral_sequencess) is not supported");
}
} else {
bail!("The number of sequences does not match the number of leaves nor the number of nodes in the tree");
bail!("The number of sequences ({}) does not match the number of leaves ({}) nor the number of nodes ({}) in the tree", sequences.len(), tree.n, tree.len());
}?;

Ok(PhyloInfo { tree, msa })
Expand Down Expand Up @@ -294,6 +294,7 @@ impl<A: Alignment, AA: AncestralAlignment> PhyloInfoBuilder<A, AA> {

/// Sets missing ids and bails if there are duplicates among the node ids that were already set.
pub(crate) fn set_missing_tree_node_ids(tree: &Tree) -> Result<Tree> {
info!("Setting missing tree node ids");
let mut tree_with_all_ids = tree.clone();
let mut seen_user_set_ids = HashSet::new();
let mut count = 0;
Expand All @@ -305,7 +306,8 @@ pub(crate) fn set_missing_tree_node_ids(tree: &Tree) -> Result<Tree> {
count += 1;
new_id = format!("I{count}");
}
tree_with_all_ids.nodes[usize::from(node_idx)].id = new_id;
tree_with_all_ids.nodes[usize::from(node_idx)].id = new_id.clone();
info!("Set missing id of node {node_idx} to {new_id}");
} else if !seen_user_set_ids.insert(id.to_string()) {
bail!("Duplicate id ({id}) found in the leaves of the tree");
}
Expand Down
2 changes: 1 addition & 1 deletion phylo/src/phylo_info/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ fn build_ancestral_alignment_from_aligned_leaf_seqs_missing_record() {
// assert
let error_msg = res_info.unwrap_err().to_string();
assert!(
error_msg.contains("The number of sequences does not match the number of leaves nor the number of nodes in the tree")
error_msg.contains("The number of sequences (3) does not match the number of leaves (4) nor the number of nodes (7) in the tree")
);
}

Expand Down
86 changes: 86 additions & 0 deletions phylo/src/tkf_model/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use std::fmt::Display;

use crate::alignment::AncestralAlignment;
use crate::likelihood::{ModelSearchCost, ParamRange};
use crate::substitution_models::{FreqVector, QMatrix, SubstitutionCost};

pub mod tkf91;
pub use tkf91::*;
pub mod tkf92;
pub use tkf92::*;
pub mod tkf_indel;
pub use tkf_indel::*;

#[derive(Clone, Debug)]
pub struct TKFCost<Q: QMatrix + Display, T: TKFModel, AA: AncestralAlignment> {
// TODO: if we have just the sum of the two costs like this, we need to keep track of the
// phylo (which is tree and alignment) twice, which might be too big of a downside, since the
// cost is copied often. Alternatively we could implement the substitution cost inside the
// tkf92 cost, which would duplicate some code.
indel_cost: TKFIndelCost<T, AA>,
subst_cost: SubstitutionCost<Q, AA>,
}

impl<Q: QMatrix, T: TKFModel, AA: AncestralAlignment> Display for TKFCost<Q, T, AA> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{} and {}",
self.indel_cost.model, self.subst_cost.model.qmatrix
)
}
}

impl<Q: QMatrix, T: TKFModel, AA: AncestralAlignment> ModelSearchCost for TKFCost<Q, T, AA> {
fn cost(&self) -> f64 {
self.indel_cost.cost() + self.subst_cost.cost()
}

fn param_count(&self) -> usize {
self.indel_cost.model.params().len() + self.subst_cost.model.qmatrix.params().len()
}

fn param(&self, idx: usize) -> f64 {
let num_params_indel_model = self.indel_cost.param_count();
if idx < num_params_indel_model {
return self.indel_cost.param(idx);
}
let idx = idx - num_params_indel_model;
self.subst_cost.param(idx)
}

fn set_param(&mut self, idx: usize, value: f64) {
let num_params_indel_model = self.indel_cost.param_count();
if idx < num_params_indel_model {
self.indel_cost.set_param(idx, value);
return;
}
let idx = idx - num_params_indel_model;
self.subst_cost.set_param(idx, value);
}

fn param_range(&self, idx: usize) -> ParamRange {
let num_params_indel_model = self.indel_cost.param_count();
if idx < num_params_indel_model {
return self.indel_cost.param_range(idx);
}
let idx = idx - num_params_indel_model;
self.subst_cost.param_range(idx)
}

fn set_freqs(&mut self, freqs: FreqVector) {
self.subst_cost.set_freqs(freqs);
}

fn empirical_freqs(&self) -> FreqVector {
self.subst_cost.info.freqs()
}

fn freqs(&self) -> &FreqVector {
self.subst_cost.freqs()
}
}

#[cfg(test)]
#[cfg_attr(coverage, coverage(off))]
mod tests;
Loading
Loading