acg-team · MattesMrzik · Nov 18, 2025 · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/phylo/Cargo.toml b/phylo/Cargo.toml
@@ -61,6 +61,7 @@ log = "0.4.19"
 # through the matrixmultiply peer-dep
 nalgebra = "0.32.3"
 ntimestamp = "1.0.0"
+num_enum = "0.7.5"
 ordered-float = "3.7.0"
 pest = "2.7.2"
 pest_derive = "2.7.2"
@@ -78,7 +79,6 @@ unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] }
 criterion = "0.5.1"
 pprof = { version = "0.14.0", features = ["criterion", "flamegraph"] }
 
-
 [[bench]]
 name = "tree_from_msa"
 harness = false

diff --git a/phylo/benches/helpers.rs b/phylo/benches/helpers.rs
@@ -1,6 +1,6 @@
 #![allow(dead_code)]
 /// this file is essentially a workaround for #[cfg(test)] like behaviour for the benchmarks
-/// The dev-depencies are only available in benchmarks or tests
+/// The dev-dependencies are only available in benchmarks or tests
 use std::{collections::HashMap, hint::black_box, path::PathBuf, time::Duration};
 
 use criterion::Criterion;

diff --git a/phylo/data/tree_multiple.newick b/phylo/data/tree_multiple.newick
@@ -1,3 +1,3 @@
 ((A:1.0,B:2.0):3.0,(C:3.5,D:0.5):5.25);
 ((E:1.0,D:2.0):3.0,(C:3.5,F:0.5):5.25);
-(K:1,L:1);
+(K:1,L:1);
diff --git a/phylo/src/alignment/mod.rs b/phylo/src/alignment/mod.rs
@@ -91,6 +91,8 @@ pub trait Alignment: Display + Clone + Debug {
 pub trait AncestralAlignment: Alignment {
     fn ancestral_seqs(&self) -> &Sequences;
     fn ancestral_map(&self, node_idx: &NodeIdx) -> &Mapping;
+    fn ancestral_maps(&self) -> &SeqMaps;
+    fn update_ancestral_map(&mut self, node_idx: &NodeIdx, map: Mapping);
     /// Checks if inputs are compatible and calls [`Self::from_aligned_with_ancestral_unchecked`].  
     /// Checks:
     /// - if sequences are aligned
@@ -412,7 +414,7 @@ impl Alignment for MASA {
     /// assert_eq!(i1_seq, "XXX");
     /// let i1_map = phylo_info.msa.ancestral_map(&phylo_info.tree.by_id("I1").idx);
     /// assert_eq!(i1_map, &vec![Some(0), Some(1), None, Some(2)]);
-    /// /// or use the align_seq marco to test seq and map at the same time
+    /// // or use the align_seq macro to test seq and map at the same time
     /// # Ok(()) }
     /// ```
     fn from_aligned(sequences: Sequences, tree: &Tree) -> Result<Self> {
@@ -446,6 +448,20 @@ impl AncestralAlignment for MASA {
         self.ancestral_maps.get(node).unwrap()
     }
 
+    fn ancestral_maps(&self) -> &SeqMaps {
+        &self.ancestral_maps
+    }
+
+    // This is needed because with the TKF models we need to re-estimate the ancestral maps after
+    // a tree move is applied.
+    fn update_ancestral_map(&mut self, node_idx: &NodeIdx, map: Mapping) {
+        if let Some(anc_map) = self.ancestral_maps.get_mut(node_idx) {
+            *anc_map = map;
+        } else {
+            panic!("NodeIdx {node_idx} is not an internal node");
+        }
+    }
+
     /// # Example
     /// ```
     /// # use bio::io::fasta::Record;

diff --git a/phylo/src/lib.rs b/phylo/src/lib.rs
@@ -19,6 +19,7 @@ pub mod phylo_info;
 pub mod pip_model;
 pub mod random;
 pub mod substitution_models;
+pub mod tkf_model;
 pub mod tree;
 
 pub(crate) mod macros;

diff --git a/phylo/src/optimisers/model_optimiser_tests.rs b/phylo/src/optimisers/model_optimiser_tests.rs
@@ -1,3 +1,4 @@
+use std::fmt::Display;
 use std::path::Path;
 
 use approx::assert_relative_eq;
@@ -12,6 +13,7 @@ use crate::substitution_models::{
     dna_models::*, protein_models::*, FreqVector, QMatrix, QMatrixMaker, SubstModel,
     SubstitutionCostBuilder as SCB,
 };
+use crate::tkf_model::{TKF91CostBuilder, TKF92CostBuilder};
 
 #[test]
 fn likelihood_improves_k80() {
@@ -574,3 +576,62 @@ fn stop_condition_epsilon() {
     let mut costs = result.costs;
     assert!(costs.pop().unwrap() - costs.pop().unwrap() < epsilon);
 }
+
+#[cfg(test)]
+fn tkf_model_opti_template<C: ModelSearchCost + Clone + Display>(c: C) {
+    let initial_llik = c.cost();
+    let o = ModelOptimiser::new(c.clone(), FrequencyOptimisation::Fixed)
+        .run()
+        .unwrap();
+    let intermediate_cost = o.final_cost;
+    assert_eq!(initial_llik, o.initial_cost);
+    assert_eq!(o.final_cost, o.cost.cost());
+    assert!(o.initial_cost < o.final_cost);
+    assert_eq!(c.freqs(), o.cost.freqs());
+    for param in 0..c.param_count() {
+        assert_ne!(c.param(param), o.cost.param(param));
+        let valid_range = o.cost.param_range(param);
+        assert!(valid_range.0 <= o.cost.param(param) && o.cost.param(param) <= valid_range.1);
+    }
+
+    let o = ModelOptimiser::new(o.cost.clone(), FrequencyOptimisation::Empirical)
+        .run()
+        .unwrap();
+    assert_eq!(intermediate_cost, o.initial_cost);
+    assert_eq!(o.final_cost, o.cost.cost());
+    assert!(o.initial_cost < o.final_cost);
+    assert_eq!(o.cost.freqs(), &o.cost.empirical_freqs());
+    for param in 0..c.param_count() {
+        assert_ne!(c.param(param), o.cost.param(param));
+        let valid_range = o.cost.param_range(param);
+        assert!(valid_range.0 <= o.cost.param(param) && o.cost.param(param) <= valid_range.1);
+    }
+}
+
+#[test]
+#[cfg_attr(feature = "ci_coverage", ignore)]
+fn tkf91_model_opti() {
+    let fldr = Path::new("./data/pip/arpip/");
+    let info = PIB::with_attrs(fldr.join("msa.fasta"), fldr.join("tree.nwk"))
+        .build_with_ancestors()
+        .unwrap();
+    let subst_model = SubstModel::<HKY>::new(&[], &[2.0]);
+    let tkf91 = TKF91CostBuilder::new(0.8, 1.0, subst_model.clone(), info.clone())
+        .build()
+        .unwrap();
+    tkf_model_opti_template(tkf91);
+}
+
+#[test]
+#[cfg_attr(feature = "ci_coverage", ignore)]
+fn tkf92_model_opti() {
+    let fldr = Path::new("./data/pip/arpip/");
+    let info = PIB::with_attrs(fldr.join("msa.fasta"), fldr.join("tree.nwk"))
+        .build_with_ancestors()
+        .unwrap();
+    let subst_model = SubstModel::<HKY>::new(&[], &[2.0]);
+    let tkf92 = TKF92CostBuilder::new(0.8, 1.0, 0.2, subst_model, info)
+        .build()
+        .unwrap();
+    tkf_model_opti_template(tkf92);
+}
diff --git a/phylo/src/optimisers/spr_optimiser.rs b/phylo/src/optimisers/spr_optimiser.rs
@@ -218,6 +218,8 @@ fn calc_spr_cost_with_blen_opt<C: TreeSearchCost + Clone + Display>(
     cost_fn.update_tree(new_tree.clone());
 
     let mut move_cost = cost_fn.cost();
+    // Branch length optimisation is skipped if the model does not support branch lengths
+    // or if the move_cost is already better since we apply the move as is.
     if cost_fn.blen_optimisation() && move_cost <= base_cost {
         // reoptimise branch length at the regraft location
         let blen_opt = optimise_branch(&cost_fn, &regraft)?;

diff --git a/phylo/src/optimisers/topo_optimiser.rs b/phylo/src/optimisers/topo_optimiser.rs
@@ -4,7 +4,7 @@ use itertools::Itertools;
 use log::{debug, info};
 use rand::{Rng, SeedableRng};
 
-use crate::alignment::Alignment;
+use crate::alignment::{Alignment, AncestralAlignment};
 use crate::likelihood::TreeSearchCost;
 use crate::optimisers::{
     BranchOptimiser, MoveCostInfo, MoveOptimiser, NniOptimiser, SprOptimiser, StopCondition,
@@ -15,6 +15,7 @@ use crate::parsimony::{BasicParsimonyCost, DolloParsimonyCost};
 use crate::pip_model::PIPCost;
 use crate::random::RandomGenerator;
 use crate::substitution_models::{QMatrix, SubstitutionCost};
+use crate::tkf_model::{TKFCost, TKFIndelCost, TKFModel as TKFM};
 use crate::tree::NodeIdx;
 use crate::Result;
 
@@ -31,6 +32,8 @@ impl<S: ParsimonyScoring, A: Alignment> Compatible<SprOptimiser> for DolloParsim
 impl<S: ParsimonyScoring, A: Alignment> Compatible<NniOptimiser> for DolloParsimonyCost<S, A> {}
 impl<A: Alignment> Compatible<SprOptimiser> for BasicParsimonyCost<A> {}
 impl<A: Alignment> Compatible<NniOptimiser> for BasicParsimonyCost<A> {}
+impl<T: TKFM, AA: AncestralAlignment> Compatible<NniOptimiser> for TKFIndelCost<T, AA> {}
+impl<Q: QMatrix, T: TKFM, AA: AncestralAlignment> Compatible<NniOptimiser> for TKFCost<Q, T, AA> {}
 
 pub struct TopologyOptimiser<'a, MO, C, R>
 where

diff --git a/phylo/src/phylo_info/phyloinfo_builder.rs b/phylo/src/phylo_info/phyloinfo_builder.rs
@@ -199,7 +199,7 @@ impl<A: Alignment, AA: AncestralAlignment> PhyloInfoBuilder<A, AA> {
                 bail!("Building an ancestral alignment from unaligned sequences (including ancestral_sequencess) is not supported");
             }
         } else {
-            bail!("The number of sequences does not match the number of leaves nor the number of nodes in the tree");
+            bail!("The number of sequences ({}) does not match the number of leaves ({}) nor the number of nodes ({}) in the tree", sequences.len(), tree.n, tree.len());
         }?;
 
         Ok(PhyloInfo { tree, msa })
@@ -294,6 +294,7 @@ impl<A: Alignment, AA: AncestralAlignment> PhyloInfoBuilder<A, AA> {
 
 /// Sets missing ids and bails if there are duplicates among the node ids that were already set.
 pub(crate) fn set_missing_tree_node_ids(tree: &Tree) -> Result<Tree> {
+    info!("Setting missing tree node ids");
     let mut tree_with_all_ids = tree.clone();
     let mut seen_user_set_ids = HashSet::new();
     let mut count = 0;
@@ -305,7 +306,8 @@ pub(crate) fn set_missing_tree_node_ids(tree: &Tree) -> Result<Tree> {
                 count += 1;
                 new_id = format!("I{count}");
             }
-            tree_with_all_ids.nodes[usize::from(node_idx)].id = new_id;
+            tree_with_all_ids.nodes[usize::from(node_idx)].id = new_id.clone();
+            info!("Set missing id of node {node_idx} to {new_id}");
         } else if !seen_user_set_ids.insert(id.to_string()) {
             bail!("Duplicate id ({id}) found in the leaves of the tree");
         }

diff --git a/phylo/src/phylo_info/tests.rs b/phylo/src/phylo_info/tests.rs
@@ -380,7 +380,7 @@ fn build_ancestral_alignment_from_aligned_leaf_seqs_missing_record() {
     // assert
     let error_msg = res_info.unwrap_err().to_string();
     assert!(
-        error_msg.contains("The number of sequences does not match the number of leaves nor the number of nodes in the tree")
+        error_msg.contains("The number of sequences (3) does not match the number of leaves (4) nor the number of nodes (7) in the tree")
     );
 }
 

diff --git a/phylo/src/tkf_model/mod.rs b/phylo/src/tkf_model/mod.rs
@@ -0,0 +1,86 @@
+use std::fmt::Display;
+
+use crate::alignment::AncestralAlignment;
+use crate::likelihood::{ModelSearchCost, ParamRange};
+use crate::substitution_models::{FreqVector, QMatrix, SubstitutionCost};
+
+pub mod tkf91;
+pub use tkf91::*;
+pub mod tkf92;
+pub use tkf92::*;
+pub mod tkf_indel;
+pub use tkf_indel::*;
+
+#[derive(Clone, Debug)]
+pub struct TKFCost<Q: QMatrix + Display, T: TKFModel, AA: AncestralAlignment> {
+    // TODO: if we have just the sum of the two costs like this, we need to keep track of the
+    // phylo (which is tree and alignment) twice, which might be too big of a downside, since the
+    // cost is copied often. Alternatively we could implement the substitution cost inside the
+    // tkf92 cost, which would duplicate some code.
+    indel_cost: TKFIndelCost<T, AA>,
+    subst_cost: SubstitutionCost<Q, AA>,
+}
+
+impl<Q: QMatrix, T: TKFModel, AA: AncestralAlignment> Display for TKFCost<Q, T, AA> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{} and {}",
+            self.indel_cost.model, self.subst_cost.model.qmatrix
+        )
+    }
+}
+
+impl<Q: QMatrix, T: TKFModel, AA: AncestralAlignment> ModelSearchCost for TKFCost<Q, T, AA> {
+    fn cost(&self) -> f64 {
+        self.indel_cost.cost() + self.subst_cost.cost()
+    }
+
+    fn param_count(&self) -> usize {
+        self.indel_cost.model.params().len() + self.subst_cost.model.qmatrix.params().len()
+    }
+
+    fn param(&self, idx: usize) -> f64 {
+        let num_params_indel_model = self.indel_cost.param_count();
+        if idx < num_params_indel_model {
+            return self.indel_cost.param(idx);
+        }
+        let idx = idx - num_params_indel_model;
+        self.subst_cost.param(idx)
+    }
+
+    fn set_param(&mut self, idx: usize, value: f64) {
+        let num_params_indel_model = self.indel_cost.param_count();
+        if idx < num_params_indel_model {
+            self.indel_cost.set_param(idx, value);
+            return;
+        }
+        let idx = idx - num_params_indel_model;
+        self.subst_cost.set_param(idx, value);
+    }
+
+    fn param_range(&self, idx: usize) -> ParamRange {
+        let num_params_indel_model = self.indel_cost.param_count();
+        if idx < num_params_indel_model {
+            return self.indel_cost.param_range(idx);
+        }
+        let idx = idx - num_params_indel_model;
+        self.subst_cost.param_range(idx)
+    }
+
+    fn set_freqs(&mut self, freqs: FreqVector) {
+        self.subst_cost.set_freqs(freqs);
+    }
+
+    fn empirical_freqs(&self) -> FreqVector {
+        self.subst_cost.info.freqs()
+    }
+
+    fn freqs(&self) -> &FreqVector {
+        self.subst_cost.freqs()
+    }
+}
+
+#[cfg(test)]
+#[cfg_attr(coverage, coverage(off))]
+mod tests;