diff --git a/phylo/data/phyml_protein_example/optimisation_pip_nj_start.newick b/phylo/data/phyml_protein_example/optimisation_pip_nj_start.newick new file mode 100644 index 0000000..e0be979 --- /dev/null +++ b/phylo/data/phyml_protein_example/optimisation_pip_nj_start.newick @@ -0,0 +1 @@ +((Baboon:0.0028331378161231516,(Rhes_cDNA:0.010200173725342486,((((DLangur:0.005187934607208908,Colobus:0.006744877884078086):0.002991818596992287,(((Orangutan:0.02425757797579703,Gibbon:0.050333222484462474):0.00349324814218369,(Gorilla:0.0031112135547833836,(Chimp:0.005196405317244636,Human:0.013661102534226385):0.003794699786919942):0.007165702458849481):0.02944066124362487,(((Tamarin:0.037556885677381205,PMarmoset:0.01702895311693103):0.030028435125174696,Squirrel:0.07201131924471216):0.007156514183315316,(((Woolly:0.034418086513793425,Spider:0.02149518903513873):0.011309226332883714,Howler:0.06122186419988194):0.031165940180073442,(Saki:0.04168710939376183,Titi:0.023512300435974267):0.02081749712142146):0.006198375436005013):0.1917050548350012):0.038848677493702605):0.009117045612009347,Patas:0.02060922344024596):0.006367394359273745,(Tant_cDNA:0.001975023706313592,AGM_cDNA:0.004967765570187601):0.008190743679022382):0.0006802670634388848):0.0005627185167904883):0); \ No newline at end of file diff --git a/phylo/data/phyml_protein_example/optimisation_pip_nj_start_gaps.newick b/phylo/data/phyml_protein_example/optimisation_pip_nj_start_gaps.newick new file mode 100644 index 0000000..6cacd04 --- /dev/null +++ b/phylo/data/phyml_protein_example/optimisation_pip_nj_start_gaps.newick @@ -0,0 +1 @@ +((Rhes_cDNA:0.005270697439151051,((((DLangur:0.0067324631560195265,Colobus:0.00521530524332267):0.002427192075502467,(((((Chimp:0.004062536454896269,Human:0.01066970288442119):0.0042616661126785435,Gorilla:0.003799987371551049):0.00939396737272837,Orangutan:0.020925588919919923):0.0042656572453531995,Gibbon:0.04549271228044225):0.025773788804886694,(((Woolly:0.02982911938015733,Spider:0.022092190400064618):0.011455489526599595,Howler:0.07603516089516084):0.10366013657195779,(((Tamarin:0.03164671167745268,PMarmoset:0.014916060359693298):0.026753727600658195,Squirrel:0.06705296485480587):0.007093494673525226,(Saki:0.03143866735106935,Titi:0.02058319641790653):0.01626675366805705):0.004670562059845509):0.17980011911442895):0.033267957441090075):0.007258518763365157,Patas:0.020042200482060055):0.01042259615650974,(Baboon:0.005263494182438327,(Tant_cDNA:0.0037526472256948387,AGM_cDNA:0.006482057922501004):0.03573474143751188):0.000016573653357281707):0.002646554869484882):0); \ No newline at end of file diff --git a/phylo/data/phyml_protein_example/optimisation_pip_nj_start_model_optim_gaps.newick b/phylo/data/phyml_protein_example/optimisation_pip_nj_start_model_optim_gaps.newick new file mode 100644 index 0000000..cb102b0 --- /dev/null +++ b/phylo/data/phyml_protein_example/optimisation_pip_nj_start_model_optim_gaps.newick @@ -0,0 +1 @@ +(((Tant_cDNA:0.0048018833811211675,AGM_cDNA:0.008339926875757585):0.04534128950416323,((((DLangur:0.008610868011143124,Colobus:0.006650819970918761):0.0031357126228166673,(((((Chimp:0.005215728948279252,Human:0.013682329995014232):0.005375585838951918,Gorilla:0.004959685843646604):0.012277692848934589,Orangutan:0.026495869011378525):0.005123154657377931,Gibbon:0.05895426364325162):0.032567290566178554,(((Woolly:0.03832426472150371,Spider:0.028961294295631365):0.014783978988486978,Howler:0.10023618083180653):0.13718895712242302,((Saki:0.04156451712084586,Titi:0.02555269910901685):0.020523346478912786,((Tamarin:0.04001931431119561,PMarmoset:0.019981458986083438):0.03338174082271472,Squirrel:0.08828899905103033):0.009173650693663014):0.004952774943586081):0.24177792859569436):0.04399012471581886):0.009241754766755407,Patas:0.025643509869867926):0.013364539018112975,(Rhes_cDNA:0.010134761142363087,Baboon:0.006736993384623528):0.000016088731852021455):0.0007497685155102595):0); \ No newline at end of file diff --git a/phylo/src/optimisers/topo_optimiser_tests.rs b/phylo/src/optimisers/topo_optimiser_tests.rs index 300a3f9..2d5c677 100644 --- a/phylo/src/optimisers/topo_optimiser_tests.rs +++ b/phylo/src/optimisers/topo_optimiser_tests.rs @@ -4,17 +4,14 @@ use approx::assert_relative_eq; use bio::io::fasta::Record; use crate::alignment::Sequences; -use crate::evolutionary_models::ProteinModelType; +use crate::evolutionary_models::{DNAModelType::*, EvoModel, ProteinModelType::*}; use crate::likelihood::PhyloCostFunction; -use crate::optimisers::PhyloOptimiser; +// use crate::optimisers::{EvoModelOptimiser, FrequencyOptimisation::*, ModelOptimiser}; +use crate::optimisers::{PhyloOptimiser, TopologyOptimiser}; use crate::phylo_info::PhyloInfoBuilder; use crate::pip_model::PIPModel; use crate::substitution_models::{DNASubstModel, ProteinSubstModel}; use crate::tree::tree_parser::from_newick_string; -use crate::{ - evolutionary_models::{DNAModelType::*, EvoModel}, - optimisers::TopologyOptimiser, -}; #[test] fn primate_topology_from_right_tree() { @@ -119,7 +116,7 @@ fn protein_topology_optimisation_good_start() { // ) // .build() // .unwrap(); - // let model = ProteinSubstModel::new(ProteinModelType::WAG, &[]).unwrap(); + // let model = ProteinSubstModel::new(WAG, &[]).unwrap(); // let unopt_logl = model.cost(&info); // let o = TopologyOptimiser::new(&model, &info).run().unwrap(); // assert!(o.final_logl >= unopt_logl); @@ -132,7 +129,7 @@ fn protein_topology_optimisation_good_start() { ) .build() .unwrap(); - let model = ProteinSubstModel::new(ProteinModelType::WAG, &[]).unwrap(); + let model = ProteinSubstModel::new(WAG, &[]).unwrap(); let logl = model.cost(&info); // compare the tree height and logl to the output of PhyML @@ -145,7 +142,7 @@ fn protein_topology_optimisation_nj_start() { // let info = PhyloInfoBuilder::new(PathBuf::from("./data/phyml_protein_example/nogap_seqs.fasta")) // .build() // .unwrap(); - // let model = ProteinSubstModel::new(ProteinModelType::WAG, &[]).unwrap(); + // let model = ProteinSubstModel::new(WAG, &[]).unwrap(); // let unopt_logl = model.cost(&info); // let o = TopologyOptimiser::new(&model, &info).run().unwrap(); // assert!(o.final_logl >= unopt_logl); @@ -158,7 +155,7 @@ fn protein_topology_optimisation_nj_start() { ) .build() .unwrap(); - let model = ProteinSubstModel::new(ProteinModelType::WAG, &[]).unwrap(); + let model = ProteinSubstModel::new(WAG, &[]).unwrap(); let logl = model.cost(&info); // compare the tree height and logl to the output of PhyML @@ -184,3 +181,82 @@ fn pip_vs_subst_dna_tree() { assert_relative_eq!(o_k80.initial_logl, initial_logl); assert_eq!(o_pip.i.tree.robinson_foulds(&o_k80.i.tree), 0); } + +#[test] +fn pip_vs_subst_protein_tree_nogaps() { + // let info = PhyloInfoBuilder::new(PathBuf::from( + // "./data/phyml_protein_example/nogap_seqs.fasta", + // )) + // .build() + // .unwrap(); + // let model = PIPModel::::new(WAG, &[50.0, 0.1]).unwrap(); + // let unopt_logl = model.cost(&info); + // let o = TopologyOptimiser::new(&model, &info).run().unwrap(); + // assert!(o.final_logl >= unopt_logl); + // The above ran in 308.93s + + let pip_info = PhyloInfoBuilder::with_attrs( + PathBuf::from("./data/phyml_protein_example/nogap_seqs.fasta"), + PathBuf::from("./data/phyml_protein_example/optimisation_pip_nj_start.newick"), + ) + .build() + .unwrap(); + + let subst_info = PhyloInfoBuilder::with_attrs( + PathBuf::from("./data/phyml_protein_example/nogap_seqs.fasta"), + PathBuf::from("./data/phyml_protein_example/optimisation_nj_start.newick"), + ) + .build() + .unwrap(); + // compare the tree created with a substitution model to the one with PIP + // the trees are not identical, but very close + assert!(pip_info.tree.robinson_foulds(&subst_info.tree) <= 2); +} + +#[test] +fn protein_optimise_model_tree() { + // let info = PhyloInfoBuilder::new(PathBuf::from("./data/phyml_protein_example/seqs.fasta")) + // .build() + // .unwrap(); + // let model = PIPModel::::new(WAG, &[1.4, 0.5]).unwrap(); + // let unopt_logl = model.cost(&info); + // let o = ModelOptimiser::new(&model, &info, Empirical).run().unwrap(); + // let model_opt_logl = o.final_logl; + // assert!(model_opt_logl >= unopt_logl); + // let o = TopologyOptimiser::new(&o.model, &info).run().unwrap(); + // assert!(model_opt_logl >= unopt_logl); + // assert!(o.final_logl >= unopt_logl); + // // The above ran in 703.87s + + // let info = PhyloInfoBuilder::new(PathBuf::from("./data/phyml_protein_example/seqs.fasta")) + // .build() + // .unwrap(); + // let model = PIPModel::::new(WAG, &[1.4, 0.5]).unwrap(); + // let unopt_logl = model.cost(&info); + // let o = TopologyOptimiser::new(&model, &info).run().unwrap(); + // assert!(o.final_logl >= unopt_logl); + // The above ran in 497.22s + + let optim_model = PIPModel::::new(WAG, &[49.56941, 0.09352]).unwrap(); + let with_model_optim = PhyloInfoBuilder::with_attrs( + PathBuf::from("./data/phyml_protein_example/seqs.fasta"), + PathBuf::from( + "./data/phyml_protein_example/optimisation_pip_nj_start_model_optim_gaps.newick", + ), + ) + .build() + .unwrap(); + + let model = ProteinSubstModel::new(WAG, &[1.4, 0.5]).unwrap(); + let wo_model_optim = PhyloInfoBuilder::with_attrs( + PathBuf::from("./data/phyml_protein_example/seqs.fasta"), + PathBuf::from( + "./data/phyml_protein_example/optimisation_pip_nj_start_model_optim_gaps.newick", + ), + ) + .build() + .unwrap(); + + assert!(with_model_optim.tree.robinson_foulds(&wo_model_optim.tree) == 0); + assert!(optim_model.cost(&with_model_optim) > model.cost(&wo_model_optim)); +}