diff --git a/Project.toml b/Project.toml index b1a72272..7ac69ec1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PhyloNetworks" uuid = "33ad39ac-ed31-50eb-9b15-43d0656eaa72" license = "MIT" -version = "0.16.1" +version = "0.16.2" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" diff --git a/src/readData.jl b/src/readData.jl index 39ac7bc5..3c51151a 100644 --- a/src/readData.jl +++ b/src/readData.jl @@ -162,28 +162,28 @@ end function readTableCF!(df::DataFrames.DataFrame; summaryfile=""::AbstractString, kwargs...) @debug "assume the numbers for the taxon read from the observed CF table match the numbers given to the taxon when creating the object network" + taxoncolnames = [[:t1, :tx1, :tax1, :taxon1], [:t2, :tx2, :tax2, :taxon2], + [:t3, :tx3, :tax3, :taxon3], [:t4, :tx4, :tax4, :taxon4] ] + taxoncol = [findfirst(x-> x ∈ taxoncolnames[1], DataFrames.propertynames(df)), + findfirst(x-> x ∈ taxoncolnames[2], DataFrames.propertynames(df)), + findfirst(x-> x ∈ taxoncolnames[3], DataFrames.propertynames(df)), + findfirst(x-> x ∈ taxoncolnames[4], DataFrames.propertynames(df))] alternativecolnames = [ # obsCF12 is as exported by fittedQuartetCF() - [:CF12_34, Symbol("CF12.34"), :obsCF12], - [:CF13_24, Symbol("CF13.24"), :obsCF13], - [:CF14_23, Symbol("CF14.23"), :obsCF14] + [:CF12_34, Symbol("CF12.34"), :obsCF12, :CF1234], + [:CF13_24, Symbol("CF13.24"), :obsCF13, :CF1324], + [:CF14_23, Symbol("CF14.23"), :obsCF14, :CF1423] ] obsCFcol = [findfirst(x-> x ∈ alternativecolnames[1], DataFrames.propertynames(df)), findfirst(x-> x ∈ alternativecolnames[2], DataFrames.propertynames(df)), findfirst(x-> x ∈ alternativecolnames[3], DataFrames.propertynames(df))] ngenecol = findfirst(isequal(:ngenes), DataFrames.propertynames(df)) withngenes = ngenecol !== nothing - if nothing in obsCFcol # one or more col names for CFs were not found - size(df,2) == (withngenes ? 8 : 7) || - @warn """Column names for quartet concordance factors (CFs) were not recognized. - Was expecting CF12_34, CF13_24 and CF14_23 for the columns with CF values, - or CF12.34 or obsCF12, etc. - Will assume that the first 4 columns give the taxon names, and that columns 5-7 give the CFs.""" - obsCFcol = [5,6,7] # assuming CFs are in columns 5,6,7, with colname mismatch - end - minimum(obsCFcol) > 4 || - error("CFs found in columns $obsCFcol, but taxon labels expected in columns 1-4") - # fixit: what about columns giving the taxon names: always assumed to be columns 1-4? No warning if not? - columns = [[1,2,3,4]; obsCFcol] + nothing in taxoncol && error("columns for taxon names were not found") + nothing in obsCFcol && error( + """Could not identify columns with quartet concordance factors (qCFs). + Was expecting CF12_34, CF13_24 and CF14_23 for the columns with CF values, + or CF12.34 or obsCF12, etc.""") + columns = [taxoncol; obsCFcol] if withngenes push!(columns, ngenecol) end d = readTableCF!(df, columns; kwargs...) @@ -661,15 +661,16 @@ data: [1.0, 0.0, 0.0, 0.5] julia> df = writeTableCF(q,t); # to get a DataFrame that can be saved to a file later julia> show(df, allcols=true) -5×8 DataFrame - Row │ t1 t2 t3 t4 CF12_34 CF13_24 CF14_23 ngenes - │ String String String String Float64 Float64 Float64 Float64 -─────┼──────────────────────────────────────────────────────────────────── - 1 │ A B D E 0.25 0.25 0.5 2.0 - 2 │ A B D O 0.5 0.5 0.0 1.0 - 3 │ A B E O 1.0 0.0 0.0 0.5 - 4 │ A D E O 1.0 0.0 0.0 0.5 - 5 │ B D E O 0.0 0.0 0.0 0.0 +5×9 DataFrame + Row │ qind t1 t2 t3 t4 CF12_34 CF13_24 CF14_23 ngenes + │ Int64 String String String String Float64 Float64 Float64 Float64 +─────┼─────────────────────────────────────────────────────────────────────────── + 1 │ 1 A B D E 0.25 0.25 0.5 2.0 + 2 │ 2 A B D O 0.5 0.5 0.0 1.0 + 3 │ 3 A B E O 1.0 0.0 0.0 0.5 + 4 │ 4 A D E O 1.0 0.0 0.0 0.5 + 5 │ 5 B D E O 0.0 0.0 0.0 0.0 + julia> # using CSV; CSV.write(df, "filename.csv"); julia> tree2 = readTopology("((A,(B,D)),E);"); @@ -680,15 +681,15 @@ Reading in trees, looking at 5 quartets in each... ** julia> show(writeTableCF(q,t), allcols=true) -5×8 DataFrame - Row │ t1 t2 t3 t4 CF12_34 CF13_24 CF14_23 ngenes - │ String String String String Float64 Float64 Float64 Float64 -─────┼─────────────────────────────────────────────────────────────────────── - 1 │ A B D E 0.333333 0.333333 0.333333 3.0 - 2 │ A B D O 0.5 0.5 0.0 2.0 - 3 │ A B E O 1.0 0.0 0.0 1.0 - 4 │ A D E O 1.0 0.0 0.0 1.0 - 5 │ B D E O 0.0 0.0 0.0 0.0 +5×9 DataFrame + Row │ qind t1 t2 t3 t4 CF12_34 CF13_24 CF14_23 ngenes + │ Int64 String String String String Float64 Float64 Float64 Float64 +─────┼────────────────────────────────────────────────────────────────────────────── + 1 │ 1 A B D E 0.333333 0.333333 0.333333 3.0 + 2 │ 2 A B D O 0.5 0.5 0.0 2.0 + 3 │ 3 A B E O 1.0 0.0 0.0 1.0 + 4 │ 4 A D E O 1.0 0.0 0.0 1.0 + 5 │ 5 B D E O 0.0 0.0 0.0 0.0 ``` """ function countquartetsintrees(tree::Vector{HybridNetwork}, diff --git a/src/traitsLikDiscrete.jl b/src/traitsLikDiscrete.jl index fcb4ff41..92f4f830 100644 --- a/src/traitsLikDiscrete.jl +++ b/src/traitsLikDiscrete.jl @@ -791,11 +791,15 @@ end """ posterior_logtreeweight(obj::SSM, trait = 1) -Return an array A such that A[t] = log of P(tree `t` and trait `trait`) -if a single `trait` is requested, or A[i,t]= log of P(tree `t` and trait `i`) +Array A of log-posterior probabilities for each tree displayed in the network: +such that A[t] = log of P(tree `t` | trait `trait`) +if a single `trait` is requested, or A[t,i]= log of P(tree `t` | trait `i`) if `trait` is a vector or range (e.g. `trait = 1:obj.nsites`). These probabilities are conditional on the model parameters in `obj`. +Displayed trees are listed in the order in which they are stored in the fitted +model object `obj`. + **Precondition**: `_loglikcache` updated by [`discrete_corelikelihood!`](@ref) # examples @@ -835,6 +839,58 @@ function posterior_logtreeweight(obj::SSM, trait = 1) return ts end +""" + posterior_loghybridweight(obj::SSM, hybrid_name, trait = 1) + posterior_loghybridweight(obj::SSM, edge_number, trait = 1) + +Log-posterior probability for all trees displaying the minor parent edge +of hybrid node named `hybrid_name`, or displaying the edge number `edge_number`. +That is: log of P(hybrid minor parent | trait) if a single `trait` is requested, +or A[i]= log of P(hybrid minor parent | trait `i`) +if `trait` is a vector or range (e.g. `trait = 1:obj.nsites`). +These probabilities are conditional on the model parameters in `obj`. + +**Precondition**: `_loglikcache` updated by [`discrete_corelikelihood!`](@ref) + +# examples + +```jldoctest +julia> net = readTopology("(((A:2.0,(B:1.0)#H1:0.1::0.9):1.5,(C:0.6,#H1:1.0::0.1):1.0):0.5,D:2.0);"); + +julia> m1 = BinaryTraitSubstitutionModel([0.1, 0.1], ["lo", "hi"]); # arbitrary rates + +julia> using DataFrames + +julia> dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]); + +julia> fit = fitdiscrete(net, m1, dat); # optimized rates: α=0.27 and β=0.35 + +julia> plhw = PhyloNetworks.posterior_loghybridweight(fit, "H1"); + +julia> round(exp(plhw), digits=5) # posterior probability of going through minor hybrid edge +0.08017 + +julia> hn = net.node[3]; getparentedgeminor(hn).gamma # prior probability +0.1 +``` +""" +function posterior_loghybridweight(obj::SSM, hybridname::String, trait = 1) + hn_index = findfirst(n -> n.name == hybridname, obj.net.node) + isnothing(hn_index) && error("node named $hybridname not found") + hn = obj.net.node[hn_index] + hn.hybrid || error("node named $hybridname is not a hybrid node") + me = getparentedgeminor(hn) + posterior_loghybridweight(obj, me.number, trait) +end +function posterior_loghybridweight(obj::SSM, edgenum::Integer, trait = 1) + tpp = posterior_logtreeweight(obj, trait) # size: (ntree,) or (ntree,ntraits) + hasedge = tree -> any(e.number == edgenum for e in tree.edge) + tokeep = map(hasedge, obj.displayedtree) + tppe = view(tpp, tokeep, :) # makes it a matrix + epp = dropdims(mapslices(logsumexp, tppe, dims=1); dims=2) + return (size(epp)==(1,) ? epp[1] : epp) # scalar or vector +end + """ traitlabels2indices(data, model::SubstitutionModel) diff --git a/test/test_traitLikDiscrete.jl b/test/test_traitLikDiscrete.jl index 3ec07305..11302b68 100644 --- a/test/test_traitLikDiscrete.jl +++ b/test/test_traitLikDiscrete.jl @@ -340,6 +340,7 @@ asr = ancestralStateReconstruction(fit1) pltw = [-0.08356534477069566, -2.5236181051014333] @test PhyloNetworks.posterior_logtreeweight(fit1) ≈ pltw atol=1e-5 @test PhyloNetworks.posterior_logtreeweight(fit1, 1:1) ≈ reshape(pltw, (2,1)) atol=1e-5 +@test PhyloNetworks.posterior_loghybridweight(fit1, "H1") ≈ -2.5236227134322293 end # end of testset, fixed topology