for v0.16.2 (#200)

- bug fix in writeTableCF: columns with taxon name not restricted to be 1:4 - new: posterior_loghybridweight for the posterior probability of a trait passed by "gene flow"
JuliaPhylo · May 11, 2023 · 524fcb8 · 524fcb8 · cecileane · May 11, 2023
1 parent e4cbb04
commit 524fcb8
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 36 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PhyloNetworks"
 uuid = "33ad39ac-ed31-50eb-9b15-43d0656eaa72"
 license = "MIT"
-version = "0.16.1"
+version = "0.16.2"
 
 [deps]
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"

diff --git a/src/readData.jl b/src/readData.jl
@@ -162,28 +162,28 @@ end
 
 function readTableCF!(df::DataFrames.DataFrame; summaryfile=""::AbstractString, kwargs...)
     @debug "assume the numbers for the taxon read from the observed CF table match the numbers given to the taxon when creating the object network"
+    taxoncolnames = [[:t1, :tx1, :tax1, :taxon1], [:t2, :tx2, :tax2, :taxon2],
+                     [:t3, :tx3, :tax3, :taxon3], [:t4, :tx4, :tax4, :taxon4] ]
+    taxoncol = [findfirst(x-> x ∈ taxoncolnames[1], DataFrames.propertynames(df)),
+                findfirst(x-> x ∈ taxoncolnames[2], DataFrames.propertynames(df)),
+                findfirst(x-> x ∈ taxoncolnames[3], DataFrames.propertynames(df)),
+                findfirst(x-> x ∈ taxoncolnames[4], DataFrames.propertynames(df))]
     alternativecolnames = [ # obsCF12 is as exported by fittedQuartetCF()
-        [:CF12_34, Symbol("CF12.34"), :obsCF12],
-        [:CF13_24, Symbol("CF13.24"), :obsCF13],
-        [:CF14_23, Symbol("CF14.23"), :obsCF14]
+        [:CF12_34, Symbol("CF12.34"), :obsCF12, :CF1234],
+        [:CF13_24, Symbol("CF13.24"), :obsCF13, :CF1324],
+        [:CF14_23, Symbol("CF14.23"), :obsCF14, :CF1423]
     ]
     obsCFcol = [findfirst(x-> x ∈ alternativecolnames[1], DataFrames.propertynames(df)),
                 findfirst(x-> x ∈ alternativecolnames[2], DataFrames.propertynames(df)),
                 findfirst(x-> x ∈ alternativecolnames[3], DataFrames.propertynames(df))]
     ngenecol =  findfirst(isequal(:ngenes), DataFrames.propertynames(df))
     withngenes = ngenecol !== nothing
-    if nothing in obsCFcol # one or more col names for CFs were not found
-        size(df,2) == (withngenes ? 8 : 7) ||
-          @warn """Column names for quartet concordance factors (CFs) were not recognized.
-          Was expecting CF12_34, CF13_24 and CF14_23 for the columns with CF values,
-          or CF12.34 or obsCF12, etc.
-          Will assume that the first 4 columns give the taxon names, and that columns 5-7 give the CFs."""
-        obsCFcol = [5,6,7] # assuming CFs are in columns 5,6,7, with colname mismatch
-    end
-    minimum(obsCFcol) > 4 ||
-        error("CFs found in columns $obsCFcol, but taxon labels expected in columns 1-4")
-    # fixit: what about columns giving the taxon names: always assumed to be columns 1-4? No warning if not?
-    columns = [[1,2,3,4]; obsCFcol]
+    nothing in taxoncol && error("columns for taxon names were not found")
+    nothing in obsCFcol && error(
+      """Could not identify columns with quartet concordance factors (qCFs).
+      Was expecting CF12_34, CF13_24 and CF14_23 for the columns with CF values,
+      or CF12.34 or obsCF12, etc.""")
+    columns = [taxoncol; obsCFcol]
     if withngenes  push!(columns, ngenecol)  end
 
     d = readTableCF!(df, columns; kwargs...)
@@ -661,15 +661,16 @@ data: [1.0, 0.0, 0.0, 0.5]
 julia> df = writeTableCF(q,t); # to get a DataFrame that can be saved to a file later
 
 julia> show(df, allcols=true)
-5×8 DataFrame
- Row │ t1      t2      t3      t4      CF12_34  CF13_24  CF14_23  ngenes  
-     │ String  String  String  String  Float64  Float64  Float64  Float64 
-─────┼────────────────────────────────────────────────────────────────────
-   1 │ A       B       D       E          0.25     0.25      0.5      2.0
-   2 │ A       B       D       O          0.5      0.5       0.0      1.0
-   3 │ A       B       E       O          1.0      0.0       0.0      0.5
-   4 │ A       D       E       O          1.0      0.0       0.0      0.5
-   5 │ B       D       E       O          0.0      0.0       0.0      0.0
+5×9 DataFrame
+ Row │ qind   t1      t2      t3      t4      CF12_34  CF13_24  CF14_23  ngenes  
+     │ Int64  String  String  String  String  Float64  Float64  Float64  Float64 
+─────┼───────────────────────────────────────────────────────────────────────────
+   1 │     1  A       B       D       E          0.25     0.25      0.5      2.0
+   2 │     2  A       B       D       O          0.5      0.5       0.0      1.0
+   3 │     3  A       B       E       O          1.0      0.0       0.0      0.5
+   4 │     4  A       D       E       O          1.0      0.0       0.0      0.5
+   5 │     5  B       D       E       O          0.0      0.0       0.0      0.0
+
 julia> # using CSV; CSV.write(df, "filename.csv");
 
 julia> tree2 = readTopology("((A,(B,D)),E);");
@@ -680,15 +681,15 @@ Reading in trees, looking at 5 quartets in each...
   **
 
 julia> show(writeTableCF(q,t), allcols=true)
-5×8 DataFrame
- Row │ t1      t2      t3      t4      CF12_34   CF13_24   CF14_23   ngenes  
-     │ String  String  String  String  Float64   Float64   Float64   Float64 
-─────┼───────────────────────────────────────────────────────────────────────
-   1 │ A       B       D       E       0.333333  0.333333  0.333333      3.0
-   2 │ A       B       D       O       0.5       0.5       0.0           2.0
-   3 │ A       B       E       O       1.0       0.0       0.0           1.0
-   4 │ A       D       E       O       1.0       0.0       0.0           1.0
-   5 │ B       D       E       O       0.0       0.0       0.0           0.0
+5×9 DataFrame
+ Row │ qind   t1      t2      t3      t4      CF12_34   CF13_24   CF14_23   ngenes  
+     │ Int64  String  String  String  String  Float64   Float64   Float64   Float64 
+─────┼──────────────────────────────────────────────────────────────────────────────
+   1 │     1  A       B       D       E       0.333333  0.333333  0.333333      3.0
+   2 │     2  A       B       D       O       0.5       0.5       0.0           2.0
+   3 │     3  A       B       E       O       1.0       0.0       0.0           1.0
+   4 │     4  A       D       E       O       1.0       0.0       0.0           1.0
+   5 │     5  B       D       E       O       0.0       0.0       0.0           0.0
 ```
 """
 function countquartetsintrees(tree::Vector{HybridNetwork},

diff --git a/src/traitsLikDiscrete.jl b/src/traitsLikDiscrete.jl
@@ -791,11 +791,15 @@ end
 """
     posterior_logtreeweight(obj::SSM, trait = 1)
 
-Return an array A such that A[t] = log of P(tree `t` and trait `trait`)
-if a single `trait` is requested, or A[i,t]= log of P(tree `t` and trait `i`)
+Array A of log-posterior probabilities for each tree displayed in the network:
+such that A[t] = log of P(tree `t` | trait `trait`)
+if a single `trait` is requested, or A[t,i]= log of P(tree `t` | trait `i`)
 if `trait` is a vector or range (e.g. `trait = 1:obj.nsites`).
 These probabilities are conditional on the model parameters in `obj`.
 
+Displayed trees are listed in the order in which they are stored in the fitted
+model object `obj`.
+
 **Precondition**: `_loglikcache` updated by [`discrete_corelikelihood!`](@ref)
 
 # examples
@@ -835,6 +839,58 @@ function posterior_logtreeweight(obj::SSM, trait = 1)
     return ts
 end
 
+"""
+    posterior_loghybridweight(obj::SSM, hybrid_name, trait = 1)
+    posterior_loghybridweight(obj::SSM, edge_number, trait = 1)
+
+Log-posterior probability for all trees displaying the minor parent edge
+of hybrid node named `hybrid_name`, or displaying the edge number `edge_number`.
+That is: log of P(hybrid minor parent | trait) if a single `trait` is requested,
+or A[i]= log of P(hybrid minor parent | trait `i`)
+if `trait` is a vector or range (e.g. `trait = 1:obj.nsites`).
+These probabilities are conditional on the model parameters in `obj`.
+
+**Precondition**: `_loglikcache` updated by [`discrete_corelikelihood!`](@ref)
+
+# examples
+
+```jldoctest
+julia> net = readTopology("(((A:2.0,(B:1.0)#H1:0.1::0.9):1.5,(C:0.6,#H1:1.0::0.1):1.0):0.5,D:2.0);");
+
+julia> m1 = BinaryTraitSubstitutionModel([0.1, 0.1], ["lo", "hi"]); # arbitrary rates
+
+julia> using DataFrames
+
+julia> dat = DataFrame(species=["C","A","B","D"], trait=["hi","lo","lo","hi"]);
+
+julia> fit = fitdiscrete(net, m1, dat); # optimized rates: α=0.27 and β=0.35
+
+julia> plhw = PhyloNetworks.posterior_loghybridweight(fit, "H1");
+
+julia> round(exp(plhw), digits=5) # posterior probability of going through minor hybrid edge
+0.08017
+
+julia> hn = net.node[3]; getparentedgeminor(hn).gamma # prior probability
+0.1
+```
+"""
+function posterior_loghybridweight(obj::SSM, hybridname::String, trait = 1)
+    hn_index = findfirst(n -> n.name == hybridname, obj.net.node)
+    isnothing(hn_index) && error("node named $hybridname not found")
+    hn = obj.net.node[hn_index]
+    hn.hybrid || error("node named $hybridname is not a hybrid node")
+    me = getparentedgeminor(hn)
+    posterior_loghybridweight(obj, me.number, trait)
+end
+function posterior_loghybridweight(obj::SSM, edgenum::Integer, trait = 1)
+    tpp = posterior_logtreeweight(obj, trait) # size: (ntree,) or (ntree,ntraits)
+    hasedge = tree -> any(e.number == edgenum for e in tree.edge)
+    tokeep = map(hasedge, obj.displayedtree)
+    tppe = view(tpp, tokeep, :) # makes it a matrix
+    epp = dropdims(mapslices(logsumexp, tppe, dims=1); dims=2)
+    return (size(epp)==(1,) ? epp[1] : epp) # scalar or vector
+end
+
 """
     traitlabels2indices(data, model::SubstitutionModel)
 

diff --git a/test/test_traitLikDiscrete.jl b/test/test_traitLikDiscrete.jl
@@ -340,6 +340,7 @@ asr = ancestralStateReconstruction(fit1)
 pltw = [-0.08356534477069566, -2.5236181051014333]
 @test PhyloNetworks.posterior_logtreeweight(fit1) ≈ pltw atol=1e-5
 @test PhyloNetworks.posterior_logtreeweight(fit1, 1:1) ≈ reshape(pltw, (2,1)) atol=1e-5
+@test PhyloNetworks.posterior_loghybridweight(fit1, "H1") ≈ -2.5236227134322293
 
 end # end of testset, fixed topology