From 987308caa1f93cd457ea42d2a5b4ec022a5e9d4e Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 13:31:43 +0100
Subject: [PATCH 01/10] add-njt

---
 workflow/envs/AmpSeeker-python.yaml           |   3 +-
 ...lysis.ipynb => population-structure.ipynb} | 189 ++++++++++++++++--
 2 files changed, 179 insertions(+), 13 deletions(-)
 rename workflow/notebooks/{principal-component-analysis.ipynb => population-structure.ipynb} (51%)

diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml
index ab21f28..f47c406 100644
--- a/workflow/envs/AmpSeeker-python.yaml
+++ b/workflow/envs/AmpSeeker-python.yaml
@@ -17,4 +17,5 @@ dependencies:
     - igv_notebook
     - papermill
     - scikit-allel
-    - pysam
\ No newline at end of file
+    - pysam
+    - anjl
\ No newline at end of file
diff --git a/workflow/notebooks/principal-component-analysis.ipynb b/workflow/notebooks/population-structure.ipynb
similarity index 51%
rename from workflow/notebooks/principal-component-analysis.ipynb
rename to workflow/notebooks/population-structure.ipynb
index 36e2344..35a096d 100644
--- a/workflow/notebooks/principal-component-analysis.ipynb
+++ b/workflow/notebooks/population-structure.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "8c01954f",
    "metadata": {
     "tags": [
@@ -24,7 +24,7 @@
     "    sampleIDs = metadata.sampleID.to_list()\n",
     "    \n",
     "    # load vcf and get genotypes and positions\n",
-    "    vcf = allel.read_vcf(vcf_path, fields='*')\n",
+    "    vcf = allel.read_vcf(vcf_path)\n",
     "    samples = vcf['samples']\n",
     "    # keep only samples in qcpass metadata \n",
     "    sample_mask = np.isin(vcf['samples'], metadata.sampleID)\n",
@@ -34,14 +34,17 @@
     "    geno = geno.compress(sample_mask, axis=1)\n",
     "    pos = vcf['variants/POS']\n",
     "    contig = vcf['variants/CHROM']\n",
-    "    indel = vcf['variants/INDEL']\n",
+    "#     indel = vcf['variants/INDEL']\n",
+    "    \n",
+    "#     # remove any indels \n",
+    "#     geno = geno.compress(~indel, axis=0)\n",
+    "#     pos = pos[~indel]\n",
+    "#     contig = contig[~indel]\n",
     "    \n",
-    "    # remove indels \n",
-    "    geno = geno.compress(~indel, axis=0)\n",
-    "    pos = pos[~indel]\n",
-    "    contig = contig[~indel]\n",
+    "    metadata = metadata.set_index('sampleID')\n",
+    "    samples = samples[sample_mask]\n",
     "    \n",
-    "    return geno, pos, contig, samples[sample_mask]\n",
+    "    return geno, pos, contig, metadata.loc[samples, :]\n",
     "\n",
     "def pca(metadata_path, vcf_path, n_components = 6):\n",
     "    \"\"\"\n",
@@ -114,7 +117,7 @@
    },
    "outputs": [],
    "source": [
-    "dataset = 'ampseq-vigg-01'\n",
+    "dataset = 'ampseq-vigg002'\n",
     "vcf_path = f\"../../results/vcfs/targets/{dataset}.annot.vcf\"\n",
     "metadata_path = \"../../results/config/metadata.qcpass.tsv\"\n",
     "cohort_cols = 'taxon,location'"
@@ -151,7 +154,7 @@
    "id": "8b018edf",
    "metadata": {},
    "source": [
-    "### Variance explained\n",
+    "#### Variance explained\n",
     "\n",
     "As a general rule of thumb, when the variance explained for each PC begins to flatten out, that is when the PCs are no longer informative."
    ]
@@ -202,10 +205,172 @@
     "    fig2.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "82bbe3f6",
+   "metadata": {},
+   "source": [
+    "## NJT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e53849e",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import numba\n",
+    "from scipy.spatial.distance import squareform  # type: ignore\n",
+    "\n",
+    "@numba.njit(parallel=True)\n",
+    "def multiallelic_diplotype_pdist(X, metric):\n",
+    "    \"\"\"Optimised implementation of pairwise distance between diplotypes.\n",
+    "\n",
+    "    N.B., here we assume the array X provides diplotypes as genotype allele\n",
+    "    counts, with axes in the order (n_samples, n_sites, n_alleles).\n",
+    "\n",
+    "    Computation will be faster if X is a contiguous (C order) array.\n",
+    "\n",
+    "    The metric argument is the function to compute distance for a pair of\n",
+    "    diplotypes. This can be a numba jitted function.\n",
+    "\n",
+    "    \"\"\"\n",
+    "    n_samples = X.shape[0]\n",
+    "    n_pairs = (n_samples * (n_samples - 1)) // 2\n",
+    "    out = np.zeros(n_pairs, dtype=np.float32)\n",
+    "\n",
+    "    # Loop over samples, first in pair.\n",
+    "    for i in range(n_samples):\n",
+    "        x = X[i, :, :]\n",
+    "\n",
+    "        # Loop over observations again, second in pair.\n",
+    "        for j in numba.prange(i + 1, n_samples):\n",
+    "            y = X[j, :, :]\n",
+    "\n",
+    "            # Compute distance for the current pair.\n",
+    "            d = metric(x, y)\n",
+    "\n",
+    "            # Store result for the current pair.\n",
+    "            k = square_to_condensed(i, j, n_samples)\n",
+    "            out[k] = d\n",
+    "\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "@numba.njit\n",
+    "def square_to_condensed(i, j, n):\n",
+    "    \"\"\"Convert distance matrix coordinates from square form (i, j) to condensed form.\"\"\"\n",
+    "\n",
+    "    assert i != j, \"no diagonal elements in condensed matrix\"\n",
+    "    if i < j:\n",
+    "        i, j = j, i\n",
+    "    return n * j - j * (j + 1) // 2 + i - 1 - j\n",
+    "\n",
+    "\n",
+    "@numba.njit\n",
+    "def multiallelic_diplotype_mean_cityblock(x, y):\n",
+    "    \"\"\"Compute the mean cityblock distance between two diplotypes x and y. The\n",
+    "    diplotype vectors are expected as genotype allele counts, i.e., x and y\n",
+    "    should have the same shape (n_sites, n_alleles).\n",
+    "\n",
+    "    N.B., here we compute the mean value of the distance over sites where\n",
+    "    both individuals have a called genotype. This avoids computing distance\n",
+    "    at missing sites.\n",
+    "\n",
+    "    \"\"\"\n",
+    "    n_sites = x.shape[0]\n",
+    "    n_alleles = x.shape[1]\n",
+    "    distance = np.float32(0)\n",
+    "    n_sites_called = np.float32(0)\n",
+    "\n",
+    "    # Loop over sites.\n",
+    "    for i in range(n_sites):\n",
+    "        x_is_called = False\n",
+    "        y_is_called = False\n",
+    "        d = np.float32(0)\n",
+    "\n",
+    "        # Loop over alleles.\n",
+    "        for j in range(n_alleles):\n",
+    "            # Access allele counts.\n",
+    "            xc = np.float32(x[i, j])\n",
+    "            yc = np.float32(y[i, j])\n",
+    "\n",
+    "            # Check if any alleles observed.\n",
+    "            x_is_called = x_is_called or (xc > 0)\n",
+    "            y_is_called = y_is_called or (yc > 0)\n",
+    "\n",
+    "            # Compute cityblock distance (absolute difference).\n",
+    "            d += np.fabs(xc - yc)\n",
+    "\n",
+    "        # Accumulate distance for the current pair, but only if both samples\n",
+    "        # have a called genotype.\n",
+    "        if x_is_called and y_is_called:\n",
+    "            distance += d\n",
+    "            n_sites_called += np.float32(1)\n",
+    "\n",
+    "    # Compute the mean distance over sites with called genotypes.\n",
+    "    if n_sites_called > 0:\n",
+    "        mean_distance = distance / n_sites_called\n",
+    "    else:\n",
+    "        mean_distance = np.nan\n",
+    "\n",
+    "    return mean_distance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f0977f1",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "df_samples = pd.read_csv(\"../../results/config/metadata.qcpass.tsv\", sep=\"\\t\", index_col=0)\n",
+    "\n",
+    "geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "783b9b6c",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import anjl\n",
+    "\n",
+    "ac = allel.GenotypeArray(geno).to_allele_counts(max_allele=3)\n",
+    "X = np.ascontiguousarray(np.swapaxes(ac.values, 0, 1))\n",
+    "\n",
+    "dists = multiallelic_diplotype_pdist(X, metric=multiallelic_diplotype_mean_cityblock)\n",
+    "dists = squareform(dists)\n",
+    "Z = anjl.rapid_nj(dists)\n",
+    "\n",
+    "anjl.plot(\n",
+    "    Z,\n",
+    "    leaf_data=df_samples.reset_index(),\n",
+    "    color=\"location\",\n",
+    "    hover_name=\"sampleID\",\n",
+    "    hover_data=cohort_cols.split(\",\"),\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "62029894",
+   "id": "7d50cbd1",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -228,7 +393,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

From 4b267b58019188901bd2694d548e30a431fcd084 Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 13:35:33 +0100
Subject: [PATCH 02/10] add-njt

---
 .test/config/config_agvampir.yaml             |  2 +-
 .test/config/config_fastqauto.yaml            |  2 +-
 .test/config/config_fastqmetadata.yaml        |  2 +-
 config/config.yaml                            |  2 +-
 workflow/notebooks/population-structure.ipynb | 14 +++++++-------
 workflow/rules/analysis.smk                   | 10 +++++-----
 workflow/rules/common.smk                     |  4 ++--
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.test/config/config_agvampir.yaml b/.test/config/config_agvampir.yaml
index 32dac14..a8f5313 100644
--- a/.test/config/config_agvampir.yaml
+++ b/.test/config/config_agvampir.yaml
@@ -40,7 +40,7 @@ quality-control:
 analysis:  
   igv: True
   sample-map: True
-  pca: True
+  population-structure: True
   genetic-diversity: True
   allele-frequencies: True
 
diff --git a/.test/config/config_fastqauto.yaml b/.test/config/config_fastqauto.yaml
index 44bcf40..d22cd04 100644
--- a/.test/config/config_fastqauto.yaml
+++ b/.test/config/config_fastqauto.yaml
@@ -42,7 +42,7 @@ quality-control:
 analysis:  
   igv: True
   sample-map: False # needs lat and longs
-  pca:  True
+  population-structure:  True
   genetic-diversity: True
   allele-frequencies: True
 
diff --git a/.test/config/config_fastqmetadata.yaml b/.test/config/config_fastqmetadata.yaml
index 4b77b52..5e66d5d 100644
--- a/.test/config/config_fastqmetadata.yaml
+++ b/.test/config/config_fastqmetadata.yaml
@@ -39,7 +39,7 @@ quality-control:
 analysis:  
   igv: True
   sample-map: True
-  pca: True
+  population-structure: True
   genetic-diversity: True
   allele-frequencies: True
 
diff --git a/config/config.yaml b/config/config.yaml
index 1a8c097..32187f9 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -40,7 +40,7 @@ quality-control:
 analysis:  
   igv: True
   sample-map: False # needs lat and longs
-  pca: True
+  population-structure: True
   genetic-diversity: True
   allele-frequencies: True
 
diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb
index 35a096d..8c96bf4 100644
--- a/workflow/notebooks/population-structure.ipynb
+++ b/workflow/notebooks/population-structure.ipynb
@@ -34,12 +34,12 @@
     "    geno = geno.compress(sample_mask, axis=1)\n",
     "    pos = vcf['variants/POS']\n",
     "    contig = vcf['variants/CHROM']\n",
-    "#     indel = vcf['variants/INDEL']\n",
+    "    indel = vcf['variants/INDEL']\n",
     "    \n",
-    "#     # remove any indels \n",
-    "#     geno = geno.compress(~indel, axis=0)\n",
-    "#     pos = pos[~indel]\n",
-    "#     contig = contig[~indel]\n",
+    "    # remove any indels \n",
+    "    geno = geno.compress(~indel, axis=0)\n",
+    "    pos = pos[~indel]\n",
+    "    contig = contig[~indel]\n",
     "    \n",
     "    metadata = metadata.set_index('sampleID')\n",
     "    samples = samples[sample_mask]\n",
@@ -128,9 +128,9 @@
    "id": "696938a0",
    "metadata": {},
    "source": [
-    "## PCA\n",
+    "## Population structure\n",
     "\n",
-    "In this notebook, we run a principal components analysis on the amplicon sequencing variant data, plotting PC1 v PC2 and PC3 v PC4, and the variance explained by the model."
+    "In this notebook, we run a principal components analysis and build a neighbour joining tree on the amplicon sequencing variant data. For the PCA, we will plot PC1 v PC2 and PC3 v PC4, and the variance explained by the model."
    ]
   },
   {
diff --git a/workflow/rules/analysis.smk b/workflow/rules/analysis.smk
index 64a009b..c79f0b6 100644
--- a/workflow/rules/analysis.smk
+++ b/workflow/rules/analysis.smk
@@ -25,20 +25,20 @@ rule igv_notebook:
         """
 
 
-rule pca:
+rule population_structure:
     input:
-        nb=f"{workflow.basedir}/notebooks/principal-component-analysis.ipynb",
+        nb=f"{workflow.basedir}/notebooks/population-structure.ipynb",
         kernel="results/.kernel.set",
         vcf=expand("results/vcfs/targets/{dataset}.annot.vcf", dataset=dataset),
         metadata="results/config/metadata.qcpass.tsv",
         taxon_complete = "results/.taxon.complete" if panel == "ag-vampir" else [],
     output:
-        nb="results/notebooks/principal-component-analysis.ipynb",
-        docs_nb="docs/ampseeker-results/notebooks/principal-component-analysis.ipynb",
+        nb="results/notebooks/population-structure.ipynb",
+        docs_nb="docs/ampseeker-results/notebooks/population-structure.ipynb",
     conda:
         "../envs/AmpSeeker-python.yaml"
     log:
-        "logs/notebooks/principal-component-analysis.log",
+        "logs/notebooks/population-structure.log",
     params:
         dataset=dataset,
         cohort_cols=cohort_cols,
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 232f176..e79c1a3 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -179,8 +179,8 @@ def AmpSeekerOutputs(wildcards):
         inputs.extend(
             expand(
                 [
-                    "results/notebooks/principal-component-analysis.ipynb",
-                    "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb",
+                    "results/notebooks/population-structure.ipynb",
+                    "docs/ampseeker-results/notebooks/population-structure.ipynb",
                 ],
             )
         )

From 061545ff96380f2110dfffec40bcc8afe8e87afd Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 13:37:43 +0100
Subject: [PATCH 03/10] fixes

---
 workflow/rules/common.smk       |  2 +-
 workflow/rules/jupyter-book.smk | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index e79c1a3..2ece668 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -175,7 +175,7 @@ def AmpSeekerOutputs(wildcards):
             )
         )
 
-    if config["analysis"]["pca"]:
+    if config["analysis"]["population-structure"]:
         inputs.extend(
             expand(
                 [
diff --git a/workflow/rules/jupyter-book.smk b/workflow/rules/jupyter-book.smk
index 13ea993..649df7e 100644
--- a/workflow/rules/jupyter-book.smk
+++ b/workflow/rules/jupyter-book.smk
@@ -17,9 +17,9 @@ rule jupyterbook:
             if config["quality-control"]["coverage"]
             else []
         ),
-        pca=(
-            "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb"
-            if config["analysis"]["pca"]
+        pop_structure=(
+            "docs/ampseeker-results/notebooks/population-structure.ipynb"
+            if config["analysis"]["population-structure"]
             else []
         ),
         af=(
@@ -91,9 +91,9 @@ rule process_notebooks:
             if config["quality-control"]["coverage"]
             else []
         ),
-        pca=(
-            "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb"
-            if config["analysis"]["pca"]
+        pop_structure=(
+            "docs/ampseeker-results/notebooks/population-structure.ipynb"
+            if config["analysis"]["population-structure"]
             else []
         ),
         af=(

From 3c259f85708f8d201d6acaba33ba54f3135be176 Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 14:11:48 +0100
Subject: [PATCH 04/10] fixes

---
 .github/workflows/github-action-ampseq.yml           | 2 +-
 .github/workflows/github-action-ampseq_agvampir.yml  | 2 +-
 .github/workflows/github-action-ampseq_fastqauto.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/github-action-ampseq.yml b/.github/workflows/github-action-ampseq.yml
index 176c47a..f5307a9 100644
--- a/.github/workflows/github-action-ampseq.yml
+++ b/.github/workflows/github-action-ampseq.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8']
+        python-version: ['3.10']
     steps:
       - name: Set up python3
         uses: actions/setup-python@v2
diff --git a/.github/workflows/github-action-ampseq_agvampir.yml b/.github/workflows/github-action-ampseq_agvampir.yml
index d9c5021..1ba63d3 100644
--- a/.github/workflows/github-action-ampseq_agvampir.yml
+++ b/.github/workflows/github-action-ampseq_agvampir.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.10', '3.11']
+        python-version: ['3.11', '3.12']
     steps:
       - name: Set up python3
         uses: actions/setup-python@v2
diff --git a/.github/workflows/github-action-ampseq_fastqauto.yml b/.github/workflows/github-action-ampseq_fastqauto.yml
index 077ce0a..d8a8be6 100644
--- a/.github/workflows/github-action-ampseq_fastqauto.yml
+++ b/.github/workflows/github-action-ampseq_fastqauto.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9']
+        python-version: ['3.10']
     steps:
       - name: Set up python3
         uses: actions/setup-python@v2

From e4532ceab8bb74964c8c1ece6630b8b65a642cb8 Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 14:31:49 +0100
Subject: [PATCH 05/10] fixes

---
 workflow/envs/AmpSeeker-python.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml
index f47c406..ba880a6 100644
--- a/workflow/envs/AmpSeeker-python.yaml
+++ b/workflow/envs/AmpSeeker-python.yaml
@@ -4,7 +4,7 @@ channels:
   - conda-forge
   - anaconda
 dependencies:
-  - python=3.9
+  - python=3.10
   - numpy
   - pandas=2.1.4
   - plotly

From 49f8bdcbea279e134dae88e948fd3a63afa13e1a Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 14:33:09 +0100
Subject: [PATCH 06/10] upgrade pytthon version in pyhton yaml

---
 workflow/envs/AmpSeeker-python.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml
index ba880a6..9e22042 100644
--- a/workflow/envs/AmpSeeker-python.yaml
+++ b/workflow/envs/AmpSeeker-python.yaml
@@ -4,7 +4,7 @@ channels:
   - conda-forge
   - anaconda
 dependencies:
-  - python=3.10
+  - python>=3.10,<3.13
   - numpy
   - pandas=2.1.4
   - plotly

From d61973d4936d84083cbdf37815ad400ae11b0dc3 Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 14:46:07 +0100
Subject: [PATCH 07/10] fix

---
 workflow/notebooks/population-structure.ipynb | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb
index 8c96bf4..5a03cbe 100644
--- a/workflow/notebooks/population-structure.ipynb
+++ b/workflow/notebooks/population-structure.ipynb
@@ -20,11 +20,9 @@
     "    \"\"\"\n",
     "    Load VCF and filter poor-quality samples\n",
     "    \"\"\"\n",
-    "    \n",
-    "    sampleIDs = metadata.sampleID.to_list()\n",
-    "    \n",
+    "       \n",
     "    # load vcf and get genotypes and positions\n",
-    "    vcf = allel.read_vcf(vcf_path)\n",
+    "    vcf = allel.read_vcf(vcf_path, fields=\"*\")\n",
     "    samples = vcf['samples']\n",
     "    # keep only samples in qcpass metadata \n",
     "    sample_mask = np.isin(vcf['samples'], metadata.sampleID)\n",

From a77ad5744997cc15d4623b87522b010fab5a57f0 Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 15:12:23 +0100
Subject: [PATCH 08/10] adding dipclust nb but not integrating yet

---
 workflow/notebooks/diplotype-clustering.ipynb | 332 ++++++++++++++++++
 1 file changed, 332 insertions(+)
 create mode 100644 workflow/notebooks/diplotype-clustering.ipynb

diff --git a/workflow/notebooks/diplotype-clustering.ipynb b/workflow/notebooks/diplotype-clustering.ipynb
new file mode 100644
index 0000000..4b1aead
--- /dev/null
+++ b/workflow/notebooks/diplotype-clustering.ipynb
@@ -0,0 +1,332 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c01954f",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import allel\n",
+    "import numpy as np\n",
+    "import pandas as pd \n",
+    "import plotly.express as px\n",
+    "\n",
+    "def load_vcf(vcf_path, metadata):\n",
+    "    \"\"\"\n",
+    "    Load VCF and filter poor-quality samples\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    sampleIDs = metadata.sampleID.to_list()\n",
+    "    \n",
+    "    # load vcf and get genotypes and positions\n",
+    "    vcf = allel.read_vcf(vcf_path, fields=\"*\")\n",
+    "    samples = vcf['samples']\n",
+    "    # keep only samples in qcpass metadata \n",
+    "    sample_mask = np.isin(vcf['samples'], metadata.sampleID)\n",
+    "    \n",
+    "    # remove low quality samples \n",
+    "    geno = allel.GenotypeArray(vcf['calldata/GT'])\n",
+    "    geno = geno.compress(sample_mask, axis=1)\n",
+    "    pos = vcf['variants/POS']\n",
+    "    contig = vcf['variants/CHROM']\n",
+    "    indel = vcf['variants/INDEL']\n",
+    "    \n",
+    "    # remove any indels \n",
+    "    geno = geno.compress(~indel, axis=0)\n",
+    "    pos = pos[~indel]\n",
+    "    contig = contig[~indel]\n",
+    "    \n",
+    "    metadata = metadata.set_index('sampleID')\n",
+    "    samples = samples[sample_mask]\n",
+    "    \n",
+    "    return geno, pos, contig, metadata.loc[samples, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6302ed5",
+   "metadata": {
+    "tags": [
+     "parameters",
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "dataset = 'ampseq-vigg002'\n",
+    "vcf_path = f\"../../results/vcfs/targets/{dataset}.annot.vcf\"\n",
+    "metadata_path = \"../../results/config/metadata.qcpass.tsv\"\n",
+    "cohort_cols = 'taxon,location'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "696938a0",
+   "metadata": {},
+   "source": [
+    "## Diplotype clustering"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a7ae6392",
+   "metadata": {
+    "tags": [
+     "remove-input",
+     "remove-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "cohort_cols = cohort_cols.split(\",\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e53849e",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import numba\n",
+    "from scipy.spatial.distance import squareform  # type: ignore\n",
+    "\n",
+    "@numba.njit(parallel=True)\n",
+    "def multiallelic_diplotype_pdist(X, metric):\n",
+    "    \"\"\"Optimised implementation of pairwise distance between diplotypes.\n",
+    "\n",
+    "    N.B., here we assume the array X provides diplotypes as genotype allele\n",
+    "    counts, with axes in the order (n_samples, n_sites, n_alleles).\n",
+    "\n",
+    "    Computation will be faster if X is a contiguous (C order) array.\n",
+    "\n",
+    "    The metric argument is the function to compute distance for a pair of\n",
+    "    diplotypes. This can be a numba jitted function.\n",
+    "\n",
+    "    \"\"\"\n",
+    "    n_samples = X.shape[0]\n",
+    "    n_pairs = (n_samples * (n_samples - 1)) // 2\n",
+    "    out = np.zeros(n_pairs, dtype=np.float32)\n",
+    "\n",
+    "    # Loop over samples, first in pair.\n",
+    "    for i in range(n_samples):\n",
+    "        x = X[i, :, :]\n",
+    "\n",
+    "        # Loop over observations again, second in pair.\n",
+    "        for j in numba.prange(i + 1, n_samples):\n",
+    "            y = X[j, :, :]\n",
+    "\n",
+    "            # Compute distance for the current pair.\n",
+    "            d = metric(x, y)\n",
+    "\n",
+    "            # Store result for the current pair.\n",
+    "            k = square_to_condensed(i, j, n_samples)\n",
+    "            out[k] = d\n",
+    "\n",
+    "    return out\n",
+    "\n",
+    "\n",
+    "@numba.njit\n",
+    "def square_to_condensed(i, j, n):\n",
+    "    \"\"\"Convert distance matrix coordinates from square form (i, j) to condensed form.\"\"\"\n",
+    "\n",
+    "    assert i != j, \"no diagonal elements in condensed matrix\"\n",
+    "    if i < j:\n",
+    "        i, j = j, i\n",
+    "    return n * j - j * (j + 1) // 2 + i - 1 - j\n",
+    "\n",
+    "\n",
+    "@numba.njit\n",
+    "def multiallelic_diplotype_mean_cityblock(x, y):\n",
+    "    \"\"\"Compute the mean cityblock distance between two diplotypes x and y. The\n",
+    "    diplotype vectors are expected as genotype allele counts, i.e., x and y\n",
+    "    should have the same shape (n_sites, n_alleles).\n",
+    "\n",
+    "    N.B., here we compute the mean value of the distance over sites where\n",
+    "    both individuals have a called genotype. This avoids computing distance\n",
+    "    at missing sites.\n",
+    "\n",
+    "    \"\"\"\n",
+    "    n_sites = x.shape[0]\n",
+    "    n_alleles = x.shape[1]\n",
+    "    distance = np.float32(0)\n",
+    "    n_sites_called = np.float32(0)\n",
+    "\n",
+    "    # Loop over sites.\n",
+    "    for i in range(n_sites):\n",
+    "        x_is_called = False\n",
+    "        y_is_called = False\n",
+    "        d = np.float32(0)\n",
+    "\n",
+    "        # Loop over alleles.\n",
+    "        for j in range(n_alleles):\n",
+    "            # Access allele counts.\n",
+    "            xc = np.float32(x[i, j])\n",
+    "            yc = np.float32(y[i, j])\n",
+    "\n",
+    "            # Check if any alleles observed.\n",
+    "            x_is_called = x_is_called or (xc > 0)\n",
+    "            y_is_called = y_is_called or (yc > 0)\n",
+    "\n",
+    "            # Compute cityblock distance (absolute difference).\n",
+    "            d += np.fabs(xc - yc)\n",
+    "\n",
+    "        # Accumulate distance for the current pair, but only if both samples\n",
+    "        # have a called genotype.\n",
+    "        if x_is_called and y_is_called:\n",
+    "            distance += d\n",
+    "            n_sites_called += np.float32(1)\n",
+    "\n",
+    "    # Compute the mean distance over sites with called genotypes.\n",
+    "    if n_sites_called > 0:\n",
+    "        mean_distance = distance / n_sites_called\n",
+    "    else:\n",
+    "        mean_distance = np.nan\n",
+    "\n",
+    "    return mean_distance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f0977f1",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "df_samples = pd.read_csv(\"../../results/config/metadata.qcpass.tsv\", sep=\"\\t\", index_col=0)\n",
+    "\n",
+    "geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "783b9b6c",
+   "metadata": {
+    "tags": [
+     "remove-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "ac = allel.GenotypeArray(geno).to_allele_counts(max_allele=3)\n",
+    "X = np.ascontiguousarray(np.swapaxes(ac.values, 0, 1))\n",
+    "\n",
+    "dists = multiallelic_diplotype_pdist(X, metric=multiallelic_diplotype_mean_cityblock)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2821a092",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from malariagen_data.plotly_dendrogram import plot_dendrogram"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7d50cbd1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "distance_metric = 'cityblock'\n",
+    "\n",
+    "fig, leaf_data = plot_dendrogram(\n",
+    "    dist=dists,\n",
+    "    linkage_method=\"complete\",\n",
+    "    count_sort=True,\n",
+    "    distance_sort=False,\n",
+    "    render_mode=\"svg\",\n",
+    "    width=800,\n",
+    "    height=500,\n",
+    "    title=dataset,\n",
+    "    line_width=0.5,\n",
+    "    line_color='black',\n",
+    "    marker_size=5,\n",
+    "    leaf_data=df_samples.reset_index(),\n",
+    "    leaf_hover_name=\"sampleID\",\n",
+    "    leaf_hover_data=cohort_cols,\n",
+    "    leaf_color=\"taxon\",\n",
+    "    leaf_symbol=None,\n",
+    "    leaf_y=-0.05,\n",
+    "    leaf_color_discrete_map=None,\n",
+    "    leaf_category_orders=None,\n",
+    "    template=\"simple_white\",\n",
+    "    y_axis_title=f\"Distance ({distance_metric})\",\n",
+    "    y_axis_buffer=0.1,\n",
+    ")\n",
+    "\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64b6f43b",
+   "metadata": {},
+   "source": [
+    "### Diplotype clustering at target loci"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a996ce9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "major_loci = '2L:2_000_000-3_000_000'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9404b863",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_bed = pd.read_csv(\"../../config/ag-vampir.bed\", sep=\"\\t\", header=None)\n",
+    "df_bed"
+   ]
+  }
+ ],
+ "metadata": {
+  "celltoolbar": "Tags",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From bc3bad756072d3a3df7e34e13c2d645f06171b8c Mon Sep 17 00:00:00 2001
From: Sanjay Nagi <sanjay.c.nagi@gmail.com>
Date: Sun, 13 Oct 2024 15:13:39 +0100
Subject: [PATCH 09/10] adding dipclust nb but not integrating yet

---
 workflow/notebooks/population-structure.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb
index 5a03cbe..1e8e25f 100644
--- a/workflow/notebooks/population-structure.ipynb
+++ b/workflow/notebooks/population-structure.ipynb
@@ -331,7 +331,7 @@
    },
    "outputs": [],
    "source": [
-    "df_samples = pd.read_csv(\"../../results/config/metadata.qcpass.tsv\", sep=\"\\t\", index_col=0)\n",
+    "df_samples = pd.read_csv(metadata_path, sep=\"\\t\", index_col=0)\n",
     "\n",
     "geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)"
    ]

From 8556c2a29d491e8cacd48d425c9b780db132a751 Mon Sep 17 00:00:00 2001
From: Sanjay Curtis Nagi <34922269+sanjaynagi@users.noreply.github.com>
Date: Sun, 13 Oct 2024 15:28:16 +0100
Subject: [PATCH 10/10] Update workflow/notebooks/population-structure.ipynb

---
 workflow/notebooks/population-structure.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb
index 1e8e25f..76ac3be 100644
--- a/workflow/notebooks/population-structure.ipynb
+++ b/workflow/notebooks/population-structure.ipynb
@@ -361,7 +361,7 @@
     "    leaf_data=df_samples.reset_index(),\n",
     "    color=\"location\",\n",
     "    hover_name=\"sampleID\",\n",
-    "    hover_data=cohort_cols.split(\",\"),\n",
+    "    hover_data=cohort_cols,\n",
     ")"
    ]
   },