From 987308caa1f93cd457ea42d2a5b4ec022a5e9d4e Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 13:31:43 +0100 Subject: [PATCH 01/10] add-njt --- workflow/envs/AmpSeeker-python.yaml | 3 +- ...lysis.ipynb => population-structure.ipynb} | 189 ++++++++++++++++-- 2 files changed, 179 insertions(+), 13 deletions(-) rename workflow/notebooks/{principal-component-analysis.ipynb => population-structure.ipynb} (51%) diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml index ab21f28..f47c406 100644 --- a/workflow/envs/AmpSeeker-python.yaml +++ b/workflow/envs/AmpSeeker-python.yaml @@ -17,4 +17,5 @@ dependencies: - igv_notebook - papermill - scikit-allel - - pysam \ No newline at end of file + - pysam + - anjl \ No newline at end of file diff --git a/workflow/notebooks/principal-component-analysis.ipynb b/workflow/notebooks/population-structure.ipynb similarity index 51% rename from workflow/notebooks/principal-component-analysis.ipynb rename to workflow/notebooks/population-structure.ipynb index 36e2344..35a096d 100644 --- a/workflow/notebooks/principal-component-analysis.ipynb +++ b/workflow/notebooks/population-structure.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "8c01954f", "metadata": { "tags": [ @@ -24,7 +24,7 @@ " sampleIDs = metadata.sampleID.to_list()\n", " \n", " # load vcf and get genotypes and positions\n", - " vcf = allel.read_vcf(vcf_path, fields='*')\n", + " vcf = allel.read_vcf(vcf_path)\n", " samples = vcf['samples']\n", " # keep only samples in qcpass metadata \n", " sample_mask = np.isin(vcf['samples'], metadata.sampleID)\n", @@ -34,14 +34,17 @@ " geno = geno.compress(sample_mask, axis=1)\n", " pos = vcf['variants/POS']\n", " contig = vcf['variants/CHROM']\n", - " indel = vcf['variants/INDEL']\n", + "# indel = vcf['variants/INDEL']\n", + " \n", + "# # remove any indels \n", + "# geno = geno.compress(~indel, axis=0)\n", + "# pos = pos[~indel]\n", + "# contig = contig[~indel]\n", " \n", - " # remove indels \n", - " geno = geno.compress(~indel, axis=0)\n", - " pos = pos[~indel]\n", - " contig = contig[~indel]\n", + " metadata = metadata.set_index('sampleID')\n", + " samples = samples[sample_mask]\n", " \n", - " return geno, pos, contig, samples[sample_mask]\n", + " return geno, pos, contig, metadata.loc[samples, :]\n", "\n", "def pca(metadata_path, vcf_path, n_components = 6):\n", " \"\"\"\n", @@ -114,7 +117,7 @@ }, "outputs": [], "source": [ - "dataset = 'ampseq-vigg-01'\n", + "dataset = 'ampseq-vigg002'\n", "vcf_path = f\"../../results/vcfs/targets/{dataset}.annot.vcf\"\n", "metadata_path = \"../../results/config/metadata.qcpass.tsv\"\n", "cohort_cols = 'taxon,location'" @@ -151,7 +154,7 @@ "id": "8b018edf", "metadata": {}, "source": [ - "### Variance explained\n", + "#### Variance explained\n", "\n", "As a general rule of thumb, when the variance explained for each PC begins to flatten out, that is when the PCs are no longer informative." ] @@ -202,10 +205,172 @@ " fig2.show()" ] }, + { + "cell_type": "markdown", + "id": "82bbe3f6", + "metadata": {}, + "source": [ + "## NJT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e53849e", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "import numba\n", + "from scipy.spatial.distance import squareform # type: ignore\n", + "\n", + "@numba.njit(parallel=True)\n", + "def multiallelic_diplotype_pdist(X, metric):\n", + " \"\"\"Optimised implementation of pairwise distance between diplotypes.\n", + "\n", + " N.B., here we assume the array X provides diplotypes as genotype allele\n", + " counts, with axes in the order (n_samples, n_sites, n_alleles).\n", + "\n", + " Computation will be faster if X is a contiguous (C order) array.\n", + "\n", + " The metric argument is the function to compute distance for a pair of\n", + " diplotypes. This can be a numba jitted function.\n", + "\n", + " \"\"\"\n", + " n_samples = X.shape[0]\n", + " n_pairs = (n_samples * (n_samples - 1)) // 2\n", + " out = np.zeros(n_pairs, dtype=np.float32)\n", + "\n", + " # Loop over samples, first in pair.\n", + " for i in range(n_samples):\n", + " x = X[i, :, :]\n", + "\n", + " # Loop over observations again, second in pair.\n", + " for j in numba.prange(i + 1, n_samples):\n", + " y = X[j, :, :]\n", + "\n", + " # Compute distance for the current pair.\n", + " d = metric(x, y)\n", + "\n", + " # Store result for the current pair.\n", + " k = square_to_condensed(i, j, n_samples)\n", + " out[k] = d\n", + "\n", + " return out\n", + "\n", + "\n", + "@numba.njit\n", + "def square_to_condensed(i, j, n):\n", + " \"\"\"Convert distance matrix coordinates from square form (i, j) to condensed form.\"\"\"\n", + "\n", + " assert i != j, \"no diagonal elements in condensed matrix\"\n", + " if i < j:\n", + " i, j = j, i\n", + " return n * j - j * (j + 1) // 2 + i - 1 - j\n", + "\n", + "\n", + "@numba.njit\n", + "def multiallelic_diplotype_mean_cityblock(x, y):\n", + " \"\"\"Compute the mean cityblock distance between two diplotypes x and y. The\n", + " diplotype vectors are expected as genotype allele counts, i.e., x and y\n", + " should have the same shape (n_sites, n_alleles).\n", + "\n", + " N.B., here we compute the mean value of the distance over sites where\n", + " both individuals have a called genotype. This avoids computing distance\n", + " at missing sites.\n", + "\n", + " \"\"\"\n", + " n_sites = x.shape[0]\n", + " n_alleles = x.shape[1]\n", + " distance = np.float32(0)\n", + " n_sites_called = np.float32(0)\n", + "\n", + " # Loop over sites.\n", + " for i in range(n_sites):\n", + " x_is_called = False\n", + " y_is_called = False\n", + " d = np.float32(0)\n", + "\n", + " # Loop over alleles.\n", + " for j in range(n_alleles):\n", + " # Access allele counts.\n", + " xc = np.float32(x[i, j])\n", + " yc = np.float32(y[i, j])\n", + "\n", + " # Check if any alleles observed.\n", + " x_is_called = x_is_called or (xc > 0)\n", + " y_is_called = y_is_called or (yc > 0)\n", + "\n", + " # Compute cityblock distance (absolute difference).\n", + " d += np.fabs(xc - yc)\n", + "\n", + " # Accumulate distance for the current pair, but only if both samples\n", + " # have a called genotype.\n", + " if x_is_called and y_is_called:\n", + " distance += d\n", + " n_sites_called += np.float32(1)\n", + "\n", + " # Compute the mean distance over sites with called genotypes.\n", + " if n_sites_called > 0:\n", + " mean_distance = distance / n_sites_called\n", + " else:\n", + " mean_distance = np.nan\n", + "\n", + " return mean_distance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0977f1", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "df_samples = pd.read_csv(\"../../results/config/metadata.qcpass.tsv\", sep=\"\\t\", index_col=0)\n", + "\n", + "geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "783b9b6c", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "import anjl\n", + "\n", + "ac = allel.GenotypeArray(geno).to_allele_counts(max_allele=3)\n", + "X = np.ascontiguousarray(np.swapaxes(ac.values, 0, 1))\n", + "\n", + "dists = multiallelic_diplotype_pdist(X, metric=multiallelic_diplotype_mean_cityblock)\n", + "dists = squareform(dists)\n", + "Z = anjl.rapid_nj(dists)\n", + "\n", + "anjl.plot(\n", + " Z,\n", + " leaf_data=df_samples.reset_index(),\n", + " color=\"location\",\n", + " hover_name=\"sampleID\",\n", + " hover_data=cohort_cols.split(\",\"),\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "62029894", + "id": "7d50cbd1", "metadata": {}, "outputs": [], "source": [] @@ -228,7 +393,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.10.12" } }, "nbformat": 4, From 4b267b58019188901bd2694d548e30a431fcd084 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 13:35:33 +0100 Subject: [PATCH 02/10] add-njt --- .test/config/config_agvampir.yaml | 2 +- .test/config/config_fastqauto.yaml | 2 +- .test/config/config_fastqmetadata.yaml | 2 +- config/config.yaml | 2 +- workflow/notebooks/population-structure.ipynb | 14 +++++++------- workflow/rules/analysis.smk | 10 +++++----- workflow/rules/common.smk | 4 ++-- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.test/config/config_agvampir.yaml b/.test/config/config_agvampir.yaml index 32dac14..a8f5313 100644 --- a/.test/config/config_agvampir.yaml +++ b/.test/config/config_agvampir.yaml @@ -40,7 +40,7 @@ quality-control: analysis: igv: True sample-map: True - pca: True + population-structure: True genetic-diversity: True allele-frequencies: True diff --git a/.test/config/config_fastqauto.yaml b/.test/config/config_fastqauto.yaml index 44bcf40..d22cd04 100644 --- a/.test/config/config_fastqauto.yaml +++ b/.test/config/config_fastqauto.yaml @@ -42,7 +42,7 @@ quality-control: analysis: igv: True sample-map: False # needs lat and longs - pca: True + population-structure: True genetic-diversity: True allele-frequencies: True diff --git a/.test/config/config_fastqmetadata.yaml b/.test/config/config_fastqmetadata.yaml index 4b77b52..5e66d5d 100644 --- a/.test/config/config_fastqmetadata.yaml +++ b/.test/config/config_fastqmetadata.yaml @@ -39,7 +39,7 @@ quality-control: analysis: igv: True sample-map: True - pca: True + population-structure: True genetic-diversity: True allele-frequencies: True diff --git a/config/config.yaml b/config/config.yaml index 1a8c097..32187f9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -40,7 +40,7 @@ quality-control: analysis: igv: True sample-map: False # needs lat and longs - pca: True + population-structure: True genetic-diversity: True allele-frequencies: True diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb index 35a096d..8c96bf4 100644 --- a/workflow/notebooks/population-structure.ipynb +++ b/workflow/notebooks/population-structure.ipynb @@ -34,12 +34,12 @@ " geno = geno.compress(sample_mask, axis=1)\n", " pos = vcf['variants/POS']\n", " contig = vcf['variants/CHROM']\n", - "# indel = vcf['variants/INDEL']\n", + " indel = vcf['variants/INDEL']\n", " \n", - "# # remove any indels \n", - "# geno = geno.compress(~indel, axis=0)\n", - "# pos = pos[~indel]\n", - "# contig = contig[~indel]\n", + " # remove any indels \n", + " geno = geno.compress(~indel, axis=0)\n", + " pos = pos[~indel]\n", + " contig = contig[~indel]\n", " \n", " metadata = metadata.set_index('sampleID')\n", " samples = samples[sample_mask]\n", @@ -128,9 +128,9 @@ "id": "696938a0", "metadata": {}, "source": [ - "## PCA\n", + "## Population structure\n", "\n", - "In this notebook, we run a principal components analysis on the amplicon sequencing variant data, plotting PC1 v PC2 and PC3 v PC4, and the variance explained by the model." + "In this notebook, we run a principal components analysis and build a neighbour joining tree on the amplicon sequencing variant data. For the PCA, we will plot PC1 v PC2 and PC3 v PC4, and the variance explained by the model." ] }, { diff --git a/workflow/rules/analysis.smk b/workflow/rules/analysis.smk index 64a009b..c79f0b6 100644 --- a/workflow/rules/analysis.smk +++ b/workflow/rules/analysis.smk @@ -25,20 +25,20 @@ rule igv_notebook: """ -rule pca: +rule population_structure: input: - nb=f"{workflow.basedir}/notebooks/principal-component-analysis.ipynb", + nb=f"{workflow.basedir}/notebooks/population-structure.ipynb", kernel="results/.kernel.set", vcf=expand("results/vcfs/targets/{dataset}.annot.vcf", dataset=dataset), metadata="results/config/metadata.qcpass.tsv", taxon_complete = "results/.taxon.complete" if panel == "ag-vampir" else [], output: - nb="results/notebooks/principal-component-analysis.ipynb", - docs_nb="docs/ampseeker-results/notebooks/principal-component-analysis.ipynb", + nb="results/notebooks/population-structure.ipynb", + docs_nb="docs/ampseeker-results/notebooks/population-structure.ipynb", conda: "../envs/AmpSeeker-python.yaml" log: - "logs/notebooks/principal-component-analysis.log", + "logs/notebooks/population-structure.log", params: dataset=dataset, cohort_cols=cohort_cols, diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 232f176..e79c1a3 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -179,8 +179,8 @@ def AmpSeekerOutputs(wildcards): inputs.extend( expand( [ - "results/notebooks/principal-component-analysis.ipynb", - "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb", + "results/notebooks/population-structure.ipynb", + "docs/ampseeker-results/notebooks/population-structure.ipynb", ], ) ) From 061545ff96380f2110dfffec40bcc8afe8e87afd Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 13:37:43 +0100 Subject: [PATCH 03/10] fixes --- workflow/rules/common.smk | 2 +- workflow/rules/jupyter-book.smk | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e79c1a3..2ece668 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -175,7 +175,7 @@ def AmpSeekerOutputs(wildcards): ) ) - if config["analysis"]["pca"]: + if config["analysis"]["population-structure"]: inputs.extend( expand( [ diff --git a/workflow/rules/jupyter-book.smk b/workflow/rules/jupyter-book.smk index 13ea993..649df7e 100644 --- a/workflow/rules/jupyter-book.smk +++ b/workflow/rules/jupyter-book.smk @@ -17,9 +17,9 @@ rule jupyterbook: if config["quality-control"]["coverage"] else [] ), - pca=( - "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb" - if config["analysis"]["pca"] + pop_structure=( + "docs/ampseeker-results/notebooks/population-structure.ipynb" + if config["analysis"]["population-structure"] else [] ), af=( @@ -91,9 +91,9 @@ rule process_notebooks: if config["quality-control"]["coverage"] else [] ), - pca=( - "docs/ampseeker-results/notebooks/principal-component-analysis.ipynb" - if config["analysis"]["pca"] + pop_structure=( + "docs/ampseeker-results/notebooks/population-structure.ipynb" + if config["analysis"]["population-structure"] else [] ), af=( From 3c259f85708f8d201d6acaba33ba54f3135be176 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 14:11:48 +0100 Subject: [PATCH 04/10] fixes --- .github/workflows/github-action-ampseq.yml | 2 +- .github/workflows/github-action-ampseq_agvampir.yml | 2 +- .github/workflows/github-action-ampseq_fastqauto.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/github-action-ampseq.yml b/.github/workflows/github-action-ampseq.yml index 176c47a..f5307a9 100644 --- a/.github/workflows/github-action-ampseq.yml +++ b/.github/workflows/github-action-ampseq.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8'] + python-version: ['3.10'] steps: - name: Set up python3 uses: actions/setup-python@v2 diff --git a/.github/workflows/github-action-ampseq_agvampir.yml b/.github/workflows/github-action-ampseq_agvampir.yml index d9c5021..1ba63d3 100644 --- a/.github/workflows/github-action-ampseq_agvampir.yml +++ b/.github/workflows/github-action-ampseq_agvampir.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.10', '3.11'] + python-version: ['3.11', '3.12'] steps: - name: Set up python3 uses: actions/setup-python@v2 diff --git a/.github/workflows/github-action-ampseq_fastqauto.yml b/.github/workflows/github-action-ampseq_fastqauto.yml index 077ce0a..d8a8be6 100644 --- a/.github/workflows/github-action-ampseq_fastqauto.yml +++ b/.github/workflows/github-action-ampseq_fastqauto.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9'] + python-version: ['3.10'] steps: - name: Set up python3 uses: actions/setup-python@v2 From e4532ceab8bb74964c8c1ece6630b8b65a642cb8 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 14:31:49 +0100 Subject: [PATCH 05/10] fixes --- workflow/envs/AmpSeeker-python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml index f47c406..ba880a6 100644 --- a/workflow/envs/AmpSeeker-python.yaml +++ b/workflow/envs/AmpSeeker-python.yaml @@ -4,7 +4,7 @@ channels: - conda-forge - anaconda dependencies: - - python=3.9 + - python=3.10 - numpy - pandas=2.1.4 - plotly From 49f8bdcbea279e134dae88e948fd3a63afa13e1a Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 14:33:09 +0100 Subject: [PATCH 06/10] upgrade pytthon version in pyhton yaml --- workflow/envs/AmpSeeker-python.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/AmpSeeker-python.yaml b/workflow/envs/AmpSeeker-python.yaml index ba880a6..9e22042 100644 --- a/workflow/envs/AmpSeeker-python.yaml +++ b/workflow/envs/AmpSeeker-python.yaml @@ -4,7 +4,7 @@ channels: - conda-forge - anaconda dependencies: - - python=3.10 + - python>=3.10,<3.13 - numpy - pandas=2.1.4 - plotly From d61973d4936d84083cbdf37815ad400ae11b0dc3 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 14:46:07 +0100 Subject: [PATCH 07/10] fix --- workflow/notebooks/population-structure.ipynb | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb index 8c96bf4..5a03cbe 100644 --- a/workflow/notebooks/population-structure.ipynb +++ b/workflow/notebooks/population-structure.ipynb @@ -20,11 +20,9 @@ " \"\"\"\n", " Load VCF and filter poor-quality samples\n", " \"\"\"\n", - " \n", - " sampleIDs = metadata.sampleID.to_list()\n", - " \n", + " \n", " # load vcf and get genotypes and positions\n", - " vcf = allel.read_vcf(vcf_path)\n", + " vcf = allel.read_vcf(vcf_path, fields=\"*\")\n", " samples = vcf['samples']\n", " # keep only samples in qcpass metadata \n", " sample_mask = np.isin(vcf['samples'], metadata.sampleID)\n", From a77ad5744997cc15d4623b87522b010fab5a57f0 Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 15:12:23 +0100 Subject: [PATCH 08/10] adding dipclust nb but not integrating yet --- workflow/notebooks/diplotype-clustering.ipynb | 332 ++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 workflow/notebooks/diplotype-clustering.ipynb diff --git a/workflow/notebooks/diplotype-clustering.ipynb b/workflow/notebooks/diplotype-clustering.ipynb new file mode 100644 index 0000000..4b1aead --- /dev/null +++ b/workflow/notebooks/diplotype-clustering.ipynb @@ -0,0 +1,332 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "8c01954f", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "import allel\n", + "import numpy as np\n", + "import pandas as pd \n", + "import plotly.express as px\n", + "\n", + "def load_vcf(vcf_path, metadata):\n", + " \"\"\"\n", + " Load VCF and filter poor-quality samples\n", + " \"\"\"\n", + " \n", + " sampleIDs = metadata.sampleID.to_list()\n", + " \n", + " # load vcf and get genotypes and positions\n", + " vcf = allel.read_vcf(vcf_path, fields=\"*\")\n", + " samples = vcf['samples']\n", + " # keep only samples in qcpass metadata \n", + " sample_mask = np.isin(vcf['samples'], metadata.sampleID)\n", + " \n", + " # remove low quality samples \n", + " geno = allel.GenotypeArray(vcf['calldata/GT'])\n", + " geno = geno.compress(sample_mask, axis=1)\n", + " pos = vcf['variants/POS']\n", + " contig = vcf['variants/CHROM']\n", + " indel = vcf['variants/INDEL']\n", + " \n", + " # remove any indels \n", + " geno = geno.compress(~indel, axis=0)\n", + " pos = pos[~indel]\n", + " contig = contig[~indel]\n", + " \n", + " metadata = metadata.set_index('sampleID')\n", + " samples = samples[sample_mask]\n", + " \n", + " return geno, pos, contig, metadata.loc[samples, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6302ed5", + "metadata": { + "tags": [ + "parameters", + "remove-input" + ] + }, + "outputs": [], + "source": [ + "dataset = 'ampseq-vigg002'\n", + "vcf_path = f\"../../results/vcfs/targets/{dataset}.annot.vcf\"\n", + "metadata_path = \"../../results/config/metadata.qcpass.tsv\"\n", + "cohort_cols = 'taxon,location'" + ] + }, + { + "cell_type": "markdown", + "id": "696938a0", + "metadata": {}, + "source": [ + "## Diplotype clustering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7ae6392", + "metadata": { + "tags": [ + "remove-input", + "remove-output" + ] + }, + "outputs": [], + "source": [ + "cohort_cols = cohort_cols.split(\",\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e53849e", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "import numba\n", + "from scipy.spatial.distance import squareform # type: ignore\n", + "\n", + "@numba.njit(parallel=True)\n", + "def multiallelic_diplotype_pdist(X, metric):\n", + " \"\"\"Optimised implementation of pairwise distance between diplotypes.\n", + "\n", + " N.B., here we assume the array X provides diplotypes as genotype allele\n", + " counts, with axes in the order (n_samples, n_sites, n_alleles).\n", + "\n", + " Computation will be faster if X is a contiguous (C order) array.\n", + "\n", + " The metric argument is the function to compute distance for a pair of\n", + " diplotypes. This can be a numba jitted function.\n", + "\n", + " \"\"\"\n", + " n_samples = X.shape[0]\n", + " n_pairs = (n_samples * (n_samples - 1)) // 2\n", + " out = np.zeros(n_pairs, dtype=np.float32)\n", + "\n", + " # Loop over samples, first in pair.\n", + " for i in range(n_samples):\n", + " x = X[i, :, :]\n", + "\n", + " # Loop over observations again, second in pair.\n", + " for j in numba.prange(i + 1, n_samples):\n", + " y = X[j, :, :]\n", + "\n", + " # Compute distance for the current pair.\n", + " d = metric(x, y)\n", + "\n", + " # Store result for the current pair.\n", + " k = square_to_condensed(i, j, n_samples)\n", + " out[k] = d\n", + "\n", + " return out\n", + "\n", + "\n", + "@numba.njit\n", + "def square_to_condensed(i, j, n):\n", + " \"\"\"Convert distance matrix coordinates from square form (i, j) to condensed form.\"\"\"\n", + "\n", + " assert i != j, \"no diagonal elements in condensed matrix\"\n", + " if i < j:\n", + " i, j = j, i\n", + " return n * j - j * (j + 1) // 2 + i - 1 - j\n", + "\n", + "\n", + "@numba.njit\n", + "def multiallelic_diplotype_mean_cityblock(x, y):\n", + " \"\"\"Compute the mean cityblock distance between two diplotypes x and y. The\n", + " diplotype vectors are expected as genotype allele counts, i.e., x and y\n", + " should have the same shape (n_sites, n_alleles).\n", + "\n", + " N.B., here we compute the mean value of the distance over sites where\n", + " both individuals have a called genotype. This avoids computing distance\n", + " at missing sites.\n", + "\n", + " \"\"\"\n", + " n_sites = x.shape[0]\n", + " n_alleles = x.shape[1]\n", + " distance = np.float32(0)\n", + " n_sites_called = np.float32(0)\n", + "\n", + " # Loop over sites.\n", + " for i in range(n_sites):\n", + " x_is_called = False\n", + " y_is_called = False\n", + " d = np.float32(0)\n", + "\n", + " # Loop over alleles.\n", + " for j in range(n_alleles):\n", + " # Access allele counts.\n", + " xc = np.float32(x[i, j])\n", + " yc = np.float32(y[i, j])\n", + "\n", + " # Check if any alleles observed.\n", + " x_is_called = x_is_called or (xc > 0)\n", + " y_is_called = y_is_called or (yc > 0)\n", + "\n", + " # Compute cityblock distance (absolute difference).\n", + " d += np.fabs(xc - yc)\n", + "\n", + " # Accumulate distance for the current pair, but only if both samples\n", + " # have a called genotype.\n", + " if x_is_called and y_is_called:\n", + " distance += d\n", + " n_sites_called += np.float32(1)\n", + "\n", + " # Compute the mean distance over sites with called genotypes.\n", + " if n_sites_called > 0:\n", + " mean_distance = distance / n_sites_called\n", + " else:\n", + " mean_distance = np.nan\n", + "\n", + " return mean_distance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f0977f1", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "df_samples = pd.read_csv(\"../../results/config/metadata.qcpass.tsv\", sep=\"\\t\", index_col=0)\n", + "\n", + "geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "783b9b6c", + "metadata": { + "tags": [ + "remove-input" + ] + }, + "outputs": [], + "source": [ + "ac = allel.GenotypeArray(geno).to_allele_counts(max_allele=3)\n", + "X = np.ascontiguousarray(np.swapaxes(ac.values, 0, 1))\n", + "\n", + "dists = multiallelic_diplotype_pdist(X, metric=multiallelic_diplotype_mean_cityblock)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2821a092", + "metadata": {}, + "outputs": [], + "source": [ + "from malariagen_data.plotly_dendrogram import plot_dendrogram" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d50cbd1", + "metadata": {}, + "outputs": [], + "source": [ + "distance_metric = 'cityblock'\n", + "\n", + "fig, leaf_data = plot_dendrogram(\n", + " dist=dists,\n", + " linkage_method=\"complete\",\n", + " count_sort=True,\n", + " distance_sort=False,\n", + " render_mode=\"svg\",\n", + " width=800,\n", + " height=500,\n", + " title=dataset,\n", + " line_width=0.5,\n", + " line_color='black',\n", + " marker_size=5,\n", + " leaf_data=df_samples.reset_index(),\n", + " leaf_hover_name=\"sampleID\",\n", + " leaf_hover_data=cohort_cols,\n", + " leaf_color=\"taxon\",\n", + " leaf_symbol=None,\n", + " leaf_y=-0.05,\n", + " leaf_color_discrete_map=None,\n", + " leaf_category_orders=None,\n", + " template=\"simple_white\",\n", + " y_axis_title=f\"Distance ({distance_metric})\",\n", + " y_axis_buffer=0.1,\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "64b6f43b", + "metadata": {}, + "source": [ + "### Diplotype clustering at target loci" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a996ce9", + "metadata": {}, + "outputs": [], + "source": [ + "major_loci = '2L:2_000_000-3_000_000'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9404b863", + "metadata": {}, + "outputs": [], + "source": [ + "df_bed = pd.read_csv(\"../../config/ag-vampir.bed\", sep=\"\\t\", header=None)\n", + "df_bed" + ] + } + ], + "metadata": { + "celltoolbar": "Tags", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From bc3bad756072d3a3df7e34e13c2d645f06171b8c Mon Sep 17 00:00:00 2001 From: Sanjay Nagi Date: Sun, 13 Oct 2024 15:13:39 +0100 Subject: [PATCH 09/10] adding dipclust nb but not integrating yet --- workflow/notebooks/population-structure.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb index 5a03cbe..1e8e25f 100644 --- a/workflow/notebooks/population-structure.ipynb +++ b/workflow/notebooks/population-structure.ipynb @@ -331,7 +331,7 @@ }, "outputs": [], "source": [ - "df_samples = pd.read_csv(\"../../results/config/metadata.qcpass.tsv\", sep=\"\\t\", index_col=0)\n", + "df_samples = pd.read_csv(metadata_path, sep=\"\\t\", index_col=0)\n", "\n", "geno, pos, contig, df_samples = load_vcf(vcf_path, metadata=df_samples)" ] From 8556c2a29d491e8cacd48d425c9b780db132a751 Mon Sep 17 00:00:00 2001 From: Sanjay Curtis Nagi <34922269+sanjaynagi@users.noreply.github.com> Date: Sun, 13 Oct 2024 15:28:16 +0100 Subject: [PATCH 10/10] Update workflow/notebooks/population-structure.ipynb --- workflow/notebooks/population-structure.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/notebooks/population-structure.ipynb b/workflow/notebooks/population-structure.ipynb index 1e8e25f..76ac3be 100644 --- a/workflow/notebooks/population-structure.ipynb +++ b/workflow/notebooks/population-structure.ipynb @@ -361,7 +361,7 @@ " leaf_data=df_samples.reset_index(),\n", " color=\"location\",\n", " hover_name=\"sampleID\",\n", - " hover_data=cohort_cols.split(\",\"),\n", + " hover_data=cohort_cols,\n", ")" ] },