added vignette

cantinilab · Sep 5, 2024 · ca28fd9 · ca28fd9
1 parent 25b8b84
commit ca28fd9
Showing 1 changed file with 277 additions and 0 deletions.
diff --git a/docs/source/vignettes/TF enrichemnt.ipynb b/docs/source/vignettes/TF enrichemnt.ipynb
@@ -0,0 +1,277 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Motif and Transcription Factor enrichment"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following code was used in the original paper to perform motif enrichment analysis from chromatin accessibility. For more information, visit https://github.com/cantinilab/mowgli_reproducibility.\n",
+    "\n",
+    "In addition, we provide example code to perform TF enrichment from gene expression using a TF-target database."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Motif enrichment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Python\n",
+    "n_peaks = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Python\n",
+    "# Get the peak dictionaries\n",
+    "H_mowgli = mdata[\"atac\"].uns[\"H_OT\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Python\n",
+    "def top_mowgli(dim, n):\n",
+    "    \"\"\"\n",
+    "    Get the top n peaks for a given dimension.\n",
+    "    \"\"\"\n",
+    "    H_scaled = H_mowgli/H_mowgli.sum(axis=1, keepdims=True)\n",
+    "    return H_scaled[:, dim].argsort()[::-1][:n]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Python\n",
+    "# Initialize the top and bottom peaks.\n",
+    "mdata[\"atac\"].var_names = mdata[\"atac\"].var_names.str.replace(\"atac:\", \"\")\n",
+    "top_in_mowgli = mdata[\"atac\"].var.copy()\n",
+    "\n",
+    "# Fill the Mowgli top peaks.\n",
+    "for dim in range(H_mowgli.shape[1]):\n",
+    "    col_name = f\"top_in_dim_{dim}\"\n",
+    "    idx = top_in_mowgli.index[top_mowgli(dim, n_peaks)]\n",
+    "    top_in_mowgli[col_name] = False\n",
+    "    top_in_mowgli.loc[idx, col_name] = True\n",
+    "\n",
+    "# Save Mowgli's top peaks.\n",
+    "top_in_mowgli.to_csv(\"top_in_mowgli.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Imports.\n",
+    "library(GenomicRanges)\n",
+    "library(motifmatchr)\n",
+    "library(chromVAR)\n",
+    "library(TFBSTools)\n",
+    "library(JASPAR2022)\n",
+    "library(Signac)\n",
+    "library(BSgenome.Hsapiens.UCSC.hg38)\n",
+    "library(chromVARmotifs)\n",
+    "library(MuData)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Read atac file.\n",
+    "in_atac <- \"/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/enrich/top_in_mowgli.csv\" # nolint\n",
+    "peaks_csv <- read.csv(in_atac, row.names = 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Remove exotic chromosomes.\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000194.1\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000205.2\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000205.2\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000219.1\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000219.1\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270721.1\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270726.1\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270726.1\", ]\n",
+    "peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270713.1\", ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Convert the peaks to GRanges.\n",
+    "chromosomes <- peaks_csv[\"Chromosome\"][, 1]\n",
+    "ranges <- IRanges::IRanges(\n",
+    "    start = peaks_csv[\"Start\"][, 1],\n",
+    "    end = peaks_csv[\"End\"][, 1]\n",
+    ")\n",
+    "peaks <- GenomicRanges::GRanges(seqnames = chromosomes, ranges = ranges)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Get JASPAR motifs.\n",
+    "opts <- list()\n",
+    "opts[\"species\"] <- \"Homo sapiens\"\n",
+    "opts[\"collection\"] <- \"CORE\"\n",
+    "motifs <- TFBSTools::getMatrixSet(JASPAR2022::JASPAR2022, opts)\n",
+    "motifs_pwm <- TFBSTools::toPWM(motifs)\n",
+    "\n",
+    "# Get cisBP motifs.\n",
+    "data(\"human_pwms_v2\")\n",
+    "\n",
+    "# Fuse JASPAR and cisBP motifs.\n",
+    "for (name in names(motifs_pwm)) {\n",
+    "    human_pwms_v2[name] <- motifs_pwm[name]\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Create a Signac object from the peaks.\n",
+    "# Actually giving peaks_csv is nonsense.\n",
+    "# But we only care about the rownames so it's fine.\n",
+    "assay <- Signac::CreateChromatinAssay(\n",
+    "    peaks_csv,\n",
+    "    ranges = peaks,\n",
+    "    sep = c(\":\", \"-\")\n",
+    ")\n",
+    "\n",
+    "# Create statistics about peaks.\n",
+    "assay <- Signac::RegionStats(\n",
+    "    object = assay,\n",
+    "    genome = BSgenome.Hsapiens.UCSC.hg38\n",
+    ")\n",
+    "\n",
+    "# Add the downloaded motif PWM annotation.\n",
+    "assay <- Signac::AddMotifs(\n",
+    "    object = assay,\n",
+    "    genome = BSgenome.Hsapiens.UCSC.hg38,\n",
+    "    pfm = human_pwms_v2\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "r"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# R\n",
+    "# Define where to save the motif enrichment outputs.\n",
+    "out_motif <- \"motifs_\"\n",
+    "\n",
+    "# Get all top peaks.\n",
+    "background <- c()\n",
+    "for (dim in 0:49) {\n",
+    "\n",
+    "    # Get the top peaks for that dimension.\n",
+    "    features <- rownames(assay)[peaks_csv[paste0(\"top_in_dim_\", dim)] == \"True\"]\n",
+    "\n",
+    "    background <- c(background, features)\n",
+    "}\n",
+    "\n",
+    "# Iterate over Mowgli's dimensions.\n",
+    "for (dim in 0:49) {\n",
+    "\n",
+    "    # Get the top peaks for that dimension.\n",
+    "    features <- rownames(assay)[peaks_csv[paste0(\"top_in_dim_\", dim)] == \"True\"]\n",
+    "\n",
+    "    # Do motif enrichment analysis.\n",
+    "    enriched_motifs <- Signac::FindMotifs(\n",
+    "        object = assay,\n",
+    "        features = features,\n",
+    "        background = background\n",
+    "    )\n",
+    "\n",
+    "    # Save the enrichment.\n",
+    "    write.csv(enriched_motifs, paste0(out_motif, dim, \".csv\"))\n",
+    "}"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}