Skip to content

Commit

Permalink
added vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
gjhuizing committed Sep 5, 2024
1 parent 25b8b84 commit ca28fd9
Showing 1 changed file with 277 additions and 0 deletions.
277 changes: 277 additions & 0 deletions docs/source/vignettes/TF enrichemnt.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Motif and Transcription Factor enrichment"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The following code was used in the original paper to perform motif enrichment analysis from chromatin accessibility. For more information, visit https://github.com/cantinilab/mowgli_reproducibility.\n",
"\n",
"In addition, we provide example code to perform TF enrichment from gene expression using a TF-target database."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Motif enrichment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Python\n",
"n_peaks = 100"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Python\n",
"# Get the peak dictionaries\n",
"H_mowgli = mdata[\"atac\"].uns[\"H_OT\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Python\n",
"def top_mowgli(dim, n):\n",
" \"\"\"\n",
" Get the top n peaks for a given dimension.\n",
" \"\"\"\n",
" H_scaled = H_mowgli/H_mowgli.sum(axis=1, keepdims=True)\n",
" return H_scaled[:, dim].argsort()[::-1][:n]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Python\n",
"# Initialize the top and bottom peaks.\n",
"mdata[\"atac\"].var_names = mdata[\"atac\"].var_names.str.replace(\"atac:\", \"\")\n",
"top_in_mowgli = mdata[\"atac\"].var.copy()\n",
"\n",
"# Fill the Mowgli top peaks.\n",
"for dim in range(H_mowgli.shape[1]):\n",
" col_name = f\"top_in_dim_{dim}\"\n",
" idx = top_in_mowgli.index[top_mowgli(dim, n_peaks)]\n",
" top_in_mowgli[col_name] = False\n",
" top_in_mowgli.loc[idx, col_name] = True\n",
"\n",
"# Save Mowgli's top peaks.\n",
"top_in_mowgli.to_csv(\"top_in_mowgli.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Imports.\n",
"library(GenomicRanges)\n",
"library(motifmatchr)\n",
"library(chromVAR)\n",
"library(TFBSTools)\n",
"library(JASPAR2022)\n",
"library(Signac)\n",
"library(BSgenome.Hsapiens.UCSC.hg38)\n",
"library(chromVARmotifs)\n",
"library(MuData)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Read atac file.\n",
"in_atac <- \"/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/enrich/top_in_mowgli.csv\" # nolint\n",
"peaks_csv <- read.csv(in_atac, row.names = 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Remove exotic chromosomes.\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000194.1\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000205.2\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000205.2\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000219.1\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000219.1\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270721.1\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270726.1\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270726.1\", ]\n",
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270713.1\", ]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Convert the peaks to GRanges.\n",
"chromosomes <- peaks_csv[\"Chromosome\"][, 1]\n",
"ranges <- IRanges::IRanges(\n",
" start = peaks_csv[\"Start\"][, 1],\n",
" end = peaks_csv[\"End\"][, 1]\n",
")\n",
"peaks <- GenomicRanges::GRanges(seqnames = chromosomes, ranges = ranges)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Get JASPAR motifs.\n",
"opts <- list()\n",
"opts[\"species\"] <- \"Homo sapiens\"\n",
"opts[\"collection\"] <- \"CORE\"\n",
"motifs <- TFBSTools::getMatrixSet(JASPAR2022::JASPAR2022, opts)\n",
"motifs_pwm <- TFBSTools::toPWM(motifs)\n",
"\n",
"# Get cisBP motifs.\n",
"data(\"human_pwms_v2\")\n",
"\n",
"# Fuse JASPAR and cisBP motifs.\n",
"for (name in names(motifs_pwm)) {\n",
" human_pwms_v2[name] <- motifs_pwm[name]\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Create a Signac object from the peaks.\n",
"# Actually giving peaks_csv is nonsense.\n",
"# But we only care about the rownames so it's fine.\n",
"assay <- Signac::CreateChromatinAssay(\n",
" peaks_csv,\n",
" ranges = peaks,\n",
" sep = c(\":\", \"-\")\n",
")\n",
"\n",
"# Create statistics about peaks.\n",
"assay <- Signac::RegionStats(\n",
" object = assay,\n",
" genome = BSgenome.Hsapiens.UCSC.hg38\n",
")\n",
"\n",
"# Add the downloaded motif PWM annotation.\n",
"assay <- Signac::AddMotifs(\n",
" object = assay,\n",
" genome = BSgenome.Hsapiens.UCSC.hg38,\n",
" pfm = human_pwms_v2\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"vscode": {
"languageId": "r"
}
},
"outputs": [],
"source": [
"# R\n",
"# Define where to save the motif enrichment outputs.\n",
"out_motif <- \"motifs_\"\n",
"\n",
"# Get all top peaks.\n",
"background <- c()\n",
"for (dim in 0:49) {\n",
"\n",
" # Get the top peaks for that dimension.\n",
" features <- rownames(assay)[peaks_csv[paste0(\"top_in_dim_\", dim)] == \"True\"]\n",
"\n",
" background <- c(background, features)\n",
"}\n",
"\n",
"# Iterate over Mowgli's dimensions.\n",
"for (dim in 0:49) {\n",
"\n",
" # Get the top peaks for that dimension.\n",
" features <- rownames(assay)[peaks_csv[paste0(\"top_in_dim_\", dim)] == \"True\"]\n",
"\n",
" # Do motif enrichment analysis.\n",
" enriched_motifs <- Signac::FindMotifs(\n",
" object = assay,\n",
" features = features,\n",
" background = background\n",
" )\n",
"\n",
" # Save the enrichment.\n",
" write.csv(enriched_motifs, paste0(out_motif, dim, \".csv\"))\n",
"}"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit ca28fd9

Please sign in to comment.