-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
277 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,277 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Motif and Transcription Factor enrichment" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"The following code was used in the original paper to perform motif enrichment analysis from chromatin accessibility. For more information, visit https://github.com/cantinilab/mowgli_reproducibility.\n", | ||
"\n", | ||
"In addition, we provide example code to perform TF enrichment from gene expression using a TF-target database." | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Motif enrichment" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Python\n", | ||
"n_peaks = 100" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Python\n", | ||
"# Get the peak dictionaries\n", | ||
"H_mowgli = mdata[\"atac\"].uns[\"H_OT\"]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Python\n", | ||
"def top_mowgli(dim, n):\n", | ||
" \"\"\"\n", | ||
" Get the top n peaks for a given dimension.\n", | ||
" \"\"\"\n", | ||
" H_scaled = H_mowgli/H_mowgli.sum(axis=1, keepdims=True)\n", | ||
" return H_scaled[:, dim].argsort()[::-1][:n]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Python\n", | ||
"# Initialize the top and bottom peaks.\n", | ||
"mdata[\"atac\"].var_names = mdata[\"atac\"].var_names.str.replace(\"atac:\", \"\")\n", | ||
"top_in_mowgli = mdata[\"atac\"].var.copy()\n", | ||
"\n", | ||
"# Fill the Mowgli top peaks.\n", | ||
"for dim in range(H_mowgli.shape[1]):\n", | ||
" col_name = f\"top_in_dim_{dim}\"\n", | ||
" idx = top_in_mowgli.index[top_mowgli(dim, n_peaks)]\n", | ||
" top_in_mowgli[col_name] = False\n", | ||
" top_in_mowgli.loc[idx, col_name] = True\n", | ||
"\n", | ||
"# Save Mowgli's top peaks.\n", | ||
"top_in_mowgli.to_csv(\"top_in_mowgli.csv\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Imports.\n", | ||
"library(GenomicRanges)\n", | ||
"library(motifmatchr)\n", | ||
"library(chromVAR)\n", | ||
"library(TFBSTools)\n", | ||
"library(JASPAR2022)\n", | ||
"library(Signac)\n", | ||
"library(BSgenome.Hsapiens.UCSC.hg38)\n", | ||
"library(chromVARmotifs)\n", | ||
"library(MuData)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Read atac file.\n", | ||
"in_atac <- \"/users/csb/huizing/Documents/PhD/Code/mowgli_reproducibility/enrich/top_in_mowgli.csv\" # nolint\n", | ||
"peaks_csv <- read.csv(in_atac, row.names = 2)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Remove exotic chromosomes.\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000194.1\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000205.2\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000205.2\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000219.1\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"GL000219.1\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270721.1\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270726.1\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270726.1\", ]\n", | ||
"peaks_csv <- peaks_csv[peaks_csv[\"Chromosome\"] != \"KI270713.1\", ]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Convert the peaks to GRanges.\n", | ||
"chromosomes <- peaks_csv[\"Chromosome\"][, 1]\n", | ||
"ranges <- IRanges::IRanges(\n", | ||
" start = peaks_csv[\"Start\"][, 1],\n", | ||
" end = peaks_csv[\"End\"][, 1]\n", | ||
")\n", | ||
"peaks <- GenomicRanges::GRanges(seqnames = chromosomes, ranges = ranges)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Get JASPAR motifs.\n", | ||
"opts <- list()\n", | ||
"opts[\"species\"] <- \"Homo sapiens\"\n", | ||
"opts[\"collection\"] <- \"CORE\"\n", | ||
"motifs <- TFBSTools::getMatrixSet(JASPAR2022::JASPAR2022, opts)\n", | ||
"motifs_pwm <- TFBSTools::toPWM(motifs)\n", | ||
"\n", | ||
"# Get cisBP motifs.\n", | ||
"data(\"human_pwms_v2\")\n", | ||
"\n", | ||
"# Fuse JASPAR and cisBP motifs.\n", | ||
"for (name in names(motifs_pwm)) {\n", | ||
" human_pwms_v2[name] <- motifs_pwm[name]\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Create a Signac object from the peaks.\n", | ||
"# Actually giving peaks_csv is nonsense.\n", | ||
"# But we only care about the rownames so it's fine.\n", | ||
"assay <- Signac::CreateChromatinAssay(\n", | ||
" peaks_csv,\n", | ||
" ranges = peaks,\n", | ||
" sep = c(\":\", \"-\")\n", | ||
")\n", | ||
"\n", | ||
"# Create statistics about peaks.\n", | ||
"assay <- Signac::RegionStats(\n", | ||
" object = assay,\n", | ||
" genome = BSgenome.Hsapiens.UCSC.hg38\n", | ||
")\n", | ||
"\n", | ||
"# Add the downloaded motif PWM annotation.\n", | ||
"assay <- Signac::AddMotifs(\n", | ||
" object = assay,\n", | ||
" genome = BSgenome.Hsapiens.UCSC.hg38,\n", | ||
" pfm = human_pwms_v2\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"vscode": { | ||
"languageId": "r" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# R\n", | ||
"# Define where to save the motif enrichment outputs.\n", | ||
"out_motif <- \"motifs_\"\n", | ||
"\n", | ||
"# Get all top peaks.\n", | ||
"background <- c()\n", | ||
"for (dim in 0:49) {\n", | ||
"\n", | ||
" # Get the top peaks for that dimension.\n", | ||
" features <- rownames(assay)[peaks_csv[paste0(\"top_in_dim_\", dim)] == \"True\"]\n", | ||
"\n", | ||
" background <- c(background, features)\n", | ||
"}\n", | ||
"\n", | ||
"# Iterate over Mowgli's dimensions.\n", | ||
"for (dim in 0:49) {\n", | ||
"\n", | ||
" # Get the top peaks for that dimension.\n", | ||
" features <- rownames(assay)[peaks_csv[paste0(\"top_in_dim_\", dim)] == \"True\"]\n", | ||
"\n", | ||
" # Do motif enrichment analysis.\n", | ||
" enriched_motifs <- Signac::FindMotifs(\n", | ||
" object = assay,\n", | ||
" features = features,\n", | ||
" background = background\n", | ||
" )\n", | ||
"\n", | ||
" # Save the enrichment.\n", | ||
" write.csv(enriched_motifs, paste0(out_motif, dim, \".csv\"))\n", | ||
"}" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |