From ebaacb2a34cd607432a56860addd79e45cb71271 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 25 Jun 2024 14:08:16 -0400 Subject: [PATCH 1/2] Update footer --- src/indra_cogex/apps/chat_page/app/public/index.html | 8 ++++---- src/indra_cogex/apps/templates/base.html | 8 ++++---- src/indra_cogex/apps/templates/downtime/index.html | 12 +++++------- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/indra_cogex/apps/chat_page/app/public/index.html b/src/indra_cogex/apps/chat_page/app/public/index.html index 0ad659d1b..7c98f081c 100644 --- a/src/indra_cogex/apps/chat_page/app/public/index.html +++ b/src/indra_cogex/apps/chat_page/app/public/index.html @@ -448,11 +448,11 @@

IndraLab

Back to top

- Developed by the INDRA Lab in the - Harvard Program in Therapeutic Science (HiTS).
+ Developed by the Gyori Lab at + Northeastern University.
INDRA CoGEx (Context Graph Extension) is an automatically assembled biomedical knowledge graph which integrates - causal mechanisms from INDRA with non-causal contextual relations including properties, ontology, and data.
- INDRA CoGEx is funded by the DARPA Young Faculty Award W911NF2010255 (PI: Benjamin M. Gyori).
+ causal mechanisms from INDRA with non-causal contextual relations including properties, ontologies, and data.
+ INDRA CoGEx is funded by grant HR00112220036 under the DARPA ASKEM / ARPA-H BDF programs

diff --git a/src/indra_cogex/apps/templates/base.html b/src/indra_cogex/apps/templates/base.html index bc2f1dfc9..a10db858e 100644 --- a/src/indra_cogex/apps/templates/base.html +++ b/src/indra_cogex/apps/templates/base.html @@ -154,11 +154,11 @@ Back to top

- Developed by the INDRA Lab in the - Harvard Program in Therapeutic Science (HiTS).
+ Developed by the Gyori Lab at + Northeastern University.
INDRA CoGEx (Context Graph Extension) is an automatically assembled biomedical knowledge graph which integrates - causal mechanisms from INDRA with non-causal contextual relations including properties, ontology, and data.
- INDRA CoGEx is funded by the DARPA Young Faculty Award W911NF2010255 (PI: Benjamin M. Gyori).
+ causal mechanisms from INDRA with non-causal contextual relations including properties, ontologies, and data.
+ INDRA CoGEx is funded by grant HR00112220036 under the DARPA ASKEM / ARPA-H BDF programs

diff --git a/src/indra_cogex/apps/templates/downtime/index.html b/src/indra_cogex/apps/templates/downtime/index.html index a1c527092..321250380 100644 --- a/src/indra_cogex/apps/templates/downtime/index.html +++ b/src/indra_cogex/apps/templates/downtime/index.html @@ -47,13 +47,11 @@

The INDRA Discovery service for INDRA CoGEx is currently Back to top

- Developed by the INDRA Lab in the - Harvard Program in Therapeutic Science (HiTS).
- INDRA CoGEx (Context Graph Extension) is an automatically assembled biomedical knowledge graph which - integrates - causal mechanisms from INDRA with non-causal contextual relations including properties, ontology, and - data.
- INDRA CoGEx is funded by the DARPA Young Faculty Award W911NF2010255 (PI: Benjamin M. Gyori).
+ Developed by the Gyori Lab at + Northeastern University.
+ INDRA CoGEx (Context Graph Extension) is an automatically assembled biomedical knowledge graph which integrates + causal mechanisms from INDRA with non-causal contextual relations including properties, ontologies, and data.
+ INDRA CoGEx is funded by grant HR00112220036 under the DARPA ASKEM / ARPA-H BDF programs

From 30aec0d2e1a6e8a6bd1f50735c5b3f27275afeb3 Mon Sep 17 00:00:00 2001 From: Ben Gyori Date: Tue, 25 Jun 2024 14:18:43 -0400 Subject: [PATCH 2/2] Add notebook --- notebooks/phosphoproteomics_geffen/README.md | 3 + .../phosphoprot_explanation.ipynb | 1569 +++++++++++++++++ 2 files changed, 1572 insertions(+) create mode 100644 notebooks/phosphoproteomics_geffen/README.md create mode 100644 notebooks/phosphoproteomics_geffen/phosphoprot_explanation.ipynb diff --git a/notebooks/phosphoproteomics_geffen/README.md b/notebooks/phosphoproteomics_geffen/README.md new file mode 100644 index 000000000..3d69739b0 --- /dev/null +++ b/notebooks/phosphoproteomics_geffen/README.md @@ -0,0 +1,3 @@ +This notebook demonstrates finding explanations for phosphorylation +changes detected in https://www.cell.com/cell/fulltext/S0092-8674(23)00781-X#gr2 +between homologous repair deficient and non-deficient samples. diff --git a/notebooks/phosphoproteomics_geffen/phosphoprot_explanation.ipynb b/notebooks/phosphoproteomics_geffen/phosphoprot_explanation.ipynb new file mode 100644 index 000000000..df3d5520f --- /dev/null +++ b/notebooks/phosphoproteomics_geffen/phosphoprot_explanation.ipynb @@ -0,0 +1,1569 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "38f360f7", + "metadata": {}, + "outputs": [], + "source": [ + "# This is the URL for the supplementary table\n", + "url = 'https://www.cell.com/cms/10.1016/j.cell.2023.07.013/attachment/b342834f-0ab7-4d68-be07-66e9ba38e3df/mmc3.xlsx'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "938eb46b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7cb32c76", + "metadata": {}, + "outputs": [], + "source": [ + "# Naively trying to load the table from the URL errors with 403: Forbidden\n", + "# sheets = pd.read_excel(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "44277396", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0gene_namelogFCAveExprtP.Valueadj.P.ValBqvalpropMissing...propMissingOutidid.descriptionvariableSitesaccession_numberfeaturegsea_rankgsea_rank_pcausalpath_adjusted_idprot_residue
0NP_001269315.1_K345k_1_1_345_345IDH1-3.904846-0.941172-6.9442468.017521e-070.0005654.0150430.0004400.685185...0.583333HRDisocitrate dehydrogenase [NADP] cytoplasmic G...['K345k', 'K345k']NP_001269315.1acetylome-7.854981e+00-23.803785NP_001269315.1_K345k_1_1_345_345IDH1_K345k_1_1_345_345
1NP_006752.1_K142k_1_1_142_142YWHAE-1.0199370.148372-4.0705771.611649e-040.0495970.8202750.0386500.111111...0.125000HRD14-3-3 protein epsilon GN=YWHAE['K142k', 'K142k']NP_006752.1acetylome-1.209124e+00-3.868344NP_006752.1_K142k_1_1_142_142YWHAE_K142k_1_1_142_142
2NP_001609.2_K105k_1_1_105_105PARP11.236072-0.5161863.9814272.110521e-040.0495970.5790850.0386500.092593...0.083333HRDpoly [ADP-ribose] polymerase 1 GN=PARP1['K105k', 'K105k']NP_001609.2acetylome1.428431e+004.543317NP_001609.2_K105k_1_1_105_105PARP1_K105k_1_1_105_105
3NP_001122321.1_K455k_1_1_455_455SMARCA40.913719-0.7966263.4851689.466467e-040.118682-0.7688560.0924860.000000...0.000000HRDtranscription activator BRG1 isoform A GN=SMA...['K455k', 'K455k']NP_001122321.1acetylome8.427154e-012.762915NP_001122321.1_K455k_1_1_455_455SMARCA4_K455k_1_1_455_455
4NP_001609.2_K621k_1_1_621_621PARP10.734708-0.1903763.4901969.621225e-040.118682-0.7706460.0924860.055556...0.041667HRDpoly [ADP-ribose] polymerase 1 GN=PARP1['K621k', 'K621k']NP_001609.2acetylome6.776145e-012.216444NP_001609.2_K621k_1_1_621_621PARP1_K621k_1_1_621_621
..................................................................
5388ENSG00000097007.19ABL10.0024315.8780270.0173149.862142e-010.993643-6.4743160.406492NaN...NaNHRDNaNNaNNaNtranscriptome5.029325e-060.000015ENSG00000097007.19ENSG00000097007.19
5389ENSG00000102977.17ACD-0.0018022.935979-0.0127829.898221e-010.994550-6.0626900.406863NaN...NaNHRDNaNNaNNaNtranscriptome-2.292426e-06-0.000008ENSG00000102977.17ENSG00000102977.17
5390ENSG00000167325.15RRM1-0.0013536.557619-0.0115149.908319e-010.994550-6.5031150.406863NaN...NaNHRDNaNNaNNaNtranscriptome-1.463072e-06-0.000005ENSG00000167325.15ENSG00000167325.15
5391ENSG00000161036.13LRWD1-0.0010063.850618-0.0070139.944159e-010.996278-6.2240210.407570NaN...NaNHRDNaNNaNNaNtranscriptome-7.332363e-07-0.000002ENSG00000161036.13ENSG00000161036.13
5392ENSG00000127616.18SMARCA40.0008266.5435720.0043489.965375e-010.996538-6.5021250.407676NaN...NaNHRDNaNNaNNaNtranscriptome3.851904e-070.000001ENSG00000127616.18ENSG00000127616.18
\n", + "

5393 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 gene_name logFC AveExpr \\\n", + "0 NP_001269315.1_K345k_1_1_345_345 IDH1 -3.904846 -0.941172 \n", + "1 NP_006752.1_K142k_1_1_142_142 YWHAE -1.019937 0.148372 \n", + "2 NP_001609.2_K105k_1_1_105_105 PARP1 1.236072 -0.516186 \n", + "3 NP_001122321.1_K455k_1_1_455_455 SMARCA4 0.913719 -0.796626 \n", + "4 NP_001609.2_K621k_1_1_621_621 PARP1 0.734708 -0.190376 \n", + "... ... ... ... ... \n", + "5388 ENSG00000097007.19 ABL1 0.002431 5.878027 \n", + "5389 ENSG00000102977.17 ACD -0.001802 2.935979 \n", + "5390 ENSG00000167325.15 RRM1 -0.001353 6.557619 \n", + "5391 ENSG00000161036.13 LRWD1 -0.001006 3.850618 \n", + "5392 ENSG00000127616.18 SMARCA4 0.000826 6.543572 \n", + "\n", + " t P.Value adj.P.Val B qval propMissing ... \\\n", + "0 -6.944246 8.017521e-07 0.000565 4.015043 0.000440 0.685185 ... \n", + "1 -4.070577 1.611649e-04 0.049597 0.820275 0.038650 0.111111 ... \n", + "2 3.981427 2.110521e-04 0.049597 0.579085 0.038650 0.092593 ... \n", + "3 3.485168 9.466467e-04 0.118682 -0.768856 0.092486 0.000000 ... \n", + "4 3.490196 9.621225e-04 0.118682 -0.770646 0.092486 0.055556 ... \n", + "... ... ... ... ... ... ... ... \n", + "5388 0.017314 9.862142e-01 0.993643 -6.474316 0.406492 NaN ... \n", + "5389 -0.012782 9.898221e-01 0.994550 -6.062690 0.406863 NaN ... \n", + "5390 -0.011514 9.908319e-01 0.994550 -6.503115 0.406863 NaN ... \n", + "5391 -0.007013 9.944159e-01 0.996278 -6.224021 0.407570 NaN ... \n", + "5392 0.004348 9.965375e-01 0.996538 -6.502125 0.407676 NaN ... \n", + "\n", + " propMissingOut id id.description \\\n", + "0 0.583333 HRD isocitrate dehydrogenase [NADP] cytoplasmic G... \n", + "1 0.125000 HRD 14-3-3 protein epsilon GN=YWHAE \n", + "2 0.083333 HRD poly [ADP-ribose] polymerase 1 GN=PARP1 \n", + "3 0.000000 HRD transcription activator BRG1 isoform A GN=SMA... \n", + "4 0.041667 HRD poly [ADP-ribose] polymerase 1 GN=PARP1 \n", + "... ... ... ... \n", + "5388 NaN HRD NaN \n", + "5389 NaN HRD NaN \n", + "5390 NaN HRD NaN \n", + "5391 NaN HRD NaN \n", + "5392 NaN HRD NaN \n", + "\n", + " variableSites accession_number feature gsea_rank \\\n", + "0 ['K345k', 'K345k'] NP_001269315.1 acetylome -7.854981e+00 \n", + "1 ['K142k', 'K142k'] NP_006752.1 acetylome -1.209124e+00 \n", + "2 ['K105k', 'K105k'] NP_001609.2 acetylome 1.428431e+00 \n", + "3 ['K455k', 'K455k'] NP_001122321.1 acetylome 8.427154e-01 \n", + "4 ['K621k', 'K621k'] NP_001609.2 acetylome 6.776145e-01 \n", + "... ... ... ... ... \n", + "5388 NaN NaN transcriptome 5.029325e-06 \n", + "5389 NaN NaN transcriptome -2.292426e-06 \n", + "5390 NaN NaN transcriptome -1.463072e-06 \n", + "5391 NaN NaN transcriptome -7.332363e-07 \n", + "5392 NaN NaN transcriptome 3.851904e-07 \n", + "\n", + " gsea_rank_p causalpath_adjusted_id prot_residue \n", + "0 -23.803785 NP_001269315.1_K345k_1_1_345_345 IDH1_K345k_1_1_345_345 \n", + "1 -3.868344 NP_006752.1_K142k_1_1_142_142 YWHAE_K142k_1_1_142_142 \n", + "2 4.543317 NP_001609.2_K105k_1_1_105_105 PARP1_K105k_1_1_105_105 \n", + "3 2.762915 NP_001122321.1_K455k_1_1_455_455 SMARCA4_K455k_1_1_455_455 \n", + "4 2.216444 NP_001609.2_K621k_1_1_621_621 PARP1_K621k_1_1_621_621 \n", + "... ... ... ... \n", + "5388 0.000015 ENSG00000097007.19 ENSG00000097007.19 \n", + "5389 -0.000008 ENSG00000102977.17 ENSG00000102977.17 \n", + "5390 -0.000005 ENSG00000167325.15 ENSG00000167325.15 \n", + "5391 -0.000002 ENSG00000161036.13 ENSG00000161036.13 \n", + "5392 0.000001 ENSG00000127616.18 ENSG00000127616.18 \n", + "\n", + "[5393 rows x 21 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load a local copy of the table and select Table 3G\n", + "df = pd.read_excel('mmc3.xlsx', sheet_name='Table 3G')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "5e41eac5", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter the table to adjusted p-values less than 0.055 to retain significant results\n", + "df = df[df['adj.P.Val'] < 0.055]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "608b94bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'phosphoproteome': 206,\n", + " 'transcriptome': 124,\n", + " 'phosphoproteome_res': 69,\n", + " 'proteome': 64,\n", + " 'acetylome': 3,\n", + " 'acetylome_res': 1})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Look at statistics of different modification types that are significant\n", + "from collections import Counter\n", + "Counter(df.feature)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1f852e08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('ATAD5', 'hgnc', 'S', '44'),\n", + " ('ATR', 'hgnc', 'T', '1989'),\n", + " ('ATRIP', 'hgnc', 'S', '224'),\n", + " ('ATRIP', 'hgnc', 'S', '239'),\n", + " ('ATRIP', 'hgnc', 'S', '518'),\n", + " ('BAZ1B', 'hgnc', 'S', '330'),\n", + " ('BAZ1B', 'hgnc', 'S', '349'),\n", + " ('BLM', 'hgnc', 'S', '28'),\n", + " ('BOD1L1', 'hgnc', 'S', '2905'),\n", + " ('BRCA2', 'hgnc', 'S', '93'),\n", + " ('BRIP1', 'hgnc', 'S', '226'),\n", + " ('CDC25B', 'hgnc', 'S', '321'),\n", + " ('CDC25B', 'hgnc', 'S', '353'),\n", + " ('CDC25B', 'hgnc', 'S', '375'),\n", + " ('CDC6', 'hgnc', 'S', '54'),\n", + " ('CDK1', 'hgnc', 'T', '14'),\n", + " ('CDK1', 'hgnc', 'T', '161'),\n", + " ('CDK1', 'hgnc', 'Y', '15'),\n", + " ('CHAF1A', 'hgnc', 'S', '775'),\n", + " ('CHEK2', 'hgnc', 'S', '303'),\n", + " ('CHTF18', 'hgnc', 'S', '225'),\n", + " ('CHTF18', 'hgnc', 'S', '64'),\n", + " ('CHTF18', 'hgnc', 'S', '871'),\n", + " ('CLSPN', 'hgnc', 'S', '225'),\n", + " ('CLSPN', 'hgnc', 'S', '83'),\n", + " ('CLSPN', 'hgnc', 'S', '846'),\n", + " ('CLSPN', 'hgnc', 'T', '1287'),\n", + " ('CUL4B', 'hgnc', 'S', '180'),\n", + " ('DBF4', 'hgnc', 'S', '359'),\n", + " ('DBF4', 'hgnc', 'S', '381'),\n", + " ('DBF4', 'hgnc', 'S', '508'),\n", + " ('DBF4', 'hgnc', 'T', '345'),\n", + " ('DBF4', 'hgnc', 'T', '553'),\n", + " ('DONSON', 'hgnc', 'S', '34'),\n", + " ('DTL', 'hgnc', 'S', '485'),\n", + " ('DTL', 'hgnc', 'S', '490'),\n", + " ('DTL', 'hgnc', 'S', '512'),\n", + " ('DTL', 'hgnc', 'S', '679'),\n", + " ('DTL', 'hgnc', 'S', '697'),\n", + " ('DTL', 'hgnc', 'T', '429'),\n", + " ('ERCC6L', 'hgnc', 'S', '1028'),\n", + " ('ERCC6L', 'hgnc', 'S', '14'),\n", + " ('ERCC6L', 'hgnc', 'S', '820'),\n", + " ('EXO1', 'hgnc', 'S', '598'),\n", + " ('EXO1', 'hgnc', 'S', '610'),\n", + " ('EXO1', 'hgnc', 'S', '639'),\n", + " ('EXO1', 'hgnc', 'S', '700'),\n", + " ('EXO1', 'hgnc', 'S', '702'),\n", + " ('EXO1', 'hgnc', 'S', '714'),\n", + " ('EXO1', 'hgnc', 'S', '746'),\n", + " ('EXO1', 'hgnc', 'S', '815'),\n", + " ('EXO1', 'hgnc', 'T', '475'),\n", + " ('FANCD2', 'hgnc', 'S', '1435'),\n", + " ('FANCE', 'hgnc', 'S', '249'),\n", + " ('FANCM', 'hgnc', 'S', '34'),\n", + " ('INO80B', 'hgnc', 'T', '60'),\n", + " ('KPNA2', 'hgnc', 'S', '490'),\n", + " ('LIG1', 'hgnc', 'S', '881'),\n", + " ('LIG1', 'hgnc', 'S', '883'),\n", + " ('LIG1', 'hgnc', 'T', '165'),\n", + " ('LIG1', 'hgnc', 'T', '203'),\n", + " ('LRWD1', 'hgnc', 'S', '243'),\n", + " ('LRWD1', 'hgnc', 'S', '259'),\n", + " ('MBD4', 'hgnc', 'S', '422'),\n", + " ('MCM2', 'hgnc', 'S', '139'),\n", + " ('MCM2', 'hgnc', 'S', '381'),\n", + " ('MCM2', 'hgnc', 'S', '40'),\n", + " ('MCM2', 'hgnc', 'S', '41'),\n", + " ('MCM3', 'hgnc', 'S', '756'),\n", + " ('MCM3', 'hgnc', 'T', '758'),\n", + " ('MCM3', 'hgnc', 'T', '767'),\n", + " ('MCM4', 'hgnc', 'S', '120'),\n", + " ('MCM4', 'hgnc', 'S', '131'),\n", + " ('MCM4', 'hgnc', 'T', '110'),\n", + " ('MCM6', 'hgnc', 'S', '13'),\n", + " ('MCM6', 'hgnc', 'S', '762'),\n", + " ('MCMBP', 'hgnc', 'T', '160'),\n", + " ('MDC1', 'hgnc', 'S', '1820'),\n", + " ('MDC1', 'hgnc', 'S', '453'),\n", + " ('MDC1', 'hgnc', 'T', '1157'),\n", + " ('MDC1', 'hgnc', 'T', '1239'),\n", + " ('MDC1', 'hgnc', 'T', '455'),\n", + " ('MPLKIP', 'hgnc', 'S', '66'),\n", + " ('MSH6', 'hgnc', 'S', '227'),\n", + " ('MSH6', 'hgnc', 'S', '309'),\n", + " ('MSH6', 'hgnc', 'S', '830'),\n", + " ('MSH6', 'hgnc', 'S', '91'),\n", + " ('NSD2', 'hgnc', 'T', '110'),\n", + " ('NSD2', 'hgnc', 'T', '114'),\n", + " ('NSD2', 'hgnc', 'T', '115'),\n", + " ('NSD2', 'hgnc', 'T', '544'),\n", + " ('NUDT5', 'hgnc', 'S', '3'),\n", + " ('ORC1', 'hgnc', 'S', '201'),\n", + " ('ORC1', 'hgnc', 'S', '273'),\n", + " ('ORC1', 'hgnc', 'S', '287'),\n", + " ('ORC2', 'hgnc', 'S', '280'),\n", + " ('ORC2', 'hgnc', 'T', '226'),\n", + " ('ORC6', 'hgnc', 'T', '195'),\n", + " ('PALB2', 'hgnc', 'S', '781'),\n", + " ('PARG', 'hgnc', 'S', '68'),\n", + " ('PARP1', 'hgnc', 'S', '179'),\n", + " ('PARP1', 'hgnc', 'S', '257'),\n", + " ('PARP1', 'hgnc', 'S', '782'),\n", + " ('PCLAF', 'hgnc', 'S', '72'),\n", + " ('PKMYT1', 'hgnc', 'S', '143'),\n", + " ('PLK1', 'hgnc', 'T', '210'),\n", + " ('PMS2', 'hgnc', 'T', '573'),\n", + " ('POLD3', 'hgnc', 'T', '277'),\n", + " ('POLQ', 'hgnc', 'S', '1587'),\n", + " ('POLR2C', 'hgnc', 'S', '124'),\n", + " ('RAD18', 'hgnc', 'S', '103'),\n", + " ('RAD18', 'hgnc', 'S', '99'),\n", + " ('RAD50', 'hgnc', 'S', '635'),\n", + " ('RAD51AP1', 'hgnc', 'S', '19'),\n", + " ('RAD51AP1', 'hgnc', 'S', '21'),\n", + " ('RAD51AP1', 'hgnc', 'S', '294'),\n", + " ('RAD51AP1', 'hgnc', 'T', '66'),\n", + " ('RECQL5', 'hgnc', 'S', '727'),\n", + " ('REV1', 'hgnc', 'S', '1144'),\n", + " ('RFC1', 'hgnc', 'S', '156'),\n", + " ('RIF1', 'hgnc', 'S', '1454'),\n", + " ('RIF1', 'hgnc', 'S', '1542'),\n", + " ('RIF1', 'hgnc', 'S', '1579'),\n", + " ('RIF1', 'hgnc', 'S', '1616'),\n", + " ('RIF1', 'hgnc', 'S', '1688'),\n", + " ('RIF1', 'hgnc', 'S', '1873'),\n", + " ('RIF1', 'hgnc', 'S', '2157'),\n", + " ('RIF1', 'hgnc', 'S', '2172'),\n", + " ('RIF1', 'hgnc', 'S', '2176'),\n", + " ('RIF1', 'hgnc', 'S', '2205'),\n", + " ('RIF1', 'hgnc', 'S', '2243'),\n", + " ('RIF1', 'hgnc', 'S', '2265'),\n", + " ('RIF1', 'hgnc', 'S', '2348'),\n", + " ('RIF1', 'hgnc', 'S', '2393'),\n", + " ('RIF1', 'hgnc', 'S', '782'),\n", + " ('RNF168', 'hgnc', 'S', '134'),\n", + " ('RPA1', 'hgnc', 'S', '384'),\n", + " ('SAMHD1', 'hgnc', 'T', '592'),\n", + " ('SLF2', 'hgnc', 'S', '710'),\n", + " ('SLF2', 'hgnc', 'T', '711'),\n", + " ('SMARCA5', 'hgnc', 'S', '755'),\n", + " ('SMARCC1', 'hgnc', 'T', '398'),\n", + " ('SMC6', 'hgnc', 'S', '11'),\n", + " ('TERF2', 'hgnc', 'S', '365'),\n", + " ('TERF2IP', 'hgnc', 'S', '36'),\n", + " ('TICRR', 'hgnc', 'S', '1750'),\n", + " ('TICRR', 'hgnc', 'S', '599'),\n", + " ('TICRR', 'hgnc', 'S', '865'),\n", + " ('TICRR', 'hgnc', 'S', '923'),\n", + " ('TICRR', 'hgnc', 'T', '1678'),\n", + " ('TOP2A', 'hgnc', 'S', '1106'),\n", + " ('TOP2A', 'hgnc', 'S', '1213'),\n", + " ('TOP2A', 'hgnc', 'S', '1247'),\n", + " ('TOP2A', 'hgnc', 'S', '1374'),\n", + " ('TOP2A', 'hgnc', 'S', '1377'),\n", + " ('TOP2A', 'hgnc', 'S', '1474'),\n", + " ('TOP2A', 'hgnc', 'S', '1504'),\n", + " ('TOP2A', 'hgnc', 'S', '1525'),\n", + " ('TOP2B', 'hgnc', 'S', '1236'),\n", + " ('TOPBP1', 'hgnc', 'S', '1504'),\n", + " ('TOPBP1', 'hgnc', 'S', '888'),\n", + " ('TP53BP1', 'hgnc', 'S', '1623'),\n", + " ('TP53BP1', 'hgnc', 'S', '1670'),\n", + " ('TP53BP1', 'hgnc', 'S', '1683'),\n", + " ('TP53BP1', 'hgnc', 'S', '1706'),\n", + " ('TP53BP1', 'hgnc', 'S', '1763'),\n", + " ('TP53BP1', 'hgnc', 'S', '1764'),\n", + " ('TTK', 'hgnc', 'S', '281'),\n", + " ('TTK', 'hgnc', 'S', '436'),\n", + " ('TTK', 'hgnc', 'S', '821'),\n", + " ('TTK', 'hgnc', 'T', '33'),\n", + " ('UBE2T', 'hgnc', 'S', '184'),\n", + " ('UFD1', 'hgnc', 'S', '245'),\n", + " ('UFD1', 'hgnc', 'S', '247'),\n", + " ('UFD1', 'hgnc', 'S', '299'),\n", + " ('USP1', 'hgnc', 'S', '313'),\n", + " ('USP1', 'hgnc', 'S', '327'),\n", + " ('USP1', 'hgnc', 'S', '475'),\n", + " ('USP10', 'hgnc', 'S', '547'),\n", + " ('USP37', 'hgnc', 'S', '650'),\n", + " ('USP37', 'hgnc', 'S', '652'),\n", + " ('WDHD1', 'hgnc', 'S', '333'),\n", + " ('WDHD1', 'hgnc', 'S', '868'),\n", + " ('WDHD1', 'hgnc', 'S', '958'),\n", + " ('WRN', 'hgnc', 'S', '1133'),\n", + " ('XRCC1', 'hgnc', 'S', '266'),\n", + " ('XRCC1', 'hgnc', 'T', '257'),\n", + " ('XRCC6', 'hgnc', 'S', '2'),\n", + " ('XRCC6', 'hgnc', 'T', '455')}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Construct a list of all phosphorylation sites that have significantly\n", + "# increased compared to control, and represent these as tuples compatible\n", + "# with Protmapper (gene_name, 'hgnc', residue, position).\n", + "import re\n", + "sites = set()\n", + "for _, row in df.iterrows():\n", + " if row['feature'] == 'phosphoproteome' and row['logFC'] > 0:\n", + " matches = re.findall(r'([STY]\\d+)', row['variableSites'])\n", + " sites |= {(row['gene_name'], 'hgnc', match[0], match[1:]) for match in matches}\n", + "sites" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b15177e1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Mapping sites: 0it [00:00, ?it/s]INFO: [2024-06-25 12:55:01] protmapper.uniprot_client - Loading Swissprot sequences...\n", + "INFO: [2024-06-25 12:55:03] protmapper.uniprot_client - Loading Uniprot isoform sequences...\n", + "Mapping sites: 189it [00:11, 16.96it/s]\n" + ] + } + ], + "source": [ + "# Use the protmapper to map the sites to human reference\n", + "import protmapper\n", + "mapped_sites = protmapper.default_mapper.map_sitelist_to_human_ref(sites)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c5904b00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Counter({'VALID': 174, 'NO_MAPPING_FOUND': 15})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Print counts of mapping results\n", + "Counter([ms.description for ms in mapped_sites])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "26a813ea", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter to valid or validly mapped sites\n", + "valid_sites = [ms for ms in mapped_sites if ms.valid or ms.mapped_id]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "93f16f6a", + "metadata": {}, + "outputs": [], + "source": [ + "# Query the INDRA DB for Phosphorylation statements whose substrate is one\n", + "# of the proteins whose phosphorylation appears in the site list\n", + "from indra.sources.indra_db_rest import get_statements_from_query\n", + "from indra.sources.indra_db_rest.query import HasAgent, HasType" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7f816be1", + "metadata": {}, + "outputs": [], + "source": [ + "stmts_by_target = {}\n", + "unique_genes = {ms.gene_name for ms in valid_sites}" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1b294e7b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/78 [00:00