Skip to content

13C labeled E. coli SIP proteomic search tutorial

XYZ(YiXiong) edited this page Mar 21, 2023 · 4 revisions

This tutorial shows how to perform stable isotope labeling (SIP) proteomics search using Sipros on 13C-labeled E. coli DDA mass spectrometry data. This workflow works on WSL Ubuntu 20.04 in Windows 11 and CentOS 7.

Install environment

conda create -n py2 scikit-learn python=2.7
conda create -n mono -c conda-forge mono
conda create -n r -c conda-forge -c bioconda r-base r-stringr r-tidyr bioconductor-biostrings

Make folder for the workflow

mkdir fasta raw ft regular sip configs bin

Download raw file

cd raw
# Download raw file with 1% 13C
wget ftp://ftp.pride.ebi.ac.uk/pride/data/archive/.raw
# Download raw file with 50% 13C
wget ftp://ftp.pride.ebi.ac.uk/pride/data/archive/.raw

Download Sipros program

cd bin
wget https://github.com/xyz1396/SiprosToolKits/releases/download/4.0/siprosRelease.zip
unzip siprosRelease.zip
chmod +x bin/*

"FASTA_Database = " may be changed to your fasta file path in .cfg file

Convert Raw files

conda activate mono
# -j is the threads that you want to limit
mono bin/bin/Raxport.exe -i raw -o ft -j 8

Make fasta database with reverse decoy

cd fasta
# download E. coli protein fasta sequence
wget https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/reference_proteomes/Bacteria/UP000000625/UP000000625_83333.fasta.gz
gunzip UP000000625_83333.fasta.gz

conda activate py2
# make reverse
python ../bin/EnsembleScripts/sipros_prepare_protein_database.py \
    -i UP000000625_83333.fasta \
    -o Decoy.fasta \
    -c ../bin/configTemplates/SiprosEnsembleConfig.cfg

Unlabeled search

# OMP_NUM_THREADS is the threads that you want to limit
export OMP_NUM_THREADS=10
# search the scans against the fasta database, this command will take a long time
bin/bin/SiprosEnsembleOMP -f ft/Pan_062822_X1iso5.FT2 -c bin/configTemplates/SiprosEnsembleConfig.cfg -o regular

conda activate py2
# convert .Spe2Pep.txt file to .tab file
python bin/EnsembleScripts/sipros_psm_tabulating.py \
    -i regular -c bin/configTemplates/SiprosEnsembleConfig.cfg \
    -o regular
# filter PSMs, output qualified PSMs to .psm.txt file
python bin/EnsembleScripts/sipros_ensemble_filtering.py \
       -i regular \
       -c bin/configTemplates/SiprosEnsembleConfig.cfg \
       -o regular
# assembly protein groups from peptide, output proteins to .pro.txt
python bin/EnsembleScripts/sipros_peptides_assembling.py \
    -c bin/configTemplates/SiprosEnsembleConfig.cfg \
    -w regular

conda activate r
# control FDR, output qualified protein groups to .proRefineFDR.txt
Rscript bin/V4Scripts/refineProteinFDR.R -pro regular/*.pro.txt -psm regular/*.psm.txt -fdr 0.005 -o regular/coli
# get spectra count of each protein groups, output spectra count to .SPcount.txt
Rscript bin/V4Scripts/getSpectraCountInEachFT.R -pro regular/*.proRefineFDR.txt -psm regular/*.psm.txt -o regular/coli

Labeled search

# generate configs
bin/bin/configGenerator -i bin/configTemplates/SiprosV4Config.cfg -o configs -e C

conda activate r

# make db of identified proteins by SiprosEnsemble
# if protein database is small, this step can be ignored
# orginal protein database is also OK
Rscript bin/V4Scripts/makeDBforLabelSearch.R \
    -pro regular/Pan_062822_X1iso5.SE.pro.txt \
    -faa fasta/UP000000625_83333.fasta \
    -o fasta/db.faa

# search the scans against the fasta database, this command will take a long time
# OMP_NUM_THREADS is the threads that you want to limit
export OMP_NUM_THREADS=10
configs=(configs/*.cfg)
echo "${configs[@]}" | xargs -n 1 -P 8 \
        bash -c 'bin/bin/SiprosV4OMP -f ft/Pan_052322_X13.FT2 -c $0 -o sip'

conda activate py2

# filter PSMs
python bin/V4Scripts/sipros_peptides_filtering.py \
    -c bin/configTemplates/SiprosV4Config.cfg \
    -w sip

# filter proteins
python bin/V4Scripts/sipros_peptides_assembling.py \
    -c bin/configTemplates/SiprosV4Config.cfg \
    -w sip

# cluster SIP abundance of protein
python bin/V4Scripts/ClusterSip.py \
    -c bin/configTemplates/SiprosV4Config.cfg \
    -w sip

conda activate r

# refine protein FDR
Rscript bin/V4Scripts/refineProteinFDR.R \
    -pro sip/*.pro.txt \
    -psm sip/*.psm.txt \
    -fdr 0.01 \
    -o sip/coli

# get SIP abundance of each protein in each FT2 file
Rscript bin/V4Scripts/getLabelPCTinEachFT.R \
    -pro sip/coli.proRefineFDR.txt \
    -psm sip/*.psm.txt \
    -thr 5 \
    -o sip/coli

The isotopic abundance of PSMs and peptides is in the "SearchName" column in *.psm.txt. The percentage in "SearchName" is multiplied by 1000

The isotopic abundance of Proteins is in the "AverageEnrichmentLevel" column in *.pro.cluster.txt and in *.LabelPCTcount.txt. The percentage in "AverageEnrichmentLevel" is multiplied by 1000