Skip to content

Commit

Permalink
fix and tidy sgd pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Aug 28, 2023
1 parent f3571cb commit 5d67bc7
Show file tree
Hide file tree
Showing 12 changed files with 27,894 additions and 28,696 deletions.
2 changes: 1 addition & 1 deletion allele_transvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def main(genome_file, allele_results_file, exclude_transcripts_file, output_file

anno_db = get_anno_db(transvardb, genome_fasta)
print('Running transvar on variants... (will take a while)')
data_exploded['transvar_coordinates'] = data_exploded.progress_apply(get_transvar_coordinates, args=(anno_db, genome, exclude_transcripts), axis=1)
data_exploded['transvar_coordinates'] = data_exploded.progress_apply(get_transvar_coordinates, args=(anno_db, genome, exclude_transcripts, sgd_mode), axis=1)

aggregated_data = data_exploded[['systematic_id', 'allele_description', 'allele_type', 'transvar_coordinates']].groupby(['systematic_id', 'allele_description', 'allele_type'], as_index=False).agg({'transvar_coordinates': lambda x: '|'.join(sum(x, []))})

Expand Down
19,405 changes: 9,622 additions & 9,783 deletions data/sgd/alleles_description_name.tsv

Large diffs are not rendered by default.

11,723 changes: 5,870 additions & 5,853 deletions data/sgd/alleles_description_semicolon.tsv

Large diffs are not rendered by default.

1,454 changes: 645 additions & 809 deletions data/sgd/alleles_sgd_raw.tsv

Large diffs are not rendered by default.

7,035 changes: 3,194 additions & 3,841 deletions data/sgd/features.gff

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions get_data_sgd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
set -e

cd data/sgd/

# Download and extract gff, delete the rest
curl -kL http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_Current_Release.tgz -o sgd_genome.tgz
tar -xvzf sgd_genome.tgz -C .
rm sgd_genome.tgz
mv S288C*/*.gff.gz sgd_genome.gff.gz
rm -rf S288C*
gzip -fd sgd_genome.gff.gz

# Split into fasta and gff
perl -ne 'if ($found) { print; } elsif (m/##FASTA/) { $found = 1; }' sgd_genome.gff > genome_sequence.fsa
perl -ne 'print; last if /##FASTA/' sgd_genome.gff > features.gff

# Remove problematic ARS lines that are not used
grep -v 'SGD ARS' features.gff > features.gff.tmp
mv features.gff.tmp features.gff

# Use intermine to get the latest SGD alleles.
# TODO: include unique identifier
python get_sgd_alleles.py alleles_sgd_raw.tsv

# TODO: download the latest genome
# convert the gff to embl, using emblmygff3 docker image (see translation*.json), which are used for the transformation
bash convert_sgd_gff2embl.sh

# Download all previous protein sequence (visit the repo), as well as the current protein sequences, to make the dictionary.
curl -kL https://raw.githubusercontent.com/pombase/all_previous_sgd_peptide_sequences/master/all_previous_seqs.tsv -o all_previous_seqs.tsv
curl http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans_all.fasta.gz -o current_protein_seqs.fasta.gz
gzip -fd current_protein_seqs.fasta.gz

cd ../..

# Extract allele descriptions from their name or description field.
# TODO: use a description field provided by SGD, this applies also to all commands below
# that use _description_name or description_semicolon
python format_alleles_sgd.py

# Load the genome to a pickle file
python load_genome.py --output data/sgd/genome.pickle --config data/sgd/config.sgd.json data/sgd/genome_embl_files/*.embl

# Remove unknown ids (not in gff), or pseudogene (YLL016W), no main feature (YJL018W)
# TODO: Check why these are missing
missing_genes="R0010W YSC0029 R0040C YLL016W YSC0032 YJL018W"

for missing_gene in $missing_genes; do
grep -v $missing_gene data/sgd/alleles_description_name.tsv > data/sgd/alleles_description_name.tsv.tmp
mv data/sgd/alleles_description_name.tsv.tmp data/sgd/alleles_description_name.tsv

grep -v $missing_gene data/sgd/alleles_description_semicolon.tsv > data/sgd/alleles_description_semicolon.tsv.tmp
mv data/sgd/alleles_description_semicolon.tsv.tmp data/sgd/alleles_description_semicolon.tsv
done
28 changes: 2 additions & 26 deletions results/sgd/description_name/allele_auto_fix.tsv
Original file line number Diff line number Diff line change
@@ -1,27 +1,3 @@
systematic_id allele_id allele_name allele_description allele_type change_description_to change_name_to change_type_to auto_fix_comment sequence_error solution_index allele_parts rules_applied reference
YBR010W 719 hht1-R72A R72A amino_acid_dummy R73A amino_acid_mutation histone_fix R72 R72A amino_acid_mutation:single_aa PMID:33843274
YEL061C 3694 cin8-D528K D528K amino_acid_dummy D490K cin8-D490K amino_acid_mutation multi_shift_fix D528 D528K amino_acid_mutation:single_aa PMID:34387192
YEL061C 3695 cin8-G522N G522N amino_acid_dummy G484N cin8-G484N amino_acid_mutation multi_shift_fix G522 G522N amino_acid_mutation:single_aa PMID:34387192
YEL061C 3696 cin8-K516M K516M amino_acid_dummy K478M cin8-K478M amino_acid_mutation multi_shift_fix K516 K516M amino_acid_mutation:single_aa PMID:34387192
YEL061C 3697 cin8-M526T M526T amino_acid_dummy M488T cin8-M488T amino_acid_mutation multi_shift_fix M526 M526T amino_acid_mutation:single_aa PMID:34387192
YKR063C 8662 las1-G4A G4A amino_acid_dummy G132A las1-G132A amino_acid_mutation multi_shift_fix G4 G4A amino_acid_mutation:single_aa PMID:32220933
YKR063C 8663 las1-H2D H2D amino_acid_dummy H130D las1-H130D amino_acid_mutation multi_shift_fix H2 H2D amino_acid_mutation:single_aa PMID:32220933
YKR063C 8664 las1-H2N H2N amino_acid_dummy H130N las1-H130N amino_acid_mutation multi_shift_fix H2 H2N amino_acid_mutation:single_aa PMID:32220933
YKR063C 8665 las1-H2R H2R amino_acid_dummy H130R las1-H130R amino_acid_mutation multi_shift_fix H2 H2R amino_acid_mutation:single_aa PMID:32220933
YKR063C 8666 las1-H6A H6A amino_acid_dummy H134A las1-H134A amino_acid_mutation multi_shift_fix H6 H6A amino_acid_mutation:single_aa PMID:32220933
YKR063C 8667 las1-H6N H6N amino_acid_dummy H134N las1-H134N amino_acid_mutation multi_shift_fix H6 H6N amino_acid_mutation:single_aa PMID:32220933
YKR063C 8668 las1-R1E R1E amino_acid_dummy R129E las1-R129E amino_acid_mutation multi_shift_fix R1 R1E amino_acid_mutation:single_aa PMID:32220933
YKR063C 8669 las1-R1K R1K amino_acid_dummy R129K las1-R129K amino_acid_mutation multi_shift_fix R1 R1K amino_acid_mutation:single_aa PMID:32220933
YKR063C 8670 las1-T5A T5A amino_acid_dummy T133A las1-T133A amino_acid_mutation multi_shift_fix T5 T5A amino_acid_mutation:single_aa PMID:32220933
YKR063C 8671 las1-T5S T5S amino_acid_dummy T133S las1-T133S amino_acid_mutation multi_shift_fix T5 T5S amino_acid_mutation:single_aa PMID:32220933
YKR063C 8672 las1-W3F W3F amino_acid_dummy W131F las1-W131F amino_acid_mutation multi_shift_fix W3 W3F amino_acid_mutation:single_aa PMID:32220933
YKR063C 8673 las1-W3L W3L amino_acid_dummy W131L las1-W131L amino_acid_mutation multi_shift_fix W3 W3L amino_acid_mutation:single_aa PMID:32220933
YNL304W 11804 ypt11-G40D G40D amino_acid_dummy G102D ypt11-G102D amino_acid_mutation old_coords_fix, revision 051111: G40 G40D amino_acid_mutation:single_aa PMID:12391144,PMID:18595704
YNL304W 11805 ypt11-I144N I144N amino_acid_dummy I206N ypt11-I206N amino_acid_mutation old_coords_fix, revision 051111: I144 I144N amino_acid_mutation:single_aa PMID:12391144
YNL304W 11815 ypt11-V246D V246D amino_acid_dummy V308D ypt11-V308D amino_acid_mutation old_coords_fix, revision 051111: V246 V246D amino_acid_mutation:single_aa PMID:12391144
YOL012C 12037 htz1-I109T I109T amino_acid_dummy I110T amino_acid_mutation histone_fix I109 I109T amino_acid_mutation:single_aa PMID:24098487
YOL012C 12038 htz1-S111P S111P amino_acid_dummy S112P amino_acid_mutation histone_fix S111 S111P amino_acid_mutation:single_aa PMID:24098487
YOR211C 12809 mgm1-E114A E114A amino_acid_dummy E93A mgm1-E93A amino_acid_mutation old_coords_fix, revision 061006: E114 E114A amino_acid_mutation:single_aa PMID:31764998
YOR211C 12829 mgm1-R78A R78A amino_acid_dummy R57A mgm1-R57A amino_acid_mutation old_coords_fix, revision 061006: R78 R78A amino_acid_mutation:single_aa PMID:31764998
YOR211C 12830 mgm1-R79A R79A amino_acid_dummy R58A mgm1-R58A amino_acid_mutation old_coords_fix, revision 061006: R79 R79A amino_acid_mutation:single_aa PMID:31764998
YOR330C 13107 mip1-A630T A630T amino_acid_dummy A604T mip1-A604T amino_acid_mutation old_coords_fix, revision 051202: A630 A630T amino_acid_mutation:single_aa PMID:32303542
YNL304W 11667 ypt11-G40D G40D amino_acid_dummy G102D ypt11-G102D amino_acid_mutation old_coords_fix, revision 051111: G40 G40D amino_acid_mutation:single_aa PMID:12391144,PMID:18595704
YNL304W 11677 ypt11-V246D V246D amino_acid_dummy V308D ypt11-V308D amino_acid_mutation old_coords_fix, revision 051111: V246 V246D amino_acid_mutation:single_aa PMID:12391144
Loading

0 comments on commit 5d67bc7

Please sign in to comment.