-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
27,894 additions
and
28,696 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
set -e | ||
|
||
cd data/sgd/ | ||
|
||
# Download and extract gff, delete the rest | ||
curl -kL http://sgd-archive.yeastgenome.org/sequence/S288C_reference/genome_releases/S288C_reference_genome_Current_Release.tgz -o sgd_genome.tgz | ||
tar -xvzf sgd_genome.tgz -C . | ||
rm sgd_genome.tgz | ||
mv S288C*/*.gff.gz sgd_genome.gff.gz | ||
rm -rf S288C* | ||
gzip -fd sgd_genome.gff.gz | ||
|
||
# Split into fasta and gff | ||
perl -ne 'if ($found) { print; } elsif (m/##FASTA/) { $found = 1; }' sgd_genome.gff > genome_sequence.fsa | ||
perl -ne 'print; last if /##FASTA/' sgd_genome.gff > features.gff | ||
|
||
# Remove problematic ARS lines that are not used | ||
grep -v 'SGD ARS' features.gff > features.gff.tmp | ||
mv features.gff.tmp features.gff | ||
|
||
# Use intermine to get the latest SGD alleles. | ||
# TODO: include unique identifier | ||
python get_sgd_alleles.py alleles_sgd_raw.tsv | ||
|
||
# TODO: download the latest genome | ||
# convert the gff to embl, using emblmygff3 docker image (see translation*.json), which are used for the transformation | ||
bash convert_sgd_gff2embl.sh | ||
|
||
# Download all previous protein sequence (visit the repo), as well as the current protein sequences, to make the dictionary. | ||
curl -kL https://raw.githubusercontent.com/pombase/all_previous_sgd_peptide_sequences/master/all_previous_seqs.tsv -o all_previous_seqs.tsv | ||
curl http://sgd-archive.yeastgenome.org/sequence/S288C_reference/orf_protein/orf_trans_all.fasta.gz -o current_protein_seqs.fasta.gz | ||
gzip -fd current_protein_seqs.fasta.gz | ||
|
||
cd ../.. | ||
|
||
# Extract allele descriptions from their name or description field. | ||
# TODO: use a description field provided by SGD, this applies also to all commands below | ||
# that use _description_name or description_semicolon | ||
python format_alleles_sgd.py | ||
|
||
# Load the genome to a pickle file | ||
python load_genome.py --output data/sgd/genome.pickle --config data/sgd/config.sgd.json data/sgd/genome_embl_files/*.embl | ||
|
||
# Remove unknown ids (not in gff), or pseudogene (YLL016W), no main feature (YJL018W) | ||
# TODO: Check why these are missing | ||
missing_genes="R0010W YSC0029 R0040C YLL016W YSC0032 YJL018W" | ||
|
||
for missing_gene in $missing_genes; do | ||
grep -v $missing_gene data/sgd/alleles_description_name.tsv > data/sgd/alleles_description_name.tsv.tmp | ||
mv data/sgd/alleles_description_name.tsv.tmp data/sgd/alleles_description_name.tsv | ||
|
||
grep -v $missing_gene data/sgd/alleles_description_semicolon.tsv > data/sgd/alleles_description_semicolon.tsv.tmp | ||
mv data/sgd/alleles_description_semicolon.tsv.tmp data/sgd/alleles_description_semicolon.tsv | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,3 @@ | ||
systematic_id allele_id allele_name allele_description allele_type change_description_to change_name_to change_type_to auto_fix_comment sequence_error solution_index allele_parts rules_applied reference | ||
YBR010W 719 hht1-R72A R72A amino_acid_dummy R73A amino_acid_mutation histone_fix R72 R72A amino_acid_mutation:single_aa PMID:33843274 | ||
YEL061C 3694 cin8-D528K D528K amino_acid_dummy D490K cin8-D490K amino_acid_mutation multi_shift_fix D528 D528K amino_acid_mutation:single_aa PMID:34387192 | ||
YEL061C 3695 cin8-G522N G522N amino_acid_dummy G484N cin8-G484N amino_acid_mutation multi_shift_fix G522 G522N amino_acid_mutation:single_aa PMID:34387192 | ||
YEL061C 3696 cin8-K516M K516M amino_acid_dummy K478M cin8-K478M amino_acid_mutation multi_shift_fix K516 K516M amino_acid_mutation:single_aa PMID:34387192 | ||
YEL061C 3697 cin8-M526T M526T amino_acid_dummy M488T cin8-M488T amino_acid_mutation multi_shift_fix M526 M526T amino_acid_mutation:single_aa PMID:34387192 | ||
YKR063C 8662 las1-G4A G4A amino_acid_dummy G132A las1-G132A amino_acid_mutation multi_shift_fix G4 G4A amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8663 las1-H2D H2D amino_acid_dummy H130D las1-H130D amino_acid_mutation multi_shift_fix H2 H2D amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8664 las1-H2N H2N amino_acid_dummy H130N las1-H130N amino_acid_mutation multi_shift_fix H2 H2N amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8665 las1-H2R H2R amino_acid_dummy H130R las1-H130R amino_acid_mutation multi_shift_fix H2 H2R amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8666 las1-H6A H6A amino_acid_dummy H134A las1-H134A amino_acid_mutation multi_shift_fix H6 H6A amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8667 las1-H6N H6N amino_acid_dummy H134N las1-H134N amino_acid_mutation multi_shift_fix H6 H6N amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8668 las1-R1E R1E amino_acid_dummy R129E las1-R129E amino_acid_mutation multi_shift_fix R1 R1E amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8669 las1-R1K R1K amino_acid_dummy R129K las1-R129K amino_acid_mutation multi_shift_fix R1 R1K amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8670 las1-T5A T5A amino_acid_dummy T133A las1-T133A amino_acid_mutation multi_shift_fix T5 T5A amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8671 las1-T5S T5S amino_acid_dummy T133S las1-T133S amino_acid_mutation multi_shift_fix T5 T5S amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8672 las1-W3F W3F amino_acid_dummy W131F las1-W131F amino_acid_mutation multi_shift_fix W3 W3F amino_acid_mutation:single_aa PMID:32220933 | ||
YKR063C 8673 las1-W3L W3L amino_acid_dummy W131L las1-W131L amino_acid_mutation multi_shift_fix W3 W3L amino_acid_mutation:single_aa PMID:32220933 | ||
YNL304W 11804 ypt11-G40D G40D amino_acid_dummy G102D ypt11-G102D amino_acid_mutation old_coords_fix, revision 051111: G40 G40D amino_acid_mutation:single_aa PMID:12391144,PMID:18595704 | ||
YNL304W 11805 ypt11-I144N I144N amino_acid_dummy I206N ypt11-I206N amino_acid_mutation old_coords_fix, revision 051111: I144 I144N amino_acid_mutation:single_aa PMID:12391144 | ||
YNL304W 11815 ypt11-V246D V246D amino_acid_dummy V308D ypt11-V308D amino_acid_mutation old_coords_fix, revision 051111: V246 V246D amino_acid_mutation:single_aa PMID:12391144 | ||
YOL012C 12037 htz1-I109T I109T amino_acid_dummy I110T amino_acid_mutation histone_fix I109 I109T amino_acid_mutation:single_aa PMID:24098487 | ||
YOL012C 12038 htz1-S111P S111P amino_acid_dummy S112P amino_acid_mutation histone_fix S111 S111P amino_acid_mutation:single_aa PMID:24098487 | ||
YOR211C 12809 mgm1-E114A E114A amino_acid_dummy E93A mgm1-E93A amino_acid_mutation old_coords_fix, revision 061006: E114 E114A amino_acid_mutation:single_aa PMID:31764998 | ||
YOR211C 12829 mgm1-R78A R78A amino_acid_dummy R57A mgm1-R57A amino_acid_mutation old_coords_fix, revision 061006: R78 R78A amino_acid_mutation:single_aa PMID:31764998 | ||
YOR211C 12830 mgm1-R79A R79A amino_acid_dummy R58A mgm1-R58A amino_acid_mutation old_coords_fix, revision 061006: R79 R79A amino_acid_mutation:single_aa PMID:31764998 | ||
YOR330C 13107 mip1-A630T A630T amino_acid_dummy A604T mip1-A604T amino_acid_mutation old_coords_fix, revision 051202: A630 A630T amino_acid_mutation:single_aa PMID:32303542 | ||
YNL304W 11667 ypt11-G40D G40D amino_acid_dummy G102D ypt11-G102D amino_acid_mutation old_coords_fix, revision 051111: G40 G40D amino_acid_mutation:single_aa PMID:12391144,PMID:18595704 | ||
YNL304W 11677 ypt11-V246D V246D amino_acid_dummy V308D ypt11-V308D amino_acid_mutation old_coords_fix, revision 051111: V246 V246D amino_acid_mutation:single_aa PMID:12391144 |
Oops, something went wrong.