Skip to content

Commit

Permalink
address #53
Browse files Browse the repository at this point in the history
  • Loading branch information
manulera committed Jul 17, 2023
1 parent 362734e commit 119a9a3
Show file tree
Hide file tree
Showing 11 changed files with 16,562 additions and 95,972 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
manual_fixes_pombase/~*
manual_fixes_pombase/*.tsv
test_data/
svn_protein_modification_files_corrected/
svn_protein_modification_files_corrected/
data/phenotype_annotations.phaf
14 changes: 14 additions & 0 deletions data/allele_type_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"amino acid insertion and deletion": "amino_acid_insertion_and_deletion",
"amino acid insertion and substitution": "amino_acid_insertion_and_mutation",
"amino acid insertion": "amino_acid_insertion",
"amino acid substitution(s)": "amino_acid_mutation",
"fusion or chimera": "fusion_or_chimera",
"nonsense mutation": "nonsense_mutation",
"nucleotide insertion": "nucleotide_insertion",
"nucleotide substitution(s)": "nucleotide_mutation",
"partial deletion and amino acid change": "amino_acid_deletion_and_mutation",
"partial deletion, amino acid": "partial_amino_acid_deletion",
"partial deletion, nucleotide": "partial_nucleotide_deletion",
"wild type": "wild_type"
}
6,229 changes: 3,665 additions & 2,564 deletions data/alleles.tsv

Large diffs are not rendered by default.

12,427 changes: 12,427 additions & 0 deletions data/alleles_pre_format_canto.tsv

Large diffs are not rendered by default.

File renamed without changes.
93,324 changes: 0 additions & 93,324 deletions data/phenotype_annotations.phaf

This file was deleted.

42 changes: 31 additions & 11 deletions format_alleles.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,45 @@
> Printed output
SPAC1006.08 unknown etd1 etd1-1 unknown PMID:15933715,PMID:7845361
To use: python format_alleles.py input_file.tsv output_file.tsv
To use: python format_alleles.py
"""

import sys
import pandas
import json


def main(input_file, output_file):
data = pandas.read_csv(input_file, sep='\t', na_filter=False)
def main():

# Rename columns
data.columns = ['systematic_id', 'allele_description', 'gene_name', 'allele_name', 'allele_synonym', 'allele_type', 'reference']
with open('data/allele_type_mapping.json', 'r') as f:
allele_type_mapping = json.load(f)

data_canto = pandas.read_csv('data/alleles_pre_format_canto.tsv', sep='\t', na_filter=False)
data_phaf = pandas.read_csv('data/alleles_pre_format_phaf.tsv', sep='\t', na_filter=False)

# Join with comma-separated
# Format alleles from phaf
data_phaf.columns = ['systematic_id', 'allele_description', 'gene_name', 'allele_name', 'allele_synonym', 'allele_type', 'reference']
unique_identifiers = ['systematic_id', 'allele_description', 'allele_name']
data2merge = data[unique_identifiers + ['reference']].groupby(unique_identifiers, as_index=False).agg({'reference': ','.join})
data = data.drop(columns=['reference']).merge(data2merge, on=unique_identifiers).drop_duplicates()
data.to_csv(output_file, sep='\t', index=False)
data2merge = data_phaf[unique_identifiers + ['reference']].groupby(unique_identifiers, as_index=False).agg({'reference': ','.join})
data_phaf = data_phaf.drop(columns=['reference']).merge(data2merge, on=unique_identifiers).drop_duplicates()

# Rename columns
data_canto.rename(columns={'gene_systematic_id': 'systematic_id', 'references': 'reference', 'allele_synonyms': 'allele_synonym'}, inplace=True)

# Map allele types
data_canto['allele_type'] = data_canto['allele_type'].apply(lambda x: allele_type_mapping.get(x, x))
data_canto = data_canto[(data_canto.allele_type != 'deletion') & (data_canto.allele_type != 'wild_type') & (data_canto.annotation_count > 0)].copy()

# Include the rows from the canto file that do not exist in the phaf file (they have different values of systematic_id, description)
# We prioritise the phaf file because it includes all synonyms, and addresses the case of missing names
data_canto['identifier'] = data_canto['systematic_id'] + '$$$$' + data_canto['allele_description']
data_phaf['identifier'] = data_phaf['systematic_id'] + '$$$$' + data_phaf['allele_description']
data_canto = data_canto[~data_canto.identifier.isin(data_phaf.identifier)].copy()

# Sort the columns, merge and save
column_order = ['systematic_id', 'allele_description', 'gene_name', 'allele_name', 'allele_synonym', 'allele_type', 'reference']
output_data = pandas.concat([data_canto[column_order], data_phaf[column_order]])
output_data[column_order].sort_values(['systematic_id', 'allele_name', 'allele_description']).to_csv('data/alleles.tsv', sep='\t', index=False)


if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])
main()
5 changes: 3 additions & 2 deletions get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ curl -k https://www.pombase.org/data/annotations/Phenotype_annotations/phenotype
gzip -fd data/phenotype_annotations.phaf.gz

# Get unique lines with allele types, and remove deletion and wild-type alleles
cut -f 2,4,9,10,11,12,18 data/phenotype_annotations.phaf|sort|uniq|grep -v $'\t'deletion|grep -v wild_type > data/alleles_pre_format.tsv
python format_alleles.py data/alleles_pre_format.tsv data/alleles.tsv
cut -f 2,4,9,10,11,12,18 data/phenotype_annotations.phaf|sort|uniq|grep -v $'\t'deletion|grep -v wild_type > data/alleles_pre_format_phaf.tsv
curl -k https://curation.pombase.org/data/pombe-allele-table.tsv --output data/alleles_pre_format_canto.tsv
python format_alleles.py


echo -e "${GREEN}Getting contig files${NC}"
Expand Down
Loading

0 comments on commit 119a9a3

Please sign in to comment.