address #53

pombase · Jul 17, 2023 · 119a9a3 · 119a9a3
1 parent 362734e
commit 119a9a3
Show file tree

Hide file tree

Showing 11 changed files with 16,562 additions and 95,972 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@
 manual_fixes_pombase/~*
 manual_fixes_pombase/*.tsv
 test_data/
-svn_protein_modification_files_corrected/
+svn_protein_modification_files_corrected/
+data/phenotype_annotations.phaf
diff --git a/data/allele_type_mapping.json b/data/allele_type_mapping.json
@@ -0,0 +1,14 @@
+{
+    "amino acid insertion and deletion": "amino_acid_insertion_and_deletion",
+    "amino acid insertion and substitution": "amino_acid_insertion_and_mutation",
+    "amino acid insertion": "amino_acid_insertion",
+    "amino acid substitution(s)": "amino_acid_mutation",
+    "fusion or chimera": "fusion_or_chimera",
+    "nonsense mutation": "nonsense_mutation",
+    "nucleotide insertion": "nucleotide_insertion",
+    "nucleotide substitution(s)": "nucleotide_mutation",
+    "partial deletion and amino acid change": "amino_acid_deletion_and_mutation",
+    "partial deletion, amino acid": "partial_amino_acid_deletion",
+    "partial deletion, nucleotide": "partial_nucleotide_deletion",
+    "wild type": "wild_type"
+}
diff --git a/data/alleles.tsv b/data/alleles.tsv
diff --git a/data/alleles_pre_format_canto.tsv b/data/alleles_pre_format_canto.tsv
diff --git a/data/alleles_pre_format.tsv → data/alleles_pre_format_phaf.tsv b/data/alleles_pre_format.tsv → data/alleles_pre_format_phaf.tsv
diff --git a/data/phenotype_annotations.phaf b/data/phenotype_annotations.phaf
diff --git a/format_alleles.py b/format_alleles.py
@@ -8,25 +8,45 @@
 > Printed output
 SPAC1006.08	unknown	etd1	etd1-1		unknown	PMID:15933715,PMID:7845361
 
-To use: python format_alleles.py input_file.tsv output_file.tsv
+To use: python format_alleles.py
 """
 
-import sys
 import pandas
+import json
 
 
-def main(input_file, output_file):
-    data = pandas.read_csv(input_file, sep='\t', na_filter=False)
+def main():
 
-    # Rename columns
-    data.columns = ['systematic_id', 'allele_description', 'gene_name', 'allele_name', 'allele_synonym', 'allele_type', 'reference']
+    with open('data/allele_type_mapping.json', 'r') as f:
+        allele_type_mapping = json.load(f)
+
+    data_canto = pandas.read_csv('data/alleles_pre_format_canto.tsv', sep='\t', na_filter=False)
+    data_phaf = pandas.read_csv('data/alleles_pre_format_phaf.tsv', sep='\t', na_filter=False)
 
-    # Join with comma-separated
+    # Format alleles from phaf
+    data_phaf.columns = ['systematic_id', 'allele_description', 'gene_name', 'allele_name', 'allele_synonym', 'allele_type', 'reference']
     unique_identifiers = ['systematic_id', 'allele_description', 'allele_name']
-    data2merge = data[unique_identifiers + ['reference']].groupby(unique_identifiers, as_index=False).agg({'reference': ','.join})
-    data = data.drop(columns=['reference']).merge(data2merge, on=unique_identifiers).drop_duplicates()
-    data.to_csv(output_file, sep='\t', index=False)
+    data2merge = data_phaf[unique_identifiers + ['reference']].groupby(unique_identifiers, as_index=False).agg({'reference': ','.join})
+    data_phaf = data_phaf.drop(columns=['reference']).merge(data2merge, on=unique_identifiers).drop_duplicates()
+
+    # Rename columns
+    data_canto.rename(columns={'gene_systematic_id': 'systematic_id', 'references': 'reference', 'allele_synonyms': 'allele_synonym'}, inplace=True)
+
+    # Map allele types
+    data_canto['allele_type'] = data_canto['allele_type'].apply(lambda x: allele_type_mapping.get(x, x))
+    data_canto = data_canto[(data_canto.allele_type != 'deletion') & (data_canto.allele_type != 'wild_type') & (data_canto.annotation_count > 0)].copy()
+
+    # Include the rows from the canto file that do not exist in the phaf file (they have different values of systematic_id, description)
+    # We prioritise the phaf file because it includes all synonyms, and addresses the case of missing names
+    data_canto['identifier'] = data_canto['systematic_id'] + '$$$$' + data_canto['allele_description']
+    data_phaf['identifier'] = data_phaf['systematic_id'] + '$$$$' + data_phaf['allele_description']
+    data_canto = data_canto[~data_canto.identifier.isin(data_phaf.identifier)].copy()
+
+    # Sort the columns, merge and save
+    column_order = ['systematic_id', 'allele_description', 'gene_name', 'allele_name', 'allele_synonym', 'allele_type', 'reference']
+    output_data = pandas.concat([data_canto[column_order], data_phaf[column_order]])
+    output_data[column_order].sort_values(['systematic_id', 'allele_name', 'allele_description']).to_csv('data/alleles.tsv', sep='\t', index=False)
 
 
 if __name__ == "__main__":
-    main(sys.argv[1], sys.argv[2])
+    main()
diff --git a/get_data.sh b/get_data.sh
@@ -12,8 +12,9 @@ curl -k https://www.pombase.org/data/annotations/Phenotype_annotations/phenotype
 gzip -fd data/phenotype_annotations.phaf.gz
 
 # Get unique lines with allele types, and remove deletion and wild-type alleles
-cut -f 2,4,9,10,11,12,18 data/phenotype_annotations.phaf|sort|uniq|grep -v $'\t'deletion|grep -v wild_type > data/alleles_pre_format.tsv
-python format_alleles.py data/alleles_pre_format.tsv data/alleles.tsv
+cut -f 2,4,9,10,11,12,18 data/phenotype_annotations.phaf|sort|uniq|grep -v $'\t'deletion|grep -v wild_type > data/alleles_pre_format_phaf.tsv
+curl -k https://curation.pombase.org/data/pombe-allele-table.tsv --output data/alleles_pre_format_canto.tsv
+python format_alleles.py
 
 
 echo -e "${GREEN}Getting contig files${NC}"