Skip to content

Commit

Permalink
[fauna] Move strain name fixes to annotations.tsv
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Jul 26, 2023
1 parent 4ac8d52 commit 46b5b02
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 143 deletions.
35 changes: 3 additions & 32 deletions ingest/bin/post_process_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,9 @@ def parse_args():

return parser.parse_args()

def define_strain_fixes(fname):
'''
Open strain name fixing files and define corresponding dictionaries
From: https://github.com/nextstrain/fauna/blob/bda9e474e3815490904ee230605f49e532d4d77d/vdb/upload.py#L142-L150
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_whole_name = {}
for line in reader:
fix_whole_name[line['label'].encode().decode('unicode-escape')] = line['fix']
return fix_whole_name

def _set_strain_name(record, fixes):
def _set_strain_name(record):
"""Replace spaces, dashes, and periods with underscores in strain name."""
strain_name = record["strain"]

if strain_name in fixes:
return(fixes[strain_name])

strain_name = strain_name.replace('Zika_virus', '').replace('Zikavirus', '').replace('Zika virus', '').replace('Zika', '').replace('ZIKV', '')
strain_name = strain_name.replace('Human', '').replace('human', '').replace('H.sapiens_wt', '').replace('H.sapiens-wt', '').replace('H.sapiens_tc', '').replace('Hsapiens_tc', '').replace('H.sapiens-tc', '').replace('Homo_sapiens', '').replace('Homo sapiens', '').replace('Hsapiens', '').replace('H.sapiens', '')
Expand All @@ -45,13 +31,10 @@ def _set_strain_name(record, fixes):
strain_name = strain_name.replace('Aaegypti_wt', 'Aedes_aegypti').replace('Aedessp', 'Aedes_sp')
strain_name = strain_name.replace(' ', '').replace('\'', '').replace('(', '').replace(')', '').replace('//', '/').replace('__', '_').replace('.', '').replace(',', '')
strain_name = re.sub('^[\/\_\-]', '', strain_name)

if strain_name in fixes:
return(fixes[strain_name])

try:
strain_name = 'V' + str(int(strain_name))
except:
except ValueError:
pass

return (
Expand All @@ -62,17 +45,6 @@ def _set_strain_name(record, fixes):
.replace(")", "_")
)

def define_location_fixes(fname):
'''
Open location fix file and define corresponding dictionaries
From: https://github.com/nextstrain/fauna/blob/bda9e474e3815490904ee230605f49e532d4d77d/vdb/upload.py#L152-L160
'''
reader = csv.DictReader(filter(lambda row: row[0]!='#', open(fname)), delimiter='\t')
fix_location = {}
for line in reader:
fix_location[line['label'].encode().decode('unicode-escape')] = line['fix']
return fix_location


def _set_url(record):
"""Set url column from accession"""
Expand All @@ -93,13 +65,12 @@ def _set_paper_url(record):
def main():
args = parse_args()

strain_fixes = define_strain_fixes(args.strain_fixes)
# location_fixes = define_location_fixes('source-data/zika_location_fix.tsv')
# date_fixes = define_date_fixes('source-data/zika_date_fix.tsv')

for index, record in enumerate(stdin):
record = json.loads(record)
record["strain"] = _set_strain_name(record, strain_fixes)
record["strain"] = _set_strain_name(record)
record["url"] = _set_url(record)
record["paper_url"] = _set_paper_url(record)
record["authors"] = record["abbr_authors"]
Expand Down
2 changes: 0 additions & 2 deletions ingest/config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ transform:
# Local geolocation rules that are only applicable to dengue data
# Local rules can overwrite the general geolocation rules provided above
local_geolocation_rules: 'source-data/geolocation-rules.tsv'
# User strain fixes file
strain_fixes: 'source-data/zika_strain_name_fix.tsv'
# User annotations file
annotations: 'source-data/annotations.tsv'
# ID field used to merge annotations
Expand Down
23 changes: 23 additions & 0 deletions ingest/source-data/annotations.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
KX922703 strain USA/2016/FL021
KY765326 strain NIC/6188_13A1/2016
KX922707 strain USA/2016/FL039
KU922923 strain MEX/InDRE/2016
KY075934 strain PuertoRico/2016/FL016U
KY765327 strain NIC/5005_13A1/2016
KX922705 strain USA/2016/FL032
KY075938 strain Aedes_aegypti/USA/2016/FL06
KX922704 strain USA/2016/FL030
KX673530 strain PHE_Guadeloupe
KY075935 strain USA/2016/FL022
KX838906 strain Aedes_aegypti/USA/2016/FL03
KY075933 strain PuertoRico/2016/FL008U
KX838904 strain Aedes_aegypti/USA/2016/FL01
KX838905 strain Aedes_aegypti/USA/2016/FL02
KY765320 strain NIC/6406_13A1/2016
KY075936 strain USA/2016/FL036
KY075932 strain Martinique/2016/FL001Sa
KY765321 strain NIC/4886_12A1/2016
KY075939 strain Aedes_aegypti/USA/2016/FL08
KX922706 strain USA/2016/FL038
KY075937 strain Aedes_aegypti/USA/2016/FL05
KX922708 strain Aedes_aegypti/USA/2016/FL04
KY014295 strain USA/2016/FL010
MT377503 strain V151144
MF988734 strain SG_EHI_/33164Y17
Expand Down
107 changes: 0 additions & 107 deletions ingest/source-data/zika_strain_name_fix.tsv

This file was deleted.

2 changes: 0 additions & 2 deletions ingest/workflow/snakemake_rules/transform.smk
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ rule transform:
input:
sequences_ndjson="data/sequences_{serotype}.ndjson",
all_geolocation_rules="data/all-geolocation-rules.tsv",
strain_fixes=config["transform"]["strain_fixes"],
annotations=config["transform"]["annotations"],
output:
metadata="data/metadata_{serotype}.tsv",
Expand Down Expand Up @@ -87,7 +86,6 @@ rule transform:
| ./bin/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/post_process_metadata.py \
--strain-fixes {input.strain_fixes} \
| ./bin/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down

0 comments on commit 46b5b02

Please sign in to comment.