Skip to content

Commit

Permalink
Append usvi data
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Nov 21, 2023
1 parent a53b5e4 commit 8d87951
Show file tree
Hide file tree
Showing 5 changed files with 365 additions and 6 deletions.
14 changes: 8 additions & 6 deletions phylogenetic/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ rule files:

files = rules.files.params

include: "workflow/snakemake_rules/usvi.smk"

rule download:
"""Downloading sequences and metadata from data.nextstrain.org"""
output:
Expand Down Expand Up @@ -53,8 +55,8 @@ rule filter:
- minimum genome length of {params.min_length} (50% of Zika virus genome)
"""
input:
sequences = "data/sequences.fasta",
metadata = "data/metadata.tsv",
sequences = "data/sequences_all.fasta",
metadata = "data/metadata_all.tsv",
exclude = files.dropped_strains
output:
sequences = "results/filtered.fasta"
Expand Down Expand Up @@ -122,7 +124,7 @@ rule refine:
input:
tree = "results/tree_raw.nwk",
alignment = "results/aligned.fasta",
metadata = "data/metadata.tsv"
metadata = "data/metadata_all.tsv"
output:
tree = "results/tree.nwk",
node_data = "results/branch_lengths.json"
Expand Down Expand Up @@ -189,7 +191,7 @@ rule traits:
"""
input:
tree = "results/tree.nwk",
metadata = "data/metadata.tsv"
metadata = "data/metadata_all.tsv"
output:
node_data = "results/traits.json",
params:
Expand All @@ -212,7 +214,7 @@ rule export:
"""Exporting data files for for auspice"""
input:
tree = "results/tree.nwk",
metadata = "data/metadata.tsv",
metadata = "data/metadata_all.tsv",
branch_lengths = "results/branch_lengths.json",
traits = "results/traits.json",
nt_muts = "results/nt_muts.json",
Expand Down Expand Up @@ -242,7 +244,7 @@ rule export:
rule final_strain_name:
input:
auspice_json="results/raw_zika.json",
metadata="data/metadata.tsv",
metadata="data/metadata_all.tsv",
root_sequence="results/raw_zika_root-sequence.json",
output:
auspice_json="auspice/zika.json",
Expand Down
2 changes: 2 additions & 0 deletions phylogenetic/example_data/metadata_usvi.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
genbank_accession genbank_accession_rev accession strain date region country division location length host release_date update_date sra_accessions authors institution url
USVI/37/2016 VI37 USVI/37/2016 2016-10-06 North America Usvi Saint Croix Saint Croix 10807 Homo sapiens Black et al FH https://github.com/blab/zika-usvi/
137 changes: 137 additions & 0 deletions phylogenetic/example_data/sequences_usvi.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
>USVI/37/2016
nnnnnnnnnnnnnnnnnnnnnnnnnnnngacagttcgagtttgaagcgaaagctagcaacagtatcaacaggttttattt
tggatttggaaacgagagtttctggtcatgaaaaacccaaaaaagaaatccggaggattccggattgtcaatatgctaaa
acgcggagtagcccgtgtgagcccctttgggggcttgaagaggctgccagccggacttctgctgggtcatgggcccatca
ggatggtcttggcgattctagcctttttgagattcacggcaatcaagccatcactgggcctcatcaatagatggggttca
gtggggaaaaaagaggctatggaaacaataaagaagttcaagaaagatctggctgccatgctgagaataatcaatgctag
gaaggagaagaagagacgaggcgcagatactagtgtcggaattgttggcctcctgctgaccacagctatggcagcggagg
tcactagacgtgggagtgcatactatatgtacttggacagaaacgatgctggggaggccatatcttttccaaccacattg
gggatgaataagtgttatatacagatcatggatcttggacacatgtgtgatgccaccatgagctatgaatgccctatgct
ggatgagggggtggaaccagatgacgtcgattgttggtgcaacacgacgtcaacttgggttgtgtacggaacctgccatc
acaaaaaaggtgaagcacggagatctagaagagctgtgacgctcccctcccattccaccaggaagctgcaaacgcggtcg
caaacctggttggaatcaagagaatacacaaagcacttgattagagtcgaaaattggatattcaggaaccctggcttcgc
gttagcagcagctgccatcgcttggcttttgggaagctcaacgagccaaaaagtcatatacttggtcatgatactgctga
ttgccccggcatacagcatcaggtgcataggagtcagcaatagggactttgtggaaggtatgtcaggtgggacttgggtt
gatgttgtcttggaacatggaggttgtgtcaccgtaatggcacaggacaaaccgactgtcgacatagagctggttacaac
aacagtcagcaacatggcggaggtaagatcctactgctatgaggcatcaatatcagacatggcttctgacagccgctgcc
caacacaaggtgaagcctaccttgacaagcaatcagacactcaatatgtctgcaaaagaacgttagtggacagaggctgg
ggaaatggatgtggactttttggcaaagggagcctggtgacatgcgctaagtttgcatgctccaagaaaatgaccgggaa
gagcatccagccagagaatctggagtaccggataatgctgtcagttcatggctcccagcacagtgggatgatcgttaatg
acacaggacatgaaactgatgagaatagagcgaaagttgagataacgcccaattcaccgagagccgaagccaccctgggg
ggttttggaagcctaggacttgattgtgaaccgaggacaggccttgacttttcagatttgtattacttgactatgaataa
caagcactggttggttcacaaggagtggttccacgacattccattaccttggcacgctggggcagacaccggaactccac
actggaacaacaaagaagcactggtagagttcaaggacgcacatgccaaaaggcaaactgtcgtggttctagggagtcaa
gaaggagcagttcacacggcccttgctggagctctggaggctgagatggatggtgcaaagggaaggctgtcctctggcca
cttgaaatgtcgcctgaaaatggataaacttagattgaagggcgtgtcatactccttgtgtactgcagcgttcacattca
ccaagatcccggctgaaacactgcacgggacagtcacagtggaggtacagtacgcagggacagatggaccttgcaaggtt
ccagctcagatggcggtggacatgcaaactctgaccccagttgggaggttgataaccgctaaccccgtaatcactgaaag
cactgagaactctaagatgatgctggaacttgatccaccatttggggactcttacattgtcataggagtcggggagaaga
agatcacccaccactggcacaggagtggcagcaccattggaaaagcatttgaagccactgtgagaggtgccaagagaatg
gcagtcttgggagacacagcctgggactttggatcagttggaggcgctctcaactcattgggcaagggcatccatcaaat
ttttggagcagctttcaaatcattgtttggaggaatgtcctggttctcacaaattctcattggaacgttgctgatgtggt
tgggtctgaacacaaagaatggatctatttcccttatgtgcttggccttagggggagtgttgatcttcttatccacagcc
gtctctgctgatgtggggtgctcggtggacttctcaaagaaggagacgagatgcggtacaggggtgttcgtctataacga
cgttgaagcctggagggacaggtacaagtaccatcctgactccccccgtagattggcagcagcagttaagcaagcctggg
aagatggtatctgcgggatctcctctgtttcaagaatggaaaacatcatgtggagatcagtagaaggggagctcaacgca
atcctggaagagaatggagttcaactgacggtcgttgtgggatctgtaaaaaaccccatgtggagaggtccacagagatt
gcccgtgcctgtgaacgagctgccccacggctggaaggcttgggggaaatcgtacttcgtcagagcagcaaagacaaata
acagctttgtcgtggatggtgacacactgaaggaatgcccactcaaacatagagcatggaacagctttcttgtggaggat
catgggttcggggtatttcacactagtgtctggctcaaggttagagaagattattcattagagtgtgatccagccgttat
tggaacagctgttaagggaaaggaggctgtacacagtgatctaggctactggattgagagtgagaagaatgacacatgga
ggctggagagggcccatctgatcgagatgaaaacatgtgaatggccaaagtcccacacattgtggacagatggaatagaa
gagagtgatctgatcatacccaagtctttagctgggccactcagccatcacaataccagagagggctacaggacccaaat
gaaagggccatggcacagtgaagagcttgaaattcggtttgaggaatgcccaggcactaaggtccacgtggaggaaacat
gtggaacaagaggaccatctctgagatcaaccactgcaagcggaagggtgatcgaggaatggtgctgcagggagtgcaca
atgcccccactgtcgttccgggctaaagatggctgttggtatggaatggagataaggcccaggaaagaaccagaaagcaa
cttagtaaggtcaatggtgactgcaggatcaactgatcacatggaccacttctcccttggagtgcttgtgatcctgctca
tggtgcaggaagggctgaagaagagaatgaccacaaagatcatcataagcacatcaatggcagtgctggtagctatgatc
ctgggaggattttcaatgagtgacctggctaagcttgcaattttgatgggtgccaccttcgcggaaatgaacactggagg
agatgtagctcatctggcgctgatagcggcattcaaagtcagaccagcgttgctggtatctttcatcttcagagctaatt
ggacaccccgtgaaagcatgctgctggccttggcctcgtgtcttttgcaaactgcgatctccgccttggaaggcgacctg
atggttctcatcaatggttttgctttggcctggttggcaatacgagcgatggttgttccacgcactgataacatcacctt
ggcaatcctggctgctctgacaccactggcccggggcacactgcttgtggcgtggagagcaggccttgctacttgcgggg
ggtttatgctcctctctctgaagggaaaaggcagtgtgaagaagaacttaccatttgtcatggccctgggactaaccgct
gtgaggctggtcgaccccatcaacgtggtgggactgctgttgctcacaaggagtgggaagcggagctggccccctagcga
agtactcacagctgttggcctgatatgcgcattggctggagggttcgccaaggcagatatagagatggctgggcccatgg
ccgcggtcggtctgctaattgtcagttacgtggtctcaggaaagagtgtggacatgtacattgaaagagcaggtgacatc
acatgggaaaaagatgcggaagtcactggaaacagtccccggctcgatgtggcgctagatgagagtggtgatttctccct
ggtggaggatgacggtccccccatgagagagatcatactcaaggtggtcctgatgaccatctgtggcatgaacccaatag
ccataccctttgcagctggagcgtggtacgtatacgtgaagactggaaaaaggagtggtgctctatgggatgtgcctgct
cccaaggaagtaaaaaagggggagaccacagatggagtgtacagagtaatgactcgtagactgctaggttcaacacaagt
tggagtgggagttatgcaagagggggtctttcacactatgtggcacgtcacaaaaggatccgcgctgagaagcggtgaag
ggagacttgatccatactggggagatgtcaagcaggatctggtgtcatactgtggtccatggaagctagatgccgcctgg
gatgggcacagcgaggtgcagctcttggccgtgccccccggagagagagcgaggaacatccagactctgcccggaatatt
taagacaaaggatggggacattggagcggttgcgctggattacccagcaggaacttcaggatctccaatcctagacaagt
gtgggagagtgataggactttatggcaatggggtcgtgatcaaaaacgggagttatgttagtgccatcacccaagggagg
agggaggaagagactcctgttgagtgcttcgagccctcgatgctgaagaagaagcagctaactgtcttagacttgcatcc
tggagctgggaaaaccaggagagttcttcctgaaatagtccgtgaagccataaaaacaagactccgtactgtgatcttag
ctccaaccagggttgtcgctgctgaaatggaggaggcccttagagggcttccagtgcgttatatgacaacagcagtcaat
gtcacccactctggaacagaaatcgtcgacttaatgtgccatgccaccttcacttcacgtctactacagccaatcagagt
ccccaactataatctgtatattatggatgaggcccacttcacagatccctcaagtatagcagcaagaggatacatttcaa
caagggttgagatgggcgaggcggctgccatcttcatgaccgccacgccaccaggaacccgtgacgcatttccggactcc
aactcaccaattatggacaccgaagtggaagtcccagagagagcctggagctcaggctttgattgggtgacggatcattc
tggaaaaacagtttggtttgttccaagcgtgaggaacggcaatgagatcgcagcttgtctgacaaaggctggaaaacggg
tcatacagctcagcagaaagacttttgagacagagttccagaaaacaaaacatcaagagtgggactttgtcgtgacaact
gacatttcagagatgggcgccaactttaaagctgaccgtgtcatagattccaggagatgcctaaagccggtcatacttga
tggcgagagagtcattctggctggacccatgcctgtcacacatgccagcgctgcccagaggagggggcgcataggcagga
atcccaacaaacctggagatgagtatctgtatggaggtgggtgcgcagagactgacgaagaccatgcacactggcttgaa
gcaagaatgctccttgacaatatttacctccaagatggcctcatagcctcgctctatcgacctgaggccgacaaagtagc
agccattgagggagagttcaagcttaggacggagcaaaggaagacctttgtggaactcatgaaaagaggagatcttcctg
tttggctggcctatcaggttgcatctgccggaataacctacacagatagaagatggtgctttgatggcacgaccaacaac
accataatggaagacagtgtgccggcagaggtgtggaccagacacggagagaaaagagtgctcaaaccgaggtggatgga
cgccagagtttgttcagatcatgcggccctgaagtcattcaaggagtttgccgctgggaaaagaggagcggcttttggag
tgatggaagccctgggaacactgccaggacacatgacnnagagattccaggaagcnattgacaacctcgctgtgctcatg
cgngcagagactggaagcaggccttacaaagccgcggcggcccaattgccggagaccctagagaccataatgcntttggg
gttgctgggaacagtctcgctgggaatcttcttcgtcttgatgaggaacaagggcatagggaagatgggctttggaatgg
tgactcttggggccagcgcatggctcatgtggctctcggaaattgagccagccagaattgcatgtgtcctcattgttgtg
ttcctattgctggtggtgctcatacctgagccagaaaagcaaagatctccccaggacaaccaaatggcaatcatcatcat
ggtagcagtaggtcttttgggcttgattaccgccaatgaactcggatggttggagagaacaaagagtgacctaagccatc
taatgggaaggagagaggagggggcaaccataggattctcaatggacattgacctgcggccagcctcagcttgggccatc
tatgctgccttgacaactttcattaccccagccgtccaacatgcagtgaccacctcatacaacaactactccttaatggc
gatggccacgcaagctggagtgttgtttggcatgggcaaagggatgccattctacgcatgggactttggagtcccgctgc
taatgataggttgctactcacaattaacacccctgaccctaatagtggccatcattttgctcgtggcgcactacatgtac
ttgatcccagggctgcaggcagcagctgcgcgtgctgcccagaagagaacggcagctggcatcatgaagaaccctgttgt
ggatggaatagtggtgactgacattgacacaatgacaattgacccccaagtggagaaaaagatgggacaggtgctactca
tagcagtggccgtctccagcgccatactgtcgcggaccgcctgggggtggggggaggctggggctctgatcacagccgca
acttccactttgtgggaaggctctccgaacaagtactggaactcctctacagccacttcactgtgtaacatttttagggg
aagttacttggctggagcttctctaatctacacagtaacaagaaacgctggcttggtcaagagacgtgggggtggaacag
gagagaccctgggagagaaatggaaggcccgcttgaaccagatgtcggccctggagttctactcctacaaaaagtcaggc
atcaccgaggtgtgcagagaagaggcccgccgcgccctcaaggacggtgtggcaacgggaggccatgctgtgtcccgagg
aagtgcaaagctgagatggttggtggagcggggatacctgcagccctatggaaaggtcattgatcttggatgtggcagag
ggggctggagttactacgccgccaccatccgcaaagttcaagaagtgaaaggatacacaaaaggaggccctggtcatgaa
gaacccgtgttggtgcaaagctatgggtggaacatagtccgtcttaagagtggggtggacgtctttcatatggcggctga
gccgtgtgacacgttgctgtgtgacataggtgagtcatcatctagtcctgaagtggaagaagcacggacgctcagagtcc
tctccatggtgggggattggcttgaaaaaagaccaggagccttttgtataaaagtgttgtgcccatacaccagcactatg
atggaaaccctggagcgactgcagcgtaggtatgggggaggactggtcagagtgccactctcccgcaactctacacatga
gatgtactgggtctctggagcgaaaagcaacaccataaaaagtgtgtccaccacgagccagctcctcttggggcgcatgg
acgggcctaggaggccagtgaaatatgaggaggatgtgaatctcggctctggcacgcgggctgtggtaagctgcgctgaa
gctcccaacatgaagatcattggtaaccgcattgaaaggatccgcagtgagcacgcggaaacgtggttctttgacgagaa
ccacccatataggacatgggcttaccatggaagctatgaggcccccacacaagggtcagcgtcctctctaataaacgggg
ttgtcaggctcctgtcaaaaccctgggatgtggtgactggagtcacaggaatagccatgaccgacaccacaccgtatggt
cagcaaagagttttcaaggaaaaagtggacactagggtgccagacccccaagaaggcactcgtcaggttatgagcatggt
ctcttcctggttgtggaaagagctaggcaaacacaaacggccacgagtctgcaccaaagaagagttcatcaacaaggttc
gtagcaatgcagcattaggggcaatatttgaggaggaaaaagagtggaagactgcagtggaagctgtgaacgatccaagg
ttctgggctctagtggacaaggaaagagagcaccacctgagaggagagtgccagagctgtgtgtacaacatgatgggaaa
aagagaaaagaaacaaggggaatttggaaaggccaagggcagccgcgccatctggtatatgtggctaggggctagatttc
tagagttcgaagcccttggattcttgaacgaggatcactggatggggagagagaactcaggaggtggtgttgaagggctg
ggattacaaagactcggatatgtcctagaagagatgagtcgtataccaggaggaaggatgtatgcagatgacactgctgg
ctgggacacccgcattagcaggtttgatctggagaatgaagctctaatcaccaaccaaatggagaaagggcacagggcct
tggcattggccataatcaagtacacataccaaaacaaagtggtaaaggtccttagaccagctgaaaaagggaaaacagtt
atggacattatttcgagacaagaccaaagggggagcggacaagttgtcacttacgctcttaacacatttaccaacctagt
ggtgcaactcattcggaatatggaggctgaggaagttctagagatgcaagacttgtggctgctgcggaggtcagagaaag
tgaccaactggttgcagagcaacggatgggataggctcaaacgaatggcagtcagtggagatgattgcgttgtgaagcca
attgatgataggtttgcacatgccctcaggttcttgaatgatatgggaaaagttaggaaggacacacaagagtggaaacc
ctcaactggatgggacaactgggaagaagttccgttttgctcccaccacttcaacaagctccatctcaaggacgggaggt
ccattgtggttccctgccgccaccaagatgaactgattggtcgggcccgcgtctctccaggggcgggatggagcatccgg
gagactgcttgcctagcaaaatcatatgcgcaaatgtggcagctcctttatttccacagaagggacctccgactgatggc
caatgccatttgttcatctgtgccagttgactgggttccaactgggagaactacctggtcaatccatggaaagggagaat
ggatgaccactgaagacatgcttgtggtgtggaacagagtgtggatnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn
nnnnnnn
171 changes: 171 additions & 0 deletions phylogenetic/scripts/uniq_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#! /usr/bin/env python

"""Harmonize and merge pandas DataTables such that conflicting data is not lost.
Leave one blank line. The rest of this docstring should contain an
overall description of the module or program. Optionally, it may also
contain a brief description of exported classes and functions and/or usage
examples.
Typical usage example:
one_df = pd.DataFrame(
{'strain': ['A', 'B', 'C'],
'date': ['2022-01-01', '2022-02-02', '2022-03-03'],
'clade': ['alpha', 'beta', 'gamma'],
'geo':['iowa', 'washington', np.nan]})
one_df['age'] = '-N/A-'
two_df = pd.DataFrame(
{'strain': ['D', 'B', 'C'],
'clade': ['delta', 'beta2', 'gamma'],
'patient': ['bob', 'marley', 'rick']})
two_df['col_date'] = np.nan
two_df['group'] = ''
print(one_df)
print(two_df)
merged_df = merge_two(one_df, two_df)
print(merged_df)
"""
# ===== Dependencies
import argparse
import os
import sys

import numpy as np
import pandas as pd


# (2) Define command line arguments
def parse_args():
# Main help command
parser = argparse.ArgumentParser(
description="Harmonize and merge pandas DataTables such that conflicting data is not lost."
)
# Add first argument
parser.add_argument("--cache", help="Path to cache of cleaned data.", required=True)
parser.add_argument("--new", help="Path to new data.", required=True)
parser.add_argument(
"--cache_delim",
default="\t",
help="delimiter for cache of cleaned data.",
required=False,
)
parser.add_argument(
"--new_delim", default="\t", help="delimiter for new data.", required=False
)
parser.add_argument(
"--outfile",
default="merged_cache_new.tsv",
help="Merged file [default: merged_cache_new.tsv].",
required=False,
)
parser.add_argument(
"--outfile_excel",
help="Will export an excell merged file if defined. [Example: merged_cache_new.xlsx].",
required=False,
)
parser.add_argument(
"--outfile_delim",
default="\t",
help="delimiter for outfile data.",
required=False,
)
parser.add_argument(
"--groupby_col",
default="strain",
help="Group by column name [default 'strain'].",
required=False,
)
parser.add_argument(
"--drop_uninformative_cols",
default=False,
help="Drop uninformative columns [default False].",
required=False,
)

return parser.parse_args()


# ===== Reusable functions
def _drop_uninformative_cols(df: pd.DataFrame) -> pd.DataFrame:
"""Drops uninformative columns from a pandas DataFrame for being all empty. Used by merge_two."""
return df.replace("", np.nan).replace("-N/A-", np.nan).dropna(how="all", axis=1)


def _uniq_merge(x: "pd.Series[str]") -> str:
"""Merges unique values by group and joins conflicting values in a comma separated list. Used by merge_two."""
cx = x.replace("", np.nan).replace("-N/A-", np.nan).replace("?",np.nan).dropna().unique()
if len(cx) >= 1:
# split substrings by delimiter and flatten list
my_list = [i.split(',') for i in cx]
flat_list = [item for sublist in my_list for item in sublist]
# return unique values joined by delimiter
return ",".join(list(set(flat_list)))
else:
return ""

# Merge and harmonize two datasets, flag conflicts with commas
def merge_two(
df1: pd.DataFrame, df2: pd.DataFrame, groupby_col: str = "strain", drop_uninformative_cols: bool = False
) -> pd.DataFrame:
"""Harmonizes and merges two pandas DataFrames.
Takes two pandas DataFrames through the following 3 steps:
1. Optionally drops any columns in either which are all NA, "-N/A", or empty strings
2. Harmonizes their columns such that columns in the left DataFrame are preferentially listed first
3. Combines the DataFrames by group defined in groupby_col such that:
* unique values are merged
* conflicting values are joined in a comma separated list
Args:
df1:
The left hand side (lhs) pandas DataTable, will preferentially decide column order of merged DataTable
df2:
The right hand side (rhs) pandas DataTable, will be merged with df1 and new columns will be listed later.
groupby_col:
The id column that is shared by both df1 and df2 to allow for merging and harmonization of datasets
Returns:
A merged and harmonized dataset of containing information from df1 and df2.
Raises:
TBD
"""
# Optionally drop uninformative columns
if(drop_uninformative_cols):
df1 = _drop_uninformative_cols(df1)
df2 = _drop_uninformative_cols(df2)

# Harmonize columns
new_col = [x for x in df2.columns.tolist() if x not in set(df1.columns.tolist())]
h_df1_df = df1.reindex(df1.columns.tolist() + new_col, axis=1)
h_df2_df = df2.reindex(df1.columns.tolist() + new_col, axis=1)

# Unique and merge conflicting data
merged_df = pd.concat([h_df1_df, h_df2_df]).groupby(groupby_col).agg([_uniq_merge])
merged_df.columns=[i[0] for i in merged_df.columns]
return merged_df


def main():
args = parse_args()

old = pd.read_csv(args.cache, sep=args.cache_delim, header=0, dtype=str)
new = pd.read_csv(args.new, sep=args.new_delim, header=0, dtype=str)

merged = merge_two(old, new, groupby_col=args.groupby_col, drop_uninformative_cols=args.drop_uninformative_cols)

# Export merged file as Excel or delimited file
if(args.outfile_excel is not None):
args.outfile_excel = os.path.splitext(args.outfile)[0] + ".xlsx"
merged.to_excel(args.outfile_excel)
else:
merged.to_csv(args.outfile, sep=args.outfile_delim)


if __name__ == "__main__":
main()
Loading

0 comments on commit 8d87951

Please sign in to comment.