Skip to content

Commit

Permalink
Merge pull request #62 from gymrek-lab/feat/vcf_output
Browse files Browse the repository at this point in the history
Added SAMPLE Format Field
  • Loading branch information
mlamkin7 authored Jul 2, 2022
2 parents d35c1ec + 0dee04c commit a26a01f
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 8 deletions.
18 changes: 16 additions & 2 deletions haptools/sim_genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,10 @@ def output_vcf(breakpoints, model_file, vcf_file, sampleinfo_file, out):
# Iterate over VCF and output variants to file(in the beginning write out the header as well) until end of haplotype block for a sample (have to iterate over all samples each time to check)
# Once VCF is complete we've output everything we wanted
# VCF output have a FORMAT field where under format is GT:POP and our sample output is GT:POP ie 1|1:YRI|CEU
_write_vcf(breakpoints, hapblock_samples, current_bkps, output_samples, vcf, out+".vcf")
_write_vcf(breakpoints, hapblock_samples, vcf.samples, current_bkps, output_samples, vcf, out+".vcf")
return

def _write_vcf(breakpoints, hapblock_samples, current_bkps, out_samples, in_vcf, out_vcf):
def _write_vcf(breakpoints, hapblock_samples, vcf_samples, current_bkps, out_samples, in_vcf, out_vcf):
"""
in_vcf = cyvcf2 variants we are reading in
out_vcf = output vcf file we output too
Expand Down Expand Up @@ -119,6 +119,15 @@ def _write_vcf(breakpoints, hapblock_samples, current_bkps, out_samples, in_vcf,
("Description", "Origin Population of each respective allele in GT"),
],
)
write_vcf.header.add_meta(
"FORMAT",
items=[
("ID", "SAMPLE"),
("Number", 2),
("Type", "String"),
("Description", "Origin sample and haplotype of each respective allele in GT"),
],
)
for var in in_vcf:
rec = {
"contig": var.CHROM,
Expand Down Expand Up @@ -148,20 +157,25 @@ def _write_vcf(breakpoints, hapblock_samples, current_bkps, out_samples, in_vcf,
if hap > 0:
record.samples[f"Sample_{sample_num}"]["GT"] = tuple(gt)
record.samples[f"Sample_{sample_num}"]["POP"] = tuple(pops)
record.samples[f"Sample_{sample_num}"]["SAMPLE"] = tuple(samples)
record.samples[f"Sample_{sample_num}"].phased = True
gt = []
pops = []
samples = []
hap_var = var.genotypes[var_sample][hap % 2]
gt.append(hap_var)
pops.append(bkp.get_pop())
samples.append(vcf_samples[var_sample] + f"-{hap_var}")
else:
hap_var = var.genotypes[var_sample][hap % 2]
gt.append(hap_var)
pops.append(bkp.get_pop())
samples.append(vcf_samples[var_sample] + f"-{hap_var}")

sample_num = hap // 2
record.samples[f"Sample_{sample_num+1}"]["GT"] = tuple(gt)
record.samples[f"Sample_{sample_num+1}"]["POP"] = tuple(pops)
record.samples[f"Sample_{sample_num+1}"]["SAMPLE"] = tuple(samples)
record.samples[f"Sample_{sample_num+1}"].phased = True

# write the record to a file
Expand Down
32 changes: 32 additions & 0 deletions tests/data/outvcf_out.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##contig=<ID=1>
##contig=<ID=2>
##contig=<ID=3>
##contig=<ID=4>
##contig=<ID=5>
##contig=<ID=6>
##contig=<ID=7>
##contig=<ID=8>
##contig=<ID=9>
##contig=<ID=10>
##contig=<ID=11>
##contig=<ID=12>
##contig=<ID=13>
##contig=<ID=14>
##contig=<ID=15>
##contig=<ID=16>
##contig=<ID=17>
##contig=<ID=18>
##contig=<ID=19>
##contig=<ID=20>
##contig=<ID=21>
##contig=<ID=22>
##contig=<ID=23>
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=POP,Number=2,Type=String,Description="Origin Population of each respective allele in GT">
##FORMAT=<ID=SAMPLE,Number=2,Type=String,Description="Origin sample and haplotype of each respective allele in GT">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample_1 Sample_2
1 10114 1:10114:T:C T C . . . GT:POP:SAMPLE 0|0:YRI,YRI:HG00100-0,HG00100-0 1|1:CEU,CEU:HG00096-1,HG00096-1
1 59423090 1:59423090:A:G A G . . . GT:POP:SAMPLE 0|1:CEU,YRI:HG00097-0,HG00100-1 1|0:YRI,CEU:HG00100-1,HG00096-0
2 10122 1:10122:A:G A G . . . GT:POP:SAMPLE 1|0:YRI,CEU:HG00100-1,HG00096-0 0|1:CEU,YRI:HG00097-0,HG00099-1
7 changes: 1 addition & 6 deletions tests/test_outputvcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def test_vcf_output():
# read in vcf file
vcf = VCF(str(out_prefix)+'.vcf')
for var in vcf:
print(var.format('SAMPLE'))
if var.CHROM == "1" and var.POS == 10114:
assert(var.genotypes[0] == [0,0,True])
assert(var.format('POP')[0] == "YRI,YRI")
Expand All @@ -86,12 +87,6 @@ def test_vcf_output():
return

def test_validate_params():
# TODO ensure that my code covers all incorrect cases
bkp_file, model_file, vcf_file, sampleinfo_file, out_prefix = _get_files()
chroms = ["1"]
mapdir = "dir"
validate_params(model_file, mapdir, chroms, 10000, vcf_file, sampleinfo_file)

# Ensure it errors when popsize <= 0
return

0 comments on commit a26a01f

Please sign in to comment.