Skip to content

Commit

Permalink
Merge pull request #1490 from dglemos/add/option_individual_zyg
Browse files Browse the repository at this point in the history
  • Loading branch information
nuno-agostinho authored Sep 22, 2023
2 parents 9dbb8d2 + b552c4c commit 90c044e
Show file tree
Hide file tree
Showing 9 changed files with 279 additions and 10 deletions.
11 changes: 10 additions & 1 deletion modules/Bio/EnsEMBL/VEP/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ our @VEP_PARAMS = (
'allow_non_variant', # allow non-variant VCF lines through
'process_ref_homs', # force processing of individuals with homozygous ref genotype
'individual=s', # give results by genotype for individuals
'individual_zyg=s', # reports genotypes for individuals
'phased', # force VCF genotypes to be interpreted as phased
'fork=i', # fork into N processes
'dont_skip', # don't skip vars that fail validation
Expand Down Expand Up @@ -315,6 +316,7 @@ our %DEFAULTS = (
# and so need to be converted to listrefs
our @LIST_FLAGS = qw(
individual
individual_zyg
cell_type
pick_order
fields
Expand Down Expand Up @@ -405,6 +407,13 @@ our @OPTION_SETS = (
},
},

{
flags => ['individual_zyg'],
set => {
allow_non_variant => 1,
},
},

{
flags => ['cell_type'],
set => {
Expand Down Expand Up @@ -596,7 +605,7 @@ our %INCOMPATIBLE = (
json => [qw(vcf tab)],
vcf => [qw(json tab)],
tab => [qw(vcf json)],
individual => [qw(minimal)],
individual => [qw(minimal individual_zyg)],
check_ref => [qw(lookup_ref)],
check_svs => [qw(offline)],
ga4gh_vrs => [qw(vcf)]
Expand Down
1 change: 1 addition & 0 deletions modules/Bio/EnsEMBL/VEP/Constants.pm
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ our @FLAG_FIELDS = (

# general
{ flag => 'individual', fields => ['IND','ZYG'] },
{ flag => 'individual_zyg', fields => ['ZYG'] },
{ flag => 'allele_number', fields => ['ALLELE_NUM'] },
{ flag => 'show_ref_allele', fields => ['REF_ALLELE'] },
{ flag => 'uploaded_allele', fields => ['UPLOADED_ALLELE'] },
Expand Down
24 changes: 20 additions & 4 deletions modules/Bio/EnsEMBL/VEP/OutputFactory.pm
Original file line number Diff line number Diff line change
Expand Up @@ -498,11 +498,17 @@ sub get_all_VariationFeatureOverlapAlleles {
$vfos = \@new;
}

# method name stub for getting *VariationAlleles
my $allele_method = $self->{process_ref_homs} ? 'get_all_' : 'get_all_alternate_';
my $method = $allele_method.'VariationFeatureOverlapAlleles';
# the option individual_zyg has to run a specific method if there is only one individual (unique_ind)
if($vf->{unique_ind}) {
return $self->filter_VariationFeatureOverlapAlleles([map {$_->get_reference_VariationFeatureOverlapAllele} @{$vfos}]);
}
else {
# method name stub for getting *VariationAlleles
my $allele_method = $self->{process_ref_homs} ? 'get_all_' : 'get_all_alternate_';
my $method = $allele_method.'VariationFeatureOverlapAlleles';

return $self->filter_VariationFeatureOverlapAlleles([map {@{$_->$method}} @{$vfos}]);
return $self->filter_VariationFeatureOverlapAlleles([map {@{$_->$method}} @{$vfos}]);
}
}


Expand Down Expand Up @@ -924,6 +930,16 @@ sub VariationFeature_to_output_hash {
$hash->{ZYG} = (scalar keys %unique > 1 ? 'HET' : 'HOM').(defined($vf->{hom_ref}) ? 'REF' : '');
}
}

# individual_zig
if(defined($vf->{genotype_ind})) {
my @tmp;
foreach my $geno_ind (keys %{$vf->{genotype_ind}}) {
my %unique = map {$_ => 1} @{$vf->{genotype_ind}->{$geno_ind}};
push @tmp, $geno_ind.":".(scalar keys %unique > 1 ? 'HET' : 'HOM').(defined($vf->{hom_ref}->{$geno_ind}) ? 'REF' : '');
}
$hash->{ZYG} = \@tmp;
}

# minimised?
$hash->{MINIMISED} = 1 if $vf->{minimised};
Expand Down
113 changes: 109 additions & 4 deletions modules/Bio/EnsEMBL/VEP/Parser/VCF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ sub new {
my $self = $class->SUPER::new(@_);

# add shortcuts to these params
$self->add_shortcuts([qw(allow_non_variant gp individual process_ref_homs phased max_sv_size)]);
$self->add_shortcuts([qw(allow_non_variant gp individual individual_zyg process_ref_homs phased max_sv_size)]);

return $self;
}
Expand Down Expand Up @@ -350,12 +350,14 @@ sub create_VariationFeatures {
$vf->{non_variant} = 1 if $non_variant;

# individual data?
if($self->{individual}) {
if($self->{individual} || $self->{individual_zyg}) {
my @changed_alleles = ($ref, @$alts);
my %allele_map = map {$original_alleles[$_] => $changed_alleles[$_]} 0..$#original_alleles;

my $method = $self->{individual} ? 'create_individual_VariationFeatures' : 'create_individuals_zyg_VariationFeature';

my @return =
map {@{$self->create_individual_VariationFeatures($_, \%allele_map)}}
my @return =
map {@{$self->$method($_, \%allele_map)}}
@{$self->post_process_vfs([$vf])};

# if all selected individuals had REF or missing genotypes @return will be empty
Expand Down Expand Up @@ -580,6 +582,16 @@ sub create_individual_VariationFeatures {

# get genotypes from parser
my $include = lc($self->{individual}->[0]) eq 'all' ? $parser->get_samples : $self->{individual};

# Compare sample names
if(lc($self->{individual}->[0]) ne 'all') {
my $found = _find_in_array($parser->get_samples, $include);

if(!$found) {
die("ERROR: Sample IDs given (", join(",", @{$include}), ") do not match samples from VCF (", join(",", @{$parser->get_samples}), ")\n");
}
}

my $ind_gts = $parser->get_samples_genotypes($include, 1 - ($self->{allow_non_variant} || 0));

foreach my $ind(@$include) {
Expand Down Expand Up @@ -628,4 +640,97 @@ sub create_individual_VariationFeatures {
return \@return;
}

=head2 create_individuals_zyg_VariationFeature
Arg 1 : Bio::EnsEMBL::VariationFeature $vf
Arg 2 : hashref $allele_map
Example : $vfs = $parser->create_individuals_zyg_VariationFeature($vf, $map);
Description: Create one VariationFeature object with
individual/sample info. Arg 2 $allele_map is a hashref mapping the
allele index to the actual ALT string it represents.
Returntype : arrayref of Bio::EnsEMBL::VariationFeature
Exceptions : none
Caller : create_VariationFeatures()
Status : Stable
=cut

sub create_individuals_zyg_VariationFeature {
my $self = shift;
my $vf = shift;
my $allele_map = shift;

my $parser = $self->parser();
my $record = $parser->{record};

my @alleles = split '\/', $vf->{allele_string};
my $ref = $alleles[0];

my @return;

# get genotypes from parser
my $include = lc($self->{individual_zyg}->[0]) eq 'all' ? $parser->get_samples : $self->{individual_zyg};

# Compare sample names
if(lc($self->{individual_zyg}->[0]) ne 'all') {
my $found = _find_in_array($parser->get_samples, $include);

if(!$found) {
die("ERROR: Sample IDs given (", join(",", @{$include}), ") do not match samples from VCF (", join(",", @{$parser->get_samples}), ")\n");
}
}

my $ind_gts = $parser->get_samples_genotypes($include, 1 - ($self->{allow_non_variant} || 0));

my $n_individuals = scalar(@{$include});

foreach my $ind(@$include) {
# get alleles present in this individual
my $gt = $ind_gts->{$ind};
next if (!$gt);
my @bits = map { $allele_map->{$_} } split /\||\/|\\/, $gt;
my $phased = ($gt =~ /\|/ ? 1 : 0);

# get non-refs, remembering to exclude "*"-types
my %non_ref = map {$_ => 1} grep {$_ ne $ref && $_ !~ /\*/} @bits;

# Genotype is reference
if(!scalar keys %non_ref) {
$vf->{hom_ref}->{$ind} = 1;
$vf->{non_variant}->{$ind} = 1;

if($n_individuals == 1) {
$vf->{allele_string} = $ref."/".$ref ;
$vf->{unique_ind} = 1;
}
}

# store phasing info
$vf->{phased}->{$ind} = $self->{phased} ? 1 : $phased;

# store GT
$vf->{genotype_ind}->{$ind} = \@bits;
}

push @return, $vf;

return \@return;
}

sub _find_in_array {
my $all_samples = shift;
my $samples = shift;

my %all = map { $_ => 1 } @{$all_samples};
my %input_samples = map { $_ => 1 } @{$samples};

foreach my $sample (keys %input_samples) {
if(!$all{$sample}) {
return 0;
}
}

return 1;
}

1;
4 changes: 3 additions & 1 deletion t/Config.t
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ is($cfg->param('format'), 'ensembl', 'ini file');
$cfg = Bio::EnsEMBL::VEP::Config->new({individual => 'dave,barry,keith'});
is_deeply($cfg->param('individual'), [qw(dave barry keith)], 'list conversion');

$cfg = Bio::EnsEMBL::VEP::Config->new({individual_zyg => 'dave,barry,keith'});
is_deeply($cfg->param('individual_zyg'), [qw(dave barry keith)], 'individual_zyg - list conversion');

# deprecated
throws_ok { Bio::EnsEMBL::VEP::Config->new({convert => 1}) } qr/deprecated/, 'deprecated no replacement';
#throws_ok { Bio::EnsEMBL::VEP::Config->new({gmaf => 1}) } qr/deprecated.+\-\-af/, 'deprecated with replacement';
Expand Down Expand Up @@ -150,4 +153,3 @@ is($cfg->param($_), undef, 'everything with database turns off '.$_) for qw(af_1
## DONE
#######
done_testing();

1 change: 1 addition & 0 deletions t/Haplo_Runner.t
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ SKIP: {
'allow_non_variant' => undef,
'gp' => undef,
'individual' => undef,
'individual_zyg' => undef,
'phased' => undef,
'max_sv_size' => 10000000,
}, 'Bio::EnsEMBL::VEP::Haplo::Parser::VCF' ), 'get_Parser');
Expand Down
45 changes: 45 additions & 0 deletions t/OutputFactory.t
Original file line number Diff line number Diff line change
Expand Up @@ -1996,6 +1996,51 @@ $of->reset_shifted_positions($vfoa->variation_feature);
ok(defined($new_cds_start) && !defined($vfoa->transcript_variation->{cds_start}), 'reset_shifted_positions');


### individual_zyg
$ib = get_annotated_buffer({
input_file => $test_cfg->create_input_file([
['##fileformat=VCFv4.1'],
[qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT dave barry jeff)],
[qw(21 25607429 indtest A G . . . GT 0|1 1/1 0/0)],
]),
individual_zyg => 'all',
});

$of->{individual_zyg} = ['all'];
my $result = $of->VariationFeature_to_output_hash($ib->buffer->[0]);
my $genotype = join ',', sort(@{$result->{ZYG}});

is($genotype, 'barry:HOM,dave:HET,jeff:HOMREF', 'VariationFeature_to_output_hash - individual_zyg');
delete($of->{individual_zyg});

### individual_zyg wrong sample name
dies_ok { get_annotated_buffer({
input_file => $test_cfg->create_input_file([
['##fileformat=VCFv4.1'],
[qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT dave barry jeff)],
[qw(21 25607429 indtest A G . . . GT 0|1 1/1 0/0)],
]),
individual_zyg => 'john',
});
} 'VariationFeature_to_output_hash - individual_zyg wrong sample name';

### individual_zyg correct sample name
$ib = get_annotated_buffer({
input_file => $test_cfg->create_input_file([
['##fileformat=VCFv4.1'],
[qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT dave barry jeff)],
[qw(21 25607429 indtest A G . . . GT 0|1 1/1 0/0)],
]),
individual_zyg => 'dave,barry',
});

$of->{individual_zyg} = ['dave,barry'];
my $result = $of->VariationFeature_to_output_hash($ib->buffer->[0]);
my $genotype = join ',', sort(@{$result->{ZYG}});

is($genotype, 'barry:HOM,dave:HET', 'VariationFeature_to_output_hash - individual_zyg correct sample name');
delete($of->{individual_zyg});


# done
done_testing();
Expand Down
71 changes: 71 additions & 0 deletions t/Parser_VCF.t
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,77 @@ $vf = $p->next;
is($vf->{allele_string}, 'C/-', 'individual - deletion alleles mapped');


### individual_zyg data
$p = Bio::EnsEMBL::VEP::Parser::VCF->new({
config => Bio::EnsEMBL::VEP::Config->new({%$base_testing_cfg, allow_non_variant => 1, individual_zyg => 'all'}),
file => $test_cfg->create_input_file([
['##fileformat=VCFv4.1'],
[qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT dave barry jeff)],
[qw(21 25587759 indtest A G . . . GT 0|1 1/1 0/0)],
]),
valid_chromosomes => [21]
});

$vf = $p->next;
delete($vf->{adaptor}); delete($vf->{_line});

is_deeply($vf, bless( {
'chr' => '21',
'strand' => 1,
'variation_name' => 'indtest',
'map_weight' => 1,
'allele_string' => 'A/G',
'end' => 25587759,
'start' => 25587759,
'seq_region_end' => 25587759,
'seq_region_start' => 25587759,
'genotype_ind' => {'jeff' => ['A', 'A'], 'barry' => ['G', 'G'], 'dave' => ['A', 'G']},
'non_variant' => { 'jeff' => 1 },
'phased' => {'jeff' => 0, 'barry' => 0, 'dave' => 1},
'hom_ref' => { 'jeff' => 1 },
}, 'Bio::EnsEMBL::Variation::VariationFeature' ), 'individual_zyg');

### individual_zyg data - test situation where a line has no valid ALT genotypes
$p = Bio::EnsEMBL::VEP::Parser::VCF->new({
config => Bio::EnsEMBL::VEP::Config->new({%$base_testing_cfg, individual_zyg => 'all'}),
file => $test_cfg->create_input_file([
['##fileformat=VCFv4.1'],
[qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT jeff)],
[qw(21 25587759 indtest1 A G . . . GT ./.)],
[qw(21 25587760 indtest2 C T . . . GT 1/1)],
]),
valid_chromosomes => [21]
});

$vf = $p->next;
is($vf->{variation_name}, 'indtest1', 'individual_zyg - return output even for lines with no valid GTs');

$vf = $p->next;
is($vf->{variation_name}, 'indtest2', 'individual_zyg - return output for lines with valid GTs');

is_deeply($vf->{genotype_ind}, ( {
'jeff' => ['T', 'T'],
}), 'individual_zyg - return genotype_ind for lines with valid GTs');

### individual_zyg data - test situation where a line has homozygous ref genotype
$p = Bio::EnsEMBL::VEP::Parser::VCF->new({
config => Bio::EnsEMBL::VEP::Config->new({%$base_testing_cfg, individual_zyg => 'all'}),
file => $test_cfg->create_input_file([
['##fileformat=VCFv4.1'],
[qw(#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT jeff)],
[qw(21 25587759 indtest1 A G . . . GT 0|0)],
]),
valid_chromosomes => [21]
});

$vf = $p->next;
is_deeply($vf->{genotype_ind}, ( {
'jeff' => ['A', 'A'],
}), 'individual_zyg - return genotype_ind for lines with homozygous ref GTs');







Expand Down
Loading

0 comments on commit 90c044e

Please sign in to comment.