Skip to content

Commit

Permalink
Merge pull request #139 from andrewjpage/missing_id
Browse files Browse the repository at this point in the history
Use locus tag when ID is missing from GFF
  • Loading branch information
andrewjpage committed Jun 2, 2015
2 parents d532f49 + 3c095f6 commit 7e2fc0c
Show file tree
Hide file tree
Showing 13 changed files with 1,152 additions and 39 deletions.
2 changes: 1 addition & 1 deletion dist.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name = Bio-Roary
version = 2.3.0
version = 2.3.1
author = Andrew J. Page <ap13@sanger.ac.uk>
license = GPL_3
copyright_holder = Wellcome Trust Sanger Institute
Expand Down
1 change: 1 addition & 0 deletions lib/Bio/Roary.pm
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Create a pan genome

use Moose;
use File::Copy;
use Bio::Perl;
use Bio::Roary::ParallelAllAgainstAllBlast;
use Bio::Roary::CombinedProteome;
use Bio::Roary::External::Cdhit;
Expand Down
28 changes: 22 additions & 6 deletions lib/Bio/Roary/BedFromGFFRole.pm
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,37 @@ sub _create_bed_file_from_gff {
next if !( $feature->primary_tag =~ /$tags_regex/ );

# Must have an ID tag
next unless ( $feature->has_tag('ID') );
my $gene_id = $self->_get_feature_id($feature);
next unless($gene_id);

#filter out small genes
next if ( ( $feature->end - $feature->start ) < $self->min_gene_size_in_nucleotides );

my ( $gene_id, @junk ) = $feature->get_tag_values('ID');
$gene_id =~ s!["']!!g;
next if ( $gene_id eq "" );

my $strand = ($feature->strand > 0)? '+':'-' ;
print {$bed_fh} join( "\t", ( $feature->seq_id, $feature->start -1, $feature->end, $gene_id, 1, $strand ) ) . "\n";
}
$gffio->close();
}


sub _get_feature_id
{
my ($self, $feature) = @_;
my ( $gene_id, @junk ) ;
if ( $feature->has_tag('ID') )
{
( $gene_id, @junk ) = $feature->get_tag_values('ID');
}
elsif($feature->has_tag('locus_tag'))
{
( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
}
else
{
return undef;
}
$gene_id =~ s!["']!!g;
return undef if ( $gene_id eq "" );
return $gene_id ;
}

1;
68 changes: 36 additions & 32 deletions lib/Bio/Roary/GeneNamesFromGFF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,42 +26,46 @@ has 'ids_to_product' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
sub _build_ids_to_gene_name {
my ($self) = @_;
my %id_to_gene_name;

open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
while(<$fh>)
{
chomp;
my $line = $_;
my $id_name;
if($line =~/ID=["']?([^;"']+)["']?;?/i)
{
$id_name = $1;
$id_name =~ s!"!!g;
}
else
{
next;
}

if($line =~/gene=["']?([^;"']+)["']?;?/i)
{
my $gene_name = $1;
$gene_name =~ s!"!!g;
next if ( $gene_name eq "" );
$id_to_gene_name{$id_name} = $gene_name;
}

if($line =~/product=["']?([^;,"']+)[,"']?;?/i)
{
my $product = $1;
$self->ids_to_product->{$id_name} = $product;
}


my $gffio = Bio::Tools::GFF->new( -file => $self->gff_file, -gff_version => 3 );
while ( my $feature = $gffio->next_feature() ) {
my $gene_id = $self->_get_feature_id($feature);
next unless ($gene_id);

if ( $feature->has_tag('gene') ) {
my ( $gene_name, @junk ) = $feature->get_tag_values('gene');
$gene_name =~ s!"!!g;
if ( $gene_name ne "" ) {
$id_to_gene_name{$gene_id} = $gene_name;
}
}
if ( $feature->has_tag('product') ) {
my ( $product, @junk ) = $feature->get_tag_values('product');
$self->ids_to_product->{$gene_id} = $product;
}

}
close($fh);

return \%id_to_gene_name;
}

sub _get_feature_id {
my ( $self, $feature ) = @_;
my ( $gene_id, @junk );
if ( $feature->has_tag('ID') ) {
( $gene_id, @junk ) = $feature->get_tag_values('ID');
}
elsif ( $feature->has_tag('locus_tag') ) {
( $gene_id, @junk ) = $feature->get_tag_values('locus_tag');
}
else {
return undef;
}
$gene_id =~ s!["']!!g;
return undef if ( $gene_id eq "" );
return $gene_id;
}

no Moose;
__PACKAGE__->meta->make_immutable;

Expand Down
10 changes: 10 additions & 0 deletions t/Bio/Roary/CommandLine/Roary.t
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,16 @@ ok((-e 'query_2.gff.proteome.faa'),'Check protein query_2.gff.proteome.faa is no
ok((-e 'query_5.gff.proteome.faa'),'Check protein query_5.gff.proteome.faa is not cleaned up');

cleanup_files();

%scripts_and_expected_files = (
'-j Local --dont_delete_files t/data/locus_tag_gffs/query_1.gff t/data/locus_tag_gffs/query_2.gff t/data/locus_tag_gffs/query_3.gff ' =>
[ 'empty_file', 't/data/empty_file' ],
);
mock_execute_script_and_check_output_sorted_groups( $script_name, \%scripts_and_expected_files, [0,6,7,8,9] );

for my $filename (('query_1.gff.proteome.faa','query_2.gff.proteome.faa','query_3.gff.proteome.faa')) {
is( read_file($filename), read_file( 't/data/locus_tag_gffs/' . $filename . '.expected' ), "content of proteome $filename as expected" );
}

SKIP:
{
Expand Down
22 changes: 22 additions & 0 deletions t/Bio/Roary/ExtractProteomeFromGFFs.t
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,26 @@ unlink('genbank1.gff.proteome.faa');
unlink('genbank2.gff.proteome.faa');
unlink('genbank3.gff.proteome.faa');




ok(
$plot_groups_obj = Bio::Roary::ExtractProteomeFromGFFs->new(
gff_files => [ 't/data/locus_tag_gffs/query_1.gff', 't/data/locus_tag_gffs/query_2.gff', 't/data/locus_tag_gffs/query_3.gff' ],
),
'initialise object with locus tag id gff files'
);
@sorted_fasta_files = sort( @{ $plot_groups_obj->fasta_files() } );
@sorted_expected_files = sort( ( 'query_1.gff.proteome.faa', 'query_2.gff.proteome.faa', 'query_3.gff.proteome.faa' ) );

is_deeply( \@sorted_fasta_files, \@sorted_expected_files, 'locus tag id files created output' );

for my $filename (@sorted_expected_files) {
is( read_file($filename), read_file( 't/data/locus_tag_gffs/' . $filename . '.expected' ), "content of proteome $filename as expected" );
}

unlink('query_1.gff.proteome.faa');
unlink('query_2.gff.proteome.faa');
unlink('query_3.gff.proteome.faa');

done_testing();
22 changes: 22 additions & 0 deletions t/Bio/Roary/GeneNamesFromGFF.t
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,26 @@ is_deeply(
'ids to gene names as expected again'
);


ok(
$obj = Bio::Roary::GeneNamesFromGFF->new(
gff_file => 't/data/locus_tag_gffs/query_1.gff'
),
'initialise a GFF file with locus tags only'
);

is_deeply(
$obj->ids_to_gene_name,
{
'abc_00005' => 'speH',
'abc_00007' => 'argF',
'abc_00001' => 'different',
'abc_00016' => 'yfnB',
'abc_00008' => 'arcC1'
},
'ids to gene names with GFF file with locus tags only'
);



done_testing();
Loading

0 comments on commit 7e2fc0c

Please sign in to comment.