Skip to content

Commit

Permalink
Merge pull request #1 from andrewjpage/master
Browse files Browse the repository at this point in the history
Treat N as a gap
  • Loading branch information
andrewjpage committed Jan 24, 2012
2 parents bf72b92 + 413134d commit 8abbe4e
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 7 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
*.o
gubbins
*.phylip
*.vcf
4 changes: 2 additions & 2 deletions alignment_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ int detect_snps(char reference_sequence[], char filename[], int length_of_genome
for(i = 0; i < length_of_genome; i++)
{
// If there is an indel in the reference sequence, replace with the first proper base you find
if(reference_sequence[i] == '-' && seq->seq.s[i] != '-' )
if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' ))
{
reference_sequence[i] = toupper(seq->seq.s[i]);
}

if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && reference_sequence[i] != toupper(seq->seq.s[i]))
if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i]))
{
reference_sequence[i] = '*';
number_of_snps++;
Expand Down
2 changes: 1 addition & 1 deletion parse_phylip.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ int does_column_contain_snps(int snp_column, char reference_base)
return 0;
}

if(sequences[i][snp_column] != '-' && sequences[i][snp_column] != reference_base)
if(sequences[i][snp_column] != '-' && toupper(sequences[i][snp_column]) != 'N' && sequences[i][snp_column] != reference_base)
{
return 1;
}
Expand Down
Binary file modified snp_sites
Binary file not shown.
Binary file modified tests/check_snp_sites
Binary file not shown.
110 changes: 110 additions & 0 deletions tests/data/alignment_file_one_line_per_sequence.aln.phylip
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
109 5
2956_6_1 AACGC
2956_6_2 AACGC
2956_6_3 AACGC
2956_6_4 AACGC
2956_6_5 AACGC
2956_6_6 AACGC
3002_8_1 GACGC
3002_8_2 GACGC
3002_8_3 AACGC
3002_8_4 AACGC
3002_8_5 -ACGC
3002_8_6 GACGC
3002_8_7 AACGC
4056_2_10 GACGC
4056_2_11 AACGC
4056_2_1 AACGC
4056_2_12 AACGC
4056_2_2 AACGC
4056_2_3 -ACGC
4056_2_4 GACGT
4056_2_5 AACGC
4056_2_6 AACGC
4056_2_7 AACGC
4056_2_9 AACGC
4056_6_10 AACGC
4056_6_11 AACGC
4056_6_12 AACGC
4056_6_2 AACGC
4056_6_3 AACGC
4056_6_4 AACGC
4056_6_5 AACGC
4056_6_6 AACGC
4056_6_7 AACGC
4056_6_9 AACGC
4056_7_10 AACGC
4056_7_11 AACGC
4056_7_1 AACGC
4056_7_12 AACGC
4056_7_7 AACGC
4056_7_8 AACGC
4056_7_9 AACGC
4056_8_10 AACGC
4056_8_1 AACGC
4056_8_12 AACGC
4056_8_2 AACGC
4056_8_3 AACGC
4056_8_4 AACGC
4056_8_6 GACGC
4056_8_8 AACGC
4056_8_9 AACGC
4075_3_11 AACGC
4075_3_12 AACGC
4075_3_2 AACGC
4075_3_3 AACGC
4075_3_5 AACGC
4075_3_6 AACGC
4075_3_7 AACGC
4075_3_8 AACGC
4075_3_9 AACGC
4370_2_11 AACGC
4370_2_12 AACGC
4370_2_2 AACGC
4370_2_3 AACGC
4370_2_4 AACGC
4370_2_7 AACGC
4370_2_8 AACGC
4370_2_9 AACGC
4370_3_11 AACGC
4370_3_1 AACGC
4370_3_6 AACGC
4370_3_7 AACGC
4370_3_8 AACGC
5174_5_1 GACGC
5174_5_2 AACGC
5174_5_3 AACGC
5174_5_4 AACGC
5174_5_5 AACGC
5174_5_6 AACGC
5174_5_7 GACGC
5174_5_9 GACGC
5174_6_10 GACGC
5174_6_1 AACGC
5174_6_2 AACGC
5174_6_3 AACGC
5174_6_4 AACGC
5174_6_5 AACGC
5174_6_6 AACGC
5174_6_7 AACGC
5174_6_8 AACGC
5174_6_9 AACGC
5174_7_10 AACGC
5174_7_1 GACGC
5174_7_2 AACGC
5174_7_3 AACGC
5174_7_4 AACGC
5174_7_5 AACGC
5174_7_6 AACGC
5174_7_7 AACGC
5174_7_8 AACGC
5174_7_9 AACGC
5174_8_1 AACGC
5174_8_2 AACGC
5174_8_3 AACGC
5174_8_5 GACGC
5174_8_6 AACGC
5174_8_8 AACGC
5174_8_9 AACGC
Vibrio_parahaemolyticus -----
Vibrio_vulnificus -GGA-
8 changes: 8 additions & 0 deletions tests/data/alignment_file_one_line_per_sequence.aln.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.1
##INFO=<ID=AB,Number=1,Type=String,Description="Alt Base">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 2956_6_1 2956_6_2 2956_6_3 2956_6_4 2956_6_5 2956_6_6 3002_8_1 3002_8_2 3002_8_3 3002_8_4 3002_8_5 3002_8_6 3002_8_7 4056_2_10 4056_2_11 4056_2_1 4056_2_12 4056_2_2 4056_2_3 4056_2_4 4056_2_5 4056_2_6 4056_2_7 4056_2_9 4056_6_10 4056_6_11 4056_6_12 4056_6_2 4056_6_3 4056_6_4 4056_6_5 4056_6_6 4056_6_7 4056_6_9 4056_7_10 4056_7_11 4056_7_1 4056_7_12 4056_7_7 4056_7_8 4056_7_9 4056_8_10 4056_8_1 4056_8_12 4056_8_2 4056_8_3 4056_8_4 4056_8_6 4056_8_8 4056_8_9 4075_3_11 4075_3_12 4075_3_2 4075_3_3 4075_3_5 4075_3_6 4075_3_7 4075_3_8 4075_3_9 4370_2_11 4370_2_12 4370_2_2 4370_2_3 4370_2_4 4370_2_7 4370_2_8 4370_2_9 4370_3_11 4370_3_1 4370_3_6 4370_3_7 4370_3_8 5174_5_1 5174_5_2 5174_5_3 5174_5_4 5174_5_5 5174_5_6 5174_5_7 5174_5_9 5174_6_10 5174_6_1 5174_6_2 5174_6_3 5174_6_4 5174_6_5 5174_6_6 5174_6_7 5174_6_8 5174_6_9 5174_7_10 5174_7_1 5174_7_2 5174_7_3 5174_7_4 5174_7_5 5174_7_6 5174_7_7 5174_7_8 5174_7_9 5174_8_1 5174_8_2 5174_8_3 5174_8_5 5174_8_6 5174_8_8 5174_8_9 Vibrio_parahaemolyticus Vibrio_vulnificus
1 824 . A G . . AB . . . . . . . G G . . . G . G . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . G G G . . . . . . . . . . G . . . . . . . . . . . G . . . . .
1 1277 . A G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G
1 1280 . C G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G
1 1298 . G A . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . A
1 1434 . C T . . AB . . . . . . . . . . . . . . . . . . . . T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
4 changes: 2 additions & 2 deletions vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases
int num_alt_bases = 0;
for(i=0; i< number_of_samples; i++ )
{
if((bases_for_snp[i] != reference_base) && (bases_for_snp[i] != '-'))
if((bases_for_snp[i] != reference_base) && (bases_for_snp[i] != '-') && (toupper(bases_for_snp[i]) != 'N') )
{
if(check_if_char_in_string(alt_bases, bases_for_snp[i], num_alt_bases) == 0)
{
Expand Down Expand Up @@ -156,7 +156,7 @@ void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base,

for(i=0; i < number_of_samples ; i++ )
{
if((bases_for_snp[i] == reference_base) || (bases_for_snp[i] == '-'))
if((bases_for_snp[i] == reference_base) || (bases_for_snp[i] == '-') || (toupper(bases_for_snp[i]) == 'N') )
{
fprintf( vcf_file_pointer, "." );
}
Expand Down

0 comments on commit 8abbe4e

Please sign in to comment.