Skip to content

Commit

Permalink
Merge pull request #35 from bewt85/bt5_496468_qm_bases
Browse files Browse the repository at this point in the history
496468: Allow '?' in fasta input
  • Loading branch information
aslett1 committed Dec 2, 2015
2 parents cc4c259 + 527d09f commit 8e2bb64
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 11 deletions.
16 changes: 14 additions & 2 deletions src/alignment-file.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,18 @@ int build_reference_sequence_and_truncate(char reference_sequence[], char filena
return 1;
}

int is_unknown(char base)
{
switch (toupper(base)) {
case 'N':
case '-':
case '?':
return 1;
default:
return 0;
}
}

int detect_snps(char reference_sequence[], char filename[], size_t length_of_genome)
{
int i;
Expand All @@ -199,12 +211,12 @@ int detect_snps(char reference_sequence[], char filename[], size_t length_of_gen
for(i = 0; i < length_of_genome; i++)
{
// If there is an indel in the reference sequence, replace with the first proper base you find
if((reference_sequence[i] == '-' && seq->seq.s[i] != '-' ) || (toupper(reference_sequence[i]) == 'N' && seq->seq.s[i] != 'N' ))
if(is_unknown(reference_sequence[i]) && !is_unknown(seq->seq.s[i]))
{
reference_sequence[i] = toupper(seq->seq.s[i]);
}

if(reference_sequence[i] != '*' && seq->seq.s[i] != '-' && toupper(seq->seq.s[i]) != 'N' && reference_sequence[i] != toupper(seq->seq.s[i]))
if(! is_unknown(reference_sequence[i]) && reference_sequence[i] != '*' && ! is_unknown(seq->seq.s[i]) && (reference_sequence[i] != toupper(seq->seq.s[i])))
{
reference_sequence[i] = '*';
number_of_snps++;
Expand Down
1 change: 1 addition & 0 deletions src/alignment-file.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "kseq.h"

int is_unknown(char base);
int detect_snps(char reference_sequence[], char filename[], size_t length_of_genome);
int line_length(FILE * alignment_file_pointer);
int build_reference_sequence(char reference_sequence[], char filename[]);
Expand Down
2 changes: 1 addition & 1 deletion src/parse-phylip.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ int does_column_contain_snps(int snp_column, char reference_base)
return 0;
}

if(sequences[i][snp_column] != '-' && toupper(sequences[i][snp_column]) != 'N' && sequences[i][snp_column] != reference_base)
if(!is_unknown(sequences[i][snp_column]) && sequences[i][snp_column] != reference_base)
{
return 1;
}
Expand Down
4 changes: 2 additions & 2 deletions src/vcf.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ char * alternative_bases(char reference_base, char * bases_for_snp, int number_o
char * alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES+1, sizeof(char));
for(i=0; i< number_of_samples; i++ )
{
if((bases_for_snp[i] != reference_base) && (bases_for_snp[i] != '-') && (toupper(bases_for_snp[i]) != 'N') )
if(!is_unknown(bases_for_snp[i]) && (bases_for_snp[i] != reference_base))
{
if(check_if_char_in_string(alt_bases, bases_for_snp[i], num_alt_bases) == 0)
{
Expand All @@ -145,7 +145,7 @@ char * format_allele_index(char base, char reference_base, char * alt_bases)
assert(length_of_alt_bases < 100);
char * result = calloc(3, sizeof(char));
int index;
if (reference_base == base || toupper(base) == 'N' || base == '-')
if (reference_base == base || is_unknown(base))
{
sprintf(result, "0");
}
Expand Down
Loading

0 comments on commit 8e2bb64

Please sign in to comment.