diff --git a/VERSION b/VERSION index d00491f..227cea2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1 +2.0.0 diff --git a/src/Makefile.am b/src/Makefile.am index cf94178..8436241 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,6 +25,7 @@ TESTS = $(check_PROGRAMS) check_PROGRAMS = run-all-tests run_all_tests_SOURCES = \ ../tests/check-snp-sites.c \ + ../tests/check-vcf.c \ ../tests/helper-methods.c \ ../tests/run-all-tests.c run_all_tests_CFLAGS = -I../tests diff --git a/src/vcf.c b/src/vcf.c index e918828..46a7cea 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -26,8 +26,7 @@ #include "vcf.h" #include "alignment-file.h" #include "snp-sites.h" - - +#include void create_vcf_file(char filename[], int snp_locations[],int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples) { @@ -56,7 +55,7 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num { int i; fprintf( vcf_file_pointer, "##fileformat=VCFv4.1\n" ); - fprintf( vcf_file_pointer, "##INFO=\n" ); + fprintf( vcf_file_pointer, "##FORMAT=\n" ); fprintf( vcf_file_pointer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" ); for(i=0; i= MAXIMUM_NUMBER_OF_ALT_BASES) + { + fprintf(stderr, "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n"); + fflush(stderr); + exit(EXIT_FAILURE); + } alt_bases[num_alt_bases] = bases_for_snp[i]; num_alt_bases++; - alt_bases[num_alt_bases] = ','; - num_alt_bases++; } } } - if(num_alt_bases > 0 && alt_bases[num_alt_bases-1] == ',') + return alt_bases; +} + +char * format_allele_index(char base, char reference_base, char * alt_bases) +{ + int length_of_alt_bases = strlen(alt_bases); + assert(length_of_alt_bases < 100); + char * result = calloc(3, sizeof(char)); + int index; + if (reference_base == base || toupper(base) == 'N' || base == '-') { - alt_bases[num_alt_bases-1] = '\0'; + sprintf(result, "0"); } else { - alt_bases[num_alt_bases] = '\0'; + sprintf(result, "."); + for (index = 1; index <= length_of_alt_bases; index++) + { + if (alt_bases[index-1] == base) + { + sprintf(result, "%i", index); + break; + } + } + } + return result; +} + +char * format_alternative_bases(char * alt_bases) +{ + int number_of_alt_bases = strlen(alt_bases); + assert( number_of_alt_bases < MAXIMUM_NUMBER_OF_ALT_BASES ); + char * formatted_alt_bases = calloc(number_of_alt_bases*2 + 1, sizeof(char)); + int i; + formatted_alt_bases[0] = alt_bases[0]; + for (i = 1; i < number_of_alt_bases; i++) + { + formatted_alt_bases[i*2 - 1] = ','; + formatted_alt_bases[i*2] = alt_bases[i]; } + return formatted_alt_bases; } int check_if_char_in_string(char search_string[], char target_char, int search_string_length) @@ -152,20 +191,16 @@ int check_if_char_in_string(char search_string[], char target_char, int search_s return 0; } -void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * bases_for_snp, int number_of_samples) +void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * alt_bases, char * bases_for_snp, int number_of_samples) { int i; + char * format; for(i=0; i < number_of_samples ; i++ ) { - if((bases_for_snp[i] == reference_base) || (bases_for_snp[i] == '-') || (toupper(bases_for_snp[i]) == 'N') ) - { - fprintf( vcf_file_pointer, "." ); - } - else - { - fprintf( vcf_file_pointer, "%c", (char) bases_for_snp[i] ); - } + format = format_allele_index(bases_for_snp[i], reference_base, alt_bases); + fprintf( vcf_file_pointer, "%s", format); + free(format); if(i+1 != number_of_samples) { fprintf( vcf_file_pointer, "\t"); diff --git a/src/vcf.h b/src/vcf.h index 764f5d1..51affe4 100644 --- a/src/vcf.h +++ b/src/vcf.h @@ -25,9 +25,12 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num void create_vcf_file(char filename[], int snp_locations[], int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples); void output_vcf_snps(FILE * vcf_file_pointer, char ** bases_for_snps, int * snp_locations, int number_of_snps, int number_of_samples); void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples); -void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * bases_for_snp, int number_of_samples); -void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases[], int number_of_samples); +void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * alt_bases, char * bases_for_snp, int number_of_samples); +char * alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples); +char * format_alternative_bases(char *); +char * format_allele_index(char, char, char *); int check_if_char_in_string(char search_string[], char target_char, int search_string_length); #define MAX_FILENAME_SIZE 250 +#define MAXIMUM_NUMBER_OF_ALT_BASES 30 #endif diff --git a/tests/check-vcf.c b/tests/check-vcf.c new file mode 100644 index 0000000..8ae95cf --- /dev/null +++ b/tests/check-vcf.c @@ -0,0 +1,104 @@ +/* + * Wellcome Trust Sanger Institute + * Copyright (C) 2013 Wellcome Trust Sanger Institute + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 3 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "check-vcf.h" +#include "vcf.h" + +void check_alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples, char * expected_result) +{ + char * result; + result = alternative_bases(reference_base, bases_for_snp, number_of_samples); + ck_assert_str_eq(result, expected_result); + free(result); +} + +START_TEST (alternative_bases_test) +{ + check_alternative_bases('A', "AGCT-nN", 6, "GCT"); +} +END_TEST + +void check_format_alternative_bases(char * test_case, char * expected_result) +{ + char * result; + result = format_alternative_bases(test_case); + ck_assert_str_eq(result, expected_result); + free(result); +} + +START_TEST (format_alternative_bases_test) +{ + check_format_alternative_bases("", ""); + check_format_alternative_bases("A", "A"); + check_format_alternative_bases("AC", "A,C"); + check_format_alternative_bases("ACT", "A,C,T"); +} +END_TEST + +void check_format_allele_index(char test_base, char reference_base, char * alt_bases, char * expected_result) +{ + char * result; + result = format_allele_index(test_base, reference_base, alt_bases); + ck_assert_str_eq(result, expected_result); + free(result); +} + +START_TEST (format_allele_index_test) +{ + check_format_allele_index('A', 'A', "", "0"); + check_format_allele_index('A', 'A', "C", "0"); + check_format_allele_index('A', 'A', "CA", "0"); + + check_format_allele_index('A', 'C', "A", "1"); + check_format_allele_index('A', 'C', "GA", "2"); + + check_format_allele_index('A', 'C', "", "."); + check_format_allele_index('A', 'C', "G", "."); + + check_format_allele_index('A', 'B', "CDEFGHIJKLMNOPAQRST", "15"); + + check_format_allele_index('-', 'A', "C", "0"); + check_format_allele_index('N', 'A', "C", "0"); + check_format_allele_index('n', 'A', "C", "0"); +} +END_TEST + +Suite * vcf_suite (void) +{ + Suite *s = suite_create ("Creating_VCF_file"); + + TCase *tc_vcf_file = tcase_create ("vcf_file"); + tcase_add_test (tc_vcf_file, alternative_bases_test); + tcase_add_test (tc_vcf_file, format_alternative_bases_test); + tcase_add_test (tc_vcf_file, format_allele_index_test); + suite_add_tcase (s, tc_vcf_file); + + return s; +} + + + diff --git a/tests/check-vcf.h b/tests/check-vcf.h new file mode 100644 index 0000000..99f3167 --- /dev/null +++ b/tests/check-vcf.h @@ -0,0 +1,30 @@ +/* + * Wellcome Trust Sanger Institute + * Copyright (C) 2013 Wellcome Trust Sanger Institute + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 3 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef _CHECK_VCF_H_ +#define _CHECK_VCF_H_ + +void check_alternative_bases(char, char *, int, char *); +void check_format_alternative_bases(char *, char *); +void check_format_allele_index(char, char, char *, char *); +Suite * vcf_suite (void); +#endif + + + diff --git a/tests/data/alignment_file_one_line_per_sequence.aln.vcf b/tests/data/alignment_file_one_line_per_sequence.aln.vcf index 590e89a..d4a88eb 100644 --- a/tests/data/alignment_file_one_line_per_sequence.aln.vcf +++ b/tests/data/alignment_file_one_line_per_sequence.aln.vcf @@ -1,8 +1,8 @@ ##fileformat=VCFv4.1 -##INFO= +##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 2956_6_1 2956_6_2 2956_6_3 2956_6_4 2956_6_5 2956_6_6 3002_8_1 3002_8_2 3002_8_3 3002_8_4 3002_8_5 3002_8_6 3002_8_7 4056_2_10 4056_2_11 4056_2_1 4056_2_12 4056_2_2 4056_2_3 4056_2_4 4056_2_5 4056_2_6 4056_2_7 4056_2_9 4056_6_10 4056_6_11 4056_6_12 4056_6_2 4056_6_3 4056_6_4 4056_6_5 4056_6_6 4056_6_7 4056_6_9 4056_7_10 4056_7_11 4056_7_1 4056_7_12 4056_7_7 4056_7_8 4056_7_9 4056_8_10 4056_8_1 4056_8_12 4056_8_2 4056_8_3 4056_8_4 4056_8_6 4056_8_8 4056_8_9 4075_3_11 4075_3_12 4075_3_2 4075_3_3 4075_3_5 4075_3_6 4075_3_7 4075_3_8 4075_3_9 4370_2_11 4370_2_12 4370_2_2 4370_2_3 4370_2_4 4370_2_7 4370_2_8 4370_2_9 4370_3_11 4370_3_1 4370_3_6 4370_3_7 4370_3_8 5174_5_1 5174_5_2 5174_5_3 5174_5_4 5174_5_5 5174_5_6 5174_5_7 5174_5_9 5174_6_10 5174_6_1 5174_6_2 5174_6_3 5174_6_4 5174_6_5 5174_6_6 5174_6_7 5174_6_8 5174_6_9 5174_7_10 5174_7_1 5174_7_2 5174_7_3 5174_7_4 5174_7_5 5174_7_6 5174_7_7 5174_7_8 5174_7_9 5174_8_1 5174_8_2 5174_8_3 5174_8_5 5174_8_6 5174_8_8 5174_8_9 Vibrio_parahaemolyticus Vibrio_vulnificus -1 825 . A G . . AB . . . . . . . G G . . . G . G . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . G G G . . . . . . . . . . G . . . . . . . . . . . G . . . . . -1 1278 . A G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1281 . C G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1299 . G A . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . A -1 1435 . C T . . AB . . . . . . . . . . . . . . . . . . . . T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +1 825 . A G . . . GT 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 +1 1278 . A G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1281 . C G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1299 . G A . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1435 . C T . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/data/alignment_file_with_n.aln.vcf b/tests/data/alignment_file_with_n.aln.vcf index 590e89a..d4a88eb 100644 --- a/tests/data/alignment_file_with_n.aln.vcf +++ b/tests/data/alignment_file_with_n.aln.vcf @@ -1,8 +1,8 @@ ##fileformat=VCFv4.1 -##INFO= +##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 2956_6_1 2956_6_2 2956_6_3 2956_6_4 2956_6_5 2956_6_6 3002_8_1 3002_8_2 3002_8_3 3002_8_4 3002_8_5 3002_8_6 3002_8_7 4056_2_10 4056_2_11 4056_2_1 4056_2_12 4056_2_2 4056_2_3 4056_2_4 4056_2_5 4056_2_6 4056_2_7 4056_2_9 4056_6_10 4056_6_11 4056_6_12 4056_6_2 4056_6_3 4056_6_4 4056_6_5 4056_6_6 4056_6_7 4056_6_9 4056_7_10 4056_7_11 4056_7_1 4056_7_12 4056_7_7 4056_7_8 4056_7_9 4056_8_10 4056_8_1 4056_8_12 4056_8_2 4056_8_3 4056_8_4 4056_8_6 4056_8_8 4056_8_9 4075_3_11 4075_3_12 4075_3_2 4075_3_3 4075_3_5 4075_3_6 4075_3_7 4075_3_8 4075_3_9 4370_2_11 4370_2_12 4370_2_2 4370_2_3 4370_2_4 4370_2_7 4370_2_8 4370_2_9 4370_3_11 4370_3_1 4370_3_6 4370_3_7 4370_3_8 5174_5_1 5174_5_2 5174_5_3 5174_5_4 5174_5_5 5174_5_6 5174_5_7 5174_5_9 5174_6_10 5174_6_1 5174_6_2 5174_6_3 5174_6_4 5174_6_5 5174_6_6 5174_6_7 5174_6_8 5174_6_9 5174_7_10 5174_7_1 5174_7_2 5174_7_3 5174_7_4 5174_7_5 5174_7_6 5174_7_7 5174_7_8 5174_7_9 5174_8_1 5174_8_2 5174_8_3 5174_8_5 5174_8_6 5174_8_8 5174_8_9 Vibrio_parahaemolyticus Vibrio_vulnificus -1 825 . A G . . AB . . . . . . . G G . . . G . G . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . G G G . . . . . . . . . . G . . . . . . . . . . . G . . . . . -1 1278 . A G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1281 . C G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1299 . G A . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . A -1 1435 . C T . . AB . . . . . . . . . . . . . . . . . . . . T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +1 825 . A G . . . GT 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 +1 1278 . A G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1281 . C G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1299 . G A . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1435 . C T . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/run-all-tests.c b/tests/run-all-tests.c index 2ee66d1..f9faee6 100644 --- a/tests/run-all-tests.c +++ b/tests/run-all-tests.c @@ -22,16 +22,27 @@ #include #include #include "check-snp-sites.h" +#include "check-vcf.h" int main (void) { int number_failed; - Suite *s = snp_sites_suite (); - SRunner *sr = srunner_create (s); + Suite *s; + SRunner *sr; + + s = snp_sites_suite (); + sr = srunner_create (s); srunner_run_all (sr, CK_NORMAL); number_failed = srunner_ntests_failed (sr); srunner_free (sr); + + s = vcf_suite (); + sr = srunner_create (s); + srunner_run_all (sr, CK_NORMAL); + number_failed += srunner_ntests_failed (sr); + srunner_free (sr); + return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; }