From ab47c064596103670e9ccefd43071a5fe0d94156 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Tue, 14 Jul 2015 16:35:28 +0100 Subject: [PATCH 1/8] Add bounds check on the number of alternative bases There was an assumption that there would not be greater than 15 different characters used in a given position between sequences. This is unlikely to be possible in well formatted files but it could be the case in an input file is corrupted. --- src/vcf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/vcf.c b/src/vcf.c index e918828..f96ba39 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -27,7 +27,7 @@ #include "alignment-file.h" #include "snp-sites.h" - +const int MAXIMUM_NUMBER_OF_ALT_BASES = 30; void create_vcf_file(char filename[], int snp_locations[],int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples) { @@ -69,7 +69,7 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples) { char reference_base = bases_for_snp[0]; - char alt_bases[30]; + char alt_bases[MAXIMUM_NUMBER_OF_ALT_BASES]; if(reference_base == '\0') { return; @@ -122,6 +122,12 @@ void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases { if(check_if_char_in_string(alt_bases, bases_for_snp[i], num_alt_bases) == 0) { + if (num_alt_bases > MAXIMUM_NUMBER_OF_ALT_BASES - 2) + { + fprintf(stderr, "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n"); + fflush(stderr); + exit(EXIT_FAILURE); + } alt_bases[num_alt_bases] = bases_for_snp[i]; num_alt_bases++; alt_bases[num_alt_bases] = ','; From 48349309409a6281b2013cbfcb7130fe2e58a790 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Tue, 14 Jul 2015 16:37:49 +0000 Subject: [PATCH 2/8] add format_alternative_bases --- src/Makefile.am | 1 + src/vcf.c | 29 +++++++++++++++++++-- src/vcf.h | 2 ++ tests/check-vcf.c | 60 +++++++++++++++++++++++++++++++++++++++++++ tests/check-vcf.h | 28 ++++++++++++++++++++ tests/run-all-tests.c | 15 +++++++++-- 6 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 tests/check-vcf.c create mode 100644 tests/check-vcf.h diff --git a/src/Makefile.am b/src/Makefile.am index cf94178..8436241 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,6 +25,7 @@ TESTS = $(check_PROGRAMS) check_PROGRAMS = run-all-tests run_all_tests_SOURCES = \ ../tests/check-snp-sites.c \ + ../tests/check-vcf.c \ ../tests/helper-methods.c \ ../tests/run-all-tests.c run_all_tests_CFLAGS = -I../tests diff --git a/src/vcf.c b/src/vcf.c index f96ba39..64b9d80 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -27,8 +27,6 @@ #include "alignment-file.h" #include "snp-sites.h" -const int MAXIMUM_NUMBER_OF_ALT_BASES = 30; - void create_vcf_file(char filename[], int snp_locations[],int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples) { FILE *vcf_file_pointer; @@ -145,6 +143,33 @@ void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases } } +char * format_alternative_bases(char * alt_bases) +{ + char * formatted_alt_bases = malloc(MAXIMUM_NUMBER_OF_ALT_BASES*2*sizeof(char)); + int i; + for (i = 0; i < MAXIMUM_NUMBER_OF_ALT_BASES; i++) + { + if (alt_bases[i] == '\0') + { + if (i == 0) + { + formatted_alt_bases[0] = '\0'; + } + else + { + formatted_alt_bases[i*2 - 1] = '\0'; + } + break; + } + else + { + formatted_alt_bases[i*2] = alt_bases[i]; + formatted_alt_bases[i*2 + 1] = ','; + } + } + return formatted_alt_bases; +} + int check_if_char_in_string(char search_string[], char target_char, int search_string_length) { int i; diff --git a/src/vcf.h b/src/vcf.h index 764f5d1..9cddba3 100644 --- a/src/vcf.h +++ b/src/vcf.h @@ -27,7 +27,9 @@ void output_vcf_snps(FILE * vcf_file_pointer, char ** bases_for_snps, int * snp_ void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples); void output_vcf_row_samples_bases(FILE * vcf_file_pointer, char reference_base, char * bases_for_snp, int number_of_samples); void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases[], int number_of_samples); +char * format_alternative_bases(char * alt_bases); int check_if_char_in_string(char search_string[], char target_char, int search_string_length); #define MAX_FILENAME_SIZE 250 +#define MAXIMUM_NUMBER_OF_ALT_BASES 30 #endif diff --git a/tests/check-vcf.c b/tests/check-vcf.c new file mode 100644 index 0000000..731f2b9 --- /dev/null +++ b/tests/check-vcf.c @@ -0,0 +1,60 @@ +/* + * Wellcome Trust Sanger Institute + * Copyright (C) 2013 Wellcome Trust Sanger Institute + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 3 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "check-vcf.h" +#include "vcf.h" + +void check_format_alternative_bases(char * test_case, char * expected_result) +{ + char * result; + result = format_alternative_bases(test_case); + ck_assert_str_eq(result, expected_result); + free(result); +} + +START_TEST (format_alternative_bases_test) +{ + check_format_alternative_bases("", ""); + check_format_alternative_bases("A", "A"); + check_format_alternative_bases("AC", "A,C"); + check_format_alternative_bases("ACT", "A,C,T"); +} +END_TEST + +Suite * vcf_suite (void) +{ + Suite *s = suite_create ("Creating_VCF_file"); + + TCase *tc_vcf_file = tcase_create ("vcf_file"); + tcase_add_test (tc_vcf_file, format_alternative_bases_test); + suite_add_tcase (s, tc_vcf_file); + + return s; +} + + + diff --git a/tests/check-vcf.h b/tests/check-vcf.h new file mode 100644 index 0000000..6b11178 --- /dev/null +++ b/tests/check-vcf.h @@ -0,0 +1,28 @@ +/* + * Wellcome Trust Sanger Institute + * Copyright (C) 2013 Wellcome Trust Sanger Institute + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 3 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef _CHECK_VCF_H_ +#define _CHECK_VCF_H_ + +void check_format_alternative_bases(char *, char *); +Suite * vcf_suite (void); +#endif + + + diff --git a/tests/run-all-tests.c b/tests/run-all-tests.c index 2ee66d1..f9faee6 100644 --- a/tests/run-all-tests.c +++ b/tests/run-all-tests.c @@ -22,16 +22,27 @@ #include #include #include "check-snp-sites.h" +#include "check-vcf.h" int main (void) { int number_failed; - Suite *s = snp_sites_suite (); - SRunner *sr = srunner_create (s); + Suite *s; + SRunner *sr; + + s = snp_sites_suite (); + sr = srunner_create (s); srunner_run_all (sr, CK_NORMAL); number_failed = srunner_ntests_failed (sr); srunner_free (sr); + + s = vcf_suite (); + sr = srunner_create (s); + srunner_run_all (sr, CK_NORMAL); + number_failed += srunner_ntests_failed (sr); + srunner_free (sr); + return (number_failed == 0) ? EXIT_SUCCESS : EXIT_FAILURE; } From dca6f466bf9cf10e213583e79d76b3205f150f95 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Wed, 15 Jul 2015 12:09:49 +0000 Subject: [PATCH 3/8] Track simple list of alleles, reformat in output --- src/vcf.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/vcf.c b/src/vcf.c index 64b9d80..0257fa4 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -89,7 +89,9 @@ void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_locat // Need to look through list and find unique characters alternative_bases(reference_base, bases_for_snp, alt_bases, number_of_samples); - fprintf( vcf_file_pointer, "%s\t", alt_bases); + char * alternative_bases_string = format_alternative_bases(alt_bases); + fprintf( vcf_file_pointer, "%s\t", alternative_bases_string ); + free(alternative_bases_string); // QUAL fprintf( vcf_file_pointer, ".\t"); @@ -128,19 +130,10 @@ void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases } alt_bases[num_alt_bases] = bases_for_snp[i]; num_alt_bases++; - alt_bases[num_alt_bases] = ','; - num_alt_bases++; } } } - if(num_alt_bases > 0 && alt_bases[num_alt_bases-1] == ',') - { - alt_bases[num_alt_bases-1] = '\0'; - } - else - { - alt_bases[num_alt_bases] = '\0'; - } + alt_bases[num_alt_bases] = '\0'; } char * format_alternative_bases(char * alt_bases) From 3b27b408dd95de39c982423899b760023ed70f92 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Wed, 15 Jul 2015 15:46:16 +0000 Subject: [PATCH 4/8] Add method to calculate GT formatted allele GT format reports the index into a list of alleles rather than the relevant allele --- src/vcf.c | 29 +++++++++++++++++++++++++++++ src/vcf.h | 3 ++- tests/check-vcf.c | 31 +++++++++++++++++++++++++++++++ tests/check-vcf.h | 1 + 4 files changed, 63 insertions(+), 1 deletion(-) diff --git a/src/vcf.c b/src/vcf.c index 0257fa4..d49e253 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -136,6 +136,35 @@ void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases alt_bases[num_alt_bases] = '\0'; } +char * format_allele_index(char base, char reference_base, char * alt_bases) +{ + int maximum_format_length = (int) log10((double) MAXIMUM_NUMBER_OF_ALT_BASES) + 1; + char * result = malloc((maximum_format_length + 1)*sizeof(char)); + int index; + if (reference_base == base) + { + sprintf(result, "0"); + } + else + { + sprintf(result, "."); + for (index = 1; index Date: Thu, 16 Jul 2015 10:45:38 +0000 Subject: [PATCH 5/8] Update output format to show Genotype The previous behaviour output an Alt Base which was not understood by some other tools. This changes the format to GT. --- src/vcf.c | 27 ++++++++----------- src/vcf.h | 2 +- tests/check-vcf.c | 4 +++ ...ignment_file_one_line_per_sequence.aln.vcf | 12 ++++----- tests/data/alignment_file_with_n.aln.vcf | 12 ++++----- 5 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/vcf.c b/src/vcf.c index d49e253..d26af30 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -54,7 +54,7 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num { int i; fprintf( vcf_file_pointer, "##fileformat=VCFv4.1\n" ); - fprintf( vcf_file_pointer, "##INFO=\n" ); + fprintf( vcf_file_pointer, "##FORMAT=\n" ); fprintf( vcf_file_pointer, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" ); for(i=0; i +##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 2956_6_1 2956_6_2 2956_6_3 2956_6_4 2956_6_5 2956_6_6 3002_8_1 3002_8_2 3002_8_3 3002_8_4 3002_8_5 3002_8_6 3002_8_7 4056_2_10 4056_2_11 4056_2_1 4056_2_12 4056_2_2 4056_2_3 4056_2_4 4056_2_5 4056_2_6 4056_2_7 4056_2_9 4056_6_10 4056_6_11 4056_6_12 4056_6_2 4056_6_3 4056_6_4 4056_6_5 4056_6_6 4056_6_7 4056_6_9 4056_7_10 4056_7_11 4056_7_1 4056_7_12 4056_7_7 4056_7_8 4056_7_9 4056_8_10 4056_8_1 4056_8_12 4056_8_2 4056_8_3 4056_8_4 4056_8_6 4056_8_8 4056_8_9 4075_3_11 4075_3_12 4075_3_2 4075_3_3 4075_3_5 4075_3_6 4075_3_7 4075_3_8 4075_3_9 4370_2_11 4370_2_12 4370_2_2 4370_2_3 4370_2_4 4370_2_7 4370_2_8 4370_2_9 4370_3_11 4370_3_1 4370_3_6 4370_3_7 4370_3_8 5174_5_1 5174_5_2 5174_5_3 5174_5_4 5174_5_5 5174_5_6 5174_5_7 5174_5_9 5174_6_10 5174_6_1 5174_6_2 5174_6_3 5174_6_4 5174_6_5 5174_6_6 5174_6_7 5174_6_8 5174_6_9 5174_7_10 5174_7_1 5174_7_2 5174_7_3 5174_7_4 5174_7_5 5174_7_6 5174_7_7 5174_7_8 5174_7_9 5174_8_1 5174_8_2 5174_8_3 5174_8_5 5174_8_6 5174_8_8 5174_8_9 Vibrio_parahaemolyticus Vibrio_vulnificus -1 825 . A G . . AB . . . . . . . G G . . . G . G . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . G G G . . . . . . . . . . G . . . . . . . . . . . G . . . . . -1 1278 . A G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1281 . C G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1299 . G A . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . A -1 1435 . C T . . AB . . . . . . . . . . . . . . . . . . . . T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +1 825 . A G . . . GT 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 +1 1278 . A G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1281 . C G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1299 . G A . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1435 . C T . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 diff --git a/tests/data/alignment_file_with_n.aln.vcf b/tests/data/alignment_file_with_n.aln.vcf index 590e89a..d4a88eb 100644 --- a/tests/data/alignment_file_with_n.aln.vcf +++ b/tests/data/alignment_file_with_n.aln.vcf @@ -1,8 +1,8 @@ ##fileformat=VCFv4.1 -##INFO= +##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 2956_6_1 2956_6_2 2956_6_3 2956_6_4 2956_6_5 2956_6_6 3002_8_1 3002_8_2 3002_8_3 3002_8_4 3002_8_5 3002_8_6 3002_8_7 4056_2_10 4056_2_11 4056_2_1 4056_2_12 4056_2_2 4056_2_3 4056_2_4 4056_2_5 4056_2_6 4056_2_7 4056_2_9 4056_6_10 4056_6_11 4056_6_12 4056_6_2 4056_6_3 4056_6_4 4056_6_5 4056_6_6 4056_6_7 4056_6_9 4056_7_10 4056_7_11 4056_7_1 4056_7_12 4056_7_7 4056_7_8 4056_7_9 4056_8_10 4056_8_1 4056_8_12 4056_8_2 4056_8_3 4056_8_4 4056_8_6 4056_8_8 4056_8_9 4075_3_11 4075_3_12 4075_3_2 4075_3_3 4075_3_5 4075_3_6 4075_3_7 4075_3_8 4075_3_9 4370_2_11 4370_2_12 4370_2_2 4370_2_3 4370_2_4 4370_2_7 4370_2_8 4370_2_9 4370_3_11 4370_3_1 4370_3_6 4370_3_7 4370_3_8 5174_5_1 5174_5_2 5174_5_3 5174_5_4 5174_5_5 5174_5_6 5174_5_7 5174_5_9 5174_6_10 5174_6_1 5174_6_2 5174_6_3 5174_6_4 5174_6_5 5174_6_6 5174_6_7 5174_6_8 5174_6_9 5174_7_10 5174_7_1 5174_7_2 5174_7_3 5174_7_4 5174_7_5 5174_7_6 5174_7_7 5174_7_8 5174_7_9 5174_8_1 5174_8_2 5174_8_3 5174_8_5 5174_8_6 5174_8_8 5174_8_9 Vibrio_parahaemolyticus Vibrio_vulnificus -1 825 . A G . . AB . . . . . . . G G . . . G . G . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . . . . . . . . . . . . . . . . . . . . G . . . . . G G G . . . . . . . . . . G . . . . . . . . . . . G . . . . . -1 1278 . A G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1281 . C G . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . G -1 1299 . G A . . AB . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . A -1 1435 . C T . . AB . . . . . . . . . . . . . . . . . . . . T . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +1 825 . A G . . . GT 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 +1 1278 . A G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1281 . C G . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1299 . G A . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +1 1435 . C T . . . GT 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 From f4f5e0871b724e25e2bf19f858f2940beb7ec336 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Thu, 16 Jul 2015 12:52:29 +0100 Subject: [PATCH 6/8] Bump VERSION --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index d00491f..227cea2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1 +2.0.0 From d78487d52a8cc351f051d7abbb7ec648f4a74a07 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Thu, 16 Jul 2015 12:27:36 +0000 Subject: [PATCH 7/8] Tidy up check_format_allele_index --- tests/check-vcf.c | 8 +------- tests/check-vcf.h | 2 +- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/check-vcf.c b/tests/check-vcf.c index c836fd8..102284f 100644 --- a/tests/check-vcf.c +++ b/tests/check-vcf.c @@ -45,18 +45,12 @@ START_TEST (format_alternative_bases_test) } END_TEST -void check_format_allele_index(char test_base, char reference_base, char alt_bases_array[], char expected_result_array[]) +void check_format_allele_index(char test_base, char reference_base, char * alt_bases, char * expected_result) { char * result; - char * alt_bases = malloc(50*sizeof(char)); - strcpy(alt_bases, alt_bases_array); - char * expected_result = malloc(20*sizeof(char)); - strcpy(expected_result, expected_result_array); result = format_allele_index(test_base, reference_base, alt_bases); ck_assert_str_eq(result, expected_result); free(result); - free(alt_bases); - free(expected_result); } START_TEST (format_allele_index_test) diff --git a/tests/check-vcf.h b/tests/check-vcf.h index 0dc7ef4..3274bfb 100644 --- a/tests/check-vcf.h +++ b/tests/check-vcf.h @@ -21,7 +21,7 @@ #define _CHECK_VCF_H_ void check_format_alternative_bases(char *, char *); -void check_format_allele_index(char, char, char[], char[]); +void check_format_allele_index(char, char, char *, char *); Suite * vcf_suite (void); #endif From 497b6e0b56710a552b23bd2ac34015a18e5a574f Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Thu, 16 Jul 2015 14:39:01 +0000 Subject: [PATCH 8/8] Simplify vcf code with calloc Use calloc and strlen to simplify the vcf related code --- src/vcf.c | 49 +++++++++++++++++------------------------------ src/vcf.h | 2 +- tests/check-vcf.c | 15 +++++++++++++++ tests/check-vcf.h | 1 + 4 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/vcf.c b/src/vcf.c index d26af30..46a7cea 100644 --- a/src/vcf.c +++ b/src/vcf.c @@ -26,6 +26,7 @@ #include "vcf.h" #include "alignment-file.h" #include "snp-sites.h" +#include void create_vcf_file(char filename[], int snp_locations[],int number_of_snps, char ** bases_for_snps, char ** sequence_names, int number_of_samples) { @@ -67,7 +68,6 @@ void output_vcf_header( FILE * vcf_file_pointer, char ** sequence_names, int num void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_location, int number_of_samples) { char reference_base = bases_for_snp[0]; - char alt_bases[MAXIMUM_NUMBER_OF_ALT_BASES]; if(reference_base == '\0') { return; @@ -88,7 +88,7 @@ void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_locat // ALT // Need to look through list and find unique characters - alternative_bases(reference_base, bases_for_snp, alt_bases, number_of_samples); + char * alt_bases = alternative_bases(reference_base, bases_for_snp, number_of_samples); char * alternative_bases_string = format_alternative_bases(alt_bases); fprintf( vcf_file_pointer, "%s\t", alternative_bases_string ); free(alternative_bases_string); @@ -107,22 +107,24 @@ void output_vcf_row(FILE * vcf_file_pointer, char * bases_for_snp, int snp_locat // Bases for each sample output_vcf_row_samples_bases(vcf_file_pointer, reference_base, alt_bases, bases_for_snp, number_of_samples ); + free(alt_bases); fprintf( vcf_file_pointer, "\n"); } -void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases[], int number_of_samples) +char * alternative_bases(char reference_base, char * bases_for_snp, int number_of_samples) { int i; int num_alt_bases = 0; + char * alt_bases = calloc(MAXIMUM_NUMBER_OF_ALT_BASES+1, sizeof(char)); for(i=0; i< number_of_samples; i++ ) { if((bases_for_snp[i] != reference_base) && (bases_for_snp[i] != '-') && (toupper(bases_for_snp[i]) != 'N') ) { if(check_if_char_in_string(alt_bases, bases_for_snp[i], num_alt_bases) == 0) { - if (num_alt_bases > MAXIMUM_NUMBER_OF_ALT_BASES - 2) + if (num_alt_bases >= MAXIMUM_NUMBER_OF_ALT_BASES) { fprintf(stderr, "Unexpectedly large number of alternative bases found between sequences. Please check input file is not corrupted\n\n"); fflush(stderr); @@ -133,13 +135,14 @@ void alternative_bases(char reference_base, char * bases_for_snp, char alt_bases } } } - alt_bases[num_alt_bases] = '\0'; + return alt_bases; } char * format_allele_index(char base, char reference_base, char * alt_bases) { - int maximum_format_length = (int) log10((double) MAXIMUM_NUMBER_OF_ALT_BASES) + 1; - char * result = malloc((maximum_format_length + 1)*sizeof(char)); + int length_of_alt_bases = strlen(alt_bases); + assert(length_of_alt_bases < 100); + char * result = calloc(3, sizeof(char)); int index; if (reference_base == base || toupper(base) == 'N' || base == '-') { @@ -148,17 +151,13 @@ char * format_allele_index(char base, char reference_base, char * alt_bases) else { sprintf(result, "."); - for (index = 1; index