forked from maip/novo_muta
-
Notifications
You must be signed in to change notification settings - Fork 0
/
simulation_model.cc
395 lines (354 loc) · 13.4 KB
/
simulation_model.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/**
* @file simulation_model.cc
* @author Melissa Ip
*
* This file contains the implementation of the SimulationModel class.
*
* See top of simulation_model.h for a complete description.
*/
#include "simulation_model.h"
/**
* Constructor to initialize SimulationModel object that contains necessary
* parameters for the random generation of samples and probabilities. Currently
* only coverage and germline and somatic mutation rates can be modified. All
* other parameters are set to default.
*
* has_mutation_ keeps track of whether each site contains a mutation and is
* reused for all sites.
*
* @param coverage Coverage.
* @param germline_mutation_rate Germline mutation rate.
* @param somatic_mutation_rate Somatic mutation rate.
*/
SimulationModel::SimulationModel(unsigned int coverage,
double population_mutation_rate,
double germline_mutation_rate,
double somatic_mutation_rate)
: coverage_{coverage}, has_mutation_{false} {
params_.set_population_mutation_rate(population_mutation_rate);
params_.set_germline_mutation_rate(germline_mutation_rate);
params_.set_somatic_mutation_rate(somatic_mutation_rate);
}
/**
* Seeds rand function with time and creates a generator chosen by the
* environment variable GSL_RNG_TYPE.
*/
void SimulationModel::Seed() {
srand(time(NULL));
const gsl_rng_type *type;
gsl_rng_env_setup();
type = gsl_rng_default;
generator_ = gsl_rng_alloc(type);
}
/**
* Frees GSL random number generator.
*/
void SimulationModel::Free() {
gsl_rng_free(generator_);
}
/**
* Generates size random samples using population priors as weights and outputs
* their probabilities and whether that site contains a mutation
* (1=true, 0=false) to a text file. The file is tab separated and each site
* is on a new line.
*
* @param file_name File name.
* @param size Number of experiments or trios.
*/
void SimulationModel::WriteProbability(const string &file_name, int size) {
ofstream fout(file_name);
TrioVector random_trios = SimulationModel::GetRandomTrios(size);
for (int i = 0; i < size; ++i) {
double probability = params_.MutationProbability(random_trios[i]);
fout << probability << "\t" << has_mutation_vec_[i] << "\n";
}
fout.close();
}
/**
* Writes to a text file the index of the key trio, how many random trios had a
* mutation, how many random trios had no mutation, tab separated, each trio
* is placed on a new line. Assumes 4x coverage.
*
* @param file_name File name.
* @param size Number of random trios.
*/
void SimulationModel::WriteMutationCounts(const string &file_name, int size) {
ofstream fout(file_name);
TrioVector random_trios = SimulationModel::GetRandomTrios(size);
for (int i = 0; i < kTrioCount; ++i) {
vector<bool> mutations = mutation_table_[i];
int has_mutation_total = 0;
int has_no_mutation_total = 0;
if (mutations.size() > 0) {
has_mutation_total = count(mutations.begin(), mutations.end(), true);
has_no_mutation_total = mutations.size() - has_mutation_total;
}
fout << i << "\t" << has_mutation_total << "\t"
<< has_no_mutation_total << "\n";
}
fout.close();
}
/**
* Writes to stdout the index of the key trio, how many random trios had a
* mutation, how many random trios had no mutation, tab separated, each trio
* is placed on a new line. Assumes 4x coverage.
*
* @param size Number of random trios.
*/
void SimulationModel::PrintMutationCounts(int size) {
TrioVector random_trios = SimulationModel::GetRandomTrios(size);
for (int i = 0; i < kTrioCount; ++i) {
vector<bool> mutations = mutation_table_[i];
int has_mutation_total = 0;
int has_no_mutation_total = 0;
if (mutations.size() > 0) {
has_mutation_total = count(mutations.begin(), mutations.end(), true);
has_no_mutation_total = mutations.size() - has_mutation_total;
}
cout << i << "\t" << has_mutation_total << "\t"
<< has_no_mutation_total << "\n";
}
}
/**
* Generates a random sample from the range [0, K) using K size probabilities
* as weights.
*
* @param K Number of cateogories in probabilities.
* @param probabilitites RowVector of probabilities associated with each entry
* in the samples generated by K.
* @return Random element based on probabilities from samples
* generated by K.
*/
int SimulationModel::RandomDiscreteChoice(size_t K,
const RowVectorXd &probabilities) {
// Converts probabilities to double array p.
int length = probabilities.size();
double p[length];
for (int i = 0; i < length; ++i) {
p[i] = probabilities(i);
}
// Creates discrete random number generator.
gsl_ran_discrete_t *g = gsl_ran_discrete_preproc(K, p);
size_t random = gsl_ran_discrete(generator_, g);
gsl_ran_discrete_free(g);
return (int) random;
}
/**
* Generates RowVector of random samples from the range [0, K) using K size
* probabilities as weights.
*
* @param K Number of cateogories in probabilities.
* @param probabilitites RowVector of probabilities associated with each entry
* in the samples generated by K.
* @param size Number of samples.
* @return Random samples based on probabilities from samples
* generated by K.
*/
RowVectorXi SimulationModel::RandomDiscreteChoice(size_t K,
const RowVectorXd &probabilities,
int size) {
// Creates size RowVector to hold random samples.
RowVectorXi random_samples(size);
for (int i = 0; i < size; ++i) {
random_samples(i) = SimulationModel::RandomDiscreteChoice(K, probabilities);
}
return random_samples;
}
/**
* Mutates a numeric genotype based on either germline or somatic transition
* matrix in the TrioModel.
*
* is_germline is false by default to process somatic mutation. If this is set
* to true, then this method will process germline mutation and assume
* parent_genotype_idx is not -1.
*
* @param genotype_idx Index of genotype.
* @param is_germline False by default. Set to true to process germline
* mutation.
* @param parent_genotype_idx Index of parent genotype.
* @return Index of mutated genotype.
*/
int SimulationModel::Mutate(int genotype_idx, bool is_germline,
int parent_genotype_idx) {
// Sets probability matrices to use either germline or somatic probabilities.
RowVector16d mat = RowVector16d::Zero();
if (!is_germline) {
mat = params_.somatic_probability_mat().row(genotype_idx);
} else {
mat = params_.germline_probability_mat().col(parent_genotype_idx);
}
// Randomly mutates the genotype using the probabilities as weights.
int mutated_genotype_idx = SimulationModel::RandomDiscreteChoice(
kGenotypeCount,
mat
);
if (mutated_genotype_idx != genotype_idx) {
has_mutation_ = true;
}
return mutated_genotype_idx;
}
/**
* Returns a random allele in the parent genotype. Used to create a child
* genotype from two parents.
*
* @param parent_genotype Parent genotype.
* @return Random allele in the parent genotype.
*/
int SimulationModel::GetChildAllele(int parent_genotype) {
int bin = rand() % 2;
if (bin == 0) {
return parent_genotype / kNucleotideCount;
} else if (bin == 1) {
return parent_genotype % kNucleotideCount;
} else {
return -1; // ERROR: Random number should be 0 or 1.
}
}
/**
* Generates a numeric child genotype by picking a random allele from each of
* the given numeric parent genotypes.
*
* @param mother_genotype Numeric mother genotype.
* @param father_genotype Numeric father genotype.
* @return Numeric child genotype.
*/
int SimulationModel::GetChildGenotype(int mother_genotype, int father_genotype) {
int child_allele1 = SimulationModel::GetChildAllele(mother_genotype);
int child_allele2 = SimulationModel::GetChildAllele(father_genotype);
for (int i = 0; i < kGenotypeCount; ++i) {
if (child_allele1 == i / kNucleotideCount &&
child_allele2 == i % kNucleotideCount) {
return i;
}
}
return -1; // ERROR: This should not happen.
}
/**
* Generates a 3 x size Eigen matrix of random genotypes for child, mother,
* and father.
*
* @param size Number of genotypes to generate.
* @return 3 x size Eigen matrix of genotypes.
*/
MatrixXi SimulationModel::GetGenotypesMatrix(int size) {
// Generates random samples using population priors as weights.
RowVectorXi parent_genotypes = SimulationModel::RandomDiscreteChoice(
kGenotypePairCount,
params_.population_priors(),
size
);
// Extracts parent genotypes from samples and gets child genotypes.
MatrixXi genotypes_mat(3, size);
for (int i = 0; i < size; ++i) {
int mother_genotype = parent_genotypes(i) / kGenotypeCount;
int father_genotype = parent_genotypes(i) % kGenotypeCount;
genotypes_mat(1, i) = mother_genotype;
genotypes_mat(2, i) = father_genotype;
genotypes_mat(0, i) = SimulationModel::GetChildGenotype(
mother_genotype,
father_genotype
);
}
return genotypes_mat;
}
/**
* Uses alpha frequencies based on the somatic genotype to select nucleotide
* frequencies and uses these frequencies to draw sequencing reads at a
* specified coverage (Dirichlet multinomial). K is kNucleotideCount.
*
* @param genotype_idx Index of genotype.
* @return Read counts drawn from Dirichlet multinomial.
*/
ReadData SimulationModel::DirichletMultinomialSample(int genotype_idx) {
// Converts alpha to double array.
auto alpha_vec = params_.alphas().row(genotype_idx);
const double alpha[kNucleotideCount] = {alpha_vec(0), alpha_vec(1),
alpha_vec(2), alpha_vec(3)};
// Sets alpha frequencies using dirichlet distribution in theta.
double theta[kNucleotideCount] = {0.0};
gsl_ran_dirichlet(generator_, kNucleotideCount, alpha, theta);
// Sets sequencing reads using multinomial distribution in reads.
unsigned int reads[kNucleotideCount] = {0};
gsl_ran_multinomial(generator_, kNucleotideCount, coverage_, theta, reads);
// Converts reads to ReadData.
ReadData data = {0};
for (int i = 0; i < kNucleotideCount; ++i) {
data.reads[i] = reads[i];
}
return data;
}
/**
* Generates size random trios and keeps track of whether each ReadDataVector
* has a mutation by adding it to the has_mutation_vec_.
*
* @param size Number of random trios.
* @return TrioVector containing random trios.
*/
TrioVector SimulationModel::GetRandomTrios(int size) {
TrioVector random_trios;
MatrixXi genotypes_mat = SimulationModel::GetGenotypesMatrix(size);
TrioVector trio_vec = GetTrioVector(kNucleotideCount);
for (int i = 0; i < size; ++i) {
int child_genotype = genotypes_mat(0, i);
int mother_genotype = genotypes_mat(1, i);
int father_genotype = genotypes_mat(2, i);
// Processes germline mutation. Germline matrix requires no Kronecker.
int child_germline_genotype = SimulationModel::Mutate(
child_genotype,
true,
mother_genotype*kGenotypeCount + father_genotype
);
// Processes somatic mutation.
int child_somatic_genotype = SimulationModel::Mutate(child_germline_genotype);
int mother_somatic_genotype = SimulationModel::Mutate(mother_genotype);
int father_somatic_genotype = SimulationModel::Mutate(father_genotype);
// Creates reads from somatic genotypes using the Dirichlet multinomial.
ReadData child_read = SimulationModel::DirichletMultinomialSample(child_somatic_genotype);
ReadData mother_read = SimulationModel::DirichletMultinomialSample(mother_somatic_genotype);
ReadData father_read = SimulationModel::DirichletMultinomialSample(father_somatic_genotype);
ReadDataVector data_vec = {child_read, mother_read, father_read};
random_trios.push_back(data_vec);
// Records has_mutation_ in order relevant vector.
// Used only for MutationCounts at 4x coverage.
int trio_index = IndexOfReadDataVector(data_vec, trio_vec);
if (trio_index != -1) {
mutation_table_[trio_index].push_back(has_mutation_);
}
has_mutation_vec_.push_back(has_mutation_);
has_mutation_ = false; // Resets for the next simulation.
}
return random_trios;
}
unsigned int SimulationModel::coverage() const {
return coverage_;
}
void SimulationModel::set_coverage(unsigned int coverage) {
coverage_ = coverage;
}
double SimulationModel::population_mutation_rate() const {
return params_.population_mutation_rate();
}
void SimulationModel::set_population_mutation_rate(double rate) {
params_.set_population_mutation_rate(rate);
}
double SimulationModel::germline_mutation_rate() const {
return params_.germline_mutation_rate();
}
void SimulationModel::set_germline_mutation_rate(double rate) {
params_.set_germline_mutation_rate(rate);
}
double SimulationModel::somatic_mutation_rate() const {
return params_.somatic_mutation_rate();
}
void SimulationModel::set_somatic_mutation_rate(double rate) {
params_.set_somatic_mutation_rate(rate);
}
bool SimulationModel::has_mutation() const {
return has_mutation_;
}
void SimulationModel::set_has_mutation(bool has_mutation) {
has_mutation_ = has_mutation;
}
gsl_rng* SimulationModel::generator() const {
return generator_;
}