-
Notifications
You must be signed in to change notification settings - Fork 1
/
genAmultigpu.cu
958 lines (836 loc) · 45.9 KB
/
genAmultigpu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
#include <omp.h>
#include <fstream>
#include <iostream>
#include <map>
#include <iomanip>
#include <cuda.h>
#include <curand.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include "load.hpp"
#include "parse.hpp"
// this value is per single node
#define MAX_SUPPORTED_GPUS 16
/* specifying # of threads for a given block, 256 block threads (index 0 to 255) */
const int BLOCK_SIZE=256;
/*************************************************************************************************
Defining the six pivotal functions for the genetic algorithm
(1) mateIt, (2) mutateIt, (3) scoreIt, (4) calcAreas, (5) moveEm, (6) getSumAreas
getSumAreas uses two other functions sumEm and sumEmIndex
*************************************************************************************************/
/************************************************************************************************
| function1: mateIt |
* creates offspring from a population, generating crossovers according to pCross
*
* @param Vs a global array of all the parent and child genomes
* @param ptrs array of pointers from logical indices to actual indices into Vs for each individual
* @param areas the probabilities for choosing each individual for mating
* @param sumArea pointer to the sum of all the individual areas
* @param rands array of random numbers
* @param pCross probability that crossover occurs
* @param pSize number of individuals in the population
* @param genomeSize number of genes in a genome
************************************************************************************************/
__global__ void mateIt(float *Vs, int *ptrs, const float *areas, const float *sumArea, const float *rands, const float pCross, const int pSize, const int genomeSize)
{
/*
figure out index blockId.x is the index by blocks, blockDIM.x is the elements per blocks (# of threads ina block)
threadIdx is the index for threads
*/
int i=blockIdx.x * blockDim.x + threadIdx.x;
/*
first parent, second parent, crossover random numbers
randi is three arrays with block and thread index of randoms numbers from 0 to 255;
The cross over random numbers
*/
int randi=i*3;
//multiply i by 2, as we will have 2 parents and 2 offspring multiplication is done using a left bitwise (<<) by 1
i<<=1;
/*
if we're in the population (sometimes warps may go past)
The statement if (i<psize) is common in cuda:
Before i index is used to access array elements, its value is checked against the number of elements, n,
to ensure there are no out-of-bounds memory accesses. This check is required for cases where the
number of elements in an array is not evenly divisible by the thread block size,
and as a result the number of threads launched by the kernel is larger than the array size.
*/
if(i<pSize){
int parent[2];
int j;
/* figure out parents */
parent[0]=parent[1]=-1;
/*
find parent where cumulative (cum) area (A) is less than random target (tgt) area
*/
float cumA=0.0f, tgtA=rands[randi++]* *sumArea;
while(cumA<=tgtA){
++parent[0];
cumA+=areas[ptrs[parent[0]]/genomeSize];
/* rands[randi-1] is the index back to zero since it is the first set of parents */
}
#if DEBUG>2
printf("rands[%d] ; %f ; %f=%f * %f\n",randi, cumA, tgtA, rands[randi-1], *sumArea);
printf("first parent\n");
#endif
cumA=0.0f; tgtA=rands[randi++]*
(*sumArea-areas[ptrs[parent[0]]/genomeSize]);
while(cumA<=tgtA){
++parent[1];
if(parent[1]==parent[0])
++parent[1];
cumA+=areas[ptrs[parent[1]]/genomeSize];
}
#if DEBUG>2
printf("Make offspring %d from %d and %d (%f=%f*(%f-%f)) %d\n", i, parent[0], parent[1], tgtA, rands[randi-1], *sumArea, areas[ptrs[parent[0]]/genomeSize], randi);
#endif
/* add offset of pSize to i because it is a child (next population) */
i+=pSize;
/* use ptrs to get indices into Vs */
int i0=ptrs[i], i1=ptrs[i+1];
parent[0]=ptrs[parent[0]];
parent[1]=ptrs[parent[1]];
/* set j to index for the next set of Vs */
j=i0+genomeSize;
/* put parent[0], parent[1], and i1 relative to i0, so we can just add i0 for index */
parent[0]-=i0;
parent[1]-=i0;
i1-=i0;
/* start with crossover pt at the end (no crossover) */
int crossPt=j;
/* check for crossover */
if(rands[randi]<pCross){
crossPt=i0+1+(int)(rands[randi]/pCross*(float)(genomeSize-1));
}
while(i0<crossPt){
/* load next bit from parent and increment i */
Vs[i0]=Vs[parent[0]+i0];
Vs[i1+i0]=Vs[parent[1]+i0];
++i0;
}
while(i0<j){
Vs[i0]=Vs[parent[1]+i0];
Vs[i1+i0]=Vs[parent[0]+i0];
++i0;
}
//i0=crossPt;
//for(i0;i0<j;i0++){
//Vs[i0]=Vs[parent[1]+i0];
//Vs[i1+i0]=Vs[parent[0]+i0];
//}
}
}
/************************************************************************************************
| function 2: mutateIt |
* @brief introduces mutations to the genomes in Vs, according to probability pMut, with a max perturbation of maxMut
*
* @param Vs a global array of all the parent and child genomes
* @param ptrs array of pointers from logical indices to actual indices into Vs for each individual
@param rands array of random numbers
* @param pSize number of individuals in the population
* @param pMut probability that a mutation occurs, evaluated for each gene
* @param maxMut maximum perturbation to an allele
* @param genomeSize number of genes in a genome
*************************************************************************************************/
__global__ void mutateIt(float *Vs, int *ptrs, const float *rands, const int pSize, const float pMut, const float maxMut, const int genomeSize)
{
/* figure out index */
int i=blockIdx.x * blockDim.x + threadIdx.x;
if(i<pSize){
// get index into random number array
int r=i*genomeSize;
i=ptrs[i];
int j=i+genomeSize;
// want random numbers from [-maxMut, maxMut). will subtract maxMut later
float scale=2.0f*maxMut/pMut;
// iterate through genome
while(i<j){
if(rands[r]<pMut){
// mutate the amplitude by adding perturbation
Vs[i]+=rands[r]*scale-maxMut;
}
++i;
++r;
}
}
}
/************************************************************************************************
| function 3: scoreIt |
* @brief calculates a score indicating the closeness of fit for each individual/chromosome (set of parameters) against the training set
* @param scores score for each conformation, calculated here
* @param areas weighting for each conformation, was formerly calculated here
* @param Vs a global array of all the parent and child genomes
* @param ptrs array of pointers from logical indices to actual indices into Vs for each individual
* @param tset training set
* @param tgts targets for training
* @param wts weights of each point in the training set
* @param breaks breaks in training set, where different data should not be compared across breaks
* @param nConf number of conformations in training set
* @param pSize number of individuals in the population
* @param genomeSize number of genes in a genome
* @param xx space to store energy differences for each conformation with test parameters
************************************************************************************************/
__global__ void scoreIt(float *scores, float *areas, const float *Vs, const int *ptrs, const float *tset, const float *tgts, const float *wts, const int *breaks, const int nConf, const int pSize, const int genomeSize, float *xx)
{
int i=blockIdx.x * blockDim.x + threadIdx.x;
//if((i<<1)<(pSize-1)*pSize){
if(i<pSize){
float *x=xx+i*nConf; // for the error of each conformation
// get reference to score
float *S=scores+i;
// set score to 0
*S=0.0f;
// accumulate little s for each set
float s;
// get first index in genome
int i0=ptrs[i];
// get index of next genome space for looping bounds
int j=i0+genomeSize;
// start with the first element in the training set
int t=0;
/* start at break 0 */
int b=0;
/* loop over conformations c */
int c=0;
while(c<nConf){
//int nP=0;
s=0.0f;
/* loop only in units without break points */
while(c<breaks[b+1]){
/*
start with delta E (tgts) for a given conformation (c) within a break; see load.cpp
conf (c) goes through until it reach a break. the loop will set delta E
*/
x[c]=tgts[c];
/*
subtract contributions from each parameter for conformation c
for each conformation
e.g deltaE - cos (dihedral * periodicity) * parameter generated from chromosomes
*/
for(i=i0;i<j;i++,t++){
x[c]-=tset[t]*Vs[i];
}
/* add differences in this error from all other errors */
for(int c2=breaks[b];c2<c;c2++){
float err=x[c]-x[c2];
s+=(err<0.0f?-err:err);
}
/* next conformation */
++c;
}
/* add little error to big error S, weighted by number of pairs */
*S+=s*wts[b];
/* go to next breakpoint */
++b;
}
#if DEBUG>1
printf("areas[%d]=%f\n",i0/genomeSize,areas[i0/genomeSize]);
#endif
}
}
/**************************************************************************************************
* | function 4: calcAreas | *
* *
* calculates the areas (the probability) each individual has of mating *
*___________________________________Parameters____________________________________________________*
* @param scores scores for each individual (set of parameters) *
* @param areas fitness for each individual, in terms of probability of mating *
* @param ptrs array of pointers from logical indices to actual indices into Vs for each individual*
* @param pSize number of individuals in the population *
* @param genomeSize number of genes in a genome *
**************************************************************************************************/
__global__ void calcAreas(float *scores, float *areas, const int *ptrs, const int pSize, const int genomeSize) {
int i=blockIdx.x * blockDim.x + threadIdx.x;
if(i<pSize){
areas[ptrs[i]/genomeSize]=__expf(-scores[i]/scores[0]);
}
}
/************************************************************************************************
* | function 5: moveEm |
*
* @brief simple helper function for copying data from oldF, oldI to neWF, newI
*
* @param newF pointer to new float array
* @param newI pointer to new int array
* @param oldF pointer to old float array
* @param oldI pointer to old int array
* @param N number of floats/ints to copy
*************************************************************************************************/
__global__ void moveEm(float * newF, int *newI, float *oldF, int *oldI, int N) {
int i=blockIdx.x * blockDim.x + threadIdx.x;
if(i<N){
newF[i]=oldF[i];
newI[i]=oldI[i];
}
}
/************************************************************************************************
| sumEm and sumEmIndex : helper function for getSumAreas |
* @brief performs a sum of each successive pair of N numbers in source and stores the sums in sums. intended to be run multiple times to sum over a whole array. if N is odd, the last sum index will be N/2-1 and contain the sum of the last 3 numbers
*
* @param sums where to store the sums
* @param source where to get the numbers to sum together
* @param N the dimension of source
*
* @return */
__global__ void sumEm(float *sums, float *source, int N){
int i=blockIdx.x*blockDim.x+threadIdx.x;
int j=(i<<1);
if(j+3<N)sums[i]=source[j]+source[j+1];
else if(j+3==N) sums[i]=source[j]+source[j+1]+source[j+2];
else if(j+2==N) sums[i]=source[j]+source[j+1];
}
/*
* @brief performs a sum of pairs of N numbers in source, using locations indicated by pointers. pointers has indices multiplied by genomeSize. intended to be run multiple times to sum over a whole array. if N is odd, the last sum index will be N/2-1 and contain the sum of the last 3 numbers
*
* @param sums where to store the sums
* @param source where to get the numbers to sum together
* @param N the dimension of source
* @param ptrs the indices to use when gathering pairs for summation
* @param genomeSize the number by which the indices in ptrs are scaled
*
* @return
*/
__global__ void sumEmIndex(float *sums, float *source, int N, const int *ptrs, const int genomeSize){
int i=blockIdx.x*blockDim.x+threadIdx.x;
int j=(i<<1);
if(j+3<N)sums[i]=source[ptrs[j]/genomeSize]+source[ptrs[j+1]/genomeSize];
else if(j+3==N) sums[i]=source[ptrs[j]/genomeSize]+source[ptrs[j+1]/genomeSize]+source[ptrs[j+2]/genomeSize];
else if(j+2==N) sums[i]=source[ptrs[j]/genomeSize]+source[ptrs[j+1]/genomeSize];
#if DEBUG>1
if(j+2<=N)printf(" %d:%f",i,sums[i]);
#endif
}
/*******************************| end of helper function |***************************************/
/*************************************************************************************************
* | function 6: getSumAreas | *
* ---------uses sumEmIndex and sumEM-------- *
* *
* @brief get sum of all areas
* *
* @param areas_d pointer to areas on device *
* @param ptrs_d pointer to indices for each individual in population
* @param pSize population size
* @param temp_d pointer to temporary array on device
* @param genomeSize number of alleles in genome
************************************************************************************************/
float *getSumAreas(float *areas_d, int *ptrs_d, int pSize, float *temp_d, const int & genomeSize){
int dim=pSize;
int offset=0;
/*
the triple chevron below describes an execution configuration
the first argument(((dim>>1)+BLOCK_SIZE-1)/BLOCK_SIZE) in the execution configuration specifies the
number of thread blocks in the grid, and the second specifies (BLOCK_SIZE) the number of threads in a thread block
*/
sumEmIndex <<<((dim>>1)+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (temp_d, areas_d, dim, ptrs_d, genomeSize);
#if DEBUG>1
std::cout << std::endl;
#endif
pSize >>= 1;
while((dim>>=1)>1){
offset^=pSize;
sumEm <<<((dim>>1)+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (temp_d+offset, temp_d+(offset^pSize), dim);
#if DEBUG>1
std::cout << std::endl;
#endif
}
return temp_d+offset;
}
struct DeviceParameters {
int curList;
int *ptrs_d, *breaks_d;
float *rands_d, *Vs_d, *tset_d, *tgts_d, *wts_d, *xx_d, *scores_d, *areas_d;
float *scores_ds[2];
int *ptrs_ds[2];
curandGenerator_t gen;
};
struct Parameters {
int pSize; // "Population Size (pSize): " << pSize << "\n\n";
int nGen; // "Number of Generations (nGen): " << nGen << "\n\n";
float pMut; // "Probability of Mutations (pMut): " << pMut << "\n\n";
float maxMut; // "Maximal permissible mutation (maxMut): " << maxMut << "\n\n";
float pCross; // "Probability of crossover (pCross): " << pCross << "\n\n";
int rseed; // "Random seed (rseed): " << rseed << "\n\n";
int peng; // "Print scores every " << peng << "generations (peng)\n\n";
int ncp; // "Print scores of only " << ncp << " chromosomes every peng \n\n";
int elitist; //elitist population we will keep every generation
int nEx; // number of migrants we will exchange
int iTime; //isolation time genetic algorithm runs until exchange
/*specify the string of the savefile, scorefile, loadfile name */
std::string saveFile,loadFile,scoreFile,scoreTest;
/* Hardcoding these input but we will make user options
nisland is the number of subpopulation, iTime is the isolation time, nMig is the number of
migrants added to migrant pool. nEx number of exchange btwn migrant pool and subpop */
int nIsland;
int exchrate; //the number of exchnages we do nGen/iTime
int genomeSize, nConf, N, save;
float *Vs, *tset, *tgts, *wts, *scores, *migrants;
int *ptrs, *breaks, nBreaks;
size_t nRands;
int maxGpuDevices;
int nBlocks;
DeviceParameters deviceParameters[MAX_SUPPORTED_GPUS];
};
void ParseArgs(int argc, char *argv[], Parameters ¶meters) {
if (!(argv[1]=="-p")) std::cout << "please use -p for param file";
ConfigFile cfg(argv[2]);
// check if keys exixt
if (!(cfg.keyExists("pSize"))) std::cout << "oops you forgot pSize"; //make it for others
//add the rest of parameters if exist as line above
// Retreive the value of keys pSize if key dont exist return value will be 1
//add new parameters here
parameters.pSize = cfg.getValueOfKey<int>("pSize", 1);
std::cout << "Population Size (pSize): " << parameters.pSize << "\n\n";
parameters.nGen = cfg.getValueOfKey<int>("nGen", 1);
std::cout << "Number of Generations (nGen): " << parameters.nGen << "\n\n";
parameters.pMut = cfg.getValueOfKey<float>("pMut", 1);
std::cout << "Probability of Mutations (pMut): " << parameters.pMut << "\n\n";
parameters.maxMut = cfg.getValueOfKey<float>("max", 1);
std::cout << "Maximal permissible mutation (max): " << parameters.maxMut << "\n\n";
parameters.pCross = cfg.getValueOfKey<float>("pCross", 1);
std::cout << "Probability of crossover (pCross): " << parameters.pCross << "\n\n";
parameters.rseed = cfg.getValueOfKey<int>("rseed", 1);
std::cout << "Random seed (rseed): " << parameters.rseed << "\n\n";
parameters.peng = cfg.getValueOfKey<int>("peng", 1);
std::cout << "Print scores every " << parameters.peng << "generations (peng)\n\n";
parameters.ncp = cfg.getValueOfKey<int>("ncp", 1);
std::cout << "Print scores of only " << parameters.ncp << " chromosomes every peng \n\n";
parameters.iTime = cfg.getValueOfKey<int>("iTime", 1);
std::cout << "isolation time is " << parameters.iTime << "\n\n";
parameters.nEx = cfg.getValueOfKey<int>("nEx", 1);
std::cout << "number of migrants to exchange is " << parameters.nEx << "\n\n";
parameters.elitist = cfg.getValueOfKey<int>("elitist", 10);
std::cout << "Elitist, the percent of parents we keep is " << parameters.elitist << "\n\n";
parameters.save=parameters.pSize/parameters.elitist; // % of the population we keep
parameters.exchrate=parameters.nGen/parameters.iTime; //get the number of exchange
/* dealing with loading the input file and save file string name */
for (int i=2;i<argc;i++){
if(i+1<argc){
if(argv[i][0]=='-'&&argv[i][1]=='r')parameters.saveFile=argv[++i];
else if(argv[i][0]=='-'&&argv[i][1]=='c')parameters.loadFile=argv[++i];
else if(argv[i][0]=='-'&&argv[i][1]=='s')parameters.scoreFile=argv[++i];
else if(argv[i][0]=='-'&&argv[i][1]=='f')parameters.scoreTest=argv[++i];
}
}
}
void LoadParameters(Parameters ¶meters, std::map<std::string,DihCorrection> &correctionMap) {
load(std::cin,
¶meters.tset,
¶meters.tgts,
¶meters.wts,
¶meters.nConf,
¶meters.breaks,
¶meters.nBreaks,
¶meters.genomeSize,
correctionMap);
}
void AllocateArrays(int gpuDevice, Parameters ¶meters) {
cudaError_t error;
cudaSetDevice(gpuDevice);
cudaMalloc((void **)¶meters.deviceParameters[gpuDevice].breaks_d, parameters.nBreaks*sizeof(int));
cudaMalloc((void **)¶meters.deviceParameters[gpuDevice].tgts_d, (parameters.nBreaks-1+parameters.nConf*(1+parameters.genomeSize))*sizeof(float));
parameters.deviceParameters[gpuDevice].wts_d=parameters.deviceParameters[gpuDevice].tgts_d+parameters.nConf;
parameters.deviceParameters[gpuDevice].tset_d=parameters.deviceParameters[gpuDevice].wts_d+parameters.nBreaks-1;
/**********************| initiate GPU blocks and # of random variable |***************************
* we need randoms, new pop 3xcrossover, genomeSizexmut *
* genome size is the number of genes which is all the parameters, *
* e.g for 4 periodicity and three dihedral fitting, then genomesize will be 4 * 3 = 12 *
* nRands is number of randoms we need for each set of parameters *
* e.g if psize (population size) is 10, then number of random number we will need is *
* (3+(# of periodicity x # of dihedral)) * psize *
* so for 4 periodicity and 3 dihedral fitting (chi1 chi2 chi3), then nRands = 3+12 * 10 = 150 *
*________________________________________________________________________________________________*
* nBlocks is dependent on the population size, it is use to figure out how many GPU blocks *
* we need to initialize the arrays for calculations. Each block has 256 threads. *
* one thread represent one individual (chromosome with soln parameters) from the population *
* e.g population size of 2000 will require (2000+256-1)/256 = 8.81 => 8 blocks *
* *
*************************************************************************************************/
parameters.nRands=(3+parameters.genomeSize)*parameters.pSize;
parameters.nBlocks=(parameters.pSize+BLOCK_SIZE-1)/BLOCK_SIZE;
/*******************************| initializing more host and device variables|*******************
* N is pSize multiply by 2, use to get more of the seed and to help storeoffspring *
* initiating the chromosomes which have the solns *
************************************************************************************************/
float *rands=(float *)malloc(parameters.nRands*sizeof(float));
//cudaMalloc((void **)&rands_d, nRands*sizeof(float));
parameters.N=(parameters.pSize<<1);
error=cudaMalloc((void **)¶meters.deviceParameters[gpuDevice].Vs_d, (parameters.N*(parameters.genomeSize+4)+parameters.pSize*parameters.nConf+parameters.nRands)*sizeof(float));
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
parameters.deviceParameters[gpuDevice].rands_d=parameters.deviceParameters[gpuDevice].Vs_d+parameters.N*parameters.genomeSize;
parameters.deviceParameters[gpuDevice].scores_d=parameters.deviceParameters[gpuDevice].rands_d+parameters.nRands;
parameters.deviceParameters[gpuDevice].areas_d=parameters.deviceParameters[gpuDevice].scores_d+(parameters.N<<1);
parameters.deviceParameters[gpuDevice].xx_d=parameters.deviceParameters[gpuDevice].areas_d+(parameters.N<<1);
parameters.scores=(float *)malloc(sizeof(*parameters.scores)*parameters.N);
parameters.deviceParameters[gpuDevice].scores_ds[0]=parameters.deviceParameters[gpuDevice].scores_d;
parameters.deviceParameters[gpuDevice].scores_ds[1]=parameters.deviceParameters[gpuDevice].scores_d+parameters.N;
parameters.Vs=(float *)malloc((parameters.N)*parameters.genomeSize*sizeof(float));
/*allocate the memory space to hold array of pointers (prts) of size N (2*pSize)
these pointers point to the individuals (chromosome) in the population */
parameters.ptrs=(int *)malloc(sizeof(int)*(parameters.N));
parameters.ptrs[0]=0;
for (int g=1;g<parameters.N;g++)parameters.ptrs[g]=parameters.ptrs[g-1]+parameters.genomeSize;
error = cudaMalloc((void **)¶meters.deviceParameters[gpuDevice].ptrs_d, parameters.N*2*sizeof(int));
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
parameters.deviceParameters[gpuDevice].ptrs_ds[0]=parameters.deviceParameters[gpuDevice].ptrs_d;
parameters.deviceParameters[gpuDevice].ptrs_ds[1]=parameters.deviceParameters[gpuDevice].ptrs_d+parameters.N;
parameters.migrants=(float *)malloc(parameters.nEx*parameters.maxGpuDevices*parameters.genomeSize*sizeof(float));
}
/*
copying over the arrays from the CPU to GPU
nbreaks is the # of dataset + 1. e.g if you are doing alpha and beta backbone set then nbreaks=3
genomesize is the # of fitting dihedral * periodicity, e.g 3 set of dihedral * 4 periodicity = 12
nconf is the # of conformations you are fitting
tset is (E_QMi-E_MMi) + (E_MMref-E_QMref) for each conformation, which = nconf, see load.cpp
tgts is the cos(dih*periodicity) for 4 periodicity for a dihedral for each conformation
so 20 conf will give tgts of 20 (nconf) * 12 (# of dih * periodicity) = 120
*/
void CopyArrays(int gpuDevice, Parameters ¶meters) {
cudaError_t error;
cudaSetDevice(gpuDevice);
error = cudaMemcpy(parameters.deviceParameters[gpuDevice].breaks_d, parameters.breaks, parameters.nBreaks*sizeof(parameters.breaks[0]), cudaMemcpyHostToDevice);
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
error = cudaMemcpy(parameters.deviceParameters[gpuDevice].tset_d, parameters.tset, parameters.nConf*parameters.genomeSize*sizeof(float), cudaMemcpyHostToDevice);
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
error = cudaMemcpy(parameters.deviceParameters[gpuDevice].tgts_d, parameters.tgts, parameters.nConf*sizeof(float), cudaMemcpyHostToDevice);
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
error = cudaMemcpy(parameters.deviceParameters[gpuDevice].wts_d, parameters.wts, (parameters.nBreaks-1)*sizeof(*parameters.wts), cudaMemcpyHostToDevice);
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
error = cudaMemcpy(parameters.deviceParameters[gpuDevice].ptrs_d, parameters.ptrs, sizeof(int)*parameters.N, cudaMemcpyHostToDevice);
if(error!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
}
/**************************| Create a random generator |********************************************
* curandCreateGenerator takes two parameters: pointer to generator (*gen), type of generator *
Once created,random number generators can be defined using the general options seed, offset,& order*
When rng_type is CURAND_RNG_PSEUDO_DEFAULT, the type chosen is CURAND_RNG_PSEUDO_XORWOW *
*__________________________________________________________________________________________________*
*curandSetPseudoRandomGeneratorSeed takes two parameters (1) the generator (gen) & (2) seed value *
* seed value # is used to initialize the generator and control the set of random numbers; *
* same seed will the give same set of random numbers of the psuedorandom generator *
* rseed is the random number specified from the 6th input) *
*__________________________________________________________________________________________________*
* curandGenerateNormal take 5 parameters: *
* (1) generator - Generator to use *
* (2) outputPtr - Pointer to device memory to store CUDA-generated results, *
or Pointer to host memory to store CPU-generated resluts *
* (3) num - Number of floats to generate *
* (4) mean - Mean of normal distribution *
* (5) stddev - Standard deviation of normal distribution *
* Results are 32-bit floating point values with mean and standard deviation. *
***************************************************************************************************/
void GenerateRandom(int gpuDevice, Parameters ¶meters) {
cudaError_t error;
cudaSetDevice(gpuDevice);
curandCreateGenerator(¶meters.deviceParameters[gpuDevice].gen, CURAND_RNG_PSEUDO_DEFAULT);
// initiate the generator with the random seed (rseed)
curandSetPseudoRandomGeneratorSeed(parameters.deviceParameters[gpuDevice].gen, parameters.rseed);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (normal)\n", cudaGetErrorString(error));}
curandGenerateNormal(parameters.deviceParameters[gpuDevice].gen, parameters.deviceParameters[gpuDevice].Vs_d, parameters.N*parameters.genomeSize, 0, 1);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (normal)\n", cudaGetErrorString(error));}
}
void LoadAmplitudeParameters(int gpuDevice, Parameters ¶meters) {
cudaSetDevice(gpuDevice);
// if we have a load Vs (amplitude parameters) from the loaded file and populate Vs
if(!parameters.loadFile.empty()) {
std::ifstream loadS(parameters.loadFile.c_str(), std::ios::in | std::ios::binary);
loadS.read((char*)parameters.Vs,parameters.pSize*parameters.genomeSize*sizeof(*parameters.Vs));
cudaMemcpy(parameters.deviceParameters[gpuDevice].Vs_d, parameters.Vs, parameters.pSize*parameters.genomeSize*sizeof(*parameters.Vs), cudaMemcpyHostToDevice);
}
}
/***************************| score of the first set of chromosomes |*******************************
* Here we score initial chromsomes *
***************************************************************************************************/
void
ScoreInitialChromsomes(int gpuDevice, Parameters ¶meters) {
cudaError_t error;
cudaSetDevice(gpuDevice);
/* lauch first kernel to score the initial set of chromsomes (Vs_d) and output scores in scores_ds
betweem the triple chervon is called the execution configuration that takes two parts
1st part takes the number of thread blocks and the second part take the number of threads in a block */
scoreIt <<<(parameters.N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.deviceParameters[gpuDevice].areas_d,
parameters.deviceParameters[gpuDevice].Vs_d,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.deviceParameters[gpuDevice].tset_d,
parameters.deviceParameters[gpuDevice].tgts_d,
parameters.deviceParameters[gpuDevice].wts_d,
parameters.deviceParameters[gpuDevice].breaks_d,
parameters.nConf,
parameters.pSize,
parameters.genomeSize,
parameters.deviceParameters[gpuDevice].xx_d
);
/* score of chromosomes out of psize since we initiated 2 times psize */
scoreIt <<<(parameters.N+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.deviceParameters[gpuDevice].areas_d,
parameters.deviceParameters[gpuDevice].Vs_d,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.deviceParameters[gpuDevice].tset_d,
parameters.deviceParameters[gpuDevice].tgts_d,
parameters.deviceParameters[gpuDevice].wts_d,
parameters.deviceParameters[gpuDevice].breaks_d,
parameters.nConf,
parameters.pSize,
parameters.genomeSize,
parameters.deviceParameters[gpuDevice].xx_d);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (1stscore)\n", cudaGetErrorString(error));}
/* sort the scores from each chromosome of the initial population (2 x pSize)
1st part is the begining of key sequence, 2nd end of key sequence, 3rd begining of values to sort*/
thrust::sort_by_key(
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]),
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.N),
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList])
);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (1stsort)\n", cudaGetErrorString(error));}
}
/****************************| Let us begin the iterations through generations |********************
Genetic algorithm iterations through the number of generations or isolation time
****************************************************************************************************/
void EvolveGenerations(int gpuDevice, Parameters ¶meters) {
cudaError_t error;
cudaSetDevice(gpuDevice);
float cpMute=(float(gpuDevice/10)+float(1))*parameters.pMut;
/* for loop for the generation */
for (int currentGeneration=0; currentGeneration<parameters.iTime; currentGeneration++) {
/*************************| Step1: Generate random numbers |****************************************/
// create an array of random numbers (rands_d) used for mutations and crossover where the number of random #s is nRands
curandGenerateUniform(parameters.deviceParameters[gpuDevice].gen, parameters.deviceParameters[gpuDevice].rands_d, parameters.nRands);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s\n", cudaGetErrorString(error));}
/***| Step2: calculate the probabilities (areas) each individual (chromosome) has of mating |******/
calcAreas <<<parameters.nBlocks, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.deviceParameters[gpuDevice].areas_d,
parameters.deviceParameters[gpuDevice].ptrs_d,
parameters.pSize, parameters.genomeSize
);
/***| Step3: mate the individuals (chromosomes,Parent[0],[1]) selected for the next generation |***/
mateIt <<<parameters.nBlocks, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].Vs_d,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.deviceParameters[gpuDevice].areas_d,
getSumAreas(
parameters.deviceParameters[gpuDevice].areas_d,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.pSize,
parameters.deviceParameters[gpuDevice].areas_d+parameters.N,
parameters.genomeSize),
parameters.deviceParameters[gpuDevice].rands_d,
parameters.pCross,
parameters.pSize,
parameters.genomeSize
);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (mate)\n", cudaGetErrorString(error));}
/*****************| Step4: mutate individuals generated after mating |*****************************/
mutateIt <<<parameters.nBlocks, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].Vs_d,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.deviceParameters[gpuDevice].rands_d+parameters.pSize*3,
parameters.pSize,
cpMute,
parameters.maxMut,
parameters.genomeSize
);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (mutate)\n", cudaGetErrorString(error));}
/**************| Step5: Score the individuals to select for the next generation |*******************/
scoreIt <<<parameters.nBlocks, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.deviceParameters[gpuDevice].areas_d,
parameters.deviceParameters[gpuDevice].Vs_d,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.deviceParameters[gpuDevice].tset_d,
parameters.deviceParameters[gpuDevice].tgts_d,
parameters.deviceParameters[gpuDevice].wts_d,
parameters.deviceParameters[gpuDevice].breaks_d,
parameters.nConf,
parameters.pSize,
parameters.genomeSize,
parameters.deviceParameters[gpuDevice].xx_d);
if((error=cudaGetLastError())!=cudaSuccess){fprintf(stderr, "Cuda error: %s (score)\n", cudaGetErrorString(error));}
/*****| Step6: Sort the offspring and score for next generation |**/
moveEm <<<(parameters.save+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList^1],
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList^1],
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList],
parameters.save
); //elitist
moveEm <<<(parameters.pSize+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList^1]+parameters.save,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList^1]+parameters.save,
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize,
parameters.pSize
); //nOffspring);
moveEm <<<(parameters.pSize-parameters.save+BLOCK_SIZE-1)/BLOCK_SIZE, BLOCK_SIZE>>> (
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList^1]+parameters.save+parameters.pSize,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList^1]+parameters.save+parameters.pSize,
parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.save,
parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.save,
parameters.pSize-parameters.save);
// switch the index so we swap the buffers: input becomes output and output becomes input
parameters.deviceParameters[gpuDevice].curList ^= 1;
/* first sort from save1 in pSize1 to save2 in pSize 2 {whats in here}, the offsprings
|save{------pSize--------|save}-------pSize---------| */
thrust::sort_by_key(
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.save),
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize+parameters.save),
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.save)
);
/* then sort all those that fit within pSize */
thrust::sort_by_key(
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]),
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList]+parameters.pSize),
thrust::device_pointer_cast(parameters.deviceParameters[gpuDevice].ptrs_ds[parameters.deviceParameters[gpuDevice].curList])
);
/****************************************************************************************************
* Here you can print the score of chromosomes (total is 2 x population size) for each generation *
* by uncommenting the if and end DEBUG statement, need to make this an input option *
* such as -s which will mean print scores *
****************************************************************************************************/
//if generation is divisable by peng, print initial, final and interval of peng
if (currentGeneration % parameters.peng == 0) {
std::ofstream scorefile;
scorefile.open (parameters.scoreFile.c_str(), std::ios::out | std::ios::trunc); //it append to the writeout so make sure u delete scores file
scorefile << "#Generation" << std::setw(14) << "Chromosomes" << std::setw(12) << "Scores\n";
//copy over only ncp scores
cudaMemcpy(parameters.scores, parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList], sizeof(*parameters.scores)*parameters.ncp, cudaMemcpyDeviceToHost);
if (parameters.ncp > parameters.pSize) {
printf("Parmfile error: ncp should be smaller than psize! \n");
std::abort();
}
else {
for(int m = 0; m< parameters.ncp; m++){
scorefile << std::setw(6) << currentGeneration << std::setw(14) << m << std::setw(18) << parameters.scores[m] << "\n";
}
}
scorefile.close();
}
} // here the loop for generations ends
std::cout << "Device " << gpuDevice << " " << cpMute << std::endl;
}
float ComputeDiff(const float *arrayA, const float *arrayB, const int N) {
//arrayb - arraya
float diff, sumdiff = 0.0f;
for (int i=0;i<N;i++) {
diff = arrayA[i]-arrayB[i];
diff *= diff;
sumdiff += diff;
}
return sumdiff;
}
void CheckError(int gpuDevice, Parameters ¶meters) {
cudaSetDevice(gpuDevice);
cudaMemcpy(parameters.scores, parameters.deviceParameters[gpuDevice].scores_ds[parameters.deviceParameters[gpuDevice].curList], sizeof(float)*parameters.N, cudaMemcpyDeviceToHost);
}
void writeFrcmod(int gpuDevice, Parameters ¶meters, std::map<std::string,DihCorrection> &correctionMap) {
cudaSetDevice(gpuDevice);
//copy over the best results from multiple GPUs to the CPU to save the scores and parameters */
cudaMemcpy(parameters.Vs, parameters.deviceParameters[gpuDevice].Vs_d, sizeof(float)*parameters.genomeSize*1, cudaMemcpyDeviceToHost);
cudaMemcpy(parameters.ptrs, parameters.deviceParameters[gpuDevice].ptrs_ds[0], sizeof(int)*1, cudaMemcpyDeviceToHost);
cudaMemcpy(parameters.scores, parameters.deviceParameters[gpuDevice].scores_ds[0], sizeof(float)*1, cudaMemcpyDeviceToHost);
std::ofstream scoretest;
scoretest.open (parameters.scoreTest.c_str(), std::ios::out | std::ios::trunc);
for(int i=0;i<1;i++){
scoretest << std::setw(8) << gpuDevice << "\n";
/* these are the final scores for each individual in the population, print in the output file */
scoretest << std::setw(8) << parameters.scores[i] << "\n";
for(std::map<std::string,DihCorrection>::iterator it=correctionMap.begin(); it!=correctionMap.end(); ++it){
scoretest << std::setw(11) << it->second.setGenome(parameters.Vs+parameters.ptrs[i]);
}
scoretest.close();
}
}
void SelectMigrants (int gpuDevice, Parameters ¶meters) {
cudaSetDevice(gpuDevice);
cudaMemcpy(
¶meters.migrants[parameters.nEx*gpuDevice*parameters.genomeSize],
¶meters.deviceParameters[gpuDevice].Vs_d[0],
(parameters.nEx)*parameters.genomeSize*sizeof(*parameters.Vs),
cudaMemcpyDeviceToHost);
}
int SpecialRandom(int gpuDevice, int maxGpuDevices) {
cudaSetDevice(gpuDevice);
int temp= rand() % maxGpuDevices;
while (gpuDevice == temp) {
temp = rand() % maxGpuDevices;
}
return temp;
}
void Exchange (int gpuDevice, Parameters ¶meters) {
cudaSetDevice(gpuDevice);
for (int i=0;i<parameters.nEx;i++){
int otherGpu= SpecialRandom(gpuDevice, parameters.maxGpuDevices);
int indnEx = rand() % parameters.nEx; //random # range from 0 to nEx
//copy by random from migrants array to Vs_d at bottom (starting from pSize-nEx)
cudaMemcpy(
¶meters.deviceParameters[gpuDevice].Vs_d[((parameters.pSize-parameters.nEx+i)*parameters.genomeSize)],
¶meters.migrants[(parameters.nEx*otherGpu+indnEx)*parameters.genomeSize],
parameters.genomeSize*sizeof(*parameters.Vs),
cudaMemcpyHostToDevice);
}
}
int main(int argc, char *argv[]) {
Parameters parameters;
std::map<std::string,DihCorrection> correctionMap;
int deviceCount=0;
cudaGetDeviceCount(&deviceCount);
parameters.maxGpuDevices = deviceCount;
ParseArgs(argc, argv, parameters);
LoadParameters(parameters, correctionMap);
for (int device = 0; device < parameters.maxGpuDevices; device++)
parameters.deviceParameters[device].curList=0;
for (int device = 0; device < parameters.maxGpuDevices; device++)
AllocateArrays(device, parameters);
for (int device = 0; device < parameters.maxGpuDevices; device++)
CopyArrays(device, parameters);
for (int device = 0; device < parameters.maxGpuDevices; device++)
GenerateRandom(device, parameters);
for (int device = 0; device < parameters.maxGpuDevices; device++)
LoadAmplitudeParameters(device, parameters);
for (int device = 0; device < parameters.maxGpuDevices; device++)
ScoreInitialChromsomes(device, parameters);
double t1=omp_get_wtime();
//run the genetic algorithm for iTime, do exchanges
#pragma omp parallel for
for (int device = 0; device < parameters.maxGpuDevices; device++){
for (int run=0;run<parameters.exchrate;run++){
std::cout << "run " << run << std::endl;
EvolveGenerations(device, parameters);
SelectMigrants(device, parameters);
//std::cout<<"migrants array" << parameters.migrants[0] << std::endl;
Exchange(device, parameters);
//std::cout << "exchange done " << std::endl;
}
}
double t2=omp_get_wtime();
for (int device = 0; device < parameters.maxGpuDevices; device++) {
CheckError(device, parameters);
writeFrcmod(device, parameters, correctionMap);
//std::cout << "Device " << device << " " << parameters.scores[0] << std::endl;
}
std::cout<< "Computation time" << t2-t1 << std::endl;
/*****************| Free up Memory |******************************************************
free(ptrs);
curandDestroyGenerator(gen);
//cudaFree(xx_d);
cudaFree(Vs_d);
cudaFree(ptrs_d);
cudaFree(breaks_d);
cudaFree(tgts_d);
free(Vs);
free(scores);
//cudaFree(rands_d);
free(rands);
return 0;
curandDestroyGenerator(gen);
//cudaFree(xx_d);
cudaFree(Vs_d);
cudaFree(ptrs_d);
cudaFree(breaks_d);
cudaFree(tgts_d);
free(Vs);
free(scores);
//cudaFree(rands_d);
free(rands);*/
return 0;
}