-
Notifications
You must be signed in to change notification settings - Fork 31
/
apop_stats.m4.c
963 lines (840 loc) · 40.7 KB
/
apop_stats.m4.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
/** \file apop_stats.c Basic moments and some distributions. */
/* Copyright (c) 2006--2007, 2013 by Ben Klemens. Licensed under the GPLv2; see COPYING. */
#include "apop_internal.h"
#include <gsl/gsl_rng.h>
#include <gsl/gsl_eigen.h>
#define Check_vw \
Apop_stopif(!v, return GSL_NAN, 0, "data vector is NULL. Returning NaN.\n"); \
Apop_stopif(!v->size, return GSL_NAN, 0, "data vector has size 0. Returning NaN.\n"); \
Apop_stopif(weights && weights->size != v->size, return GSL_NAN, 0, "data vector has size %zu; weighting vector has size %zu. Returning NaN.\n", v->size, weights->size);
/** Returns the sum of the data in the given vector.
*/
long double apop_vector_sum(const gsl_vector *in){
Apop_stopif(!in, return 0, 1, "You just asked me to sum a NULL. Returning zero.");
long double out = 0;
for (size_t i=0; i< in->size; i++)
out += gsl_vector_get(in, i);
return out;
}
/** \def apop_sum(in)
An alias for \ref apop_vector_sum. Returns the sum of the data in the given vector.
*/
/** \def apop_mean(v)
Returns the mean of the elements of the vector \c v.
\param v A \ref gsl_vector.
*/
/** \def apop_var(in)
An alias for \ref apop_vector_var.
Returns the variance of the data in the given vector.
*/
/** Returns an unbiased estimate of the sample skew of the data in the given vector.
*/
double apop_vector_skew(const gsl_vector *in){
return apop_vector_skew_pop(in) * gsl_pow_2(in->size)/((in->size -1.)*(in->size -2.)); }
/** Returns the sample fourth central moment of the data in the given
vector. Corrections are made to produce an unbiased result as per <a
href="http://modelingwithdata.org/pdfs/moments.pdf">Appendix M</a> (PDF) of <em>Modeling
with data</em>.
\li This is an estimate of the fourth central moment without normalization. The kurtosis
of a \f${\cal N}(0,1)\f$ is \f$3 \sigma^4\f$, not three, one, or zero.
\see \ref apop_vector_kurtosis_pop
*/
double apop_vector_kurtosis(const gsl_vector *in){
size_t n = in->size;
long double coeff0= n*n/(gsl_pow_3(n-1)*(gsl_pow_2(n)-3*n+3));
long double coeff1= n*gsl_pow_2(n-1)+ (6*n-9);
long double coeff2= n*(6*n-9);
return coeff0 *(coeff1 * apop_vector_kurtosis_pop(in) - coeff2 * gsl_pow_2(apop_vector_var(in)*(n-1.)/n));
}
static double wskewkurt(const gsl_vector *v, const gsl_vector *w, const int exponent, const char *fn_name){
long double wsum = 0, sumcu = 0, vv, ww, mu;
//Using the E(x - \bar x)^3 form, which is lazy.
mu = apop_vector_mean(v, w);
for (size_t i=0; i< w->size; i++){
vv = gsl_vector_get(v,i);
ww = gsl_vector_get(w,i);
sumcu+= ww * gsl_pow_int(vv - mu, exponent);
wsum += ww;
}
double len = wsum < 1.1 ? w->size : wsum;
return sumcu/len;
}
/** Returns the population skew \f$(\sum_i (x_i - \mu)^3/n))\f$ of the data in the given vector. Observations may be weighted.
\param v The data vector
\param weights The weight vector. Default: equal weights for all observations.
\return The weighted skew.
\li Some people like to normalize the skew by dividing by (variance)\f$^{3/2}\f$; that's not done here, so you'll have to do so separately if need be.
\li Apophenia tries to be smart about reading the weights. If weights
sum to one, then the system uses \c w->size as the number of elements,
and returns the usual sum over \f$n-1\f$. If weights > 1, then the
system uses the total weights as \f$n\f$. Thus, you can use the weights
as standard weightings or to represent elements that appear repeatedly.
*/
APOP_VAR_HEAD double apop_vector_skew_pop(gsl_vector const *v, gsl_vector const *weights){
gsl_vector const * apop_varad_var(v, NULL);
gsl_vector const * apop_varad_var(weights, NULL);
Check_vw
APOP_VAR_ENDHEAD
if (weights) return wskewkurt(v, weights, 3, "apop_vector_weighted_skew");
//This code is cut/pasted/modified from the GSL.
//I reimplement the skew calculation here without the division by var^3/2 that the GSL does.
size_t n = v->size;
long double avg = 0;
long double mean = apop_vector_mean(v);
for (size_t i = 0; i < n; i++) {
const long double x = gsl_vector_get(v, i) - mean;
avg += (gsl_pow_3(x) - avg)/(i + 1);
}
return avg;
}
/** Returns the population fourth central moment [\f$\sum_i (x_i - \mu)^4/n)\f$] of the data in
the given vector, with an optional weighting.
\param v The data vector
\param weights The weight vector. If NULL, assume equal weights.
\return The weighted kurtosis.
\li Some people like to normalize the fourth central moment by dividing by variance
squared, or by subtracting three; those things are not done here, so you'll have to
do them separately if desired.
\li This function uses the \ref designated syntax for inputs.
\see \ref apop_vector_kurtosis for the unbiased sample version.
*/
APOP_VAR_HEAD double apop_vector_kurtosis_pop(gsl_vector const *v, gsl_vector const *weights){
gsl_vector const * apop_varad_var(v, NULL);
gsl_vector const * apop_varad_var(weights, NULL);
Check_vw
APOP_VAR_ENDHEAD
if (weights) return wskewkurt(v, weights, 4, "apop_vector_weighted_kurtosis");
//This code is cut/pasted/modified from the GSL.
//I reimplement the kurtosis calculation here without the division by var^2 that the GSL does.
size_t n = v->size;
long double avg = 0;
long double mean = apop_vector_mean(v);
for (size_t i = 0; i < n; i++) {
const long double x = gsl_vector_get(v, i) - mean;
avg += (gsl_pow_4(x) - avg)/(i + 1);
}
return avg;
}
/** Returns the variance of the data in the given vector, given that you've already calculated the mean.
\param in the vector in question
\param mean the mean, which you've already calculated using \ref apop_vector_mean.
\see apop_vector_var
*/
double apop_vector_var_m(const gsl_vector *in, const double mean){
return gsl_stats_variance_m(in->data,in->stride, in->size, mean); }
/** Returns the correlation coefficient of two vectors:
\f$ {\hbox{cov}(a,b)\over \sqrt{\hbox{var}(a)} \sqrt{\hbox{var}(b)}}.\f$
An example
\code
gsl_matrix *m = apop_text_to_data("indata")->matrix;
printf("The correlation coefficient between rows two "
"and three is %g.\n", apop_vector_correlation(Apop_mrv(m, 2), Apop_mrv(m, 3)));
\endcode
\param ina, inb Two vectors of equal length (no default, must not be NULL)
\param weights Replicate weights for the observations. (default: equal weights for all observations)
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD double apop_vector_correlation(const gsl_vector *ina, const gsl_vector *inb, const gsl_vector *weights){
gsl_vector const * apop_varad_var(ina, NULL);
gsl_vector const * apop_varad_var(inb, NULL);
gsl_vector const * apop_varad_var(weights, NULL);
APOP_VAR_ENDHEAD
return apop_vector_cov(ina, inb, weights)
/ sqrt(apop_vector_var(ina, weights) * apop_vector_var(inb, weights)); }
/** Returns the distance between two vectors, where distance is defined
based on the third (optional) parameter:
- 'e' (the default): scalar distance (standard Euclidean metric) between two vectors. \f$\sqrt{\sum_i{(a_i - b_i)^2}},\f$
where \f$i\f$ iterates over dimensions.
- 'm' Returns the Manhattan metric distance between two vectors: \f$\sum_i{|a_i - b_i|},\f$
where \f$i\f$ iterates over dimensions.
- 'd' The discrete norm: if \f$a = b\f$, return zero, else return one.
- 's' The sup norm: find the dimension where \f$|a_i - b_i|\f$ is largest, return the distance along that one dimension.
- 'l' or 'L' The \f$L_p\f$ norm, \f$\left(\sum_i{|a_i - b_i|^2}\right)^{1/p}\f$. The value of \f$p\f$ is set by the fourth (optional) argument.
\param ina First vector (No default, must not be \c NULL)
\param inb Second vector (Default = zero)
\param metric The type of metric, as above.
\param norm If you are using an \f$L_p\f$ norm, this is \f$p\f$. Must be strictly greater than zero. (default = 2)
\li The defaults are such that
\code
apop_vector_distance(v);
apop_vector_distance(v, .metric = 's');
apop_vector_distance(v, .metric = 'm');
\endcode
gives you the standard Euclidean length of \c v, its longest element, and its sum.
\li This function uses the \ref designated syntax for inputs.
\include test_distances.c
*/
APOP_VAR_HEAD double apop_vector_distance(const gsl_vector *ina, const gsl_vector *inb, const char metric, const double norm){
static threadlocal gsl_vector *zero = NULL;
const gsl_vector * apop_varad_var(ina, NULL);
Apop_stopif(!ina, return NAN, 1, "The first vector is NULL. Returning NAN");
const gsl_vector * apop_varad_var(inb, NULL);
if (!inb){
if (!zero || zero->size !=ina->size){
if (zero) gsl_vector_free(zero);
zero = gsl_vector_calloc(ina->size);
}
inb = zero;
}
const char apop_varad_var(metric, 'e');
const double apop_varad_var(norm, 2);
APOP_VAR_ENDHEAD
Apop_stopif(ina->size != inb->size, return GSL_NAN, 0, "I need equal-sized vectors, but "
"you sent a vector of size %zu and a vector of size %zu. Returning NaN.", ina->size, inb->size);
double dist = 0;
if (metric == 'e' || metric == 'E'){
for (size_t i=0; i< ina->size; i++)
dist += gsl_pow_2(gsl_vector_get(ina, i) - gsl_vector_get(inb, i));
return sqrt(dist);
}
if (metric == 'm' || metric == 'M'){ //redundant with vector_grid_distance, below.
for (size_t i=0; i< ina->size; i++)
dist += fabs(gsl_vector_get(ina, i) - gsl_vector_get(inb, i));
return dist;
}
if (metric == 'd' || metric == 'D'){
for (size_t i=0; i< ina->size; i++)
if (gsl_vector_get(ina, i) != gsl_vector_get(inb, i))
return 1;
return 0;
}
if (metric == 's' || metric == 'S'){
for (size_t i=0; i< ina->size; i++)
dist = GSL_MAX(dist, fabs(gsl_vector_get(ina, i) - gsl_vector_get(inb, i)));
return dist;
}
if (metric == 'l' || metric == 'L'){
for (size_t i=0; i< ina->size; i++)
dist += pow(fabs(gsl_vector_get(ina, i) - gsl_vector_get(inb, i)), norm);
return pow(dist, 1./norm);
}
Apop_stopif(1, return NAN, 1, "I couldn't find the metric type you gave, %c, in my list of supported types. Returning NaN", metric);
}
/** This function will normalize a vector, either such that it has mean
zero and variance one, or ranges between zero and one, or sums to one.
\param in A \c gsl_vector with the un-normalized data. \c NULL
input gives \c NULL output. (No default)
\param out If normalizing in place, \c NULL.
If not, the address of a <tt>gsl_vector*</tt>. Do not allocate. (default = \c NULL.)
\param normalization_type
\c 'p': normalized vector will sum to one. E.g., start with a set of observations in bins, end with the percentage of observations in each bin. (the default)<br>
\c 'r': normalized vector will range between zero and one. Replace each X with (X-min) / (max - min).<br>
\c 's': normalized vector will have mean zero and (sample) variance one. Replace
each X with \f$(X-\mu) / \sigma\f$, where \f$\sigma\f$ is the sample
standard deviation.<br>
\c 'm': normalize to mean zero: Replace each X with \f$(X-\mu)\f$<br>
\b Example
\code
\endcode
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD void apop_vector_normalize(gsl_vector *in, gsl_vector **out, const char normalization_type){
gsl_vector * apop_varad_var(in, NULL);
Apop_stopif(!in, return, 1, "Input vector is NULL. Doing nothing.");
gsl_vector ** apop_varad_var(out, NULL);
const char apop_varad_var(normalization_type, 'p');
APOP_VAR_END_HEAD
double mu, min, max;
if (!out) out = ∈
else {
*out = gsl_vector_alloc (in->size);
gsl_vector_memcpy(*out, in);
}
if (normalization_type == 's'){
mu = apop_vector_mean(in);
Apop_stopif(!isfinite(mu), return, 0, "normalization failed: the mean of the vector is not finite.");
gsl_vector_add_constant(*out, -mu);
double scaling = 1./(sqrt(apop_vector_var_m(*out, 0)));
Apop_stopif(!isfinite(scaling), return, 0, "normalization failed: 1/(std error) of the vector is not finite.");
gsl_vector_scale(*out, scaling);
}
else if (normalization_type == 'r'){
gsl_vector_minmax(in, &min, &max);
gsl_vector_add_constant(*out, -min);
gsl_vector_scale(*out, 1/(max-min));
}
else if (normalization_type == 'p'){
long double sum = apop_sum(in);
Apop_stopif(!sum, return, 0, "the vector sums to zero, so I can't normalize it to sum to one.");
gsl_vector_scale(*out, 1/sum);
}
else if (normalization_type == 'm'){
mu = apop_vector_mean(in);
Apop_stopif(!isfinite(mu), return, 0, "normalization failed: the mean of the vector is not finite.");
gsl_vector_add_constant(*out, -mu);
}
}
/** Returns the sum of the elements of a matrix. Occasionally convenient.
\param m the matrix to be summed.
*/
long double apop_matrix_sum(const gsl_matrix *m){
Apop_stopif(!m, return 0, 1, "You just asked me to sum a NULL. Returning zero.");
long double sum = 0;
for (size_t j=0; j< m->size1; j++)
for (size_t i=0; i< m->size2; i++)
sum += gsl_matrix_get(m, j, i);
return sum;
}
/** Returns the mean of all elements of a matrix.
\param data The matrix to be averaged. If \c NULL, return zero.
\return The mean of all cells of the matrix.
*/
double apop_matrix_mean(const gsl_matrix *data){
if (!data) return 0;
long double avg = 0;
int cnt = 0;
for(size_t i=0; i < data->size1; i++)
for(size_t j=0; j < data->size2; j++){
double x = gsl_matrix_get(data, i,j);
long double ratio = cnt/(cnt+1.0);
cnt++;
avg*= ratio;
avg+= x/cnt;
}
return avg;
}
/** Returns the mean and population variance of all elements of a matrix.
\li If \c NULL, return \f$\mu=0, \sigma^2=NaN\f$.
\li Gives the population variance (sum of squares divided by \f$N\f$).
If you want sample variance, multiply the result by \f$N/(N-1)\f$:
\code
double mu, var;
apop_data *data= apop_query_to_data("select * from indata");
apop_matrix_mean_and_var(data->matrix, &mu, &var);
var *= (data->size1*data->size2)/(data->size1*data->size2-1.0);
\endcode
\param data the matrix to be averaged.
\param mean where to put the mean to be calculated.
\param var where to put the variance to be calculated.
*/
void apop_matrix_mean_and_var(const gsl_matrix *data, double *mean, double *var){
if (!data) {*mean=0; *var=GSL_NAN; return;}
long double avg = 0,
avg2 = 0;
size_t cnt= 0;
long double x, ratio;
for(size_t i=0; i < data->size1; i++)
for(size_t j=0; j < data->size2; j++){
x = gsl_matrix_get(data, i,j);
ratio = cnt/(cnt+1.0);
cnt ++;
avg *= ratio;
avg2 *= ratio;
avg += x/(cnt +0.0);
avg2 += gsl_pow_2(x)/(cnt +0.0);
}
*mean = avg;
*var = avg2 - gsl_pow_2(avg); //E[x^2] - E^2[x]
}
/** Put summary information about the columns of a table (mean, std dev, variance, min, median, max) in a table.
\param indata The table to be summarized. An \ref apop_data structure. May have a <tt>weights</tt> element.
\return An \ref apop_data structure with one row for each column in the original
table, and a column for each summary statistic.
\exception out->error='a' Allocation error.
\li This function gives more columns than you probably want; use \ref apop_data_prune_columns to pick the ones you want to see.
\li See apop_data_prune_columns for an example.
*/
apop_data * apop_data_summarize(apop_data *indata){
Apop_stopif(!indata, return NULL, 0, "You sent me a NULL apop_data set. Returning NULL.");
Apop_stopif(!indata->matrix, return NULL, 0, "You sent me an apop_data set with a NULL matrix. Returning NULL.");
apop_data *out = apop_data_alloc(indata->matrix->size2, 6);
double mean, var;
char rowname[10000]; //crashes on more than 10^9995 columns.
apop_name_add(out->names, "mean", 'c');
apop_name_add(out->names, "std dev", 'c');
apop_name_add(out->names, "variance", 'c');
apop_name_add(out->names, "min", 'c');
apop_name_add(out->names, "median", 'c');
apop_name_add(out->names, "max", 'c');
if (indata->names !=NULL){
apop_name_stack(out->names,indata->names, 'r', 'c');
if (indata->names->title && strlen(indata->names->title)){
char *title;
Asprintf(&title, "summary for %s", indata->names->title);
apop_name_add(out->names, title, 'h');
free(title);
}
}
else
for (size_t i=0; i< indata->matrix->size2; i++){
sprintf(rowname, "col %zu", i);
apop_name_add(out->names, rowname, 'r');
}
for (size_t i=0; i< indata->matrix->size2; i++){
gsl_vector *v = Apop_cv(indata, i);
if (!indata->weights){
mean = apop_vector_mean(v);
var = apop_vector_var_m(v, mean);
} else {
mean = apop_vector_mean(v, indata->weights);
var = apop_vector_var(v, indata->weights);
}
double *pctiles = apop_vector_percentiles(v);
gsl_matrix_set(out->matrix, i, 0, mean);
gsl_matrix_set(out->matrix, i, 1, sqrt(var));
gsl_matrix_set(out->matrix, i, 2, var);
gsl_matrix_set(out->matrix, i, 3, pctiles[0]);
gsl_matrix_set(out->matrix, i, 4, pctiles[50]);
gsl_matrix_set(out->matrix, i, 5, pctiles[100]);
free(pctiles);
}
return out;
}
/** Returns an array of size 101, where \c returned_vector[95] gives the value of the
95th percentile, for example. \c Returned_vector[100] is always the maximum value,
and \c returned_vector[0] is always the min (regardless of rounding rule).
\param data A \c gsl_vector with the data. (No default, must not be \c NULL.)
\param rounding Either be \c 'u', \c 'd', or \c 'a'. Unless your data is
exactly a multiple of 101, some percentiles will be ambiguous. If \c 'u', then round
up (use the next highest value); if \c 'd', round down to the next lowest value; if \c
'a', take the mean of the two nearest points. (Default = \c 'd'.)
\li If the rounding method is \c 'u' or \c 'a', then you can say "5% or more of
the sample is below returned_vector[5]"; if \c 'd' or \c 'a', then you can say "5%
or more of the sample is above returned_vector[5]".
\li You may eventually want to \c free() the array returned by this function.
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD double * apop_vector_percentiles(gsl_vector *data, char rounding){
gsl_vector *apop_varad_var(data, NULL);
Apop_stopif(!data, return NULL, 0, "You gave me NULL data.");
char apop_varad_var(rounding, 'd');
APOP_VAR_ENDHEAD
gsl_vector *sorted = gsl_vector_alloc(data->size);
double *pctiles = malloc(sizeof(double) * 101);
gsl_vector_memcpy(sorted,data);
gsl_sort_vector(sorted);
for(int i=0; i<101; i++){
int index = i*(data->size-1)/100.0;
if (rounding == 'u' && index != i*(data->size-1)/100.0)
index ++; //index was rounded down, but should be rounded up.
if (rounding == 'a' && index != i*(data->size-1)/100.0)
pctiles[i] = (gsl_vector_get(sorted, index)+gsl_vector_get(sorted, index+1))/2.;
else pctiles[i] = gsl_vector_get(sorted, index);
}
gsl_vector_free(sorted);
return pctiles;
}
/** Find the mean, weighted or unweighted.
\param v The data vector
\param weights The weight vector. Default: assume equal weights.
\return The weighted mean
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD double apop_vector_mean(gsl_vector const *v, gsl_vector const *weights){
gsl_vector const * apop_varad_var(v, NULL);
gsl_vector const * apop_varad_var(weights, NULL);
Check_vw
APOP_VAR_END_HEAD
if (!weights) return gsl_stats_mean(v->data, v->stride, v->size);
long double sum = 0, wsum = 0;
for (size_t i=0; i< weights->size; i++){
sum += gsl_vector_get(weights, i) * gsl_vector_get(v,i);
wsum += gsl_vector_get(weights, i);
}
return sum/wsum;
}
/** Find the sample variance of a vector, weighted or unweighted.
\param v The data vector
\param weights The weight vector. If NULL (the default), assume equal weights.
\return The weighted sample variance.
\li This uses (n-1) in the denominator of the sum; i.e., it corrects for the bias
introduced by using \f$\bar x\f$ instead of \f$\mu\f$.
\li Multiply the output by (n-1)/n if you need population variance.
\li Apophenia tries to be smart about reading the weights. If weights
sum to one, then the system uses \c w->size as the number of elements,
and returns the usual sum over \f$n-1\f$. If weights > 1, then the
system uses the total weights as \f$n\f$. Thus, you can use the weights
as standard weightings or to represent elements that appear repeatedly.
\li This function uses the \ref designated syntax for inputs.
\see apop_vector_var_m for the case where you already have the vector's mean.
*/
APOP_VAR_HEAD double apop_vector_var(gsl_vector const *v, gsl_vector const *weights){
gsl_vector const * apop_varad_var(v, NULL);
gsl_vector const * apop_varad_var(weights, NULL);
Check_vw
APOP_VAR_END_HEAD
if (!weights) return gsl_stats_variance(v->data, v->stride, v->size);
//Using the E(x^2) - E^2(x) form.
long double sum = 0, wsum = 0, sumsq = 0, vv, ww;
for (size_t i=0; i< weights->size; i++){
vv = gsl_vector_get(v, i);
ww = gsl_vector_get(weights, i);
sum += ww * vv;
sumsq += ww * gsl_pow_2(vv);
wsum += ww;
}
double len = (wsum < 1.1 ? weights->size : wsum);
return (sumsq/len - gsl_pow_2(sum/len)) * len/(len -1.);
}
/** Find the sample covariance of a pair of vectors, with an optional weighting. This only
makes sense if the weightings are identical, so the function takes only one weighting vector for both.
\param v1, v2 The data vectors (no default; must not be \c NULL)
\param weights The weight vector. (default equal weights for all elements)
\return The sample covariance
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD double apop_vector_cov(const gsl_vector *v1, const gsl_vector *v2, const gsl_vector *weights){
gsl_vector const * apop_varad_var(v1, NULL);
gsl_vector const * apop_varad_var(v2, NULL);
gsl_vector const * apop_varad_var(weights, NULL);
Apop_stopif(!v1, return GSL_NAN, 0, "first data vector is NULL. Returning NaN.");
Apop_stopif(!v2, return GSL_NAN, 0, "second data vector is NULL. Returning NaN.");
Apop_stopif(!v1->size, return GSL_NAN, 0, "first data vector has size 0. Returning NaN.");
Apop_stopif(!v2->size, return GSL_NAN, 0, "second data vector has size 0. Returning NaN.");
Apop_stopif(v1->size!= v2->size, return GSL_NAN, 0, "data vectors have sizes %zu and %zu. Returning NaN.", v1->size, v2->size);
Apop_stopif(weights && ((weights->size != v1->size) || (weights->size != v2->size)), return GSL_NAN, 0, "data vectors have sizes %zu and %zu; weighting vector has size %zu. Returning NaN.", v1->size, v2->size, weights->size);
APOP_VAR_ENDHEAD
if (!weights) return gsl_stats_covariance(v1->data, v1->stride, v2->data, v2->stride, v2->size);
long double sum1 = 0, sum2 = 0, wsum = 0, sumsq = 0, vv1, vv2, ww;
//Using the E(x^2) - E^2(x) form.
for (size_t i=0; i< weights->size; i++){
vv1 = gsl_vector_get(v1,i);
vv2 = gsl_vector_get(v2,i);
ww = gsl_vector_get(weights,i);
sum1 += ww * vv1;
sum2 += ww * vv2;
sumsq+= ww * vv1 * vv2;
wsum += ww;
}
double len = (wsum < 1.1 ? weights->size : wsum);
return (sumsq/len - sum1*sum2/gsl_pow_2(len)) *(len/(len-1));
}
/** Returns the sample variance/covariance matrix relating each column of the matrix to each other column.
\param in An \ref apop_data set. If the weights vector is set, I'll take it into account.
\li This is the sample covariance---dividing by \f$n-1\f$, not \f$n\f$. If you need the population variance, use
\code
apop_data *popcov = apop_data_covariance(indata);
int size=indata->matrix->size1;
gsl_matrix_scale(popcov->matrix, size/(size-1.));
\endcode
\return Returns an \ref apop_data set the variance/covariance matrix.
\exception out->error='a' Allocation error.
*/
apop_data *apop_data_covariance(const apop_data *in){
Apop_stopif(!in, return NULL, 1, "You sent me a NULL apop_data set. Returning NULL.");
Apop_stopif(!in->matrix, return NULL, 1, "You sent me an apop_data set with a NULL matrix. Returning NULL.");
apop_data *out = apop_data_alloc(in->matrix->size2, in->matrix->size2);
Apop_stopif(out->error, return out, 0, "allocation error.");
for (size_t i=0; i < in->matrix->size2; i++){
for (size_t j=i; j < in->matrix->size2; j++){
double var = apop_vector_cov(Apop_cv(in, i), Apop_cv(in, j), in->weights);
gsl_matrix_set(out->matrix, i,j, var);
if (i!=j) gsl_matrix_set(out->matrix, j,i, var);
}
}
apop_name_stack(out->names, in->names, 'c');
apop_name_stack(out->names, in->names, 'r', 'c');
return out;
}
/** Returns the matrix of correlation coefficients \f$(\sigma^2_{xy}/(\sigma_x\sigma_y))\f$ relating each column with each other.
\param in A data matrix: rows are observations, columns are variables. If you give me a weights vector, I'll use it.
\return Returns the square variance/covariance matrix with dimensions equal to the number of input columns.
\exception out->error='a' Allocation error.
*/
apop_data *apop_data_correlation(const apop_data *in){
apop_data *out = apop_data_covariance(in);
if (!out) return NULL;
for(size_t i=0; i< in->matrix->size2; i++){
double std_dev = sqrt(apop_vector_var(Apop_cv(in, i), in->weights));
gsl_vector_scale(Apop_cv(out, i), 1.0/std_dev);
gsl_vector_scale(Apop_rv(out, i), 1.0/std_dev);
}
return out;
}
/** Given a vector representing a probability distribution of observations, calculate the entropy, \f$\sum_i -\ln(v_i)v_i\f$.
\li You may input a vector giving frequencies (normalized to sum to one) or counts (arbitrary sum).
\li The entropy of a data set depends only on the frequency with which elements are
observed, not the value of the elements themselves. The \ref apop_data_pmf_compress
function will reduce an input \ref apop_data set to one weighted line per observation, and
the weights would determine the entropy:
\code
apop_data *data = apop_text_to_data("indata");
apop_data_pmf_compress(data);
data_entropy = apop_vector_entropy(d->weights);
\endcode
\li The entropy is calculated using natural logs. To convert to base 2, divide by \f$\ln(2)\f$; see the example.
\li The entropy of an empty data set (\c NULL or a total weight of zero) is zero. Print a warning when given \c NULL
input and <tt>apop_opts.verbose >=1</tt>.
\li If the input vector has negative elements, return \c NaN; print a warning when <tt>apop_opts.verbose >= 0</tt>.
Sample code:
\include entropy_vector.c
*/
long double apop_vector_entropy(gsl_vector *in){
Apop_stopif(!in, return 0, 1, "Entropy of a NULL vector ≡ 0");
Apop_stopif(!in->size, return 0, 1, "Entropy of a zero-length vector ≡ 0");//can't happen.
//User may or may not have normalized in, so scale everything by the sum.
long double sum = apop_vector_sum(in);
Apop_stopif(sum<0, return NAN, 0, "Vector sums to a negative value (%Lg). Returning NaN.\n", sum);
if (!sum) return 0;
long double out=0;
for (int i=0; i< in->size; i++){
double val = gsl_vector_get(in, i)/sum;
Apop_stopif(val<0, return NAN, 0, "negative value (%g) in vector position %i. Returning NaN.\n", val, i);
if (!val) continue;
out -= logl(val)*val;
}
return out;
}
static long double norment(apop_model *m){
double sigma_sq = gsl_pow_2(apop_data_get(m->parameters, 1));
return (log(2*M_PI*sigma_sq) +1)/2.;
}
double get_ll(apop_data *d, void *m){ return apop_log_likelihood(d, m); }
/** Calculate the entropy of a model: \f$\int -\ln(p(x))p(x)dx\f$, which is the expected
value of \f$-\ln(p(x))\f$.
The default method is to make draws using \ref apop_model_draws, then
evaluate the log likelihood at those points using the model's \c log_likelihood method.
There are a number of routines for specific models, inlcuding the \ref apop_normal and \ref apop_pmf models.
\li If you want the entropy of a data set, see \ref apop_vector_entropy.
\li The entropy is calculated using natural logs. If you prefer base-2 logs, just divide by \f$\ln(2)\f$: <tt>apop_model_entropy(my_model)/log(2)</tt>.
\param in A parameterized \ref apop_model. That is, you have already used \ref apop_estimate or \ref apop_model_set_parameters to estimate/set the model parameters.
\param draws If using the default method of making random draws, how many random draws to make (default=1,000)
Sample code:
\include entropy_model.c
*/
APOP_VAR_HEAD long double apop_model_entropy(apop_model *in, int draws){
apop_model * apop_varad_var(in, NULL);
Apop_stopif(!in, return NAN, 0, "NULL input model. Returning NaN.");
int apop_varad_var(draws, 1000);
APOP_VAR_ENDHEAD
static int setup=0; if (!(setup++)){
apop_entropy_vtable_add(norment, apop_normal);
}
apop_entropy_type e_fn = apop_entropy_vtable_get(in);
if (e_fn) return e_fn(in);
apop_data *d = apop_model_draws(in, draws);
apop_data *lls = apop_map(d, .fn_rp=get_ll, .param=in);
long double out = -apop_vector_mean(lls->vector);
apop_data_free(d);
apop_data_free(lls);
return out;
}
/** Kullback-Leibler divergence.
This measure of the divergence of one distribution from another has the form \f$ D(p,q)
= \sum_i \ln(p_i/q_i) p_i \f$. Notice that it is not a distance, because there is an
asymmetry between \f$p\f$ and \f$q\f$, so one can expect that \f$D(p, q) \neq D(q, p)\f$.
\param from the \f$p\f$ in the above formula. (No default; must not be \c NULL)
\param to the \f$q\f$ in the above formula. (No default; must not be \c NULL)
\param draw_ct If I do the calculation via random draws, how many? (Default = 1e5)
\param rng A \c gsl_rng. If \c NULL or number of threads is greater than 1, I'll take care of the RNG; see \ref apop_rng_get_thread. (Default = \c NULL)
This function can take empirical histogram-type models (\ref apop_pmf) or continuous
models like \ref apop_loess or \ref apop_normal.
If the \c from distribution is a PMF (determined by checking whether its \c p function
is that of \ref apop_pmf), then I'll step through it for the points in the summation.
\li If you have two empirical distributions in the form of \ref apop_pmf, they must
be synced: if \f$p_i>0\f$ but \f$q_i=0\f$, then the function returns \c GSL_NEGINF. If
<tt>apop_opts.verbose >=1</tt> I print a message as well.
If the \c from distribution is not a PMF, then I will take \c draw_ct random draws
from \c from and evaluate at those points.
\li Set <tt>apop_opts.verbose = 3</tt> for observation-by-observation info.
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD long double apop_kl_divergence(apop_model *from, apop_model *to, int draw_ct, gsl_rng *rng){
apop_model * apop_varad_var(from, NULL);
apop_model * apop_varad_var(to, NULL);
Apop_stopif(!from, return NAN, 0, "The first model is NULL; returning NaN.");
Apop_stopif(!to, return NAN, 0, "The second model is NULL.");
double apop_varad_var(draw_ct, 1e5);
gsl_rng * apop_varad_var(rng, NULL);
APOP_VAR_ENDHEAD
double div = 0;
Apop_notify(3, "p(from)\tp(to)\tfrom*log(from/to)\n");
if (from->p == apop_pmf->p){
apop_data *p = from->data;
apop_pmf_settings *settings = Apop_settings_get_group(from, apop_pmf);
Get_vmsizes(p); //maxsize
OMP_for_reduce (+:div, int i=0; i < maxsize; i++){
double pi = p->weights ? gsl_vector_get(p->weights, i)/settings->total_weight : 1./maxsize;
if (!pi){
Apop_notify(3, "0\t--\t0");
continue;
} //else:
double qi = apop_p(Apop_r(p, i), to);
Apop_notify(3,"%g\t%g\t%g", pi, qi, pi ? pi * log(pi/qi):0);
Apop_stopif(!qi, div+=GSL_NEGINF; break, 1, "The PMFs aren't synced: from-distribution has a value where "
"to-distribution doesn't (which produces infinite divergence).");
div += pi * log(pi/qi);
}
} else { //the version with the RNG.
Apop_stopif(!from->dsize, return GSL_NAN, 0, "I need to make random draws from the 'from' model, "
"but its dsize (draw size)==0. Returning NaN.");
OMP_for_reduce(+:div, int i=0; i < draw_ct; i++){
double draw[from->dsize];
apop_draw(draw, rng, from);
gsl_matrix_view dm = gsl_matrix_view_array(draw, 1, from->dsize);
double pi = apop_p(&(apop_data){.matrix=&(dm.matrix)}, from);
double qi = apop_p(&(apop_data){.matrix=&(dm.matrix)}, to);
double val = pi ? log(pi/qi): 0; //each row already has probability p_i
Apop_notify(3,"%g\t%g\t%g", pi, qi, val);
div += val;
Apop_stopif(!qi, break, 1, "From-distribution has a value where "
"to-distribution doesn't (which produces infinite divergence).");
}
div /= draw_ct; //div is an expected value of ln(pi/qi)
}
return div;
}
/** The multivariate generalization of the Gamma distribution.
\f[
\Gamma_p(a)=
\pi^{p(p-1)/4}\prod_{j=1}^p
\Gamma\left[ a+(1-j)/2\right]. \f]
Because \f$\Gamma(x)\f$ is undefined for \f$x\in\{0, -1, -2, ...\}\f$, this function returns \c NAN when \f$a+(1-j)/2\f$ takes on one of those values.
See also \ref apop_multivariate_lngamma, which is more numerically stable in most cases.
*/
long double apop_multivariate_gamma(double a, int p){
Apop_stopif(-(a+(1-p)/2) == (int)-(a+(1-p)/2) && a+(1-p)/2 <=0, return NAN, 1, "Undefined when a + (1-p)/2 = 0, -1, -2, ... [you sent a=%g, p=%i]", a, p);
long double out = pow(M_PI, p*(p-1.)/4.);
long double factor = 1;
for (int i=1; i<=p; i++)
factor *= gsl_sf_gamma(a+(1-i)/2.);
return out * factor;
}
/** The log of the multivariate generalization of the Gamma; see also
\ref apop_multivariate_gamma.
*/
long double apop_multivariate_lngamma(double a, int p){
Apop_stopif(-(a+(1-p)/2) == (int)-(a+(1-p)/2) && a+(1-p)/2 <=0, return NAN, 1, "Undefined when a + (1-p)/2 = 0, -1, -2, ... [you sent a=%g, p=%i]", a, p);
long double out = M_LNPI * p*(p-1.)/4.;
for (int i=1; i<=p; i++)
out += gsl_sf_lngamma(a+(1-i)/2.);
return out;
}
static void find_eigens(gsl_matrix **subject, gsl_vector *eigenvals, gsl_matrix *eigenvecs){
gsl_eigen_symmv_workspace * w = gsl_eigen_symmv_alloc((*subject)->size1);
gsl_eigen_symmv(*subject, eigenvals, eigenvecs, w);
gsl_eigen_symmv_free (w);
gsl_matrix_free(*subject); *subject = NULL;
}
static void diagonal_copy(gsl_vector *v, gsl_matrix *m, char in_or_out){
gsl_vector_view dv = gsl_matrix_diagonal(m);
if (in_or_out == 'i') gsl_vector_memcpy(&(dv.vector), v);
else gsl_vector_memcpy(v, &(dv.vector));
}
static double diagonal_size(gsl_matrix *m){
gsl_vector_view dv = gsl_matrix_diagonal(m);
return apop_sum(&dv.vector);
}
static double biggest_elmt(gsl_matrix *d){
return GSL_MAX(fabs(gsl_matrix_max(d)), fabs(gsl_matrix_min(d)));
}
/** Test whether the input matrix is positive semidefinite (PSD).
A covariance matrix will always be PSD, so this function can tell you whether your matrix is a valid covariance matrix.
Consider the 1x1 matrix in the upper left of the input, then the 2x2 matrix in the
upper left, on up to the full matrix. If the matrix is PSD, then each of these has
a positive determinant. This function thus calculates \f$N\f$ determinants for an
\f$N\f$x\f$N\f$ matrix.
\param m The matrix to test. If \c NULL, I will return zero---not PSD.
\param semi If anything but \c 's', check for positive definite, not semidefinite. (default 's')
See also \ref apop_matrix_to_positive_semidefinite, which will change the input to something PSD.
\li This function uses the \ref designated syntax for inputs.
*/
APOP_VAR_HEAD int apop_matrix_is_positive_semidefinite(gsl_matrix *m, char semi){
gsl_matrix * apop_varad_var(m, NULL);
Apop_stopif(!m, return 0, 1, "You gave me a NULL matrix. I will take this as not positive semidefinite; returning zero.");
char apop_varad_var(semi, 's');
APOP_VAR_ENDHEAD
for (int i=1; i<= m->size1; i++){
gsl_matrix mv =gsl_matrix_submatrix (m, 0, 0, i, i).matrix;
double det = apop_matrix_determinant(&mv);
if ((semi == 'd' && det <0) || det <=0)
return 0;
}
return 1;
}
void vfabs(double *x){*x = fabs(*x);}
/** This function takes in a matrix and converts it in place to the `closest' positive semidefinite matrix.
\param m On input, any matrix; on output, a positive semidefinite matrix. If \c NULL, return \c NaN and print an error.
\return the distance between the original and new matrices.
\li See also the test function \ref apop_matrix_is_positive_semidefinite.
\li This function can be used as the core of a model constraint.
\li Adapted from the R Matrix package's nearPD, which is
Copyright (2007) Jens Oehlschlägel [under the GPL].
*/
double apop_matrix_to_positive_semidefinite(gsl_matrix *m){
Apop_stopif(!m, return NAN, 0, "Got a NULL matrix. Returning NaN.");
if (apop_matrix_is_positive_semidefinite(m)) return 0;
double diffsize=0, dsize;
apop_data *qdq;
gsl_matrix *d = apop_matrix_copy(m);
gsl_matrix *original = apop_matrix_copy(m);
double orig_diag_size = fabs(diagonal_size(d));
int size = d->size1;
gsl_vector *diag = gsl_vector_alloc(size);
diagonal_copy(diag, d, 'o');
apop_vector_apply(diag, vfabs);
double origsize = biggest_elmt(d);
do {
//get eigenvals
apop_data *eigenvecs = apop_data_alloc(size, size);
gsl_vector *eigenvals = gsl_vector_calloc(size);
gsl_matrix *junk_copy = apop_matrix_copy(d);
find_eigens(&junk_copy, eigenvals, eigenvecs->matrix);//junk freed here.
//prune positive only
int j=0;
int plussize = eigenvecs->matrix->size1;
int *mask = calloc(eigenvals->size , sizeof(int));
for (int i=0; i< eigenvals->size; i++)
plussize -=
mask[i] = (gsl_vector_get(eigenvals, i) <= 0);
//construct Q = pruned eigenvals
apop_data_rm_columns(eigenvecs, mask);
if (!eigenvecs->matrix) break;
//construct D = positive eigen diagonal
apop_data *eigendiag = apop_data_calloc(0, plussize, plussize);
for (int i=0; i< eigenvals->size; i++)
if (!mask[i]) {
apop_data_set(eigendiag, j, j, eigenvals->data[i]);
j++;
}
// Our candidate is QDQ', symmetrized, with the old diagonal subbed in.
apop_data *qd = apop_dot(eigenvecs, eigendiag);
qdq = apop_dot(qd, eigenvecs, .form2='t');
for (int i=0; i< qdq->matrix->size1; i++)
for (int j=i+1; j< qdq->matrix->size1; j++){
double avg = (apop_data_get(qdq, i, j) +apop_data_get(qdq, j, i)) /2.;
apop_data_set(qdq, i, j, avg);
apop_data_set(qdq, j, i, avg);
}
diagonal_copy(diag, qdq->matrix, 'i');
// Evaluate progress, clean up.
dsize = biggest_elmt(d);
gsl_matrix_sub(d, qdq->matrix);
diffsize = biggest_elmt(d);
apop_data_free(qd); gsl_matrix_free(d);
apop_data_free(eigendiag); free(mask);
apop_data_free(eigenvecs); gsl_vector_free(eigenvals);
d = qdq->matrix;
qdq->matrix=NULL; apop_data_free(qdq); qdq = NULL;
} while (diffsize/dsize > 1e-3);
apop_data *eigenvecs = apop_data_alloc(size, size);
gsl_vector *eigenvals = gsl_vector_calloc(size);
find_eigens(&d, eigenvals, eigenvecs->matrix);//free d here.
//make eigenvalues more positive
double score =0;
for (int i=0; i< eigenvals->size; i++){
double v = gsl_vector_get(eigenvals, i);
if (v < 1e-1){
gsl_vector_set(eigenvals, i, 1e-1);
score += 1e-1 - v;
}
}
for (int i=0; i< size; i++)
assert(eigenvals->data[i] >=0);
//if (score){
apop_data *eigendiag = apop_data_calloc(0, size, size);
diagonal_copy(eigenvals, eigendiag->matrix, 'i');
double new_diag_size = diagonal_size(eigendiag->matrix);
gsl_matrix_scale(eigendiag->matrix, orig_diag_size/new_diag_size);
apop_data *qd = apop_dot(eigenvecs, eigendiag);
qdq = apop_dot(qd, eigenvecs, .form2='t');
gsl_matrix_memcpy(m, qdq->matrix);
apop_data_free(qd);
apop_data_free(eigendiag);
//}
assert(apop_matrix_is_positive_semidefinite(m));
apop_data_free(qdq); gsl_vector_free(diag);
apop_data_free(eigenvecs); gsl_vector_free(eigenvals);
gsl_matrix_sub(original, m);
return biggest_elmt(original)/origsize;
}