-
Notifications
You must be signed in to change notification settings - Fork 0
/
project_v2.py
244 lines (203 loc) · 10.7 KB
/
project_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
## LeDuc, Pereira, Zhang
# This is a first hack at working with the project data using the KL divergence to measure the distance
# between two probability distributions of the depth of minimum sound speed.
import numpy as np
import pandas as pd
import pylab as pl # This gets used a lot I promise
from abstract_simplicial_complex import Point, Simplex, vietoris_rips
from metrics import ks_test
from random import sample, seed
from scipy.interpolate import interp1d
from statfuncs import ecdf
print('AAAAAAAAAA')
# import sys
# sys.setrecursionlimit(1000)
# Global constants (bad)
# TODO: move these into a config file?
USE_RANDOM_SAMPLING = True # Read a random sample of points from the input data
MIN_POINTS_FOR_KL_DIVERGENCE = 4 # Minimum number of points needed to compute KL divergence
TEST_SAMPLE_SIZE = 2500 # Number of points to extract from dataset (non-random sampling)
MIN_SIGNIFICANCE_LEVEL = 0.05 # Used in Kolmogorov-Smirnov test
MAX_ASC_DIMENSION = 2 # Maximum dimension of the simplices considered in the output ASC
# Read a subset of data from multiple years (currently, 2 years -- hardcoded)
def read_raw_data(sample_size=None, sample_randomly=True, random_seed=0):
seed(random_seed)
data1 = np.genfromtxt('data/MITprof_mar2016_argo0708.nc.csv', delimiter = ',')
data2 = np.genfromtxt('data/MITprof_mar2016_argo0910.nc.csv', delimiter = ',')
data = np.concatenate((data1[:,1:], data2[:, 1:]), axis = 1)
depths = data1[3:,0]
N = data.shape[1] # total number of data points
# Choose a subset of points to return, if sample_size is specified
if (sample_size is not None) and (sample_size < N):
if sample_randomly:
all_indices = range(data.shape[1])
selected_indices = sample(all_indices, sample_size)
else:
selected_indices = range(sample_size)
data = data[:, selected_indices]
return data, depths
# Preprocess data to extract dates, ocean depths, latitudes, longitudes, and sound speed profiles
def extract_features(data):
NUM_DAYS_TO_UNIX_EPOCH = 719529 # Days between January 1, 0000 to January 1, 1970
dates = pd.to_datetime(data[2, 1:] - NUM_DAYS_TO_UNIX_EPOCH, unit='D')
# Spatial dimensions
latitudes = np.around(data[0, 1:] / 3) * 3 # length: N
longitudes = np.around(data[1, 1:] / 3) * 3 # length: N
# Sound speed profiles
sound_speed_profiles = data[3:, 1:] # dimensions: K x N
return dates, latitudes, longitudes, sound_speed_profiles
# Divide the Earth's surface into geographic (latitude/longitude) boxes,
# and return the coordinates at the boundary of each box.
def locate_grid_boundaries(latitudes, longitudes):
# Can find probability density functions of sound speed minimum or maximum
# <unused>
# mask by month first, then divide into lat/lon boxes?
## sound_speed_minimums = np.argmin(sound_speed_profiles, axis = 0)
## min_depths = depths[sound_speed_minimums]
## for mm in [1]: # [1,2,3,4,5,6,7,8,9,10,11,12]
## month_mask = ( dates.month == mm )
## this_month_min_depths = min_depths[month_mask]
## this_month_lats = lats[month_mask]
## this_month_lons = lons[month_mask]
## ...
# </unused>
unique_latitudes = np.unique(latitudes)
unique_longitudes = np.unique(longitudes)
# Cartesian product between latitude and longitude gridpoints
[lat_grid, lon_grid] = np.meshgrid(unique_latitudes, unique_longitudes)
return np.reshape(lat_grid, [1, -1])[0], np.reshape(lon_grid, [1, -1])[0]
# Compute CDFs on each geographic box
def compute_boxed_cdfs(latitudes, longitudes, grid_latitudes, grid_longitudes, sound_speed_profiles, min_points_per_box=MIN_POINTS_FOR_KL_DIVERGENCE):
min_speeds = np.argmin(sound_speed_profiles, axis = 0)
min_depths = depths[min_speeds]
num_boxes = len(grid_latitudes)
spatial_cdf = np.zeros([len(depths), num_boxes])
subsample_sizes = np.zeros(num_boxes)
for k in range(len(grid_latitudes)):
lat_indices = np.where(latitudes == grid_latitudes[k])
lon_indices = np.where(longitudes == grid_longitudes[k])
box_indices = np.intersect1d(lat_indices, lon_indices)
# Consider the number of points in this box
subsample_sizes[k] = len(np.unique(min_depths[box_indices]))
# and whether the subsample size is sufficient
if subsample_sizes[k] < min_points_per_box:
continue
# Compute empirical CDF on the depths
quantiles, probabilities = ecdf(min_depths[box_indices])
# Interpolate the ecdf onto the depth grid we started with, then adjust the
# values as needed. We need these CDFs to measure the distance between points.
interpolator = interp1d(quantiles, probabilities, fill_value="extrapolate")
p = interpolator(depths) # interpolated cumulative probabilities
# Probabilities must be between 0 and 1
too_low = p < 0
p[too_low] = 0
too_high = p > 1
p[too_high] = 1
spatial_cdf[:, k] = p
# Exclude boxes with too few data points
mask = np.where(subsample_sizes >= min_points_per_box)
masked_lats = grid_latitudes[mask]
masked_lons = grid_longitudes[mask]
masked_cdfs = spatial_cdf[:, mask]
return masked_lats, masked_lons, masked_cdfs
# Compute the Kolmogorov-Smirnov statistic between each pair of geographic boxes
def compute_pairwise_ks_statistics(latitudes, longitudes, cdfs):
# <unused>
## Kolmogorov-Smirnov constants
# a = MIN_SIGNIFICANCE_LEVEL
# c_a = np.sqrt(-np.log(a/2)*0.5)
## Value the metric needs to exceed to reject the ks test null hypothesis at the given significance level
# critical_value = c_a * np.sqrt(2 / len(depths))
# </unused>
num_boxes = len(latitudes)
assert(num_boxes == len(longitudes))
ks_matrix = np.zeros([num_boxes, num_boxes])
# Calculate the test statistic on each box
for i in range(0, num_boxes):
for j in range(i+1, num_boxes):
ks_matrix[i, j] = ks_test(cdfs[:, i], cdfs[:, j])
ks_matrix[j, i] = ks_matrix[i, j]
return ks_matrix
# Human-readable format for a geographic point
def stringify_coordinates(latitude, longitude):
return str(latitude) + "deg N by " + str(longitude) + "deg E"
def generate_geographic_names(latitudes, longitudes):
num_masked_boxes = len(masked_latitudes)
assert(num_masked_boxes == len(masked_longitudes))
return [stringify_coordinates(masked_latitudes[k], masked_longitudes[k]) for k in range(num_masked_boxes)]
# Create a set of N points, where N = |names| = |coords|
# names: list of strings
# coords: list of coordinates
def create_point_cloud(names, coords, metric):
cloud = set()
N = len(names)
assert(N == coords.shape[1])
for k in range(len(names)):
cloud.add(Point(names[k], coords[:, k], metric))
return cloud
if __name__ == '__main__':
sample_data, depths = read_raw_data(sample_size=TEST_SAMPLE_SIZE, sample_randomly=USE_RANDOM_SAMPLING)
dates, latitudes, longitudes, sound_speed_profiles = extract_features(sample_data)
grid_latitudes, grid_longitudes = locate_grid_boundaries(latitudes, longitudes)
masked_latitudes, masked_longitudes, masked_cdf = compute_boxed_cdfs(latitudes, longitudes, grid_latitudes, grid_longitudes, sound_speed_profiles)
# TODO: investigate why masked_cdfs returned by compute_boxed_cdfs has 1 extra dimension compared to local variable in function
masked_cdfs = masked_cdf[:, 0, :]
# depths = np.genfromtxt('depths.csv')
# coordvals = np.genfromtxt( 'C:/Users/Matt/Desktop/Masters coursework/topology/project/results/sca depth/2000 pts/'
# +'boxcoords.csv', delimiter = ',')
#
# masked_latitudes = coordvals[0,:]
# masked_longitudes = coordvals[1,:]
print('There are ' + str(len(masked_latitudes)) + ' lat/lon boxes')
# masked_cdfs = np.genfromtxt('C:/Users/Matt/Desktop/Masters coursework/topology/project/results/sca depth/2000 pts/'
# +'ssp_cdfs.csv', delimiter = ',')
# distance_matrix = compute_pairwise_ks_statistics(masked_latitudes, masked_longitudes, masked_cdfs)
minRadius = 0.1#np.min(distance_matrix[distance_matrix>0.1])
# Generate an ASC based on Kolmogorov-Smirnov distances fed into Vietoris-Rips algorithm
geographic_names = generate_geographic_names(masked_latitudes, masked_longitudes)
point_cloud = create_point_cloud(geographic_names, masked_cdfs, ks_test)
a = MIN_SIGNIFICANCE_LEVEL
c_a = np.sqrt(-np.log(a/2)*0.5)
## Value the metric needs to exceed to reject the ks test null hypothesis at the given significance level
critical_value = c_a * np.sqrt(2 / len(depths))
dr = 0.01
radii = np.arange(minRadius, 0.46, dr)
# After ~ r = 0.29 the KS test rejects F_1=F_2 at the .05 level. Do we care much about what happens past there?
# At that point the dists are statistically disimilar so grouping them may not be meaningful.
# First pass it appears that all the cool stuff happens around there
# so maybe we do
homologies = np.zeros( [2, len(radii)] )
for rndx in range(len(radii)):#want the index so we can store the dims of homologies
rr = radii[rndx]
rips_asc = vietoris_rips(point_cloud, MAX_ASC_DIMENSION, rr)
print("Radius = "+str(rr))
# Print simplices
for k in range( MAX_ASC_DIMENSION+1):
rips_asc.compute_boundary(k)
for k in range(MAX_ASC_DIMENSION):
if not( rips_asc.boundary_matrix[k+1] is None):
thisHomology = rips_asc.compute_homology(k)
homologies[k,rndx] = len( thisHomology.chains )
else:
#If C_{k+1} is empty then Im(del_{k+1}) = {} so H_k = ker(del_k)
homologies[ k, rndx ] = len(rips_asc.extract_kernel_cycles(k).chains)
pl.plot(radii, homologies[0,:])
pl.plot(radii, homologies[1,:])
pl.show()
coolrads = [np.round(critical_value, 2)]
for rads in coolrads:
persistent = vietoris_rips(point_cloud, MAX_ASC_DIMENSION, rads)
simplexFile = open('persistent_homology_simplex_names_radius'+str(rads)+'_mysterydata.txt', 'w')
ps = list(persistent.simplices)
for pp in range(len(ps)):
print(ps[pp])
psp = list(ps[pp].points)
for pnt in range(len(psp)):
simplexFile.write(psp[pnt].name+' , ')
simplexFile.write('\n')
simplexFile.close()
coords = np.zeros([2, len(masked_latitudes)])
coords[0,:] = masked_latitudes
coords[1,:] = masked_longitudes
np.savetxt('coords.csv',coords, delimiter = ',')
np.savetxt('cdfs_'+str(TEST_SAMPLE_SIZE)+'points.csv', masked_cdfs, delimiter = ',')