-
Notifications
You must be signed in to change notification settings - Fork 4
/
pyUPMASK.py
212 lines (179 loc) · 7.19 KB
/
pyUPMASK.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import os
from pathlib import Path
import numpy as np
from astropy.stats import RipleysKEstimator
import time as t
from modules import outer
from modules.dataIO import readFiles, readINI, dread, dmask, dxynorm, dwrite
import multiprocessing as mp
def main(
parallel_flag, parallel_procs, rnd_seed, verbose, ID_c, x_c, y_c,
data_cols, data_errs, oultr_method, stdRegion_nstd, OL_runs, resampleFlag,
PCAflag, PCAdims, GUMM_flag, GUMM_perc, KDEP_flag, IL_runs, N_membs,
N_cl_max, clust_method, clRjctMethod, C_thresh, cl_method_pars):
"""
"""
out_folder = "output"
# Create 'output' folder if it does not exist
Path('./{}'.format(out_folder)).mkdir(parents=True, exist_ok=True)
# Process all files inside the '/input' folder
inputfiles = readFiles()
for file_path in inputfiles:
print("\n")
print("===========================================================")
print("Processing : {}".format(file_path.name))
# Set a random seed for reproducibility
if rnd_seed == 'None':
seed = np.random.randint(100000)
else:
seed = int(rnd_seed)
print("Random seed : {}".format(seed))
np.random.seed(seed)
# Original data
full_data, cl_ID, cl_xy, cl_data, cl_errs, data_rjct = dread(
file_path, ID_c, x_c, y_c, data_cols, data_errs)
# Remove outliers
msk_data, ID, xy, data, data_err = dmask(
cl_ID, cl_xy, cl_data, cl_errs, oultr_method, stdRegion_nstd)
# Normalize (x, y) data to [0, 1]
xy01 = dxynorm(xy)
probs_all = dataProcess(
ID, xy01, data, data_err, verbose, OL_runs,
parallel_flag, parallel_procs, resampleFlag, PCAflag, PCAdims,
GUMM_flag, GUMM_perc, KDEP_flag, IL_runs, N_membs, N_cl_max,
clust_method, clRjctMethod, C_thresh, cl_method_pars)
if OL_runs > 1:
# Obtain the mean of all runs. This are the final probabilities
# assigned to each star in the frame
probs_mean = np.mean(probs_all, 0)
else:
probs_mean = probs_all[0]
# Write final data to file
dwrite(
out_folder, file_path, full_data, msk_data, data_rjct, probs_mean)
def dataProcess(
ID, xy, data, data_err, verbose, OL_runs, parallel_flag,
parallel_procs, resampleFlag, PCAflag, PCAdims, GUMM_flag, GUMM_perc,
KDEP_flag, IL_runs, N_membs, N_cl_max, clust_method, clRjctMethod,
C_thresh, cl_method_pars):
"""
"""
start_t = t.time()
# TODO this should be handled by the logging() module
# Set print() according to the 'verbose' parameter
if verbose == 0:
prfl = open(os.devnull, 'w')
else:
prfl = None
# Print input parameters to screen
if parallel_flag:
print("Parallel runs : {}".format(parallel_flag))
print("Processes : {}".format(parallel_procs))
print("Outer loop runs : {}".format(OL_runs))
if PCAflag:
print("Apply PCA : {}".format(PCAflag))
print(" PCA N_dims : {}".format(PCAdims))
if GUMM_flag:
print("Apply GUMM : {}".format(GUMM_flag))
print(" GUMM percentile : {}".format(GUMM_perc))
if KDEP_flag:
print("Obtain KDE probs : {}".format(KDEP_flag))
print("Inner loop runs : {}".format(IL_runs))
print("Stars per cluster : {}".format(N_membs))
print("Maximum clusters : {}".format(N_cl_max))
print("Clustering method : {}".format(clust_method))
if cl_method_pars:
for key, val in cl_method_pars.items():
print(" {:<17} : {}".format(key, val))
print("")
# print("Rejection method : {}".format(clRjctMethod))
# if clRjctMethod != 'rkfunc':
# print("Threshold : {:.2f}".format(C_thresh))
# Define RK test with an area of 1.
# Kest = None
# if clRjctMethod == 'rkfunc':
Kest = RipleysKEstimator(area=1, x_max=1, y_max=1, x_min=0, y_min=0)
# if clRjctMethod == 'kdetest' or clust_method == 'rkmeans':
# from rpy2.robjects import r
# from rpy2.robjects import numpy2ri
# from rpy2.robjects.packages import importr
# # cat(paste("R version: ",R.version.string,"\n"))
# importr('MASS')
# r("""
# set.seed(12345)
# """)
# numpy2ri.activate()
# r.assign('nruns', 2000)
# r.assign('nKde', 50)
# Arguments for the Outer Loop
OLargs = (
ID, xy, data, data_err, resampleFlag, PCAflag, PCAdims, GUMM_flag,
GUMM_perc, KDEP_flag, IL_runs, N_membs, N_cl_max, clust_method,
clRjctMethod, Kest, C_thresh, cl_method_pars, prfl)
# TODO: Breaks if verbose=0
if parallel_flag is True:
if parallel_procs == 'None':
# Use *almost* all the cores
N_cpu = mp.cpu_count() - 1
else:
N_cpu = int(parallel_procs)
with mp.Pool(processes=N_cpu) as p:
manager = mp.Manager()
KDE_vals = manager.dict({})
probs_all = p.starmap(
OLfunc, [(OLargs, KDE_vals) for _ in range(OL_runs)])
else:
KDE_vals = {}
probs_all = []
for _ in range(OL_runs):
print("\n--------------------------------------------------------")
print("OL run {}".format(_ + 1))
# The KDE_vals dictionary is updated after each OL run
probs, KDE_vals = outer.loop(*OLargs, KDE_vals)
probs_all.append(probs)
p_dist = [
(np.mean(probs_all, 0) > _).sum() for _ in
(.5, .75, .9, .95, .99)]
print("\nP>(.5, .75, .9, .95, .99): {}, {}, {}, {}, {}".format(
*p_dist), file=prfl)
elapsed = t.time() - start_t
if elapsed > 60.:
elapsed, ms_id = elapsed / 60., "minutes"
else:
ms_id = "seconds"
print("\nTime consumed: {:.1f} {}".format(elapsed, ms_id))
return probs_all
def OLfunc(args, KDE_vals):
"""
Here to handle the parallel runs.
"""
probs, _ = outer.loop(*args, KDE_vals)
return probs
if __name__ == '__main__':
# # Limit numpy's cores used to 1
# # Source: https://stackoverflow.com/a/58195413/1391441, also
# # https://stackoverflow.com/q/17053671/1391441
# parallel_flag, parallel_procs = params[:2]
# if parallel_flag:
# if parallel_procs == 'None':
# # Use *almost* all the cores
# parallel_procs = mp.cpu_count() - 1
# else:
# # Never use more than these cores
# parallel_procs = min(int(parallel_procs), mp.cpu_count() - 1)
# else:
# parallel_procs = 1
# Read input parameters.
params = readINI()
if params[0] is False:
# Disable numpy's multithreading
parallel_procs = str(1)
os.environ["OMP_NUM_THREADS"] = parallel_procs
os.environ["MKL_NUM_THREADS"] = parallel_procs
os.environ["OPENBLAS_NUM_THREADS"] = parallel_procs
os.environ["VECLIB_MAXIMUM_THREADS"] = parallel_procs
os.environ["NUMEXPR_NUM_THREADS"] = parallel_procs
else:
# If numpy is allowed to multithread, disable the parallel run
params[1] = False
main(*params[1:])