Skip to content

Commit

Permalink
latest codes
Browse files Browse the repository at this point in the history
  • Loading branch information
pkumar81 committed Apr 23, 2024
1 parent 3b92033 commit eff8240
Show file tree
Hide file tree
Showing 79 changed files with 414,780 additions and 111,140 deletions.
74 changes: 62 additions & 12 deletions PULSNAR/PULSNAR.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,17 @@


class PULSNARClassifier:
def __init__(self, scar=True, csrdata=False, classifier='xgboost', n_clusters=0, max_clusters=25, covar_type='full',
top50p_covars=False, bin_method='scott', bw_method='hist', lowerbw=0.01, upperbw=0.5, optim='global',
calibration=False, calibration_data='PU', calibration_method='isotonic', calibration_n_bins=100,
smooth_isotonic=False, classification_metrics=False, n_iterations=1, kfold=5, kflips=1,
pulsnar_params_file=None):
def __init__(self, scar=True, csrdata=False, classifier='xgboost', clustering_method='gmm', n_clusters=0,
max_clusters=25, covar_type='full', top50p_covars=False, bin_method='scott', bw_method='hist',
lowerbw=0.01, upperbw=0.5, optim='global', calibration=False, calibration_data='PU',
calibration_method='isotonic', calibration_n_bins=100, smooth_isotonic=False,
classification_metrics=False, n_iterations=1, kfold=5, kflips=1, pulsnar_params_file=None, rseed=123):

# set class variables
self.scar = scar
self.csrdata = csrdata
self.classifier = classifier
self.clustering_method = clustering_method
self.n_clusters = n_clusters
self.max_clusters = max_clusters
self.covar_type = covar_type
Expand All @@ -49,6 +50,7 @@ def __init__(self, scar=True, csrdata=False, classifier='xgboost', n_clusters=0,
self.kfold = kfold
self.kflips = kflips
self.pulsnar_params_file = pulsnar_params_file
self.rseed = rseed

def pulsnar(self, data, label, tru_label=None, rec_list=None):
"""
Expand Down Expand Up @@ -132,7 +134,7 @@ def scar_data_processing(self, X, Y, Y_true, mv_list):
scar=True)

cls_metrics_preds, cls_metrics_y_true = mlpe.prediction_using_probable_positives(preds, y_ml, y_orig,
rec_ids)
rec_ids)
itr_bs, itr_aps, itr_auc, itr_f1, itr_mcc, itr_acc = \
pulsnar_performance_metrics(cls_metrics_preds, cls_metrics_y_true, scar=True)

Expand Down Expand Up @@ -193,19 +195,21 @@ def snar_data_processing(self, X, Y, Y_true, mv_list):
pickle.dump(impf, fh, protocol=4)
'''
# divide data into positive and unlabeled sets
ml_data = MLDataPreprocessing(rseed=123)
ml_data = MLDataPreprocessing(rseed=self.rseed)
X_pos, y_ml_pos, y_true_pos, mv_pos, X_unlab, y_ml_unlab, y_true_unlab, mv_unlab = \
ml_data.generate_pu_dataset(X, Y, Y_true, mv_list)

# divide positive data into clusters
# logging.info("Dividing positives into k clusters")
if self.n_clusters == 0:
clster_indx, f_idx = self.determine_clusters(impf, X_pos, self.covar_type, io_files['bic_plot_file'],
n_clusters=None, csr=self.csrdata, top50p=self.top50p_covars)
n_clusters=None, csr=self.csrdata, top50p=self.top50p_covars,
clustering_method=self.clustering_method)
else:
clster_indx, f_idx = self.determine_clusters(impf, X_pos, self.covar_type, io_files['bic_plot_file'],
n_clusters=self.n_clusters, csr=self.csrdata,
top50p=self.top50p_covars)
top50p=self.top50p_covars,
clustering_method=self.clustering_method)
# use only important features.
X_pos, X_unlab = X_pos[:, f_idx], X_unlab[:, f_idx]

Expand Down Expand Up @@ -423,11 +427,16 @@ def get_params(self, option='classifier'):
return dict(config['IO_params'])
else:
return pp.IO_params
elif option == 'clustering':
elif option == 'gmm_clustering':
if param_file:
return dict(config['GMM_params'])
else:
return pp.GMM_params
elif option == 'nmf_clustering':
if param_file:
return dict(config['NMF_params'])
else:
return pp.NMF_params
else:
logging.error("Invalid option !!!")

Expand Down Expand Up @@ -462,12 +471,22 @@ def get_model_imp_features(self, data, label):
impf = dict(zip(f_idx, f_imp_vals))
return impf

def determine_clusters(self, imf, X_pos, covar, bic_plt_file, n_clusters=None, csr=False, top50p=False):
def determine_clusters(self, imf, X_pos, covar, bic_plt_file, n_clusters=None, csr=False, top50p=False,
clustering_method="gmm"):
"""
split labeled positive tests into k clusters
"""
if clustering_method == "gmm":
return self.determine_clusters_using_gmm(imf, X_pos, covar, bic_plt_file, n_clusters, csr, top50p)
elif clustering_method == "nmf":
return self.determine_clusters_using_nmf(imf, X_pos, bic_plt_file, n_clusters, csr, top50p)

def determine_clusters_using_gmm(self, imf, X_pos, covar, bic_plt_file, n_clusters, csr, top50p):
"""
determine cluster count and clusters for the positive records using GMM method
"""
# get GMM parameters
gmm_params = self.get_params(option='clustering')
gmm_params = self.get_params(option='gmm_clustering')

# set some parameters for clustering algorithm
gmm_params['covariance_type'] = covar
Expand All @@ -493,3 +512,34 @@ def determine_clusters(self, imf, X_pos, covar, bic_plt_file, n_clusters=None, c
cluster_indx, sel_idx = cls.divide_positives_into_clusters(X_pos, f_idx, f_imp_vals, n_clusters,
n_threads_blas=1, top50p=top50p, csr=csr)
return cluster_indx, sel_idx

def determine_clusters_using_nmf(self, imf, X_pos, bic_plt_file, n_clusters, csr, top50p):
"""
determine cluster count and clusters for the positive records using GMM method
"""
# get GMM parameters
nmf_params = self.get_params(option='nmf_clustering')

# set some parameters for clustering algorithm
cls = ClusteringEstimator(clf="nmf", clf_params=nmf_params)

# dict to array - feature index and feature importance
f_idx, f_imp_vals = list(map(np.array, zip(*imf.items())))

# check if number of clusters needs to be estimated
if n_clusters is None:
_, bic_vals, n_clusters = cls.find_cluster_count_nmf(X_pos, f_idx, f_imp_vals,
max_clusters=self.max_clusters,
n_threads_blas=1, top50p=top50p, csr=csr)
print('Number of clusters in the positive set: ', n_clusters)
plt = MiscUtils(bic_plot=True)

# generate BIC plot
cluster_list = [c + 1 for c in range(len(bic_vals))]
plt.draw_line_plot(cluster_list, bic_vals, bic_plt_file)
# n_clusters = int(input("Check the BIC plot and enter number of cluster: "))

# divide positives into n clusters
cluster_indx, sel_idx = cls.divide_positives_into_clusters_nmf(X_pos, f_idx, f_imp_vals, n_clusters,
n_threads_blas=1, top50p=top50p, csr=csr)
return cluster_indx, sel_idx
6 changes: 0 additions & 6 deletions PULSNAR/pudata/SimulatedData.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,6 @@ def generate_simulated_data(self, n_pos=10000, n_unlab=10000, pf_in_unlab=0.1, n

# print(self.params)

# self.params['n_informative'] = n_classes
#if scar:
# self.params['n_clusters_per_class'] = 2 # use default value for the SCAR data
#else:
# self.params['n_clusters_per_class'] = 1 # set it to 1

# basic checks before processing
if scar and n_classes != 2:
traceback.print_stack()
Expand Down
4 changes: 2 additions & 2 deletions PULSNAR/puestimator/AlphaEstimate.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,8 @@ def minimize_mse(b, probs, n_bins, hdensity):

_, kdensity = compute_beta_kernel_density(probs, n_bins=n_bins, bw=b)
# kdensity = compute_gaussian_kde(probs, b, n_bins)
# return np.mean(pow(kdensity - hdensity, 2))
return distance.jensenshannon(kdensity, hdensity)
return np.mean(pow(kdensity - hdensity, 2))
# return distance.jensenshannon(kdensity, hdensity)


def err_function(estrange, unlab_kde=None, case_kde=None):
Expand Down
108 changes: 98 additions & 10 deletions PULSNAR/puestimator/MLEstimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.mixture import GaussianMixture
from scipy.signal import argrelmin
from sklearn.decomposition import NMF
from scipy.signal import argrelmin, argrelmax
from collections import Counter
from scipy import sparse
import traceback
Expand Down Expand Up @@ -313,21 +314,24 @@ def __init__(self, clf=None, clf_params=None):
if clf is None or clf == "gmm":
# Use GMM if no clf is provided
model = GaussianMixture(**clf_params)
elif clf == "nmf":
# Use NMF for clustering
model = NMF(**clf_params)
else:
traceback.print_stack()
logging.error("MLEstimators.py needs to be modified to support {0}".format(clf))
exit(-1)

# check if the classifier supports fit and predict_proba
# check if the classifier supports fit and aic/bic
if not hasattr(model, "fit"):
traceback.print_stack()
logging.error("The selected algorithm {0} does not have fit() function".format(clf))
exit(-1)
if not hasattr(model, "bic"):
if not hasattr(model, "bic") and clf == "gmm":
traceback.print_stack()
logging.error("The selected algorithm {0} does not have bic() function".format(clf))
exit(-1)
if not hasattr(model, "aic"):
if not hasattr(model, "aic") and clf == "gmm":
traceback.print_stack()
logging.error("The selected algorithm {0} does not have aic() function".format(clf))
exit(-1)
Expand Down Expand Up @@ -375,11 +379,15 @@ def num_of_clusters(vals):
"""
vals = np.asarray(vals)
minima_bic_idx = argrelmin(vals)[0]
if len(minima_bic_idx) > 0: # function is not monotonically increasing or decreasing
minima_vals = vals[minima_bic_idx]
i = np.argmin(minima_vals)
return minima_bic_idx[i] + 1 # cluster count starts from 1, not 0 and hence +1
else: # use knee point detection algorithm
# print("minima_bic_idx: ", minima_bic_idx)
if len(minima_bic_idx) > 0: # function is not monotonically increasing or decreasing
bic_slope = np.diff(vals)
return min(np.argmin(vals), minima_bic_idx[0], np.argmax(np.diff(bic_slope)) + 3)+1

# minima_vals = vals[minima_bic_idx]
# i = np.argmin(minima_vals)
# return minima_bic_idx[i] + 1 # cluster count starts from 1, not 0 and hence +1
else: # use knee point detection algorithm
diff_list = []
curr_val, prev_val, next_val = vals[0], vals[0], vals[0]
for m in range(len(vals) - 1):
Expand All @@ -396,7 +404,7 @@ def num_of_clusters(vals):
# create a dictionary with cluster count as key and local minimum in diff_list as value
local_min = {}
for m in minima_idx:
local_min[m+1] = diff_list[m] # cluster count starts from 1, not 0. so m+1
local_min[m + 1] = diff_list[m] # cluster count starts from 1, not 0. so m+1
local_min = Counter(local_min).most_common() # sort local_min in decreasing order

# find the angle for each local minimum
Expand Down Expand Up @@ -482,3 +490,83 @@ def divide_positives_into_clusters(self, data, f_idx, f_imp_vals, n_clusters, n_
cluster_indx.append(idx)

return cluster_indx, sel_idx

def find_cluster_count_nmf(self, data, f_idx, f_imp_vals, max_clusters=25, n_threads_blas=1, top50p=True,
csr=False):
"""
This function finds the number of clusters using NMF method
Parameters
----------
data: ML data
f_idx: indices of the important features
f_imp_vals: importance value of the important features
max_clusters: max clusters to use in the clustering algorithm
n_threads_blas: number of threads for blas
top50p: select only top 50 percent of the important features?
csr: data are in CSR format?
Returns
-------
AIC value,
BIC value,
number of clusters
"""

bic_values = []
# logging.info("Scale the data feature by their importance value")
data, _ = preprocess_data(data, f_idx, f_imp_vals, top50p, csr)

# Run GMM clustering with updated params
with threadpool_limits(limits=n_threads_blas, user_api='blas'):
for n_clusters in range(1, max_clusters + 1, 1):
self.clf_params['n_components'] = n_clusters
nmf_model = NMF(**self.clf_params).fit(data)
W, H = nmf_model.transform(data), nmf_model.components_
delta = np.sqrt(np.sum(np.square(data - np.matmul(W, H))) / np.size(data))
bic = -2 * np.log(delta) + np.log(data.shape[0]) * n_clusters
bic_values.append(bic)
return bic_values, bic_values, np.argmax(np.diff(bic_values)) + 2

def divide_positives_into_clusters_nmf(self, data, f_idx, f_imp_vals, n_clusters, n_threads_blas=1, top50p=True,
csr=False):
"""
This function divides positive data into n clusters using NMF algorithm
Parameters
----------
data: feature matrix for ML
f_idx: indices of the important features
f_imp_vals: importance value of the important features
n_clusters: number of clusters
n_threads_blas: number of threads for blas
top50p: select only top 50 percent of the important features?
csr: data are in CSR format?
Returns
-------
indices of records in each cluster,
indices of important features
"""

data, sel_idx = preprocess_data(data, f_idx, f_imp_vals, top50p, csr)

# Train GMM model for clustering
self.clf_params['n_components'] = n_clusters
self.clf_params['random_state'] = 100 + n_clusters

# run GMM with selected cluster count
with threadpool_limits(limits=n_threads_blas, user_api='blas'):
nmf_model = NMF(**self.clf_params).fit(data)
W, H = nmf_model.transform(data), nmf_model.components_

# predict labels using train model
labels = np.argmax(W, axis=1)

# group data by their labels
cluster_indx = []
for v in np.unique(labels):
idx = list(np.where(labels == v)[0])
cluster_indx.append(idx)

return cluster_indx, sel_idx
9 changes: 8 additions & 1 deletion PULSNAR/puestimator/PulsnarParams.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@
'random_state': 101
}

# Parameters for NMF algorithm. If you want to add more parameters of NMF,
# check the list here: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
NMF_params = {
'n_components': 1,
'max_iter': 500,
'random_state': 101
}

# files to store IO
IO_params = {
'result_file': 'predictions.tsv',
Expand All @@ -47,4 +55,3 @@
'eta': [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 0.7, 1],
'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15]
}

3 changes: 0 additions & 3 deletions examples/.idea/.gitignore

This file was deleted.

12 changes: 0 additions & 12 deletions examples/.idea/examples.iml

This file was deleted.

14 changes: 0 additions & 14 deletions examples/.idea/inspectionProfiles/Project_Default.xml

This file was deleted.

6 changes: 0 additions & 6 deletions examples/.idea/inspectionProfiles/profiles_settings.xml

This file was deleted.

4 changes: 0 additions & 4 deletions examples/.idea/misc.xml

This file was deleted.

8 changes: 0 additions & 8 deletions examples/.idea/modules.xml

This file was deleted.

Loading

0 comments on commit eff8240

Please sign in to comment.