latest codes

unmtransinfo · Apr 23, 2024 · eff8240 · eff8240
1 parent 3b92033
commit eff8240
Show file tree

Hide file tree

Showing 79 changed files with 414,780 additions and 111,140 deletions.
diff --git a/PULSNAR/PULSNAR.py b/PULSNAR/PULSNAR.py
@@ -20,16 +20,17 @@
 
 
 class PULSNARClassifier:
-    def __init__(self, scar=True, csrdata=False, classifier='xgboost', n_clusters=0, max_clusters=25, covar_type='full',
-                 top50p_covars=False, bin_method='scott', bw_method='hist', lowerbw=0.01, upperbw=0.5, optim='global',
-                 calibration=False, calibration_data='PU', calibration_method='isotonic', calibration_n_bins=100,
-                 smooth_isotonic=False, classification_metrics=False, n_iterations=1, kfold=5, kflips=1,
-                 pulsnar_params_file=None):
+    def __init__(self, scar=True, csrdata=False, classifier='xgboost', clustering_method='gmm', n_clusters=0,
+                 max_clusters=25, covar_type='full', top50p_covars=False, bin_method='scott', bw_method='hist',
+                 lowerbw=0.01, upperbw=0.5, optim='global', calibration=False, calibration_data='PU',
+                 calibration_method='isotonic', calibration_n_bins=100, smooth_isotonic=False,
+                 classification_metrics=False, n_iterations=1, kfold=5, kflips=1, pulsnar_params_file=None, rseed=123):
 
         # set class variables
         self.scar = scar
         self.csrdata = csrdata
         self.classifier = classifier
+        self.clustering_method = clustering_method
         self.n_clusters = n_clusters
         self.max_clusters = max_clusters
         self.covar_type = covar_type
@@ -49,6 +50,7 @@ def __init__(self, scar=True, csrdata=False, classifier='xgboost', n_clusters=0,
         self.kfold = kfold
         self.kflips = kflips
         self.pulsnar_params_file = pulsnar_params_file
+        self.rseed = rseed
 
     def pulsnar(self, data, label, tru_label=None, rec_list=None):
         """
@@ -132,7 +134,7 @@ def scar_data_processing(self, X, Y, Y_true, mv_list):
                                                scar=True)
 
                 cls_metrics_preds, cls_metrics_y_true = mlpe.prediction_using_probable_positives(preds, y_ml, y_orig,
-                                                                                                    rec_ids)
+                                                                                                 rec_ids)
                 itr_bs, itr_aps, itr_auc, itr_f1, itr_mcc, itr_acc = \
                     pulsnar_performance_metrics(cls_metrics_preds, cls_metrics_y_true, scar=True)
 
@@ -193,19 +195,21 @@ def snar_data_processing(self, X, Y, Y_true, mv_list):
                 pickle.dump(impf, fh, protocol=4)
         '''
         # divide data into positive and unlabeled sets
-        ml_data = MLDataPreprocessing(rseed=123)
+        ml_data = MLDataPreprocessing(rseed=self.rseed)
         X_pos, y_ml_pos, y_true_pos, mv_pos, X_unlab, y_ml_unlab, y_true_unlab, mv_unlab = \
             ml_data.generate_pu_dataset(X, Y, Y_true, mv_list)
 
         # divide positive data into clusters
         # logging.info("Dividing positives into k clusters")
         if self.n_clusters == 0:
             clster_indx, f_idx = self.determine_clusters(impf, X_pos, self.covar_type, io_files['bic_plot_file'],
-                                                         n_clusters=None, csr=self.csrdata, top50p=self.top50p_covars)
+                                                         n_clusters=None, csr=self.csrdata, top50p=self.top50p_covars,
+                                                         clustering_method=self.clustering_method)
         else:
             clster_indx, f_idx = self.determine_clusters(impf, X_pos, self.covar_type, io_files['bic_plot_file'],
                                                          n_clusters=self.n_clusters, csr=self.csrdata,
-                                                         top50p=self.top50p_covars)
+                                                         top50p=self.top50p_covars,
+                                                         clustering_method=self.clustering_method)
         # use only important features.
         X_pos, X_unlab = X_pos[:, f_idx], X_unlab[:, f_idx]
 
@@ -423,11 +427,16 @@ def get_params(self, option='classifier'):
                 return dict(config['IO_params'])
             else:
                 return pp.IO_params
-        elif option == 'clustering':
+        elif option == 'gmm_clustering':
             if param_file:
                 return dict(config['GMM_params'])
             else:
                 return pp.GMM_params
+        elif option == 'nmf_clustering':
+            if param_file:
+                return dict(config['NMF_params'])
+            else:
+                return pp.NMF_params
         else:
             logging.error("Invalid option !!!")
 
@@ -462,12 +471,22 @@ def get_model_imp_features(self, data, label):
         impf = dict(zip(f_idx, f_imp_vals))
         return impf
 
-    def determine_clusters(self, imf, X_pos, covar, bic_plt_file, n_clusters=None, csr=False, top50p=False):
+    def determine_clusters(self, imf, X_pos, covar, bic_plt_file, n_clusters=None, csr=False, top50p=False,
+                           clustering_method="gmm"):
         """
         split labeled positive tests into k clusters
         """
+        if clustering_method == "gmm":
+            return self.determine_clusters_using_gmm(imf, X_pos, covar, bic_plt_file, n_clusters, csr, top50p)
+        elif clustering_method == "nmf":
+            return self.determine_clusters_using_nmf(imf, X_pos, bic_plt_file, n_clusters, csr, top50p)
+
+    def determine_clusters_using_gmm(self, imf, X_pos, covar, bic_plt_file, n_clusters, csr, top50p):
+        """
+        determine cluster count and clusters for the positive records using GMM method
+        """
         # get GMM parameters
-        gmm_params = self.get_params(option='clustering')
+        gmm_params = self.get_params(option='gmm_clustering')
 
         # set some parameters for clustering algorithm
         gmm_params['covariance_type'] = covar
@@ -493,3 +512,34 @@ def determine_clusters(self, imf, X_pos, covar, bic_plt_file, n_clusters=None, c
         cluster_indx, sel_idx = cls.divide_positives_into_clusters(X_pos, f_idx, f_imp_vals, n_clusters,
                                                                    n_threads_blas=1, top50p=top50p, csr=csr)
         return cluster_indx, sel_idx
+
+    def determine_clusters_using_nmf(self, imf, X_pos, bic_plt_file, n_clusters, csr, top50p):
+        """
+        determine cluster count and clusters for the positive records using GMM method
+        """
+        # get GMM parameters
+        nmf_params = self.get_params(option='nmf_clustering')
+
+        # set some parameters for clustering algorithm
+        cls = ClusteringEstimator(clf="nmf", clf_params=nmf_params)
+
+        # dict to array - feature index and feature importance
+        f_idx, f_imp_vals = list(map(np.array, zip(*imf.items())))
+
+        # check if number of clusters needs to be estimated
+        if n_clusters is None:
+            _, bic_vals, n_clusters = cls.find_cluster_count_nmf(X_pos, f_idx, f_imp_vals,
+                                                                        max_clusters=self.max_clusters,
+                                                                        n_threads_blas=1, top50p=top50p, csr=csr)
+            print('Number of clusters in the positive set: ', n_clusters)
+            plt = MiscUtils(bic_plot=True)
+
+            # generate BIC plot
+            cluster_list = [c + 1 for c in range(len(bic_vals))]
+            plt.draw_line_plot(cluster_list, bic_vals, bic_plt_file)
+            # n_clusters = int(input("Check the BIC plot and enter number of cluster: "))
+
+        # divide positives into n clusters
+        cluster_indx, sel_idx = cls.divide_positives_into_clusters_nmf(X_pos, f_idx, f_imp_vals, n_clusters,
+                                                                       n_threads_blas=1, top50p=top50p, csr=csr)
+        return cluster_indx, sel_idx
diff --git a/PULSNAR/pudata/SimulatedData.py b/PULSNAR/pudata/SimulatedData.py
@@ -60,12 +60,6 @@ def generate_simulated_data(self, n_pos=10000, n_unlab=10000, pf_in_unlab=0.1, n
 
         # print(self.params)
 
-        # self.params['n_informative'] = n_classes
-        #if scar:
-        #    self.params['n_clusters_per_class'] = 2  # use default value for the SCAR data
-        #else:
-        #    self.params['n_clusters_per_class'] = 1  # set it to 1
-
         # basic checks before processing
         if scar and n_classes != 2:
             traceback.print_stack()

diff --git a/PULSNAR/puestimator/AlphaEstimate.py b/PULSNAR/puestimator/AlphaEstimate.py
@@ -193,8 +193,8 @@ def minimize_mse(b, probs, n_bins, hdensity):
 
     _, kdensity = compute_beta_kernel_density(probs, n_bins=n_bins, bw=b)
     # kdensity = compute_gaussian_kde(probs, b, n_bins)
-    # return np.mean(pow(kdensity - hdensity, 2))
-    return distance.jensenshannon(kdensity, hdensity)
+    return np.mean(pow(kdensity - hdensity, 2))
+    # return distance.jensenshannon(kdensity, hdensity)
 
 
 def err_function(estrange, unlab_kde=None, case_kde=None):

diff --git a/PULSNAR/puestimator/MLEstimators.py b/PULSNAR/puestimator/MLEstimators.py
@@ -9,7 +9,8 @@
 from sklearn.model_selection import StratifiedKFold
 from sklearn.model_selection import GridSearchCV
 from sklearn.mixture import GaussianMixture
-from scipy.signal import argrelmin
+from sklearn.decomposition import NMF
+from scipy.signal import argrelmin, argrelmax
 from collections import Counter
 from scipy import sparse
 import traceback
@@ -313,21 +314,24 @@ def __init__(self, clf=None, clf_params=None):
         if clf is None or clf == "gmm":
             # Use GMM if no clf is provided
             model = GaussianMixture(**clf_params)
+        elif clf == "nmf":
+            # Use NMF for clustering
+            model = NMF(**clf_params)
         else:
             traceback.print_stack()
             logging.error("MLEstimators.py needs to be modified to support {0}".format(clf))
             exit(-1)
 
-        # check if the classifier supports fit and predict_proba
+        # check if the classifier supports fit and aic/bic
         if not hasattr(model, "fit"):
             traceback.print_stack()
             logging.error("The selected algorithm {0} does not have fit() function".format(clf))
             exit(-1)
-        if not hasattr(model, "bic"):
+        if not hasattr(model, "bic") and clf == "gmm":
             traceback.print_stack()
             logging.error("The selected algorithm {0} does not have bic() function".format(clf))
             exit(-1)
-        if not hasattr(model, "aic"):
+        if not hasattr(model, "aic") and clf == "gmm":
             traceback.print_stack()
             logging.error("The selected algorithm {0} does not have aic() function".format(clf))
             exit(-1)
@@ -375,11 +379,15 @@ def num_of_clusters(vals):
             """
             vals = np.asarray(vals)
             minima_bic_idx = argrelmin(vals)[0]
-            if len(minima_bic_idx) > 0:     # function is not monotonically increasing or decreasing
-                minima_vals = vals[minima_bic_idx]
-                i = np.argmin(minima_vals)
-                return minima_bic_idx[i] + 1    # cluster count starts from 1, not 0 and hence +1
-            else:   # use knee point detection algorithm
+            # print("minima_bic_idx: ", minima_bic_idx)
+            if len(minima_bic_idx) > 0:  # function is not monotonically increasing or decreasing
+                bic_slope = np.diff(vals)
+                return min(np.argmin(vals), minima_bic_idx[0], np.argmax(np.diff(bic_slope)) + 3)+1
+
+                # minima_vals = vals[minima_bic_idx]
+                # i = np.argmin(minima_vals)
+                # return minima_bic_idx[i] + 1    # cluster count starts from 1, not 0 and hence +1
+            else:  # use knee point detection algorithm
                 diff_list = []
                 curr_val, prev_val, next_val = vals[0], vals[0], vals[0]
                 for m in range(len(vals) - 1):
@@ -396,7 +404,7 @@ def num_of_clusters(vals):
                 # create a dictionary with cluster count as key and local minimum in diff_list as value
                 local_min = {}
                 for m in minima_idx:
-                    local_min[m+1] = diff_list[m]   # cluster count starts from 1, not 0. so m+1
+                    local_min[m + 1] = diff_list[m]  # cluster count starts from 1, not 0. so m+1
                 local_min = Counter(local_min).most_common()  # sort local_min in decreasing order
 
                 # find the angle for each local minimum
@@ -482,3 +490,83 @@ def divide_positives_into_clusters(self, data, f_idx, f_imp_vals, n_clusters, n_
             cluster_indx.append(idx)
 
         return cluster_indx, sel_idx
+
+    def find_cluster_count_nmf(self, data, f_idx, f_imp_vals, max_clusters=25, n_threads_blas=1, top50p=True,
+                               csr=False):
+        """
+        This function finds the number of clusters using NMF method
+
+        Parameters
+        ----------
+        data: ML data
+        f_idx: indices of the important features
+        f_imp_vals: importance value of the important features
+        max_clusters: max clusters to use in the clustering algorithm
+        n_threads_blas: number of threads for blas
+        top50p: select only top 50 percent of the important features?
+        csr: data are in CSR format?
+
+        Returns
+        -------
+        AIC value,
+        BIC value,
+        number of clusters
+        """
+
+        bic_values = []
+        # logging.info("Scale the data feature by their importance value")
+        data, _ = preprocess_data(data, f_idx, f_imp_vals, top50p, csr)
+
+        # Run GMM clustering with updated params
+        with threadpool_limits(limits=n_threads_blas, user_api='blas'):
+            for n_clusters in range(1, max_clusters + 1, 1):
+                self.clf_params['n_components'] = n_clusters
+                nmf_model = NMF(**self.clf_params).fit(data)
+                W, H = nmf_model.transform(data), nmf_model.components_
+                delta = np.sqrt(np.sum(np.square(data - np.matmul(W, H))) / np.size(data))
+                bic = -2 * np.log(delta) + np.log(data.shape[0]) * n_clusters
+                bic_values.append(bic)
+        return bic_values, bic_values, np.argmax(np.diff(bic_values)) + 2
+
+    def divide_positives_into_clusters_nmf(self, data, f_idx, f_imp_vals, n_clusters, n_threads_blas=1, top50p=True,
+                                           csr=False):
+        """
+        This function divides positive data into n clusters using NMF algorithm
+
+        Parameters
+        ----------
+        data: feature matrix for ML
+        f_idx: indices of the important features
+        f_imp_vals: importance value of the important features
+        n_clusters: number of clusters
+        n_threads_blas: number of threads for blas
+        top50p: select only top 50 percent of the important features?
+        csr: data are in CSR format?
+
+        Returns
+        -------
+        indices of records in each cluster,
+        indices of important features
+        """
+
+        data, sel_idx = preprocess_data(data, f_idx, f_imp_vals, top50p, csr)
+
+        # Train GMM model for clustering
+        self.clf_params['n_components'] = n_clusters
+        self.clf_params['random_state'] = 100 + n_clusters
+
+        # run GMM with selected cluster count
+        with threadpool_limits(limits=n_threads_blas, user_api='blas'):
+            nmf_model = NMF(**self.clf_params).fit(data)
+            W, H = nmf_model.transform(data), nmf_model.components_
+
+        # predict labels using train model
+        labels = np.argmax(W, axis=1)
+
+        # group data by their labels
+        cluster_indx = []
+        for v in np.unique(labels):
+            idx = list(np.where(labels == v)[0])
+            cluster_indx.append(idx)
+
+        return cluster_indx, sel_idx
diff --git a/PULSNAR/puestimator/PulsnarParams.py b/PULSNAR/puestimator/PulsnarParams.py
@@ -34,6 +34,14 @@
     'random_state': 101
 }
 
+# Parameters for NMF algorithm. If you want to add more parameters of NMF,
+# check the list here: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
+NMF_params = {
+    'n_components': 1,
+    'max_iter': 500,
+    'random_state': 101
+}
+
 # files to store IO
 IO_params = {
     'result_file': 'predictions.tsv',
@@ -47,4 +55,3 @@
     'eta': [0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 0.7, 1],
     'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15]
 }
-
diff --git a/examples/.idea/.gitignore b/examples/.idea/.gitignore
diff --git a/examples/.idea/examples.iml b/examples/.idea/examples.iml
diff --git a/examples/.idea/inspectionProfiles/Project_Default.xml b/examples/.idea/inspectionProfiles/Project_Default.xml
diff --git a/examples/.idea/inspectionProfiles/profiles_settings.xml b/examples/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/examples/.idea/misc.xml b/examples/.idea/misc.xml
diff --git a/examples/.idea/modules.xml b/examples/.idea/modules.xml