Som plugin (#1269)

* som peaklet classification * Updating plugin, this needs straxen PR Waveform attributes #745 * update plugin, this works now * remove som type and propagate som classification up to peaks * do not classify type 0 * Fix issue with array sisize * add som_type * Update peaklet_classification_som.py fix error in data_to_log_deciles * fix som_type issue * update model + some function details * new file + new type 3 added * separate som_plugin from main pipeline * Add tests to produce input vectors * current status of plugin, detach plugin before we are ready fro review * start making new context * fix plugin and context * clean up plugin * Update peaklet_classification_som.py Comply with some of the pep8 suggestions * conform to pep8 style * Update peaklet_classification_som.py missed a function with upper cases * Update peaklet_classification_som.py pep8 space --------- Co-authored-by: ahiguera-mx <ahiguera-mx@fried.rice.edu>
XENONnT · Oct 9, 2023 · 02621e1 · 02621e1
1 parent 8c70f8f
commit 02621e1
Show file tree

Hide file tree

Showing 4 changed files with 371 additions and 2 deletions.
diff --git a/straxen/contexts.py b/straxen/contexts.py
@@ -8,6 +8,7 @@
 import typing as ty
 from pandas.util._decorators import deprecate_kwarg
 import socket
+from straxen.plugins.peaklets.peaklet_classification_som import PeakletClassificationSOM
 
 common_opts = dict(
     register_all=[],
@@ -111,6 +112,18 @@ def xenonnt(cmt_version='global_ONLINE', xedocs_version=None,
     return st
 
 
+def xenonnt_som(cmt_version='global_ONLINE', xedocs_version=None,
+                _from_cutax=False, **kwargs):
+    """XENONnT context for the SOM"""
+
+    st = straxen.contexts.xenonnt(cmt_version=cmt_version, xedocs_version=xedocs_version,
+                                  _from_cutax=_from_cutax, **kwargs)
+    del st._plugin_class_registry['peaklet_classification']
+    st.register(PeakletClassificationSOM)
+
+    return st
+
+
 def find_rucio_local_path(include_rucio_local, _rucio_local_path):
     """
     Check the hostname to determine which rucio local path to use. Note that access to

diff --git a/straxen/plugins/peaklets/peaklet_classification_som.py b/straxen/plugins/peaklets/peaklet_classification_som.py
@@ -0,0 +1,297 @@
+import numpy as np
+import numpy.lib.recfunctions as rfn
+from scipy.spatial.distance import cdist
+from straxen.plugins.peaklets.peaklet_classification import PeakletClassification
+import numba
+
+import strax
+import straxen
+
+export, __all__ = strax.exporter()
+
+
+@export
+class PeakletClassificationSOM(PeakletClassification):
+
+    """
+    Self-Organizing Maps (SOM)
+    https://xe1t-wiki.lngs.infn.it/doku.php?id=xenon:xenonnt:lsanchez:unsupervised_neural_network_som_methods
+    For peaklet classification. We this pluggin will provide 2 data types, the 'type' we are
+    already familiar with, classifying peaklets as s1, s2 (using the new classification) or
+    unknown (from the previous classification). As well as a new data type, SOM type, which
+    will be assigned numbers based on the cluster in the SOM in which they are found. For
+    each version I will make some documentation in the corrections repository explaining
+    what I believe each cluster represents.
+
+    This correction/plugin is currently on the testing phase, feel free to use it if you are
+    curious or just want to test it or try it out but note this is note ready to be used in
+    analysis.
+    """
+
+    __version__ = '0.0.1'
+
+    depends_on = ('peaklets') 
+
+    provides = ('peaklet_classification', 'som_peaklet_data')
+    data_kind = dict(peaklet_classification='peaklets',
+                     som_peaklet_data='peaklets')
+
+    parallel = True
+
+    som_files = straxen.URLConfig(
+        default='resource://xedocs://som_classifiers?attr=value&version=v1&run_id=045000&fmt=npy'
+    )
+
+    def infer_dtype(self):
+        dtype = {}
+        dtype['peaklet_classification'] = (
+                strax.peak_interval_dtype + 
+                [('type', np.int8, 'Classification of the peak(let)')]
+        )
+        dtype['som_peaklet_data'] = (
+                strax.peak_interval_dtype + 
+                [('som_type', np.int8, 'SOM type of the peak(let)')] +
+                [('loc_x_som', np.int16, 'x location of the peak(let) in the SOM')] +
+                [('loc_y_som', np.int16, 'y location of the peak(let) in the SOM')]
+        )
+
+        return dtype
+
+    def setup(self):
+
+        self.som_weight_cube = self.som_files['weight_cube']
+        self.som_img = self.som_files['som_img']
+        self.som_norm_factors = self.som_files['norm_factors']
+        self.som_s1_array = self.som_files['s1_array']
+        self.som_s2_array = self.som_files['s2_array']
+        self.som_s3_array = self.som_files['s3_array']
+        self.som_s0_array = self.som_files['s0_array']
+
+    def compute(self, peaklets):
+        # Current classification
+        ptype = np.zeros(len(peaklets), dtype=np.int8)
+
+        # Properties needed for classification:
+        rise_time = -peaklets['area_decile_from_midpoint'][:, 1]
+        n_channels = (peaklets['area_per_channel'] > 0).sum(axis=1)
+        n_top = self.n_top_pmts
+        area_top = peaklets['area_per_channel'][:, :n_top].sum(axis=1)
+        area_total = peaklets['area_per_channel'].sum(axis=1)
+        area_fraction_top = area_top / area_total
+
+        is_large_s1 = (peaklets['area'] >= 100)
+        is_large_s1 &= (rise_time <= self.s1_max_rise_time_post100)
+        is_large_s1 &= peaklets['tight_coincidence'] >= self.s1_min_coincidence
+
+        is_small_s1 = peaklets["area"] < 100
+        is_small_s1 &= rise_time < self.upper_rise_time_area_boundary(
+            peaklets["area"],
+            *self.s1_risetime_area_parameters,
+        )
+
+        is_small_s1 &= rise_time < self.upper_rise_time_aft_boundary(
+            area_fraction_top,
+            *self.s1_risetime_aft_parameters,
+            *self.s1_flatten_threshold_aft,
+        )
+
+        is_small_s1 &= peaklets['tight_coincidence'] >= self.s1_min_coincidence
+
+        ptype[is_large_s1 | is_small_s1] = 1
+
+        is_s2 = n_channels >= self.s2_min_pmts
+        is_s2[is_large_s1 | is_small_s1] = False
+        ptype[is_s2] = 2
+
+        # SOM classification
+        peaklets_w_type = peaklets.copy()
+        peaklets_w_type['type'] = ptype
+        mask_non_zero = peaklets_w_type['type'] != 0
+        peaklets_w_type = peaklets_w_type[mask_non_zero]
+        result = np.zeros(len(peaklets), dtype=self.dtype['peaklet_classification'])
+        som_info = np.zeros(len(peaklets), dtype=self.dtype['som_peaklet_data'])
+        som_type, x_som, y_som = recall_populations(peaklets_w_type, self.som_weight_cube,
+                                      self.som_img,
+                                      self.som_norm_factors)
+
+        som_info['time'] = peaklets['time']
+        som_info['length'] = peaklets['length']
+        som_info['dt'] = peaklets['dt']
+        som_info['som_type'][mask_non_zero] = som_type
+        som_info['loc_x_som'][mask_non_zero] = x_som
+        som_info['loc_y_som'][mask_non_zero] = y_som
+
+        strax_type = som_type_to_type(som_type,
+                                      self.som_s1_array,
+                                      self.som_s2_array,
+                                      self.som_s3_array,
+                                      self.som_s0_array)
+        result['time'] = peaklets['time']
+        result['length'] = peaklets['length']
+        result['dt'] = peaklets['dt']
+        result['type'][mask_non_zero] = strax_type
+
+        return dict(peaklet_classification=result, som_peaklet_data=som_info)
+
+
+
+def recall_populations(dataset, weight_cube, som_cls_img, norm_factors):
+
+    """
+    Master function that should let the user provide a weightcube,
+    a reference img as a np.array, a dataset and a set of normalization factors.
+    In theory, if these 5 things are provided, this function should output
+    the original data back with one added field with the name "SOM_type"
+    weight_cube:      SOM weight cube (3D array)
+    som_cls_img:      SOM reference image as a numpy array
+    dataset:          Data to preform the recall on (Should be peaklet level data)
+    normfactos:       A set of 11 numbers to normalize the data so we can preform a recall
+    """
+
+    xdim, ydim, zdim = weight_cube.shape
+    img_xdim, img_ydim, img_zdim = som_cls_img.shape
+    unique_colors = np.unique(np.reshape(som_cls_img, [xdim * ydim, 3]), axis=0)
+    # Checks that the reference image matches the weight cube
+    assert xdim == img_xdim, f'Dimensions mismatch between SOM weight cube ({xdim}) and reference image ({img_xdim})'
+    assert ydim == img_ydim, f'Dimensions mismatch between SOM weight cube ({ydim}) and reference image ({img_ydim})'
+
+    assert all(dataset['type'] != 0), 'Dataset contains unclassified peaklets'
+    # Get the deciles representation of data for recall
+    decile_transform_check = data_to_log_decile_log_area_aft(dataset, norm_factors)
+    # preform a recall of the dataset with the weight cube
+    # assign each population color a number (can do from previous function)
+    ref_map = generate_color_ref_map(som_cls_img, unique_colors, xdim, ydim)
+    som_cls_array = np.empty(len(dataset['area']))
+    som_cls_array[:] = np.nan
+    # Make new numpy structured array to save the SOM cls data
+    data_with_SOM_cls = rfn.append_fields(dataset, 'SOM_type', som_cls_array)
+    # preforms the recall and assigns SOM_type label
+    output_data, x_som, y_som = som_cls_recall(data_with_SOM_cls, decile_transform_check, weight_cube, ref_map)
+    return output_data['SOM_type'], x_som, y_som
+
+
+def generate_color_ref_map(color_image, unique_colors, xdim, ydim):
+    ref_map = np.zeros((xdim, ydim))
+    for color in np.arange(len(unique_colors)):
+        mask = np.all(np.equal(color_image, unique_colors[color, :]), axis=2)
+        indices = np.argwhere(mask)  # generates a 2d mask
+        for loc in np.arange(len(indices)):
+            ref_map[indices[loc][0], indices[loc][1]] = color
+    return ref_map
+
+
+def som_cls_recall(array_to_fill, data_in_som_fmt, weight_cube, reference_map):
+    som_xdim, som_ydim, _ = weight_cube.shape
+    # for data_point in data_in_SOM_fmt:
+    distances = cdist(weight_cube.reshape(-1, weight_cube.shape[-1]), data_in_som_fmt, metric='euclidean')
+    w_neuron = np.argmin(distances, axis=0)
+    x_idx, y_idx = np.unravel_index(w_neuron, (som_xdim, som_ydim))
+    array_to_fill['SOM_type'] = reference_map[x_idx, y_idx]
+    return array_to_fill, x_idx, y_idx
+
+
+def som_type_to_type(som_type, s1_array, s2_array, s3_array, s0_array):
+    """
+    Converts the SOM type into either S1 or S2 type (1, 2)
+    som_type:    array with integers corresponding to the different SOM types
+    s1_array:    array containing the number corresponding to the SOM types which should
+                 be converted to S1's
+    """
+    som_type_copy = som_type.copy()
+    som_type_copy[np.isin(som_type_copy, s1_array)] = 1234
+    som_type_copy[np.isin(som_type_copy, s2_array)] = 5678
+    som_type_copy[np.isin(som_type_copy, s3_array)] = -5
+    som_type_copy[np.isin(som_type_copy, s0_array)] = -250
+    som_type_copy[som_type_copy == 1234] = 1
+    som_type_copy[som_type_copy == 5678] = 2
+    som_type_copy[som_type_copy == -5] = 3
+    som_type_copy[som_type_copy == -250] = 0
+
+    return som_type_copy
+
+
+def data_to_log_decile_log_area_aft(peaklet_data, normalization_factor):
+    """
+    Converts peaklet data into the current best inputs for the SOM,
+    log10(deciles) + log10(area) + AFT
+    Since we are dealing with logs, anything less than 1 will be set to 1
+    """
+    # turn deciles into approriate 'normalized' format (maybe also consider L1 normalization of these inputs)
+    decile_data = compute_quantiles(peaklet_data, 10)
+    data = peaklet_data.copy()
+    decile_data[decile_data < 1] = 1
+
+    decile_log = np.log10(decile_data)
+    decile_log_over_max = np.divide(decile_log, normalization_factor[:10])
+    # Now lets deal with area
+    data['area'] = data['area'] + normalization_factor[11] + 1
+    peaklet_log_area = np.log10(data['area'])
+    peaklet_aft = np.sum(data['area_per_channel'][:, :straxen.n_top_pmts], axis=1) / peaklet_data['area']
+    peaklet_aft = np.where(peaklet_aft > 0, peaklet_aft, 0)
+    peaklet_aft = np.where(peaklet_aft < 1, peaklet_aft, 1)
+    deciles_area_aft = np.concatenate((decile_log_over_max,
+                                       np.reshape(peaklet_log_area, (len(peaklet_log_area), 1)) / normalization_factor[
+                                           10],
+                                       np.reshape(peaklet_aft, (len(peaklet_log_area), 1))), axis=1)
+    return deciles_area_aft
+
+
+def compute_quantiles(peaks: np.ndarray, n_samples: int):
+
+    """
+    Compute waveforms and quantiles for a given number of nodes(attributes)
+    :param peaks:
+    :param n_samples: number of nodes or attributes
+    :return:quantiles
+    """
+
+    data = peaks['data'].copy()
+    data[data < 0.0] = 0.0
+    dt = peaks['dt']
+    q = compute_wf_attributes(data, dt, n_samples)
+    return q
+
+
+@export
+@numba.jit(nopython=True, cache=True)
+def compute_wf_attributes(data, sample_length, n_samples: int):
+
+    """
+    Compute waveform attribures
+    Quantiles: represent the amount of time elapsed for
+    a given fraction of the total waveform area to be observed in n_samples
+    i.e. n_samples = 10, then quantiles are equivalent deciles
+    Waveforms: downsampled waveform to n_samples
+    :param data: waveform e.g. peaks or peaklets
+    :param n_samples: compute quantiles for a given number of samples
+    :return: waveforms and quantiles of size n_samples
+    """
+
+    assert data.shape[0] == len(sample_length), "ararys must have same size"
+
+    num_samples = data.shape[1]
+
+    quantiles = np.zeros((len(data), n_samples), dtype=np.float64)
+
+    # Cannot compute with with more samples than actual waveform sample
+    assert num_samples > n_samples, "cannot compute with more samples than the actual waveform"
+    assert num_samples % n_samples == 0, "number of samples must be a multiple of n_samples"
+
+    # Compute quantiles
+    inter_points = np.linspace(0., 1. - (1. / n_samples), n_samples)
+    cumsum_steps = np.zeros(n_samples + 1, dtype=np.float64)
+    frac_of_cumsum = np.zeros(num_samples + 1)
+    sample_number_div_dt = np.arange(0, num_samples + 1, 1)
+    for i, (samples, dt) in enumerate(zip(data, sample_length)):
+        if np.sum(samples) == 0:
+            continue
+        # reset buffers
+        frac_of_cumsum[:] = 0
+        cumsum_steps[:] = 0
+        frac_of_cumsum[1:] = np.cumsum(samples)
+        frac_of_cumsum[1:] = frac_of_cumsum[1:] / frac_of_cumsum[-1]
+        cumsum_steps[:-1] = np.interp(inter_points, frac_of_cumsum, sample_number_div_dt * dt)
+        cumsum_steps[-1] = sample_number_div_dt[-1] * dt
+        quantiles[i] = cumsum_steps[1:] - cumsum_steps[:-1]
+
+    return quantiles
diff --git a/straxen/plugins/peaks/peaks.py b/straxen/plugins/peaks/peaks.py
@@ -56,4 +56,4 @@ def compute(self, peaklets, merged_s2s):
 
             assert np.all(peaks['time'][to_check][1:]
                           >= strax.endtime(peaks)[to_check][:-1]), "Peaks not disjoint"
-        return peaks
+        return peaks