diff --git a/alea/__init__.py b/alea/__init__.py index 797c9f55..db25e2bf 100644 --- a/alea/__init__.py +++ b/alea/__init__.py @@ -1,15 +1,13 @@ __version__ = '0.0.0' -from . import model from .model import * -from . import models +from .models import * -from . import utils from .utils import * -from . import parameters from .parameters import * -from . import simulators from .simulators import * + +from .template_source import * diff --git a/alea/model.py b/alea/model.py index 82a327e9..8628df7e 100644 --- a/alea/model.py +++ b/alea/model.py @@ -7,6 +7,7 @@ from scipy.optimize import brentq from iminuit import Minuit from iminuit.util import make_func_code +from blueice.likelihood import _needs_data from inference_interface import toydata_to_file from alea.parameters import Parameters @@ -58,7 +59,18 @@ def __init__( confidence_interval_kind: str = "central", # one of central, upper, lower confidence_interval_threshold: Callable[[float], float] = None, ): - self._data = data + """Initialize a statistical model""" + if type(self) == StatisticalModel: + raise RuntimeError( + "You cannot instantiate the StatisticalModel class directly, " + "you must use a subclass where the likelihood function and data generation " + "method are implemented") + + # following https://github.com/JelleAalbers/blueice/blob/ + # 7c10222a13227e78dc7224b1a7e56ff91e4a8043/blueice/likelihood.py#L97 + self.is_data_set = False + if data is not None: + self.data = data self._confidence_level = confidence_level self._confidence_interval_kind = confidence_interval_kind self.confidence_interval_threshold = confidence_interval_threshold @@ -93,6 +105,7 @@ def _generate_data(self, **kwargs): "You must write a data-generation method (_generate_data) for your statistical model" " or use a subclass where it is written for you") + @_needs_data def ll(self, **kwargs) -> float: """ Likelihod function, returns the loglikelihood for the given parameters. @@ -143,6 +156,7 @@ def data(self, data): representing the data-sets of one or more likelihood terms. """ self._data = data + self.is_data_set = True def store_data( self, file_name, data_list, data_name_list=None, metadata = None): @@ -205,6 +219,7 @@ def cost(args): return cost + @_needs_data def fit(self, verbose=False, **kwargs) -> Tuple[dict, float]: """ Fit the model to the data by maximizing the likelihood diff --git a/alea/model_configs/unbinned_wimp_statistical_model.yaml b/alea/model_configs/unbinned_wimp_statistical_model.yaml index 2f69425d..c0665718 100644 --- a/alea/model_configs/unbinned_wimp_statistical_model.yaml +++ b/alea/model_configs/unbinned_wimp_statistical_model.yaml @@ -62,7 +62,7 @@ parameter_definition: likelihood_config: likelihood_weights: [1, 1, 1] - template_folder: alea/templates + template_folder: [] likelihood_terms: # SR0 - name: sr0 diff --git a/alea/models/__init__.py b/alea/models/__init__.py index 530eeafa..17b4e8a1 100644 --- a/alea/models/__init__.py +++ b/alea/models/__init__.py @@ -1,5 +1,3 @@ -from . import gaussian_model from .gaussian_model import * -from . import blueice_extended_model from .blueice_extended_model import * diff --git a/alea/models/blueice_extended_model.py b/alea/models/blueice_extended_model.py index 52694b19..0951f083 100644 --- a/alea/models/blueice_extended_model.py +++ b/alea/models/blueice_extended_model.py @@ -1,16 +1,17 @@ -from pydoc import locate # to lookup likelihood class from typing import List +from copy import deepcopy +from pydoc import locate import yaml import numpy as np import scipy.stats as stats from blueice.likelihood import LogAncillaryLikelihood, LogLikelihoodSum -from inference_interface import dict_to_structured_array +from inference_interface import dict_to_structured_array, structured_array_to_dict from alea.model import StatisticalModel -from alea.simulators import BlueiceDataGenerator -from alea.utils import adapt_likelihood_config_for_blueice from alea.parameters import Parameters +from alea.simulators import BlueiceDataGenerator +from alea.utils import adapt_likelihood_config_for_blueice, get_template_folder_list class BlueiceExtendedModel(StatisticalModel): @@ -73,22 +74,30 @@ def data(self, data: list): ll_term.set_data(d) self._data = data + self.is_data_set = True def get_expectation_values(self, **kwargs) -> dict: """ Return total expectation values (summed over all likelihood terms with the same name) given a number of named parameters (kwargs) + TODO: Current implementation is not elegant + because it calls the ll and requires data to be set. + We should update this function in the future after we stop using blueice. """ ret = dict() - # ancillary likelihood does not contribute - for ll in self._likelihood.likelihood_list[:-1]: - ll_pars = list(ll.rate_parameters.keys()) + list(ll.shape_parameters.keys()) + # calling ll need data to be set + self_copy = deepcopy(self) + self_copy.data = self_copy.generate_data() + + # ancillary likelihood does not contribute + for ll_term in self_copy._likelihood.likelihood_list[:-1]: + ll_pars = list(ll_term.rate_parameters.keys()) + list(ll_term.shape_parameters.keys()) ll_pars += ["livetime_days"] call_args = {k: i for k, i in kwargs.items() if k in ll_pars} - mus = ll(full_output=True, **call_args)[1] - for n, mu in zip(ll.source_name_list, mus): + mus = ll_term(full_output=True, **call_args)[1] + for n, mu in zip(ll_term.source_name_list, mus): ret[n] = ret.get(n, 0) + mu return ret @@ -101,16 +110,11 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum": """ lls = [] + template_folder_list = get_template_folder_list(likelihood_config) + # Iterate through each likelihood term in the configuration for config in likelihood_config["likelihood_terms"]: likelihood_object = locate(config["likelihood_type"]) - if isinstance(likelihood_config["template_folder"], str): - template_folder_list = [likelihood_config["template_folder"]] - elif isinstance(likelihood_config["template_folder"], list): - template_folder_list = likelihood_config["template_folder"] - else: - raise ValueError( - "template_folder must be either a string or a list of strings.") blueice_config = adapt_likelihood_config_for_blueice( config, template_folder_list) @@ -134,7 +138,6 @@ def _build_ll_from_config(self, likelihood_config: dict) -> "LogLikelihoodSum": ll = likelihood_object(blueice_config) for source in config["sources"]: - # Set rate parameters rate_parameters = [ p for p in source["parameters"] if self.parameters[p].ptype == "rate"] @@ -283,10 +286,11 @@ def set_data(self, d: np.array): d (np.array): Data of ancillary measurements, stored as numpy array """ # This results in shifted constraint terms. - if set(d.keys()) != set(self.parameters.names): + d_dict = structured_array_to_dict(d) + if set(d_dict.keys()) != set(self.parameters.names): raise ValueError( "The data dict must contain all parameters as keys in CustomAncillaryLikelihood.") - self.constraint_functions = self._get_constraint_functions(**d) + self.constraint_functions = self._get_constraint_functions(**d_dict) def ancillary_likelihood_sum(self, evaluate_at: dict) -> float: """Return the sum of all constraint terms. diff --git a/alea/parameters.py b/alea/parameters.py index 589f5fc7..e7f26da0 100644 --- a/alea/parameters.py +++ b/alea/parameters.py @@ -274,8 +274,9 @@ def __call__( values[name] = new_val if new_val is not None else param.nominal_value if any(i is None for k, i in values.items()): emptypars = ", ".join([k for k, i in values.items() if i is None]) - raise AssertionError("All parameters must be set explicitly, or have a nominal value," - " encountered for: " + emptypars) + raise AssertionError( + "All parameters must be set explicitly, or have a nominal value," + " encountered for: " + emptypars) return values def __getattr__(self, name: str) -> Parameter: @@ -291,9 +292,9 @@ def __getattr__(self, name: str) -> Parameter: Raises: AttributeError: If the attribute is not found. """ - if name in self.parameters: - return self.parameters[name] - else: + try: + return super().__getattribute__('parameters')[name] + except KeyError: raise AttributeError(f"Attribute '{name}' not found.") def __getitem__(self, name: str) -> Parameter: diff --git a/alea/template_source.py b/alea/template_source.py index d5c31f34..dd4480c0 100644 --- a/alea/template_source.py +++ b/alea/template_source.py @@ -24,6 +24,7 @@ class TemplateSource(blueice.HistogramPdfSource): :param log10_bins: List of axis numbers. If True, bin edges on this axis in the root file are log10() of the actual bin edges. """ + def build_histogram(self): format_dict = { k: self.config[k] @@ -71,7 +72,6 @@ def build_histogram(self): stop=slice_axis_limits[1]) logging.debug(f"Normalization after slicing: {h.n}.") - if collapse_axis is not None: if collapse_slices is None: raise ValueError( @@ -94,9 +94,8 @@ def build_histogram(self): self.config['analysis_space']): expected_bin_edges = np.array(expected_bin_edges) seen_bin_edges = h.bin_edges[axis_i] - if len( - self.config['analysis_space'] - ) == 1: # If 1D, hist1d returns bin_edges straight, not as list + # If 1D, hist1d returns bin_edges straight, not as list + if len(self.config['analysis_space']) == 1: seen_bin_edges = h.bin_edges logging.debug("axis_i: " + str(axis_i)) logging.debug("expected_bin_edges: " + str(expected_bin_edges)) @@ -175,6 +174,7 @@ class CombinedSource(blueice.HistogramPdfSource): Must be 1 shorter than histnames, templatenames :param histogram_parameters: names of parameters that should be put in the hdf5/histogram names, """ + def build_histogram(self): weight_names = self.config.get("weight_names") weights = [ @@ -355,20 +355,6 @@ def simulate(self, n_events): return ret -def get_json_spectrum(fn): - """ - Translates bbf-style JSON files to spectra. - units are keV and /kev*day*kg - """ - contents = json.load(open(fn, "r")) - logging.debug(contents["description"]) - esyst = contents["coordinate_system"][0][1] - ret = interp1d( - np.linspace(*esyst), contents["map"], - bounds_error=False, fill_value=0.) - return ret - - class SpectrumTemplateSource(blueice.HistogramPdfSource): """ :param spectrum_name: name of bbf json-like spectrum _OR_ function that can be called @@ -376,6 +362,21 @@ class SpectrumTemplateSource(blueice.HistogramPdfSource): :param histname: histogram name :param named_parameters: list of config settings to pass to .format on histname and filename """ + + @staticmethod + def _get_json_spectrum(fn): + """ + Translates bbf-style JSON files to spectra. + units are keV and /kev*day*kg + """ + contents = json.load(open(fn, "r")) + logging.debug(contents["description"]) + esyst = contents["coordinate_system"][0][1] + ret = interp1d( + np.linspace(*esyst), contents["map"], + bounds_error=False, fill_value=0.) + return ret + def build_histogram(self): logging.debug("building a hist") format_dict = { @@ -387,7 +388,7 @@ def build_histogram(self): spectrum = self.config["spectrum"] if type(spectrum) is str: - spectrum = get_json_spectrum(spectrum.format(**format_dict)) + spectrum = self._get_json_spectrum(spectrum.format(**format_dict)) slice_args = self.config.get("slice_args", {}) if type(slice_args) is dict: diff --git a/alea/utils.py b/alea/utils.py index 834c9cd5..0dc2d30a 100644 --- a/alea/utils.py +++ b/alea/utils.py @@ -1,8 +1,13 @@ import os +import yaml +import pkg_resources +from copy import deepcopy from pydoc import locate +import logging import numpy as np -import alea + +logging.basicConfig(level=logging.INFO) def get_analysis_space(analysis_space: dict) -> list: @@ -31,37 +36,94 @@ def adapt_likelihood_config_for_blueice( Args: likelihood_config (dict): likelihood config dict - template_folder_list (list): list of possible base folders where - templates are located. If a folder starts with alea/, - the alea folder is used as base. + template_folder_list (list): list of possible base folders. Ordered by priority. Returns: dict: adapted likelihood config """ - template_folder = None - for template_folder in template_folder_list: - # if template folder starts with alea: get location of alea - if template_folder.startswith("alea/"): - alea_dir = os.path.dirname(os.path.abspath(alea.__file__)) - template_folder = os.path.join(alea_dir, template_folder.replace("alea/", "")) - # check if template folder exists - if not os.path.isdir(template_folder): - template_folder = None - else: - break - # raise error if no template folder is found - if template_folder is None: - raise FileNotFoundError("No template folder found. Please provide a valid template folder.") + likelihood_config_copy = deepcopy(likelihood_config) + + likelihood_config_copy["analysis_space"] = get_analysis_space( + likelihood_config_copy["analysis_space"]) + + likelihood_config_copy["default_source_class"] = locate( + likelihood_config_copy["default_source_class"]) + + for source in likelihood_config_copy["sources"]: + source["templatename"] = get_file_path( + source["template_filename"], template_folder_list) + return likelihood_config_copy + + +def load_yaml(file_name: str): + """Load data from yaml file.""" + with open(get_file_path(file_name), 'r') as file: + data = yaml.safe_load(file) + return data + + +def _get_abspath(file_name): + """Get the abspath of the file. Raise FileNotFoundError when not found in any subfolder""" + for sub_dir in ('model_configs', 'runner_configs', 'templates'): + p = os.path.join(_package_path(sub_dir), file_name) + if os.path.exists(p): + return p + raise FileNotFoundError(f'Cannot find {file_name}') + + +def _package_path(sub_directory): + """Get the abs path of the requested sub folder""" + return pkg_resources.resource_filename('alea', f'{sub_directory}') + + +def get_file_path(fname, folder_list=None): + """Find the full path to the resource file + Try 5 methods in the following order + + #. fname begin with '/', return absolute path + #. folder begin with '/', return folder + name + #. can get file from _get_abspath, return alea internal file path + #. can be found in local installed ntauxfiles, return ntauxfiles absolute path + #. can be downloaded from MongoDB, download and return cached path + """ + if folder_list is None: + folder_list = [] + # 1. From absolute path + # Usually Config.default is a absolute path + if fname.startswith('/'): + return fname + + # 2. From local folder + # Use folder as prefix + for folder in folder_list: + if folder.startswith('/'): + fpath = os.path.join(folder, fname) + if os.path.exists(fpath): + logging.info(f'Load {fname} successfully from {fpath}') + return fpath + + # 3. From alea internal files + try: + return _get_abspath(fname) + except FileNotFoundError: + pass - likelihood_config["analysis_space"] = get_analysis_space( - likelihood_config["analysis_space"]) + # raise error when can not find corresponding file + raise RuntimeError(f'Can not find {fname}, please check your file system') - likelihood_config["default_source_class"] = locate( - likelihood_config["default_source_class"]) - for source in likelihood_config["sources"]: - source["templatename"] = os.path.join( - template_folder, source["template_filename"]) - return likelihood_config +def get_template_folder_list(likelihood_config): + """Get a list of template_folder from likelihood_config""" + if "template_folder" not in likelihood_config: + # return empty list if template_folder is not specified + likelihood_config["template_folder"] = [] + if isinstance(likelihood_config["template_folder"], str): + template_folder_list = [likelihood_config["template_folder"]] + elif isinstance(likelihood_config["template_folder"], list): + template_folder_list = likelihood_config["template_folder"] + else: + raise ValueError( + "template_folder must be either a string or a list of strings.") + return template_folder_list diff --git a/requirements.txt b/requirements.txt index 809096e5..d66ad6fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ atomicwrites git+https://github.com/JelleAalbers/blueice h5py +iminuit git+https://github.com/XENONnT/inference_interface matplotlib mergedeep diff --git a/tests/test_blueice_extended_model.py b/tests/test_blueice_extended_model.py new file mode 100644 index 00000000..4ff09ed3 --- /dev/null +++ b/tests/test_blueice_extended_model.py @@ -0,0 +1,72 @@ +from unittest import TestCase + +from blueice.likelihood import LogLikelihoodSum +from alea.utils import load_yaml +from alea.models import BlueiceExtendedModel, CustomAncillaryLikelihood + + +class TestBlueiceExtendedModel(TestCase): + """Test of the BlueiceExtendedModel class""" + + @classmethod + def setUp(cls): + """Initialise the BlueiceExtendedModel instance""" + cls.config = load_yaml('unbinned_wimp_statistical_model.yaml') + cls.n_likelihood_terms = len(cls.config['likelihood_config']['likelihood_terms']) + cls.set_new_model(cls) + + def set_new_model(self): + """Set a new BlueiceExtendedModel instance""" + self.model = BlueiceExtendedModel( + parameter_definition=self.config['parameter_definition'], + likelihood_config=self.config['likelihood_config'], + ) + + def test_expectation_values(self): + """Test of the expectation_values method""" + self.set_new_model() + expectation_values = self.model.get_expectation_values() + + # should avoid accidentally set data + is_data_set = False + for ll_term in self.model._likelihood.likelihood_list[:-1]: + is_data_set |= ll_term.is_data_set + if is_data_set: + raise ValueError('Data should not be set after get_expectation_values.') + # TODO: assert expectation values after manually initialize template source + + def test_generate_data(self): + """Test of the generate_data method""" + data = self.model.generate_data() + self.assertEqual( + len(data), self.n_likelihood_terms + 2) + if not all(['source' in d.dtype.names for d in data[:-2]]): + raise ValueError('Data does not contain source information.') + + def test_likelihood(self): + """Test of the _likelihood attribute""" + self.assertIsInstance(self.model._likelihood, LogLikelihoodSum) + self.assertIsInstance(self.model._likelihood.likelihood_list[-1], CustomAncillaryLikelihood) + self.assertEqual( + len(self.model._likelihood.likelihood_list), + self.n_likelihood_terms + 1) + self.model.data = self.model.generate_data() + self.model.ll() + + def test_fit(self): + """Test of the fit method""" + self.model.data = self.model.generate_data() + fit_result, max_llh = self.model.fit() + # TODO: + # check whether all parameters are in fit_result + # and whether fittable parameters are fitted + # and whether the results are in boundaries + + +class TestCustomAncillaryLikelihood(TestCase): + """Test of the CustomAncillaryLikelihood class""" + + def test_ancillary_likelihood(self): + """Test of the ancillary_likelihood method""" + # TODO: + pass diff --git a/tests/test_gaussian_model.py b/tests/test_gaussian_model.py new file mode 100644 index 00000000..bc636233 --- /dev/null +++ b/tests/test_gaussian_model.py @@ -0,0 +1,55 @@ +from os import remove +from unittest import TestCase + +import scipy.stats as sps +import inference_interface + +from alea.models import GaussianModel + + +class TestGaussianModel(TestCase): + """Test of the GaussianModel class""" + + @classmethod + def setUp(cls): + """Initialise the GaussianModel instance""" + parameter_definition = { + 'mu': { + 'fit_guess': 0., + 'fittable': True, + 'nominal_value': 0., + }, + 'sigma': { + 'fit_guess': 1., + 'fit_limits': [ + 0., + None, + ], + 'fittable': True, + 'nominal_value': 1., + }, + } + cls.model = GaussianModel( + parameter_definition=parameter_definition) + + def test_data_generation(self): + """Test generation of data""" + self.model.data = self.model.generate_data(mu=0, sigma=2) + + def test_data_storage(self): + """Test storage of data to file and retrieval of data from file""" + toydata_file = 'simple_data.hdf5' + self.model.data = self.model.generate_data(mu=0, sigma=2) + self.model.store_data(toydata_file, [self.model.data]) + stored_data = inference_interface.toydata_from_file(toydata_file) + assert self.model.data == stored_data[0], 'Stored data disagrees with data!' + remove(toydata_file) + + def test_fit_result(self): + """Test fitting of data""" + self.model.data = self.model.generate_data(mu=0, sigma=2) + hat_meas = self.model.data[0]['hat_mu'].item() + best_fit, lf = self.model.fit(sigma=2) + hat_fit = best_fit['mu'] + self.assertAlmostEqual(hat_meas, hat_fit) + self.assertAlmostEqual(lf, sps.norm(hat_fit, 2).logpdf(hat_meas)) diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 00000000..5d99488b --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,13 @@ +from alea import StatisticalModel + +def test_statistical_model(): + try: + error_raised = True + StatisticalModel() + error_raised = False + except Exception: + print('Error correctly raised when directly instantiating StatisticalModel') + else: + if not error_raised: + raise RuntimeError( + 'Should raise error when directly instantiating StatisticalModel') diff --git a/tests/test_parameter.py b/tests/test_parameter.py new file mode 100644 index 00000000..de2ff26c --- /dev/null +++ b/tests/test_parameter.py @@ -0,0 +1,20 @@ +from copy import deepcopy +from unittest import TestCase + +from alea.utils import load_yaml +from alea.parameters import Parameters + + +class TestParameters(TestCase): + """Test of the Parameters class""" + + @classmethod + def setUp(cls): + """Initialise the Parameters instance""" + config = load_yaml('unbinned_wimp_statistical_model.yaml') + cls.parameters = Parameters.from_config(config['parameter_definition']) + + def test_deep_copyable(self): + """Test of whether Parameters instance can be deepcopied""" + if deepcopy(self.parameters) != self.parameters: + raise ValueError('Parameters instance cannot be correctly deepcopied.') diff --git a/tests/test_statistical_model.py b/tests/test_statistical_model.py deleted file mode 100644 index 08e9ea40..00000000 --- a/tests/test_statistical_model.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_gaussian_model(): - pass diff --git a/tests/test_template_source.py b/tests/test_template_source.py new file mode 100644 index 00000000..36c4f4fd --- /dev/null +++ b/tests/test_template_source.py @@ -0,0 +1,12 @@ +from unittest import TestCase + +from alea.template_source import TemplateSource + + +class TestTemplateSource(TestCase): + """Test of the TemplateSource class""" + + def test_load_templates(self): + # TODO: not sure whether we want to test TemplateSource in alea + # not sure TemplateSource.build_histogram and TemplateSource.simulate are called + pass