From cf5523bb89ec60c85065e0c6dbc44362f301367f Mon Sep 17 00:00:00 2001 From: Thomas Reineking Date: Thu, 9 Oct 2014 22:03:05 +0200 Subject: [PATCH] refactor belief construction add IDM method for belief construction --- src/examples.py | 10 +++++++ src/pyds.py | 72 +++++++++++++++++++++++++++++++----------------- src/pyds_test.py | 29 +++++++++++-------- 3 files changed, 73 insertions(+), 38 deletions(-) diff --git a/src/examples.py b/src/examples.py index cb2aab9..2b77065 100644 --- a/src/examples.py +++ b/src/examples.py @@ -20,6 +20,7 @@ Shows different use cases of the library. """ +from __future__ import print_function from pyds import MassFunction from itertools import product @@ -89,3 +90,12 @@ print('vacuous extension of m_1 to {1, 2} =', extended) projected = extended.map(lambda h: (t[0] for t in h)) print('project m_1 back to its original frame =', projected) + +print('\n=== construct belief from data ===') +hist = {'a':2, 'b':0, 'c':1} +print('histogram:', hist) +print('maximum likelihood:', MassFunction.from_samples(hist, 'bayesian', s=0)) +print('Laplace smoothing:', MassFunction.from_samples(hist, 'bayesian', s=1)) +print('IDM:', MassFunction.from_samples(hist, 'idm')) +print('MaxBel:', MassFunction.from_samples(hist, 'maxbel')) +print('MCD:', MassFunction.from_samples(hist, 'mcd')) diff --git a/src/pyds.py b/src/pyds.py index 2155267..6562194 100644 --- a/src/pyds.py +++ b/src/pyds.py @@ -894,7 +894,7 @@ def _confidence_intervals(histogram, alpha): return p_lower, p_upper @staticmethod - def from_samples(histogram, alpha=0.05, mode='default'): + def from_samples(histogram, method='idm', alpha=0.05, s=1.0): """ Generate a mass function from an empirical probability distribution that was obtained from a limited number of samples. This makes the expected deviation of the empirical distribution from the true distribution explicit. @@ -902,33 +902,37 @@ def from_samples(histogram, alpha=0.05, mode='default'): 'histogram' represents the empirical distribution. It is a dictionary mapping each possible event to the respective number of observations (represented as integers). - 'mode' determines the algorithm used for generating the mass function. - Except for mode 'bayesian', all algorithms are based on the idea that the true probabilities lie within confidence intervals + 'method' determines the algorithm used for generating the mass function. + Except for method 'bayesian', all algorithms are based on the idea that the true probabilities lie within confidence intervals represented by the mass function with confidence level 1 - 'alpha'. The following modes are supported: - 'default': Maximize the total belief by solving a linear program. (Attention: this becomes very expensive computationally + 'idm': Imprecise Dirichlet model. A small amount of mass (controlled by 's') is assigned to the entire frame. + For more information on 'idm', see: + P. Walley (1996), "Inferences from multinomial data: learning about a bag of marbles", + Journal of the Royal Statistical Society. Series B (Methodological), 3-57. + + 'maxbel': Maximize the total belief by solving a linear program. (Attention: this becomes very computationally expensive for larger numbers of events.) - 'ordered': Similar to 'default' except that the events are assumed to have a natural order (e.g., intervals), in which case + 'maxbel-ordered': Similar to 'maxbel' except that the events are assumed to have a natural order (e.g., intervals), in which case the mass function can be computed analytically and thus much faster. - For more information on 'default' and 'ordered', see: + For more information on 'maxbel' and 'maxbel-ordered', see: T. Denoeux (2006), "Constructing belief functions from sample data using multinomial confidence regions", International Journal of Approximate Reasoning 42, 228-252. - 'consonant': Compute the least committed consonant mass function whose pignistic transformation lies within the confidence interval - induced by 'alpha'. Like 'default', it is based on solving a linear program and quickly becomes computationally expensive. + 'mcd': Compute the least committed consonant mass function whose pignistic transformation lies within the confidence interval + induced by 'alpha'. Like 'maxbel', it is based on solving a linear program and quickly becomes computationally expensive. - 'consonant-approximate': An approximation of 'consonant' that can be computed much more efficiently. + 'mcd-approximate': An approximation of 'mcd' that can be computed much more efficiently. - For more information on these two modes, see: + For more information on these two methods, see: A. Aregui, T. Denoeux (2008), "Constructing consonant belief functions from sample data using confidence sets of pignistic probabilities", International Journal of Approximate Reasoning 49, 575-594. - 'bayesian': Disregard the number of samples and assume the true probability distribution is equal to the empirical one. - + 'bayesian': Construct a Bayesian mass function based on the relative frequencies. In addition, additive smoothing is applied (controlled by 's'). In case the sample number is 0, returns a vacuous mass function (or uniform distribution for 'bayesian'). @@ -943,24 +947,40 @@ def from_samples(histogram, alpha=0.05, mode='default'): return MassFunction() if sum(histogram.values()) == 0: # return vacuous/uniform belief if there are no samples vac = MassFunction({tuple(histogram.keys()):1}) - if mode == 'bayesian': + if method == 'bayesian': return vac.pignistic() else: return vac - if mode == 'bayesian': - return MassFunction({(h,):v for h, v in histogram.items()}).normalize() - elif mode == 'default': - return MassFunction._from_samples(histogram, alpha) - elif mode == 'ordered': - return MassFunction._from_samples(histogram, alpha, ordered=True) - elif mode == 'consonant': - return MassFunction._from_samples_consonant(histogram, alpha) - elif mode == 'consonant-approximate': - return MassFunction._from_samples_consonant(histogram, alpha, approximate=True) - raise ValueError('unknown mode: %s' % mode) + if method == 'bayesian': + return MassFunction({(h,):v + s for h, v in histogram.items()}).normalize() + elif method == 'idm': + return MassFunction._from_samples_idm(histogram, s) + elif method == 'maxbel': + return MassFunction._from_samples_maxbel(histogram, alpha) + elif method == 'maxbel-ordered': + return MassFunction._from_samples_maxbel(histogram, alpha, ordered=True) + elif method == 'mcd': + return MassFunction._from_samples_mcd(histogram, alpha) + elif method == 'mcd-approximate': + return MassFunction._from_samples_mcd(histogram, alpha, approximate=True) + raise ValueError('unknown method: %s' % method) + + @staticmethod + def _from_samples_idm(histogram, s): + """ + Reference: + P. Walley (1996), "Inferences from multinomial data: learning about a bag of marbles", + Journal of the Royal Statistical Society. Series B (Methodological), 3-57. + """ + total = sum(histogram.values()) + m = MassFunction() + for h, c in histogram.items(): + m[(h,)] = float(c) / (total + s) + m[MassFunction._convert(histogram.keys())] = float(s) / (total + s) + return m @staticmethod - def _from_samples(histogram, alpha, ordered=False): + def _from_samples_maxbel(histogram, alpha, ordered=False): """ Reference: T. Denoeux (2006), "Constructing belief functions from sample data using multinomial confidence regions", @@ -1011,7 +1031,7 @@ def p_lower_set(hs): return MassFunction.from_array(m_optimal, H) @staticmethod - def _from_samples_consonant(histogram, alpha, approximate=False): + def _from_samples_mcd(histogram, alpha, approximate=False): """ Reference: A. Aregui, T. Denoeux (2008), "Constructing consonant belief functions from sample data using confidence diff --git a/src/pyds_test.py b/src/pyds_test.py index 1954ed1..2035de4 100644 --- a/src/pyds_test.py +++ b/src/pyds_test.py @@ -465,19 +465,19 @@ def test_confidence_intervals(self): def test_from_samples(self): """ - Example 1 (default) and example 7 (ordered) from: + Example 1 (maxbel) and example 7 (ordered) from: T. Denoeux (2006), "Constructing belief functions from sample data using multinomial confidence regions", International Journal of Approximate Reasoning 42, 228-252. - Example 6 (consonant) from: + Example 6 (mcd) from: A. Aregui, T. Denoeux (2008), "Constructing consonant belief functions from sample data using confidence sets of pignistic probabilities", International Journal of Approximate Reasoning 49, 575-594. """ precipitation_data = {1:48, 2:17, 3:19, 4:11, 5:6, 6:9} failure_mode_data = {1:5, 2:11, 3:19, 4:30, 5:58, 6:67, 7:92, 8:118, 9:173, 10:297} psych_data = {1:91, 2:49, 3:37, 4:43} - # default - m = MassFunction.from_samples(psych_data, 0.05, mode='default') + # maxbel + m = MassFunction.from_samples(psych_data, method='maxbel', alpha=0.05) p_lower, p_upper = MassFunction._confidence_intervals(psych_data, 0.05) def p_lower_set(hs): l = u = 0 @@ -495,8 +495,8 @@ def p_lower_set(hs): self.assertEqual(1, sum(m.values())) # constraint (25) self.assertGreaterEqual(min(m.values()), 0) # constraint (26) self.assertGreaterEqual(bel_sum, 6.23) # optimization criterion - # ordered - m = MassFunction.from_samples(precipitation_data, 0.05, mode='ordered') + # maxbel-ordered + m = MassFunction.from_samples(precipitation_data, method='maxbel-ordered', alpha=0.05) self.assertAlmostEqual(0.32, m[(1,)], 2) self.assertAlmostEqual(0.085, m[(2,)], 3) self.assertAlmostEqual(0.098, m[(3,)], 3) @@ -507,18 +507,23 @@ def p_lower_set(hs): self.assertAlmostEqual(0.11, m[range(1, 6)], 2) self.assertAlmostEqual(0.012, m[range(2, 6)], 2) self.assertAlmostEqual(0.14, m[range(2, 7)], 2) - # consonant + # mcd poss = {1: 0.171, 2: 0.258, 3: 0.353, 4: 0.462, 5: 0.688, 6: 0.735, 7: 0.804, 8: 0.867, 9: 0.935, 10: 1.0} # 8: 0.873 - m = MassFunction.from_samples(failure_mode_data, 0.1, mode='consonant') + m = MassFunction.from_samples(failure_mode_data, method='mcd', alpha=0.1) self._assert_equal_belief(MassFunction.from_possibility(poss), m, 1) - # consonant-approximate - m = MassFunction.from_samples(failure_mode_data, 0.1, mode='consonant-approximate') + # mcd-approximate + m = MassFunction.from_samples(failure_mode_data, method='mcd-approximate', alpha=0.1) poss = {1: 0.171, 2: 0.258, 3: 0.353, 4: 0.462, 5: 0.688, 6: 0.747, 7: 0.875, 8: 0.973, 9: 1.0, 10: 1.0} self._assert_equal_belief(MassFunction.from_possibility(poss), m, 2) # bayesian - m = MassFunction.from_samples(precipitation_data, 0.05, mode='bayesian') + m = MassFunction.from_samples(precipitation_data, method='bayesian', s=0) for e, n in precipitation_data.items(): - self.assertEqual(n / float(sum(precipitation_data.values())), m[(e,)]) + self.assertEqual(n / float(sum(precipitation_data.values())), m[(e,)]) + # idm + m = MassFunction.from_samples(precipitation_data, method='idm', s=1) + self.assertAlmostEqual(1. / float(sum(precipitation_data.values()) + 1), m[MassFunction._convert(precipitation_data.keys())]) + for e, n in precipitation_data.items(): + self.assertAlmostEqual(n / float(sum(precipitation_data.values()) + 1), m[(e,)]) def test_powerset(self): s = range(2)