From cf5523bb89ec60c85065e0c6dbc44362f301367f Mon Sep 17 00:00:00 2001
From: Thomas Reineking <thomas.reineking@gmail.com>
Date: Thu, 9 Oct 2014 22:03:05 +0200
Subject: [PATCH] refactor belief construction add IDM method for belief
 construction

---
 src/examples.py  | 10 +++++++
 src/pyds.py      | 72 +++++++++++++++++++++++++++++++-----------------
 src/pyds_test.py | 29 +++++++++++--------
 3 files changed, 73 insertions(+), 38 deletions(-)

diff --git a/src/examples.py b/src/examples.py
index cb2aab9..2b77065 100644
--- a/src/examples.py
+++ b/src/examples.py
@@ -20,6 +20,7 @@
 Shows different use cases of the library.
 """
 
+from __future__ import print_function
 from pyds import MassFunction
 from itertools import product
 
@@ -89,3 +90,12 @@
 print('vacuous extension of m_1 to {1, 2} =', extended)
 projected = extended.map(lambda h: (t[0] for t in h))
 print('project m_1 back to its original frame =', projected)
+
+print('\n=== construct belief from data ===')
+hist = {'a':2, 'b':0, 'c':1}
+print('histogram:', hist)
+print('maximum likelihood:', MassFunction.from_samples(hist, 'bayesian', s=0))
+print('Laplace smoothing:', MassFunction.from_samples(hist, 'bayesian', s=1))
+print('IDM:', MassFunction.from_samples(hist, 'idm'))
+print('MaxBel:', MassFunction.from_samples(hist, 'maxbel'))
+print('MCD:', MassFunction.from_samples(hist, 'mcd'))
diff --git a/src/pyds.py b/src/pyds.py
index 2155267..6562194 100644
--- a/src/pyds.py
+++ b/src/pyds.py
@@ -894,7 +894,7 @@ def _confidence_intervals(histogram, alpha):
         return p_lower, p_upper
     
     @staticmethod
-    def from_samples(histogram, alpha=0.05, mode='default'):
+    def from_samples(histogram, method='idm', alpha=0.05, s=1.0):
         """
         Generate a mass function from an empirical probability distribution that was obtained from a limited number of samples.
         This makes the expected deviation of the empirical distribution from the true distribution explicit.
@@ -902,33 +902,37 @@ def from_samples(histogram, alpha=0.05, mode='default'):
         'histogram' represents the empirical distribution. It is a dictionary mapping each possible event to the respective
         number of observations (represented as integers).
         
-        'mode' determines the algorithm used for generating the mass function.
-        Except for mode 'bayesian', all algorithms are based on the idea that the true probabilities lie within confidence intervals
+        'method' determines the algorithm used for generating the mass function.
+        Except for method 'bayesian', all algorithms are based on the idea that the true probabilities lie within confidence intervals
         represented by the mass function with confidence level 1 - 'alpha'.
         
         The following modes are supported:
         
-        'default': Maximize the total belief by solving a linear program. (Attention: this becomes very expensive computationally
+        'idm': Imprecise Dirichlet model. A small amount of mass (controlled by 's') is assigned to the entire frame.
+        For more information on 'idm', see:
+        P. Walley (1996), "Inferences from multinomial data: learning about a bag of marbles",
+        Journal of the Royal Statistical Society. Series B (Methodological), 3-57.
+        
+        'maxbel': Maximize the total belief by solving a linear program. (Attention: this becomes very computationally expensive
         for larger numbers of events.)
         
-        'ordered': Similar to 'default' except that the events are assumed to have a natural order (e.g., intervals), in which case
+        'maxbel-ordered': Similar to 'maxbel' except that the events are assumed to have a natural order (e.g., intervals), in which case
         the mass function can be computed analytically and thus much faster.
         
-        For more information on 'default' and 'ordered', see:
+        For more information on 'maxbel' and 'maxbel-ordered', see:
         T. Denoeux (2006), "Constructing belief functions from sample data using multinomial confidence regions",
         International Journal of Approximate Reasoning 42, 228-252.
         
-        'consonant': Compute the least committed consonant mass function whose pignistic transformation lies within the confidence interval
-        induced by 'alpha'. Like 'default', it is based on solving a linear program and quickly becomes computationally expensive.
+        'mcd': Compute the least committed consonant mass function whose pignistic transformation lies within the confidence interval
+        induced by 'alpha'. Like 'maxbel', it is based on solving a linear program and quickly becomes computationally expensive.
         
-        'consonant-approximate': An approximation of 'consonant' that can be computed much more efficiently.
+        'mcd-approximate': An approximation of 'mcd' that can be computed much more efficiently.
         
-        For more information on these two modes, see:
+        For more information on these two methods, see:
         A. Aregui, T. Denoeux (2008), "Constructing consonant belief functions from sample data using confidence sets of pignistic probabilities",
         International Journal of Approximate Reasoning 49, 575-594.
         
-        'bayesian': Disregard the number of samples and assume the true probability distribution is equal to the empirical one.
-        
+        'bayesian': Construct a Bayesian mass function based on the relative frequencies. In addition, additive smoothing is applied (controlled by 's'). 
         
         In case the sample number is 0, returns a vacuous mass function (or uniform distribution for 'bayesian').
         
@@ -943,24 +947,40 @@ def from_samples(histogram, alpha=0.05, mode='default'):
             return MassFunction()
         if sum(histogram.values()) == 0: # return vacuous/uniform belief if there are no samples
             vac = MassFunction({tuple(histogram.keys()):1})
-            if mode == 'bayesian':
+            if method == 'bayesian':
                 return vac.pignistic()
             else:
                 return vac
-        if mode == 'bayesian':
-            return MassFunction({(h,):v for h, v in histogram.items()}).normalize()
-        elif mode == 'default':
-            return MassFunction._from_samples(histogram, alpha)
-        elif mode == 'ordered':
-            return MassFunction._from_samples(histogram, alpha, ordered=True)
-        elif mode == 'consonant':
-            return MassFunction._from_samples_consonant(histogram, alpha)
-        elif mode == 'consonant-approximate':
-            return MassFunction._from_samples_consonant(histogram, alpha, approximate=True)
-        raise ValueError('unknown mode: %s' % mode)
+        if method == 'bayesian':
+            return MassFunction({(h,):v + s for h, v in histogram.items()}).normalize()
+        elif method == 'idm':
+            return MassFunction._from_samples_idm(histogram, s)
+        elif method == 'maxbel':
+            return MassFunction._from_samples_maxbel(histogram, alpha)
+        elif method == 'maxbel-ordered':
+            return MassFunction._from_samples_maxbel(histogram, alpha, ordered=True)
+        elif method == 'mcd':
+            return MassFunction._from_samples_mcd(histogram, alpha)
+        elif method == 'mcd-approximate':
+            return MassFunction._from_samples_mcd(histogram, alpha, approximate=True)
+        raise ValueError('unknown method: %s' % method)
+    
+    @staticmethod
+    def _from_samples_idm(histogram, s):
+        """
+        Reference:
+        P. Walley (1996), "Inferences from multinomial data: learning about a bag of marbles",
+        Journal of the Royal Statistical Society. Series B (Methodological), 3-57.
+        """
+        total = sum(histogram.values())
+        m = MassFunction()
+        for h, c in histogram.items():
+            m[(h,)] = float(c) / (total + s)
+        m[MassFunction._convert(histogram.keys())] = float(s) / (total + s)
+        return m
     
     @staticmethod
-    def _from_samples(histogram, alpha, ordered=False):
+    def _from_samples_maxbel(histogram, alpha, ordered=False):
         """
         Reference:
         T. Denoeux (2006), "Constructing belief functions from sample data using multinomial confidence regions",
@@ -1011,7 +1031,7 @@ def p_lower_set(hs):
             return MassFunction.from_array(m_optimal, H)
         
     @staticmethod
-    def _from_samples_consonant(histogram, alpha, approximate=False):
+    def _from_samples_mcd(histogram, alpha, approximate=False):
         """
         Reference:
         A. Aregui, T. Denoeux (2008), "Constructing consonant belief functions from sample data using confidence
diff --git a/src/pyds_test.py b/src/pyds_test.py
index 1954ed1..2035de4 100644
--- a/src/pyds_test.py
+++ b/src/pyds_test.py
@@ -465,19 +465,19 @@ def test_confidence_intervals(self):
     
     def test_from_samples(self):
         """
-        Example 1 (default) and example 7 (ordered) from:
+        Example 1 (maxbel) and example 7 (ordered) from:
         T. Denoeux (2006), "Constructing belief functions from sample data using multinomial confidence regions",
         International Journal of Approximate Reasoning 42, 228-252.
         
-        Example 6 (consonant) from:
+        Example 6 (mcd) from:
         A. Aregui, T. Denoeux (2008), "Constructing consonant belief functions from sample data using confidence sets of pignistic probabilities",
         International Journal of Approximate Reasoning 49, 575-594.
         """
         precipitation_data = {1:48, 2:17, 3:19, 4:11, 5:6, 6:9}
         failure_mode_data = {1:5, 2:11, 3:19, 4:30, 5:58, 6:67, 7:92, 8:118, 9:173, 10:297}
         psych_data = {1:91, 2:49, 3:37, 4:43}
-        # default
-        m = MassFunction.from_samples(psych_data, 0.05, mode='default')
+        # maxbel
+        m = MassFunction.from_samples(psych_data, method='maxbel', alpha=0.05)
         p_lower, p_upper = MassFunction._confidence_intervals(psych_data, 0.05)
         def p_lower_set(hs):
             l = u = 0
@@ -495,8 +495,8 @@ def p_lower_set(hs):
         self.assertEqual(1, sum(m.values())) # constraint (25)
         self.assertGreaterEqual(min(m.values()), 0) # constraint (26)
         self.assertGreaterEqual(bel_sum, 6.23) # optimization criterion
-        # ordered
-        m = MassFunction.from_samples(precipitation_data, 0.05, mode='ordered')
+        # maxbel-ordered
+        m = MassFunction.from_samples(precipitation_data, method='maxbel-ordered', alpha=0.05)
         self.assertAlmostEqual(0.32,  m[(1,)], 2)
         self.assertAlmostEqual(0.085, m[(2,)], 3)
         self.assertAlmostEqual(0.098, m[(3,)], 3)
@@ -507,18 +507,23 @@ def p_lower_set(hs):
         self.assertAlmostEqual(0.11,  m[range(1, 6)], 2)
         self.assertAlmostEqual(0.012, m[range(2, 6)], 2)
         self.assertAlmostEqual(0.14,  m[range(2, 7)], 2)
-        # consonant
+        # mcd
         poss = {1: 0.171, 2: 0.258, 3: 0.353, 4: 0.462, 5: 0.688, 6: 0.735, 7: 0.804, 8: 0.867, 9: 0.935, 10: 1.0} # 8: 0.873
-        m = MassFunction.from_samples(failure_mode_data, 0.1, mode='consonant') 
+        m = MassFunction.from_samples(failure_mode_data, method='mcd', alpha=0.1) 
         self._assert_equal_belief(MassFunction.from_possibility(poss), m, 1)
-        # consonant-approximate
-        m = MassFunction.from_samples(failure_mode_data, 0.1, mode='consonant-approximate')
+        # mcd-approximate
+        m = MassFunction.from_samples(failure_mode_data, method='mcd-approximate', alpha=0.1)
         poss = {1: 0.171, 2: 0.258, 3: 0.353, 4: 0.462, 5: 0.688, 6: 0.747, 7: 0.875, 8: 0.973, 9: 1.0, 10: 1.0} 
         self._assert_equal_belief(MassFunction.from_possibility(poss), m, 2)
         # bayesian
-        m = MassFunction.from_samples(precipitation_data, 0.05, mode='bayesian')
+        m = MassFunction.from_samples(precipitation_data, method='bayesian', s=0)
         for e, n in precipitation_data.items():
-            self.assertEqual(n / float(sum(precipitation_data.values())), m[(e,)]) 
+            self.assertEqual(n / float(sum(precipitation_data.values())), m[(e,)])
+        # idm
+        m = MassFunction.from_samples(precipitation_data, method='idm', s=1)
+        self.assertAlmostEqual(1. / float(sum(precipitation_data.values()) + 1), m[MassFunction._convert(precipitation_data.keys())])
+        for e, n in precipitation_data.items():
+            self.assertAlmostEqual(n / float(sum(precipitation_data.values()) + 1), m[(e,)])
     
     def test_powerset(self):
         s = range(2)