pyro-ppl · fritzo · Apr 23, 2024 · Apr 21, 2024 · Apr 21, 2024 · Apr 22, 2024
diff --git a/pyro/ops/stats.py b/pyro/ops/stats.py
@@ -508,3 +508,56 @@ def crps_empirical(pred, truth):
     weight = weight.reshape(weight.shape + (1,) * (diff.dim() - 1))
 
     return (pred - truth).abs().mean(0) - (diff * weight).sum(0) / num_samples**2
+
+
+def energy_score_empirical(pred: torch.Tensor, truth: torch.Tensor) -> torch.Tensor:
+    """
+    Computes negative Energy Score ES* (see equation 22 in [1]) between a
+    set of multivariate samples ``pred`` and a true data vector ``truth``. Running time
+    is quadratic in the number of samples ``n``. In case of univariate samples
+    the output coincides with the CRPS::
+
+        ES* = E|pred - truth| - 1/2 E|pred - pred'|
+
+    Note that for a single sample this reduces to the Euclidean norm of the difference between
+    the sample ``pred`` and the ``truth``.
+
+    This is a strictly proper score so that for ``pred`` distirbuted according to a
+    distribution :math:`P` and ``truth`` distributed according to a distribution :math:`Q`
+    we have :math:`ES^{*}(P,Q) \ge ES^{*}(Q,Q)` with equality holding if and only if :math:`P=Q`, i.e.
+    if :math:`P` and :math:`Q` have the same multivariate distribution (it is not sufficient for
+    :math:`P` and :math:`Q` to have the same marginals in order for equality to hold).
+
+    **References**
+
+    [1] Tilmann Gneiting, Adrian E. Raftery (2007)
+        `Strictly Proper Scoring Rules, Prediction, and Estimation`
+        https://www.stat.washington.edu/raftery/Research/PDF/Gneiting2007jasa.pdf
+
+    :param torch.Tensor pred: A set of sample predictions batched on the second leftmost dim.
+        The leftmost dim is that of the multivariate sample.
+    :param torch.Tensor truth: A tensor of true observations with same shape as ``pred`` except
+        for the second leftmost dim which can have any value or be omitted.
+    :return: A tensor of shape ``truth.shape``.
+    :rtype: torch.Tensor
+    """
+    if pred.dim() == (truth.dim() + 1):
+        remove_leftmost_dim = True
+        truth = truth[..., None, :]
+    elif pred.dim() == truth.dim():
+        remove_leftmost_dim = False
+    else:
+        raise ValueError(
+            "Expected pred to have at most one extra dim versus truth."
+            "Actual shapes: {} versus {}".format(pred.shape, truth.shape)
+        )
+
+    retval = (
+        torch.cdist(pred, truth).mean(dim=-2)
+        - 0.5 * torch.cdist(pred, pred).mean(dim=[-1, -2])[..., None]
+    )
+
+    if remove_leftmost_dim:
+        retval = retval[..., 0]
+
+    return retval
diff --git a/tests/ops/test_stats.py b/tests/ops/test_stats.py
@@ -12,6 +12,7 @@
     autocovariance,
     crps_empirical,
     effective_sample_size,
+    energy_score_empirical,
     fit_generalized_pareto,
     gelman_rubin,
     hpdi,
@@ -313,7 +314,7 @@ def test_fit_generalized_pareto(k, sigma, n_samples=5000):
 
 @pytest.mark.parametrize("event_shape", [(), (4,), (3, 2)])
 @pytest.mark.parametrize("num_samples", [1, 2, 3, 4, 10])
-def test_crps_empirical(num_samples, event_shape):
+def test_crps_univariate_energy_score_empirical(num_samples, event_shape):
     truth = torch.randn(event_shape)
     pred = truth + 0.1 * torch.randn((num_samples,) + event_shape)
 
@@ -324,3 +325,33 @@ def test_crps_empirical(num_samples, event_shape):
         pred - pred.unsqueeze(1)
     ).abs().mean([0, 1])
     assert_close(actual, expected)
+
+    expected = energy_score_empirical(
+        pred[..., None].swapaxes(0, -1)[0, ..., None], truth[..., None]
+    )
+    assert_close(actual, expected)
+
+
+@pytest.mark.parametrize("sample_dim", [3, 10, 30, 100])
+def test_multivariate_energy_score(sample_dim, num_samples=10000):
+    pred_uncorrelated = torch.randn(num_samples, sample_dim)
+
+    pred = torch.randn(num_samples, 1)
+    pred = pred.expand(pred_uncorrelated.shape)
+
+    truth = torch.randn(num_samples, 1)
+    truth = truth.expand(pred_uncorrelated.shape)
+
+    energy_score = energy_score_empirical(pred, truth).mean()
+    energy_score_uncorrelated = energy_score_empirical(pred_uncorrelated, truth).mean()
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=RuntimeWarning)
+        from scipy.stats import chi
+
+    assert_close(
+        energy_score,
+        torch.tensor(0.5 * chi(1).mean() * (2 * sample_dim) ** 0.5),
+        rtol=0.02,
+    )
+    assert energy_score * 1.02 < energy_score_uncorrelated