From abbd9779192757e50268e463d10011294403a76e Mon Sep 17 00:00:00 2001
From: Christina Ovezik <20790332+LadyChristina@users.noreply.github.com>
Date: Tue, 16 Jan 2024 17:25:23 +0000
Subject: [PATCH] Add theil, mpr and tau to metrics (#138)

---
 config.yaml                                   |  4 ++
 consensus_decentralization/analyze.py         |  3 +
 .../metrics/max_power_ratio.py                | 11 +++
 .../metrics/nakamoto_coefficient.py           | 19 ++----
 .../metrics/tau_index.py                      | 19 ++++++
 .../metrics/theil_index.py                    | 21 ++++++
 docs/metrics.md                               | 32 ++++++---
 tests/test_metrics.py                         | 68 ++++++++++++++++++-
 8 files changed, 151 insertions(+), 26 deletions(-)
 create mode 100644 consensus_decentralization/metrics/max_power_ratio.py
 create mode 100644 consensus_decentralization/metrics/tau_index.py
 create mode 100644 consensus_decentralization/metrics/theil_index.py

diff --git a/config.yaml b/config.yaml
index d2d16ed..6b36a4f 100644
--- a/config.yaml
+++ b/config.yaml
@@ -6,6 +6,10 @@ metrics:
   gini:
   hhi:
   nakamoto_coefficient:
+  theil_index:
+  max_power_ratio:
+  tau_index:
+    threshold: 0.66
 
 default_timeframe:
   start_date: 2010-01-01
diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py
index 706e1c4..ae62450 100644
--- a/consensus_decentralization/analyze.py
+++ b/consensus_decentralization/analyze.py
@@ -5,6 +5,9 @@
 from consensus_decentralization.metrics.nakamoto_coefficient import compute_nakamoto_coefficient  # noqa: F401
 from consensus_decentralization.metrics.entropy import compute_entropy, compute_entropy_percentage  # noqa: F401
 from consensus_decentralization.metrics.herfindahl_hirschman_index import compute_hhi  # noqa: F401
+from consensus_decentralization.metrics.theil_index import compute_theil_index  # noqa: F401
+from consensus_decentralization.metrics.max_power_ratio import compute_max_power_ratio  # noqa: F401
+from consensus_decentralization.metrics.tau_index import compute_tau_index  # noqa: F401
 
 
 def analyze(projects, aggregated_data_filename, output_dir):
diff --git a/consensus_decentralization/metrics/max_power_ratio.py b/consensus_decentralization/metrics/max_power_ratio.py
new file mode 100644
index 0000000..2259058
--- /dev/null
+++ b/consensus_decentralization/metrics/max_power_ratio.py
@@ -0,0 +1,11 @@
+def compute_max_power_ratio(blocks_per_entity):
+    """
+    Calculates the maximum power ratio of a distribution of balances
+    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :returns: float that represents the maximum power ratio among all block producers (0 if there weren't any)
+    """
+    if len(blocks_per_entity) == 0:
+        return 0
+    max_nblocks = max(blocks_per_entity.values())
+    total_blocks = sum(blocks_per_entity.values())
+    return max_nblocks / total_blocks if total_blocks > 0 else 0
diff --git a/consensus_decentralization/metrics/nakamoto_coefficient.py b/consensus_decentralization/metrics/nakamoto_coefficient.py
index 4b4328e..ef4b15c 100644
--- a/consensus_decentralization/metrics/nakamoto_coefficient.py
+++ b/consensus_decentralization/metrics/nakamoto_coefficient.py
@@ -1,19 +1,10 @@
+from consensus_decentralization.metrics.tau_index import compute_tau_index
+
+
 def compute_nakamoto_coefficient(blocks_per_entity):
     """
     Calculates the Nakamoto coefficient of a distribution of blocks to entities
     :param blocks_per_entity: a dictionary with entities and the blocks they have produced
-    :returns: int that represents the Nakamoto coefficient of the given distribution or None if the data is empty
+    :returns: int that represents the Nakamoto coefficient of the given distribution, or None if the data is empty
     """
-    total_blocks = sum(blocks_per_entity.values())
-    if total_blocks == 0:
-        return None
-    nc, power_percentage, top_entities = 0, 0, set()
-    while power_percentage < 50:
-        current_max_name = None
-        for (name, blocks) in blocks_per_entity.items():
-            if current_max_name is None or (blocks >= blocks_per_entity[current_max_name] and name not in top_entities):
-                current_max_name = name
-        nc += 1
-        power_percentage += 100 * blocks_per_entity[current_max_name] / total_blocks
-        top_entities.add(current_max_name)
-    return nc
+    return compute_tau_index(blocks_per_entity=blocks_per_entity, threshold=0.5)
diff --git a/consensus_decentralization/metrics/tau_index.py b/consensus_decentralization/metrics/tau_index.py
new file mode 100644
index 0000000..50fbe89
--- /dev/null
+++ b/consensus_decentralization/metrics/tau_index.py
@@ -0,0 +1,19 @@
+def compute_tau_index(blocks_per_entity, threshold):
+    """
+    Calculates the tau-decentralization index of a distribution of blocks
+    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :param threshold: float, the parameter of the tau-decentralization index, i.e. the threshold for the power
+    ratio that is captured by the index (e.g. 0.66 for 66%)
+    :returns: int that corresponds to the tau index of the given distribution, or None if there were no blocks
+    """
+    total_blocks = sum(blocks_per_entity.values())
+    if total_blocks == 0:
+        return None
+    tau_index, power_ratio_covered = 0, 0
+    blocks_per_entity_copy = blocks_per_entity.copy()
+    while power_ratio_covered < threshold:
+        current_max_entity = max(blocks_per_entity_copy, key=blocks_per_entity_copy.get)
+        tau_index += 1
+        power_ratio_covered += blocks_per_entity_copy[current_max_entity] / total_blocks
+        del blocks_per_entity_copy[current_max_entity]
+    return tau_index
diff --git a/consensus_decentralization/metrics/theil_index.py b/consensus_decentralization/metrics/theil_index.py
new file mode 100644
index 0000000..687834c
--- /dev/null
+++ b/consensus_decentralization/metrics/theil_index.py
@@ -0,0 +1,21 @@
+from math import log
+
+
+def compute_theil_index(blocks_per_entity):
+    """
+    Calculates the Thiel index of a distribution of blocks to entities
+    :param blocks_per_entity: a dictionary with entities and the blocks they have produced
+    :returns: float that represents the Thiel index of the given distribution
+    """
+    n = len(blocks_per_entity)
+    if n == 0:
+        return 0
+    total_blocks = sum(blocks_per_entity.values())
+    mu = total_blocks / n
+    theil = 0
+    for nblocks in blocks_per_entity.values():
+        x = nblocks / mu
+        if x > 0:
+            theil += x * log(x)
+    theil /= n
+    return theil
diff --git a/docs/metrics.md b/docs/metrics.md
index dd8a16d..25f29d7 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -4,27 +4,37 @@ A metric gets the aggregated data (see [Aggregator](aggregator.md)) and outputs
 The metrics that have been implemented so far are the following:
 
 1. **Nakamoto coefficient**: The Nakamoto coefficient represents the minimum number of entities that
-collectively produce more than 50% of the total blocks within a given timeframe. The output of the metric is an
-integer.
+   collectively produce more than 50% of the total blocks within a given timeframe. The output of the metric is an
+   integer.
 2. **Gini coefficient**: The Gini coefficient represents the degree of inequality in block production. The
-output of the metric is a decimal number in [0,1]. Values close to 0 indicate equality (all entities in
-the system produce the same number of blocks) and values close to 1 indicate inequality (one entity
-produces most or all blocks).
+   output of the metric is a decimal number in [0,1]. Values close to 0 indicate equality (all entities in
+   the system produce the same number of blocks) and values close to 1 indicate inequality (one entity
+   produces most or all blocks).
 3. **Entropy**: Entropy represents the expected amount of information in the distribution of blocks across entities.
-The output of the metric is a real number. Typically, a higher value of entropy indicates higher decentralization
-(lower predictability). Entropy is parameterized by a base rate α, which defines different types of entropy:
+   The output of the metric is a real number. Typically, a higher value of entropy indicates higher decentralization
+   (lower predictability). Entropy is parameterized by a base rate α, which defines different types of entropy:
     - α = -1: min entropy
     - α = 0: Hartley entropy
     - α = 1: Shannon entropy (this is used by default)
     - α = 2: collision entropy
 4. **HHI**: The Herfindahl-Hirschman Index (HHI) is a measure of market concentration. It is defined as the sum of the
-squares of the market shares (as whole numbers, e.g. 40 for 40%) of the entities in the system. The output of the metric
-is a real number in (0, 10000]. Values close to 0 indicate low concentration (many entities produce a similar number of
-blocks) and values close to 1 indicate high concentration (one entity produces most or all blocks). 
-The U.S. Department of Justice has set the following thresholds for interpreting HHI values (in traditional markets):
+   squares of the market shares (as whole numbers, e.g. 40 for 40%) of the entities in the system. The output of the
+   metric is a real number in (0, 10000]. Values close to 0 indicate low concentration (many entities produce a similar
+   number of blocks) and values close to 1 indicate high concentration (one entity produces most or all blocks).
+   The U.S. Department of Justice has set the following thresholds for interpreting HHI values (in traditional markets):
     - (0, 1500): Competitive market
     - [1500, 2500]: Moderately concentrated market
     - (2500, 10000]: Highly concentrated market
+5. **Theil index**: The Theil index is another measure of entropy which is intended to capture the lack of diversity,
+   or the redundancy, in a population. In practice, it is calculated as the maximum possible entropy minus the observed
+   entropy. The output is a real number. Values close to 0 indicate equality and values towards infinity indicate
+   inequality. Therefore, a high Theil Index suggests a population that is highly centralized.
+6. **Max power ratio**: The max power ratio represents the share of blocks that are produced by the most "powerful"
+   entity, i.e. the entity that produces the most blocks. The output of the metric is a decimal number in [0,1].
+7. **Tau-decentralization index**: The tau-decentralization index is a generalization of the Nakamoto coefficient.
+   It is defined as the minimum number of entities that collectively produce more than a given threshold of the total
+   blocks within a given timeframe. The threshold parameter is a decimal in [0, 1] (0.66 by default) and the output of
+   the metric is an integer.
 
 Each metric is implemented in a separate Python script in the folder `metrics`. 
 Each script defines a function named `compute_<metric_name>`, which takes as input a dictionary of the form
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index dca1eae..546180d 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -1,4 +1,5 @@
-from consensus_decentralization.metrics import entropy, gini, nakamoto_coefficient, herfindahl_hirschman_index
+from consensus_decentralization.metrics import (entropy, gini, nakamoto_coefficient, herfindahl_hirschman_index,
+                                                theil_index, max_power_ratio, tau_index)
 import numpy as np
 
 
@@ -111,3 +112,68 @@ def test_hhi():
 
     hhi5 = herfindahl_hirschman_index.compute_hhi(blocks_per_entity={'a': 0, 'b': 0})
     assert hhi5 is None
+
+
+def test_compute_theil_index():
+    """
+    Ensure that the results of the compute_theil_index function are consistent with online calculators,
+    such as: http://www.poorcity.richcity.org/calculator/
+    """
+    decimals = 3
+
+    theil_t = theil_index.compute_theil_index(blocks_per_entity={'a': 3, 'b': 2, 'c': 1})
+    assert round(theil_t, decimals) == 0.087
+
+    theil_t = theil_index.compute_theil_index(blocks_per_entity={'a': 3, 'b': 2, 'c': 1, 'd': 1, 'e': 1, 'f': 1})
+    assert round(theil_t, decimals) == 0.115
+
+    theil_t = theil_index.compute_theil_index(blocks_per_entity={'a': 432, 'b': 0, 'c': 0, 'd': 0})
+    assert round(theil_t, decimals) == 1.386
+
+    theil_t = theil_index.compute_theil_index(blocks_per_entity={'a': 432})
+    assert round(theil_t, decimals) == 0
+
+    theil_t = theil_index.compute_theil_index(blocks_per_entity={})
+    assert theil_t == 0
+
+
+def test_compute_max_power_ratio():
+    max_mpr = max_power_ratio.compute_max_power_ratio(blocks_per_entity={'a': 3, 'b': 2, 'c': 1})
+    assert max_mpr == 0.5
+
+    max_mpr = max_power_ratio.compute_max_power_ratio(blocks_per_entity={'a': 3, 'b': 2, 'c': 1, 'd': 1, 'e': 1, 'f': 1})
+    assert max_mpr == 1 / 3
+
+    max_mpr = max_power_ratio.compute_max_power_ratio(blocks_per_entity={'a': 1})
+    assert max_mpr == 1
+
+    max_mpr = max_power_ratio.compute_max_power_ratio(blocks_per_entity={'a': 1, 'b': 1, 'c': 1})
+    assert max_mpr == 1 / 3
+
+    max_mpr = max_power_ratio.compute_max_power_ratio(blocks_per_entity={})
+    assert max_mpr == 0
+
+
+def test_tau_33():
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={'a': 3, 'b': 2, 'c': 1}, threshold=0.33)
+    assert tau_idx == 1
+
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={'a': 3, 'b': 2, 'c': 1, 'd': 1, 'e': 1, 'f': 1}, threshold=0.33)
+    assert tau_idx == 1
+
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={'a': 1}, threshold=0.33)
+    assert tau_idx == 1
+
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={}, threshold=0.33)
+    assert tau_idx is None
+
+
+def test_tau_66():
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={'a': 3, 'b': 2, 'c': 1}, threshold=0.66)
+    assert tau_idx == 2
+
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={'a': 3, 'b': 2, 'c': 1, 'd': 1, 'e': 1, 'f': 1}, threshold=0.66)
+    assert tau_idx == 3
+
+    tau_idx = tau_index.compute_tau_index(blocks_per_entity={'a': 1}, threshold=0.66)
+    assert tau_idx == 1