BLEU variable references: fix for empty hypotheses (#134)

* Allow variable number of references for BLEU via API This allows BLEU to use a variable number of references (different number of references for each sentence). If a sentence has fewer than the maximum number of references, None is used to fill remaining reference streams. * add more tests * Updated credits Co-authored-by: Ozan Caglayan <ozancag@gmail.com> Co-authored-by: Matt Post <post@cs.jhu.edu>
mjpost · Jan 20, 2021 · 7bd5d88 · 7bd5d88
1 parent a5e1137
commit 7bd5d88
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,7 +11,8 @@
     to make two methods behave the same.
   - Add smoothing value to BLEU signature (#98)
   - dataset: Fix IWSLT links (#128)
-  - Allow variable number of references for BLEU (only via API) (#130)
+  - Allow variable number of references for BLEU (only via API) (#130).
+    Thanks to Ondrej Dusek (@tuetschek)
 
 - 1.4.14 (2020-09-13)
   - Added character-based tokenization (`-tok char`).

diff --git a/README.md b/README.md
@@ -12,6 +12,8 @@ Why use this version of BLEU?
 - It produces the same values as official script (`mteval-v13a.pl`) used by WMT
 - It outputs the BLEU score without the comma, so you don't have to remove it with `sed` (Looking at you, `multi-bleu.perl`)
 
+The official version is hosted at <https://github.com/mjpost/sacrebleu>.
+
 # QUICK START
 
 Install the Python module (Python 3 only)
@@ -111,7 +113,7 @@ SacreBLEU is licensed under the Apache 2.0 License.
 
 This was all Rico Sennrich's idea.
 Originally written by Matt Post.
-The official version can be found at <https://github.com/mjpost/sacrebleu>.
+New features and ongoing support provided by Martin Popel (@martinpopel) and Ozan Caglayan (@ozancaglayan).
 
 If you use SacreBLEU, please cite the following:
 

diff --git a/sacrebleu/metrics/bleu.py b/sacrebleu/metrics/bleu.py
@@ -278,11 +278,11 @@ def corpus_score(self, sys_stream: Union[str, Iterable[str]],
         if any(line is None for line in sys_stream):
             raise EOFError("Undefined line in system stream!")
 
-        fhs = [sys_stream] + ref_streams
-        for lines in zip(*fhs):
-            # remove undefined references (i.e. we have fewer references for this particular sentence)
-            lines = [x for x in lines if x is not None and x != ""]
-            if len(lines) < 2:  # we need at least system + 1 defined reference
+        for output, *refs in zip(sys_stream, *ref_streams):
+            # remove undefined/empty references (i.e. we have fewer references for this particular sentence)
+            # but keep empty hypothesis (it's always defined thanks to the sanity check above)
+            lines = [output] + [x for x in refs if x is not None and x != ""]
+            if len(lines) < 2:  # we need at least hypothesis + 1 defined & non-empty reference
                 raise EOFError("No valid references for a sentence!")
 
             if self.lc:

diff --git a/test/test_bleu.py b/test/test_bleu.py
@@ -5,7 +5,7 @@
 # is located at
 #
 #     http://aws.amazon.com/apache2.0/
-# 
+#
 # or in the "license" file accompanying this file. This file is distributed on
 # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
 # express or implied. See the License for the specific language governing
@@ -19,9 +19,30 @@
 
 Statistics = namedtuple('Statistics', ['common', 'total'])
 
-test_cases = [(["this is a test", "another test"], ["ref1", "ref2"], 0.003799178428257963),
-              (["this is a test"], ["this is a test"], 1.0),
-              (["this is a fest"], ["this is a test"], 0.223606797749979)]
+test_raw_bleu_cases = [(["this is a test", "another test"], ["ref1", "ref2"], 0.003799178428257963),
+                       (["this is a test"], ["this is a test"], 1.0),
+                       (["this is a fest"], ["this is a test"], 0.223606797749979)]
+
+# test for README example with empty hypothesis strings check
+_refs = [
+    ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
+    ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'],
+]
+
+_hyps = [
+    'The dog bit the man.',
+    "It wasn't surprising.",
+    'The man had just bitten him.',
+]
+
+test_corpus_bleu_cases = [
+    (_hyps, _refs, {}, 48.530827),   # test for default BLEU settings
+    (('', '', ''), _refs, {}, 0.0),  # ensure that empty hypotheses are not removed
+    (_hyps, _refs, {'tokenize': 'none'}, 49.1919566),
+    (_hyps, _refs, {'tokenize': '13a'}, 48.530827),
+    (_hyps, _refs, {'tokenize': 'intl'}, 43.91623493),
+    (_hyps, _refs, {'smooth_method': 'none'}, 48.530827),
+]
 
 test_case_offset = [("am I am a character sequence", "I am a symbol string sequence a a", 0.1555722182, 0)]
 
@@ -37,8 +58,8 @@
 test_case_scoring = [((Statistics([9, 7, 5, 3], [10, 8, 6, 4]), 11, 11), 0.8375922397)]
 
 test_case_effective_order = [(["test"], ["a test"], 0.3678794411714425),
-                        (["a test"], ["a test"], 1.0),
-                        (["a little test"], ["a test"], 0.03218297948685433)]
+                             (["a test"], ["a test"], 1.0),
+                             (["a little test"], ["a test"], 0.03218297948685433)]
 
 
 # testing that right score is returned for null statistics and different offsets
@@ -51,12 +72,18 @@
                               ((Statistics([0, 0, 0, 0], [0, 0, 0, 0]), 1, 5), 0.01, 0.0)]
 
 
-@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_cases)
-def test_bleu(hypotheses, references, expected_bleu):
+@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_raw_bleu_cases)
+def test_raw_bleu(hypotheses, references, expected_bleu):
     bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score / 100
     assert abs(bleu - expected_bleu) < EPSILON
 
 
+@pytest.mark.parametrize("hypotheses, references, kwargs, expected_bleu", test_corpus_bleu_cases)
+def test_corpus_bleu(hypotheses, references, kwargs, expected_bleu):
+    bleu = sacrebleu.corpus_bleu(hypotheses, references, **kwargs).score
+    assert abs(bleu - expected_bleu) < EPSILON
+
+
 @pytest.mark.parametrize("hypotheses, references, expected_bleu", test_case_effective_order)
 def test_effective_order(hypotheses, references, expected_bleu):
     bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score / 100
@@ -90,6 +117,7 @@ def test_offset(hypothesis, reference, expected_with_offset, expected_without_of
     score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference).score / 100
     assert abs(expected_with_offset - score_with_offset) < EPSILON
 
+
 @pytest.mark.parametrize("statistics, offset, expected_score", test_case_degenerate_stats)
 def test_degenerate_statistics(statistics, offset, expected_score):
     score = sacrebleu.compute_bleu(statistics[0].common, statistics[0].total, statistics[1], statistics[2], smooth_method='floor', smooth_value=offset).score / 100