Skip to content

Commit

Permalink
BLEU variable references: fix for empty hypotheses (#134)
Browse files Browse the repository at this point in the history
* Allow variable number of references for BLEU via API

This allows BLEU to use a variable number of references (different
number of references for each sentence). If a sentence has fewer
than the maximum number of references, None is used to fill remaining
reference streams.

* add more tests

* Updated credits

Co-authored-by: Ozan Caglayan <ozancag@gmail.com>
Co-authored-by: Matt Post <post@cs.jhu.edu>
  • Loading branch information
3 people authored Jan 20, 2021
1 parent a5e1137 commit 7bd5d88
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 15 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
to make two methods behave the same.
- Add smoothing value to BLEU signature (#98)
- dataset: Fix IWSLT links (#128)
- Allow variable number of references for BLEU (only via API) (#130)
- Allow variable number of references for BLEU (only via API) (#130).
Thanks to Ondrej Dusek (@tuetschek)

- 1.4.14 (2020-09-13)
- Added character-based tokenization (`-tok char`).
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ Why use this version of BLEU?
- It produces the same values as official script (`mteval-v13a.pl`) used by WMT
- It outputs the BLEU score without the comma, so you don't have to remove it with `sed` (Looking at you, `multi-bleu.perl`)

The official version is hosted at <https://github.com/mjpost/sacrebleu>.

# QUICK START

Install the Python module (Python 3 only)
Expand Down Expand Up @@ -111,7 +113,7 @@ SacreBLEU is licensed under the Apache 2.0 License.

This was all Rico Sennrich's idea.
Originally written by Matt Post.
The official version can be found at <https://github.com/mjpost/sacrebleu>.
New features and ongoing support provided by Martin Popel (@martinpopel) and Ozan Caglayan (@ozancaglayan).

If you use SacreBLEU, please cite the following:

Expand Down
10 changes: 5 additions & 5 deletions sacrebleu/metrics/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,11 +278,11 @@ def corpus_score(self, sys_stream: Union[str, Iterable[str]],
if any(line is None for line in sys_stream):
raise EOFError("Undefined line in system stream!")

fhs = [sys_stream] + ref_streams
for lines in zip(*fhs):
# remove undefined references (i.e. we have fewer references for this particular sentence)
lines = [x for x in lines if x is not None and x != ""]
if len(lines) < 2: # we need at least system + 1 defined reference
for output, *refs in zip(sys_stream, *ref_streams):
# remove undefined/empty references (i.e. we have fewer references for this particular sentence)
# but keep empty hypothesis (it's always defined thanks to the sanity check above)
lines = [output] + [x for x in refs if x is not None and x != ""]
if len(lines) < 2: # we need at least hypothesis + 1 defined & non-empty reference
raise EOFError("No valid references for a sentence!")

if self.lc:
Expand Down
44 changes: 36 additions & 8 deletions test/test_bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# is located at
#
# http://aws.amazon.com/apache2.0/
#
#
# or in the "license" file accompanying this file. This file is distributed on
# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
# express or implied. See the License for the specific language governing
Expand All @@ -19,9 +19,30 @@

Statistics = namedtuple('Statistics', ['common', 'total'])

test_cases = [(["this is a test", "another test"], ["ref1", "ref2"], 0.003799178428257963),
(["this is a test"], ["this is a test"], 1.0),
(["this is a fest"], ["this is a test"], 0.223606797749979)]
test_raw_bleu_cases = [(["this is a test", "another test"], ["ref1", "ref2"], 0.003799178428257963),
(["this is a test"], ["this is a test"], 1.0),
(["this is a fest"], ["this is a test"], 0.223606797749979)]

# test for README example with empty hypothesis strings check
_refs = [
['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'],
['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'],
]

_hyps = [
'The dog bit the man.',
"It wasn't surprising.",
'The man had just bitten him.',
]

test_corpus_bleu_cases = [
(_hyps, _refs, {}, 48.530827), # test for default BLEU settings
(('', '', ''), _refs, {}, 0.0), # ensure that empty hypotheses are not removed
(_hyps, _refs, {'tokenize': 'none'}, 49.1919566),
(_hyps, _refs, {'tokenize': '13a'}, 48.530827),
(_hyps, _refs, {'tokenize': 'intl'}, 43.91623493),
(_hyps, _refs, {'smooth_method': 'none'}, 48.530827),
]

test_case_offset = [("am I am a character sequence", "I am a symbol string sequence a a", 0.1555722182, 0)]

Expand All @@ -37,8 +58,8 @@
test_case_scoring = [((Statistics([9, 7, 5, 3], [10, 8, 6, 4]), 11, 11), 0.8375922397)]

test_case_effective_order = [(["test"], ["a test"], 0.3678794411714425),
(["a test"], ["a test"], 1.0),
(["a little test"], ["a test"], 0.03218297948685433)]
(["a test"], ["a test"], 1.0),
(["a little test"], ["a test"], 0.03218297948685433)]


# testing that right score is returned for null statistics and different offsets
Expand All @@ -51,12 +72,18 @@
((Statistics([0, 0, 0, 0], [0, 0, 0, 0]), 1, 5), 0.01, 0.0)]


@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_cases)
def test_bleu(hypotheses, references, expected_bleu):
@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_raw_bleu_cases)
def test_raw_bleu(hypotheses, references, expected_bleu):
bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score / 100
assert abs(bleu - expected_bleu) < EPSILON


@pytest.mark.parametrize("hypotheses, references, kwargs, expected_bleu", test_corpus_bleu_cases)
def test_corpus_bleu(hypotheses, references, kwargs, expected_bleu):
bleu = sacrebleu.corpus_bleu(hypotheses, references, **kwargs).score
assert abs(bleu - expected_bleu) < EPSILON


@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_case_effective_order)
def test_effective_order(hypotheses, references, expected_bleu):
bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score / 100
Expand Down Expand Up @@ -90,6 +117,7 @@ def test_offset(hypothesis, reference, expected_with_offset, expected_without_of
score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference).score / 100
assert abs(expected_with_offset - score_with_offset) < EPSILON


@pytest.mark.parametrize("statistics, offset, expected_score", test_case_degenerate_stats)
def test_degenerate_statistics(statistics, offset, expected_score):
score = sacrebleu.compute_bleu(statistics[0].common, statistics[0].total, statistics[1], statistics[2], smooth_method='floor', smooth_value=offset).score / 100
Expand Down

0 comments on commit 7bd5d88

Please sign in to comment.