diff --git a/CHANGELOG.md b/CHANGELOG.md index f95eeab5..928390b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,8 @@ to make two methods behave the same. - Add smoothing value to BLEU signature (#98) - dataset: Fix IWSLT links (#128) - - Allow variable number of references for BLEU (only via API) (#130) + - Allow variable number of references for BLEU (only via API) (#130). + Thanks to Ondrej Dusek (@tuetschek) - 1.4.14 (2020-09-13) - Added character-based tokenization (`-tok char`). diff --git a/README.md b/README.md index 84a81929..68cfc34f 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ Why use this version of BLEU? - It produces the same values as official script (`mteval-v13a.pl`) used by WMT - It outputs the BLEU score without the comma, so you don't have to remove it with `sed` (Looking at you, `multi-bleu.perl`) +The official version is hosted at . + # QUICK START Install the Python module (Python 3 only) @@ -111,7 +113,7 @@ SacreBLEU is licensed under the Apache 2.0 License. This was all Rico Sennrich's idea. Originally written by Matt Post. -The official version can be found at . +New features and ongoing support provided by Martin Popel (@martinpopel) and Ozan Caglayan (@ozancaglayan). If you use SacreBLEU, please cite the following: diff --git a/sacrebleu/metrics/bleu.py b/sacrebleu/metrics/bleu.py index 95ef120e..631d61e3 100644 --- a/sacrebleu/metrics/bleu.py +++ b/sacrebleu/metrics/bleu.py @@ -278,11 +278,11 @@ def corpus_score(self, sys_stream: Union[str, Iterable[str]], if any(line is None for line in sys_stream): raise EOFError("Undefined line in system stream!") - fhs = [sys_stream] + ref_streams - for lines in zip(*fhs): - # remove undefined references (i.e. we have fewer references for this particular sentence) - lines = [x for x in lines if x is not None and x != ""] - if len(lines) < 2: # we need at least system + 1 defined reference + for output, *refs in zip(sys_stream, *ref_streams): + # remove undefined/empty references (i.e. we have fewer references for this particular sentence) + # but keep empty hypothesis (it's always defined thanks to the sanity check above) + lines = [output] + [x for x in refs if x is not None and x != ""] + if len(lines) < 2: # we need at least hypothesis + 1 defined & non-empty reference raise EOFError("No valid references for a sentence!") if self.lc: diff --git a/test/test_bleu.py b/test/test_bleu.py index b5a0d160..eaceab20 100644 --- a/test/test_bleu.py +++ b/test/test_bleu.py @@ -5,7 +5,7 @@ # is located at # # http://aws.amazon.com/apache2.0/ -# +# # or in the "license" file accompanying this file. This file is distributed on # an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing @@ -19,9 +19,30 @@ Statistics = namedtuple('Statistics', ['common', 'total']) -test_cases = [(["this is a test", "another test"], ["ref1", "ref2"], 0.003799178428257963), - (["this is a test"], ["this is a test"], 1.0), - (["this is a fest"], ["this is a test"], 0.223606797749979)] +test_raw_bleu_cases = [(["this is a test", "another test"], ["ref1", "ref2"], 0.003799178428257963), + (["this is a test"], ["this is a test"], 1.0), + (["this is a fest"], ["this is a test"], 0.223606797749979)] + +# test for README example with empty hypothesis strings check +_refs = [ + ['The dog bit the man.', 'It was not unexpected.', 'The man bit him first.'], + ['The dog had bit the man.', 'No one was surprised.', 'The man had bitten the dog.'], +] + +_hyps = [ + 'The dog bit the man.', + "It wasn't surprising.", + 'The man had just bitten him.', +] + +test_corpus_bleu_cases = [ + (_hyps, _refs, {}, 48.530827), # test for default BLEU settings + (('', '', ''), _refs, {}, 0.0), # ensure that empty hypotheses are not removed + (_hyps, _refs, {'tokenize': 'none'}, 49.1919566), + (_hyps, _refs, {'tokenize': '13a'}, 48.530827), + (_hyps, _refs, {'tokenize': 'intl'}, 43.91623493), + (_hyps, _refs, {'smooth_method': 'none'}, 48.530827), +] test_case_offset = [("am I am a character sequence", "I am a symbol string sequence a a", 0.1555722182, 0)] @@ -37,8 +58,8 @@ test_case_scoring = [((Statistics([9, 7, 5, 3], [10, 8, 6, 4]), 11, 11), 0.8375922397)] test_case_effective_order = [(["test"], ["a test"], 0.3678794411714425), - (["a test"], ["a test"], 1.0), - (["a little test"], ["a test"], 0.03218297948685433)] + (["a test"], ["a test"], 1.0), + (["a little test"], ["a test"], 0.03218297948685433)] # testing that right score is returned for null statistics and different offsets @@ -51,12 +72,18 @@ ((Statistics([0, 0, 0, 0], [0, 0, 0, 0]), 1, 5), 0.01, 0.0)] -@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_cases) -def test_bleu(hypotheses, references, expected_bleu): +@pytest.mark.parametrize("hypotheses, references, expected_bleu", test_raw_bleu_cases) +def test_raw_bleu(hypotheses, references, expected_bleu): bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score / 100 assert abs(bleu - expected_bleu) < EPSILON +@pytest.mark.parametrize("hypotheses, references, kwargs, expected_bleu", test_corpus_bleu_cases) +def test_corpus_bleu(hypotheses, references, kwargs, expected_bleu): + bleu = sacrebleu.corpus_bleu(hypotheses, references, **kwargs).score + assert abs(bleu - expected_bleu) < EPSILON + + @pytest.mark.parametrize("hypotheses, references, expected_bleu", test_case_effective_order) def test_effective_order(hypotheses, references, expected_bleu): bleu = sacrebleu.raw_corpus_bleu(hypotheses, [references], .01).score / 100 @@ -90,6 +117,7 @@ def test_offset(hypothesis, reference, expected_with_offset, expected_without_of score_with_offset = sacrebleu.raw_corpus_bleu(hypothesis, reference).score / 100 assert abs(expected_with_offset - score_with_offset) < EPSILON + @pytest.mark.parametrize("statistics, offset, expected_score", test_case_degenerate_stats) def test_degenerate_statistics(statistics, offset, expected_score): score = sacrebleu.compute_bleu(statistics[0].common, statistics[0].total, statistics[1], statistics[2], smooth_method='floor', smooth_value=offset).score / 100