diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml index 881091a3..c546c0e1 100644 --- a/.github/workflows/check-build.yml +++ b/.github/workflows/check-build.yml @@ -1,7 +1,9 @@ name: check-build on: - pull_request + push: + pull_request: + workflow_dispatch: env: PYTHONUTF8: "1" @@ -27,8 +29,6 @@ jobs: - os: ubuntu-20.04 python-version: '3.6' # test Python 3.6 on older Ubuntu instead steps: - # - name: update - # run: sudo apt-get -y update - uses: actions/checkout@v3 - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v3 @@ -42,12 +42,18 @@ jobs: run: choco install wget unzip - name: Install python dependencies run: | - python -m pip install --upgrade pip - pip install pytest - pip install .[ja] - pip install .[ko] + python3 -m pip install --upgrade pip + pip3 install .[dev] + pip3 install .[ja] + pip3 install .[ko] + - name: Lint with Mypy + run: mypy sacrebleu scripts test + - name: Lint with Ruff + uses: chartboost/ruff-action@v1 - name: Python pytest test suite run: python3 -m pytest - name: CLI bash test suite shell: bash run: ./test.sh + - name: Build + run: python3 setup.py sdist bdist_wheel diff --git a/.gitignore b/.gitignore index 6bf040a7..e2d4f533 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ __pycache__ sacrebleu.egg-info .sacrebleu *~ -.DS_Store \ No newline at end of file +.DS_Store +.idea/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 14f74df1..d7196284 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,14 @@ # Release Notes - -- 2.4.0 (2023-11-07) +- 2.4.0 (2023-12-11) Added: - WMT23 test sets (test set `wmt23`) +- 2.3.3 (2023-11-28) + Fixed: + - Typing issues (#249, #250) + - Improved builds (#252) + - 2.3.2 (2023-11-06) Fixed: - Special treatment of empty references in TER (#232) diff --git a/Makefile b/Makefile index f6f82360..6b378c70 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ .PHONY: test test: + mypy sacrebleu scripts test python3 -m pytest bash test.sh diff --git a/mypy.ini b/mypy.ini index 7207d687..26ff0958 100644 --- a/mypy.ini +++ b/mypy.ini @@ -18,3 +18,12 @@ ignore_missing_imports = True [mypy-MeCab.*] ignore_missing_imports = True + +[mypy-mecab_ko.*] +ignore_missing_imports = True + +[mypy-mecab_ko_dic.*] +ignore_missing_imports = True + +[mypy-sentencepiece.*] +ignore_missing_imports = True diff --git a/sacrebleu/__init__.py b/sacrebleu/__init__.py index c3405280..4a3bcab7 100644 --- a/sacrebleu/__init__.py +++ b/sacrebleu/__init__.py @@ -18,14 +18,26 @@ __description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores' -from .utils import smart_open, SACREBLEU_DIR, download_test_set # noqa: F401 -from .utils import get_source_file, get_reference_files # noqa: F401 -from .utils import get_available_testsets, get_langpairs_for_testset # noqa: F401 -from .metrics.helpers import extract_word_ngrams, extract_char_ngrams # noqa: F401 -from .dataset import DATASETS # noqa: F401 -from .metrics import BLEU, CHRF, TER # noqa: F401 +from .utils import smart_open, SACREBLEU_DIR, download_test_set +from .utils import get_source_file, get_reference_files +from .utils import get_available_testsets, get_langpairs_for_testset +from .metrics.helpers import extract_word_ngrams, extract_char_ngrams +from .dataset import DATASETS +from .metrics import BLEU, CHRF, TER # Backward compatibility functions for old style API access (<= 1.4.10) -from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu # noqa: F401 -from .compat import corpus_chrf, sentence_chrf # noqa: F401 -from .compat import corpus_ter, sentence_ter # noqa: F401 +from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu +from .compat import corpus_chrf, sentence_chrf +from .compat import corpus_ter, sentence_ter + +__all__ = [ + 'smart_open', 'SACREBLEU_DIR', 'download_test_set', + 'get_source_file', 'get_reference_files', + 'get_available_testsets', 'get_langpairs_for_testset', + 'extract_word_ngrams', 'extract_char_ngrams', + 'DATASETS', + 'BLEU', 'CHRF', 'TER', + 'corpus_bleu', 'raw_corpus_bleu', 'sentence_bleu', + 'corpus_chrf', 'sentence_chrf', + 'corpus_ter', 'sentence_ter' +] diff --git a/sacrebleu/compat.py b/sacrebleu/compat.py index cce90e9f..57359603 100644 --- a/sacrebleu/compat.py +++ b/sacrebleu/compat.py @@ -64,7 +64,7 @@ def raw_corpus_bleu(hypotheses: Sequence[str], def sentence_bleu(hypothesis: str, references: Sequence[str], smooth_method: str = 'exp', - smooth_value: float = None, + smooth_value: Optional[float] = None, lowercase: bool = False, tokenize=BLEU.TOKENIZER_DEFAULT, use_effective_order: bool = True) -> BLEUScore: diff --git a/sacrebleu/dataset/__init__.py b/sacrebleu/dataset/__init__.py index b44a1a70..6ec8aad8 100644 --- a/sacrebleu/dataset/__init__.py +++ b/sacrebleu/dataset/__init__.py @@ -14,6 +14,19 @@ # express or implied. See the License for the specific language governing # permissions and limitations under the License. + +# This defines data locations. +# Right below are test sets. +# Beneath each test set, we define the location to download the test data. +# The other keys are each language pair contained in the tarball, and the respective locations of the source and reference data within each. +# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script. +# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET} +from .fake_sgml import FakeSGMLDataset, WMTAdditionDataset +from .iwslt_xml import IWSLTXMLDataset +from .plain_text import PlainTextDataset +from .tsv import TSVDataset +from .wmt_xml import WMTXMLDataset + # Detailed document metadata annotation in form DocumentID -> CountryCode - Domain - OptionalFinegrainedCountryCode # While the annotation is subjective with many unclear cases, it may provide useful insights # when applied on large data (TODO: annotate all documents from recent WMT years, at least for origlang=en, consider renaming "world" to "other"). @@ -59,19 +72,6 @@ COUNTRIES = sorted(list({v.split("-")[0] for v in SUBSETS["wmt19"].values()})) DOMAINS = sorted(list({v.split("-")[1] for v in SUBSETS["wmt19"].values()})) - -# This defines data locations. -# At the top level are test sets. -# Beneath each test set, we define the location to download the test data. -# The other keys are each language pair contained in the tarball, and the respective locations of the source and reference data within each. -# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script. -# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET} -from .fake_sgml import FakeSGMLDataset, WMTAdditionDataset -from .iwslt_xml import IWSLTXMLDataset -from .plain_text import PlainTextDataset -from .tsv import TSVDataset -from .wmt_xml import WMTXMLDataset - DATASETS = { # wmt "wmt23": WMTXMLDataset( @@ -180,7 +180,7 @@ ), "wmt21": WMTXMLDataset( "wmt21", - data=["http://data.statmt.org/wmt21/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/test.tgz"], description="Official evaluation data for WMT21.", md5=["32e7ab995bc318414375d60f0269af92"], langpairs={ @@ -210,7 +210,7 @@ ), "wmt21/B": WMTXMLDataset( "wmt21/B", - data=["http://data.statmt.org/wmt21/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/test.tgz"], description="Official evaluation data for WMT21 with reference B.", md5=["32e7ab995bc318414375d60f0269af92"], langpairs={ @@ -226,7 +226,7 @@ ), "wmt21/AB": WMTXMLDataset( "wmt21/AB", - data=["http://data.statmt.org/wmt21/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/test.tgz"], description="Official evaluation data for WMT21 with references A and B.", md5=["32e7ab995bc318414375d60f0269af92"], langpairs={ @@ -243,7 +243,7 @@ ), "wmt21/C": WMTXMLDataset( "wmt21/C", - data=["http://data.statmt.org/wmt21/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/test.tgz"], description="Official evaluation data for WMT21 with reference C", md5=["32e7ab995bc318414375d60f0269af92"], langpairs={ @@ -254,7 +254,7 @@ ), "wmt21/AC": WMTXMLDataset( "wmt21/AC", - data=["http://data.statmt.org/wmt21/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/test.tgz"], description="Official evaluation data for WMT21 with references A and C", md5=["32e7ab995bc318414375d60f0269af92"], langpairs={ @@ -265,7 +265,7 @@ ), "wmt21/D": WMTXMLDataset( "wmt21/D", - data=["http://data.statmt.org/wmt21/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/test.tgz"], description="Official evaluation data for WMT21 with reference D", md5=["32e7ab995bc318414375d60f0269af92"], langpairs={ @@ -276,7 +276,7 @@ ), "wmt21/dev": WMTXMLDataset( "wmt21/dev", - data=["http://data.statmt.org/wmt21/translation-task/dev.tgz"], + data=["https://data.statmt.org/wmt21/translation-task/dev.tgz"], description="Development data for WMT21,if multiple references are available, the first one is used.", md5=["165da59ac8dfb5b7cafd7e90b1cac672"], langpairs={ @@ -290,7 +290,7 @@ ), "wmt20/tworefs": FakeSGMLDataset( "wmt20/tworefs", - data=["http://data.statmt.org/wmt20/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt20/translation-task/test.tgz"], description="WMT20 news test sets with two references", md5=["3b1f777cfd2fb15ccf66e9bfdb2b1699"], langpairs={ @@ -323,7 +323,7 @@ ), "wmt20": FakeSGMLDataset( "wmt20", - data=["http://data.statmt.org/wmt20/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt20/translation-task/test.tgz"], description="Official evaluation data for WMT20", md5=["3b1f777cfd2fb15ccf66e9bfdb2b1699"], langpairs={ @@ -419,7 +419,7 @@ ), "wmt20/dev": FakeSGMLDataset( "wmt20/dev", - data=["http://data.statmt.org/wmt20/translation-task/dev.tgz"], + data=["https://data.statmt.org/wmt20/translation-task/dev.tgz"], description="Development data for tasks new to 2020.", md5=["037f2b37aab74febbb1b2307dc2afb54"], langpairs={ @@ -459,7 +459,7 @@ ), "wmt20/robust/set1": PlainTextDataset( "wmt20/robust/set1", - data=["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], + data=["https://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], md5=["a12ac9ebe89b72195041518dffc4a9d5"], description="WMT20 robustness task, set 1", langpairs={ @@ -475,7 +475,7 @@ ), "wmt20/robust/set2": PlainTextDataset( "wmt20/robust/set2", - data=["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], + data=["https://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], md5=["a12ac9ebe89b72195041518dffc4a9d5"], description="WMT20 robustness task, set 2", langpairs={ @@ -491,7 +491,7 @@ ), "wmt20/robust/set3": PlainTextDataset( "wmt20/robust/set3", - data=["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], + data=["https://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"], md5=["a12ac9ebe89b72195041518dffc4a9d5"], description="WMT20 robustness task, set 3", langpairs={ @@ -503,7 +503,7 @@ ), "wmt19": FakeSGMLDataset( "wmt19", - data=["http://data.statmt.org/wmt19/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt19/translation-task/test.tgz"], description="Official evaluation data.", md5=["84de7162d158e28403103b01aeefc39a"], citation=r"""@proceedings{ws-2019-machine, @@ -612,7 +612,7 @@ ), "wmt19/dev": FakeSGMLDataset( "wmt19/dev", - data=["http://data.statmt.org/wmt19/translation-task/dev.tgz"], + data=["https://data.statmt.org/wmt19/translation-task/dev.tgz"], description="Development data for tasks new to 2019.", md5=["f2ec7af5947c19e0cacb3882eb208002"], langpairs={ @@ -645,7 +645,7 @@ "wmt19/google/ar": WMTAdditionDataset( "wmt19/google/ar", data=[ - "http://data.statmt.org/wmt19/translation-task/test.tgz", + "https://data.statmt.org/wmt19/translation-task/test.tgz", "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-ar.ref", ], description="Additional high-quality reference for WMT19/en-de.", @@ -658,7 +658,7 @@ "wmt19/google/arp": WMTAdditionDataset( "wmt19/google/arp", data=[ - "http://data.statmt.org/wmt19/translation-task/test.tgz", + "https://data.statmt.org/wmt19/translation-task/test.tgz", "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-arp.ref", ], description="Additional paraphrase of wmt19/google/ar.", @@ -671,7 +671,7 @@ "wmt19/google/wmtp": WMTAdditionDataset( "wmt19/google/wmtp", data=[ - "http://data.statmt.org/wmt19/translation-task/test.tgz", + "https://data.statmt.org/wmt19/translation-task/test.tgz", "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-wmtp.ref", ], description="Additional paraphrase of the official WMT19 reference.", @@ -684,7 +684,7 @@ "wmt19/google/hqr": WMTAdditionDataset( "wmt19/google/hqr", data=[ - "http://data.statmt.org/wmt19/translation-task/test.tgz", + "https://data.statmt.org/wmt19/translation-task/test.tgz", "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-hqr.ref", ], description="Best human selected-reference between wmt19 and wmt19/google/ar.", @@ -697,7 +697,7 @@ "wmt19/google/hqp": WMTAdditionDataset( "wmt19/google/hqp", data=[ - "http://data.statmt.org/wmt19/translation-task/test.tgz", + "https://data.statmt.org/wmt19/translation-task/test.tgz", "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-hqp.ref", ], description="Best human-selected reference between wmt19/google/arp and wmt19/google/wmtp.", @@ -710,7 +710,7 @@ "wmt19/google/hqall": WMTAdditionDataset( "wmt19/google/hqall", data=[ - "http://data.statmt.org/wmt19/translation-task/test.tgz", + "https://data.statmt.org/wmt19/translation-task/test.tgz", "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-hqall.ref", ], description="Best human-selected reference among original official reference and the Google reference and paraphrases.", @@ -722,7 +722,7 @@ ), "wmt18": FakeSGMLDataset( "wmt18", - data=["http://data.statmt.org/wmt18/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt18/translation-task/test.tgz"], md5=["f996c245ecffea23d0006fa4c34e9064"], description="Official evaluation data.", citation='@inproceedings{bojar-etal-2018-findings,\n title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)",\n author = "Bojar, Ond{\v{r}}ej and\n Federmann, Christian and\n Fishel, Mark and\n Graham, Yvette and\n Haddow, Barry and\n Koehn, Philipp and\n Monz, Christof",\n booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",\n month = oct,\n year = "2018",\n address = "Belgium, Brussels",\n publisher = "Association for Computational Linguistics",\n url = "https://www.aclweb.org/anthology/W18-6401",\n pages = "272--303",\n}', @@ -787,7 +787,7 @@ ), "wmt18/test-ts": FakeSGMLDataset( "wmt18/test-ts", - data=["http://data.statmt.org/wmt18/translation-task/test-ts.tgz"], + data=["https://data.statmt.org/wmt18/translation-task/test-ts.tgz"], md5=["5c621a34d512cc2dd74162ae7d00b320"], description="Official evaluation sources with extra test sets interleaved.", langpairs={ @@ -809,7 +809,7 @@ ), "wmt18/dev": FakeSGMLDataset( "wmt18/dev", - data=["http://data.statmt.org/wmt18/translation-task/dev.tgz"], + data=["https://data.statmt.org/wmt18/translation-task/dev.tgz"], md5=["486f391da54a7a3247f02ebd25996f24"], description="Development data (Estonian<>English).", langpairs={ @@ -825,7 +825,7 @@ ), "wmt17": FakeSGMLDataset( "wmt17", - data=["http://data.statmt.org/wmt17/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt17/translation-task/test.tgz"], md5=["86a1724c276004aa25455ae2a04cef26"], description="Official evaluation data.", citation="@InProceedings{bojar-EtAl:2017:WMT1,\n author = {Bojar, Ond\\v{r}ej and Chatterjee, Rajen and Federmann, Christian and Graham, Yvette and Haddow, Barry and Huang, Shujian and Huck, Matthias and Koehn, Philipp and Liu, Qun and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Post, Matt and Rubino, Raphael and Specia, Lucia and Turchi, Marco},\n title = {Findings of the 2017 Conference on Machine Translation (WMT17)},\n booktitle = {Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers},\n month = {September},\n year = {2017},\n address = {Copenhagen, Denmark},\n publisher = {Association for Computational Linguistics},\n pages = {169--214},\n url = {http://www.aclweb.org/anthology/W17-4717}\n}", @@ -890,7 +890,7 @@ ), "wmt17/B": FakeSGMLDataset( "wmt17/B", - data=["http://data.statmt.org/wmt17/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt17/translation-task/test.tgz"], md5=["86a1724c276004aa25455ae2a04cef26"], description="Additional reference for EN-FI and FI-EN.", langpairs={ @@ -902,7 +902,7 @@ ), "wmt17/tworefs": FakeSGMLDataset( "wmt17/tworefs", - data=["http://data.statmt.org/wmt17/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt17/translation-task/test.tgz"], md5=["86a1724c276004aa25455ae2a04cef26"], description="Systems with two references.", langpairs={ @@ -915,7 +915,7 @@ ), "wmt17/improved": FakeSGMLDataset( "wmt17/improved", - data=["http://data.statmt.org/wmt17/translation-task/test-update-1.tgz"], + data=["https://data.statmt.org/wmt17/translation-task/test-update-1.tgz"], md5=["91dbfd5af99bc6891a637a68e04dfd41"], description="Improved zh-en and en-zh translations.", langpairs={ @@ -925,7 +925,7 @@ ), "wmt17/dev": FakeSGMLDataset( "wmt17/dev", - data=["http://data.statmt.org/wmt17/translation-task/dev.tgz"], + data=["https://data.statmt.org/wmt17/translation-task/dev.tgz"], md5=["9b1aa63c1cf49dccdd20b962fe313989"], description="Development sets released for new languages in 2017.", langpairs={ @@ -951,7 +951,7 @@ "wmt17/ms", data=[ "https://github.com/MicrosoftTranslator/Translator-HumanParityData/archive/master.zip", - "http://data.statmt.org/wmt17/translation-task/test-update-1.tgz", + "https://data.statmt.org/wmt17/translation-task/test-update-1.tgz", ], md5=["18fdaa7a3c84cf6ef688da1f6a5fa96f", "91dbfd5af99bc6891a637a68e04dfd41"], description="Additional Chinese-English references from Microsoft Research.", @@ -967,7 +967,7 @@ ), "wmt16": FakeSGMLDataset( "wmt16", - data=["http://data.statmt.org/wmt16/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt16/translation-task/test.tgz"], md5=["3d809cd0c2c86adb2c67034d15c4e446"], description="Official evaluation data.", citation="@InProceedings{bojar-EtAl:2016:WMT1,\n author = {Bojar, Ond\\v{r}ej and Chatterjee, Rajen and Federmann, Christian and Graham, Yvette and Haddow, Barry and Huck, Matthias and Jimeno Yepes, Antonio and Koehn, Philipp and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Neveol, Aurelie and Neves, Mariana and Popel, Martin and Post, Matt and Rubino, Raphael and Scarton, Carolina and Specia, Lucia and Turchi, Marco and Verspoor, Karin and Zampieri, Marcos},\n title = {Findings of the 2016 Conference on Machine Translation},\n booktitle = {Proceedings of the First Conference on Machine Translation},\n month = {August},\n year = {2016},\n address = {Berlin, Germany},\n publisher = {Association for Computational Linguistics},\n pages = {131--198},\n url = {http://www.aclweb.org/anthology/W/W16/W16-2301}\n}", @@ -1024,7 +1024,7 @@ ), "wmt16/B": FakeSGMLDataset( "wmt16/B", - data=["http://data.statmt.org/wmt16/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt16/translation-task/test.tgz"], md5=["3d809cd0c2c86adb2c67034d15c4e446"], description="Additional reference for EN-FI.", langpairs={ @@ -1036,7 +1036,7 @@ ), "wmt16/tworefs": FakeSGMLDataset( "wmt16/tworefs", - data=["http://data.statmt.org/wmt16/translation-task/test.tgz"], + data=["https://data.statmt.org/wmt16/translation-task/test.tgz"], md5=["3d809cd0c2c86adb2c67034d15c4e446"], description="EN-FI with two references.", langpairs={ @@ -1049,7 +1049,7 @@ ), "wmt16/dev": FakeSGMLDataset( "wmt16/dev", - data=["http://data.statmt.org/wmt16/translation-task/dev.tgz"], + data=["https://data.statmt.org/wmt16/translation-task/dev.tgz"], md5=["4a3dc2760bb077f4308cce96b06e6af6"], description="Development sets released for new languages in 2016.", langpairs={ @@ -1073,7 +1073,7 @@ ), "wmt15": FakeSGMLDataset( "wmt15", - data=["http://statmt.org/wmt15/test.tgz"], + data=["https://statmt.org/wmt15/test.tgz"], md5=["67e3beca15e69fe3d36de149da0a96df"], description="Official evaluation data.", citation="@InProceedings{bojar-EtAl:2015:WMT,\n author = {Bojar, Ond\\v{r}ej and Chatterjee, Rajen and Federmann, Christian and Haddow, Barry and Huck, Matthias and Hokamp, Chris and Koehn, Philipp and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Post, Matt and Scarton, Carolina and Specia, Lucia and Turchi, Marco},\n title = {Findings of the 2015 Workshop on Statistical Machine Translation},\n booktitle = {Proceedings of the Tenth Workshop on Statistical Machine Translation},\n month = {September},\n year = {2015},\n address = {Lisbon, Portugal},\n publisher = {Association for Computational Linguistics},\n pages = {1--46},\n url = {http://aclweb.org/anthology/W15-3001}\n}", @@ -1122,7 +1122,7 @@ ), "wmt14": FakeSGMLDataset( "wmt14", - data=["http://statmt.org/wmt14/test-filtered.tgz"], + data=["https://statmt.org/wmt14/test-filtered.tgz"], md5=["84c597844c1542e29c2aff23aaee4310"], description="Official evaluation data.", citation="@InProceedings{bojar-EtAl:2014:W14-33,\n author = {Bojar, Ondrej and Buck, Christian and Federmann, Christian and Haddow, Barry and Koehn, Philipp and Leveling, Johannes and Monz, Christof and Pecina, Pavel and Post, Matt and Saint-Amand, Herve and Soricut, Radu and Specia, Lucia and Tamchyna, Ale\\v{s}},\n title = {Findings of the 2014 Workshop on Statistical Machine Translation},\n booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},\n month = {June},\n year = {2014},\n address = {Baltimore, Maryland, USA},\n publisher = {Association for Computational Linguistics},\n pages = {12--58},\n url = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}", @@ -1171,7 +1171,7 @@ ), "wmt14/full": FakeSGMLDataset( "wmt14/full", - data=["http://statmt.org/wmt14/test-full.tgz"], + data=["https://statmt.org/wmt14/test-full.tgz"], md5=["a8cd784e006feb32ac6f3d9ec7eb389a"], description="Evaluation data released after official evaluation for further research.", langpairs={ @@ -1219,7 +1219,7 @@ ), "wmt13": FakeSGMLDataset( "wmt13", - data=["http://statmt.org/wmt13/test.tgz"], + data=["https://statmt.org/wmt13/test.tgz"], md5=["48eca5d02f637af44e85186847141f67"], description="Official evaluation data.", citation="@InProceedings{bojar-EtAl:2013:WMT,\n author = {Bojar, Ond\\v{r}ej and Buck, Christian and Callison-Burch, Chris and Federmann, Christian and Haddow, Barry and Koehn, Philipp and Monz, Christof and Post, Matt and Soricut, Radu and Specia, Lucia},\n title = {Findings of the 2013 {Workshop on Statistical Machine Translation}},\n booktitle = {Proceedings of the Eighth Workshop on Statistical Machine Translation},\n month = {August},\n year = {2013},\n address = {Sofia, Bulgaria},\n publisher = {Association for Computational Linguistics},\n pages = {1--44},\n url = {http://www.aclweb.org/anthology/W13-2201}\n}", @@ -1238,7 +1238,7 @@ ), "wmt12": FakeSGMLDataset( "wmt12", - data=["http://statmt.org/wmt12/test.tgz"], + data=["https://statmt.org/wmt12/test.tgz"], md5=["608232d34ebc4ba2ff70fead45674e47"], description="Official evaluation data.", citation="@InProceedings{callisonburch-EtAl:2012:WMT,\n author = {Callison-Burch, Chris and Koehn, Philipp and Monz, Christof and Post, Matt and Soricut, Radu and Specia, Lucia},\n title = {Findings of the 2012 Workshop on Statistical Machine Translation},\n booktitle = {Proceedings of the Seventh Workshop on Statistical Machine Translation},\n month = {June},\n year = {2012},\n address = {Montr{'e}al, Canada},\n publisher = {Association for Computational Linguistics},\n pages = {10--51},\n url = {http://www.aclweb.org/anthology/W12-3102}\n}", @@ -1255,7 +1255,7 @@ ), "wmt11": FakeSGMLDataset( "wmt11", - data=["http://statmt.org/wmt11/test.tgz"], + data=["https://statmt.org/wmt11/test.tgz"], md5=["b0c9680adf32d394aefc2b24e3a5937e"], description="Official evaluation data.", citation="@InProceedings{callisonburch-EtAl:2011:WMT,\n author = {Callison-Burch, Chris and Koehn, Philipp and Monz, Christof and Zaidan, Omar},\n title = {Findings of the 2011 Workshop on Statistical Machine Translation},\n booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n month = {July},\n year = {2011},\n address = {Edinburgh, Scotland},\n publisher = {Association for Computational Linguistics},\n pages = {22--64},\n url = {http://www.aclweb.org/anthology/W11-2103}\n}", @@ -1272,7 +1272,7 @@ ), "wmt10": FakeSGMLDataset( "wmt10", - data=["http://statmt.org/wmt10/test.tgz"], + data=["https://statmt.org/wmt10/test.tgz"], md5=["491cb885a355da5a23ea66e7b3024d5c"], description="Official evaluation data.", citation="@InProceedings{callisonburch-EtAl:2010:WMT,\n author = {Callison-Burch, Chris and Koehn, Philipp and Monz, Christof and Peterson, Kay and Przybocki, Mark and Zaidan, Omar},\n title = {Findings of the 2010 Joint Workshop on Statistical Machine Translation and Metrics for Machine Translation},\n booktitle = {Proceedings of the Joint Fifth Workshop on Statistical Machine Translation and MetricsMATR},\n month = {July},\n year = {2010},\n address = {Uppsala, Sweden},\n publisher = {Association for Computational Linguistics},\n pages = {17--53},\n note = {Revised August 2010},\n url = {http://www.aclweb.org/anthology/W10-1703}\n}", @@ -1289,7 +1289,7 @@ ), "wmt09": FakeSGMLDataset( "wmt09", - data=["http://statmt.org/wmt09/test.tgz"], + data=["https://statmt.org/wmt09/test.tgz"], md5=["da227abfbd7b666ec175b742a0d27b37"], description="Official evaluation data.", citation="@InProceedings{callisonburch-EtAl:2009:WMT-09,\n author = {Callison-Burch, Chris and Koehn, Philipp and Monz, Christof and Schroeder, Josh},\n title = {Findings of the 2009 {W}orkshop on {S}tatistical {M}achine {T}ranslation},\n booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation},\n month = {March},\n year = {2009},\n address = {Athens, Greece},\n publisher = {Association for Computational Linguistics},\n pages = {1--28},\n url = {http://www.aclweb.org/anthology/W/W09/W09-0401}\n}", @@ -1310,7 +1310,7 @@ ), "wmt08": FakeSGMLDataset( "wmt08", - data=["http://statmt.org/wmt08/test.tgz"], + data=["https://statmt.org/wmt08/test.tgz"], md5=["0582e4e894a3342044059c894e1aea3d"], description="Official evaluation data.", citation="@InProceedings{callisonburch-EtAl:2008:WMT,\n author = {Callison-Burch, Chris and Fordyce, Cameron and Koehn, Philipp and Monz, Christof and Schroeder, Josh},\n title = {Further Meta-Evaluation of Machine Translation},\n booktitle = {Proceedings of the Third Workshop on Statistical Machine Translation},\n month = {June},\n year = {2008},\n address = {Columbus, Ohio},\n publisher = {Association for Computational Linguistics},\n pages = {70--106},\n url = {http://www.aclweb.org/anthology/W/W08/W08-0309}\n}", @@ -1329,7 +1329,7 @@ ), "wmt08/nc": FakeSGMLDataset( "wmt08/nc", - data=["http://statmt.org/wmt08/test.tgz"], + data=["https://statmt.org/wmt08/test.tgz"], md5=["0582e4e894a3342044059c894e1aea3d"], description="Official evaluation data (news commentary).", langpairs={ @@ -1339,7 +1339,7 @@ ), "wmt08/europarl": FakeSGMLDataset( "wmt08/europarl", - data=["http://statmt.org/wmt08/test.tgz"], + data=["https://statmt.org/wmt08/test.tgz"], md5=["0582e4e894a3342044059c894e1aea3d"], description="Official evaluation data (Europarl).", langpairs={ diff --git a/sacrebleu/dataset/__main__.py b/sacrebleu/dataset/__main__.py index 22954920..5b13d59a 100644 --- a/sacrebleu/dataset/__main__.py +++ b/sacrebleu/dataset/__main__.py @@ -16,6 +16,8 @@ for item in DATASETS.values(): if item.md5 is not None: + assert item.data + assert item.md5 assert len(item.data) == len(item.md5) pairs = zip(item.data, item.md5) for url, md5_hash in pairs: diff --git a/sacrebleu/dataset/base.py b/sacrebleu/dataset/base.py index ba6e65ba..cf3c092f 100644 --- a/sacrebleu/dataset/base.py +++ b/sacrebleu/dataset/base.py @@ -4,7 +4,7 @@ import os import re from abc import ABCMeta, abstractmethod -from typing import Dict, List +from typing import Dict, List, Optional from ..utils import SACREBLEU_DIR, download_file, smart_open @@ -13,10 +13,10 @@ class Dataset(metaclass=ABCMeta): def __init__( self, name: str, - data: List[str] = None, - description: str = None, - citation: str = None, - md5: List[str] = None, + data: Optional[List[str]] = None, + description: Optional[str] = None, + citation: Optional[str] = None, + md5: Optional[List[str]] = None, langpairs=Dict[str, List[str]], **kwargs, ): diff --git a/sacrebleu/dataset/wmt_xml.py b/sacrebleu/dataset/wmt_xml.py index 92c96d57..d5eb5d86 100644 --- a/sacrebleu/dataset/wmt_xml.py +++ b/sacrebleu/dataset/wmt_xml.py @@ -76,7 +76,7 @@ def _unwrap_wmt21_or_later(raw_file): def get_sents(doc): return { int(seg.get("id")): seg.text if seg.text else "" - for seg in doc.findall(f".//seg") + for seg in doc.findall(".//seg") } ref_docs = doc.findall(".//ref") @@ -114,7 +114,7 @@ def _get_langpair_path(self, langpair): in order to allow for overriding which test set to use. """ langpair_data = self._get_langpair_metadata(langpair)[langpair] - rel_path = langpair_data["path"] if type(langpair_data) == dict else langpair_data[0] + rel_path = langpair_data["path"] if isinstance(langpair_data, dict) else langpair_data[0] return os.path.join(self._rawdir, rel_path) def process_to_text(self, langpair=None): @@ -156,7 +156,7 @@ def _get_langpair_allowed_refs(self, langpair): """ defaults = self.kwargs.get("refs", []) langpair_data = self._get_langpair_metadata(langpair)[langpair] - if type(langpair_data) == dict: + if isinstance(langpair_data, dict): allowed_refs = langpair_data.get("refs", defaults) else: allowed_refs = defaults diff --git a/sacrebleu/sacrebleu.py b/sacrebleu/sacrebleu.py index 7edbe3ac..d778e1db 100755 --- a/sacrebleu/sacrebleu.py +++ b/sacrebleu/sacrebleu.py @@ -50,7 +50,7 @@ try: # SIGPIPE is not available on Windows machines, throwing an exception. - from signal import SIGPIPE + from signal import SIGPIPE # type: ignore # If SIGPIPE is available, change behaviour to default instead of ignore. from signal import signal, SIG_DFL diff --git a/sacrebleu/significance.py b/sacrebleu/significance.py index b39e0a59..a9c71d0a 100644 --- a/sacrebleu/significance.py +++ b/sacrebleu/significance.py @@ -1,7 +1,7 @@ import os import logging import multiprocessing as mp -from typing import Sequence, Dict, Optional, Tuple, List, Union, Any +from typing import Sequence, Dict, Optional, Tuple, List, Union, Any, Mapping import numpy as np @@ -77,11 +77,11 @@ def _bootstrap_resample(stats: List[List[Union[int, float]]], idxs = rng.choice(len(stats), size=(n_samples, len(stats)), replace=True) # convert to numpy array. float32 is more efficient - stats = np.array(stats, dtype='float32') + stats_np = np.array(stats, dtype='float32') # recompute scores for all resamples scores = [ - metric._compute_score_from_stats(_s.sum(0)) for _s in stats[idxs]] + metric._compute_score_from_stats(_s.sum(0)) for _s in stats_np[idxs]] return str(seed).lower(), scores @@ -98,7 +98,7 @@ def _compute_p_value(stats: np.ndarray, real_difference: float) -> float: # "the != is important. if we want to score the same system against itself # having a zero difference should not be attributed to chance." - c = np.sum(stats > real_difference) + c = np.sum(stats > real_difference).item() # "+1 applies here, though it only matters for small numbers of shufflings, # which we typically never do. it's necessary to ensure the probability of @@ -186,8 +186,9 @@ def _paired_ar_test(baseline_info: Dict[str, Tuple[np.ndarray, Result]], sacrelogger.info(f' > Performing bootstrap resampling for confidence interval (# resamples: {n_ar_confidence})') sys_stats = np.array(sys_stats, dtype='float32') # recompute scores for all resamples - sys_scores = [ - metric._compute_score_from_stats(_s.sum(0)).score for _s in sys_stats[bs_idxs]] + sys_scores = np.array([ + metric._compute_score_from_stats(_s.sum(0)).score for _s in sys_stats[bs_idxs] + ]) res.mean, res.ci = estimate_ci(sys_scores) # Store the result @@ -300,7 +301,7 @@ class PairedTest: } def __init__(self, named_systems: List[Tuple[str, Sequence[str]]], - metrics: Dict[str, Metric], + metrics: Mapping[str, Metric], references: Optional[Sequence[Sequence[str]]], test_type: str = 'ar', n_samples: int = 0, diff --git a/sacrebleu/tokenizers/tokenizer_spm.py b/sacrebleu/tokenizers/tokenizer_spm.py index a50d0fb8..92729b5b 100644 --- a/sacrebleu/tokenizers/tokenizer_spm.py +++ b/sacrebleu/tokenizers/tokenizer_spm.py @@ -2,7 +2,6 @@ import os import logging -import urllib.request from functools import lru_cache from ..utils import SACREBLEU_DIR, download_file diff --git a/sacrebleu/utils.py b/sacrebleu/utils.py index 6187e3b3..56e6fcab 100644 --- a/sacrebleu/utils.py +++ b/sacrebleu/utils.py @@ -423,9 +423,7 @@ def download_file(source_path, dest_path, extract_to=None, expected_md5=None): with portalocker.Lock(lockfile, timeout=60): if not os.path.exists(dest_path) or os.path.getsize(dest_path) == 0: - sacrelogger.info(f"Downloading {source_path} to {dest_path}") - md5 = hashlib.md5() try: with urllib.request.urlopen(source_path) as f, open(dest_path, 'wb') as out: @@ -441,7 +439,7 @@ def download_file(source_path, dest_path, extract_to=None, expected_md5=None): if cur_md5 != expected_md5: sacrelogger.error(f'Fatal: MD5 sum of downloaded file was incorrect (got {cur_md5}, expected {expected_md5}).') sacrelogger.error(f'Please manually delete {dest_path!r} and rerun the command.') - sacrelogger.error(f'If the problem persists, the tarball may have changed, in which case, please contact the SacreBLEU maintainer.') + sacrelogger.error('If the problem persists, the tarball may have changed, in which case, please contact the SacreBLEU maintainer.') sys.exit(1) # Extract the tarball @@ -594,4 +592,4 @@ def print_subset_results(metrics, full_system, full_refs, args): print(f'{key}: sentences={n_system:<6} {score.name:<{max_metric_width}} = {score.score:.{w}f}') # import at the end to avoid circular import -from .dataset import DATASETS, SUBSETS, DOMAINS, COUNTRIES +from .dataset import DATASETS, SUBSETS, DOMAINS, COUNTRIES # noqa: E402 diff --git a/scripts/perf_test.py b/scripts/perf_test.py index 1cf2b484..f2812db5 100644 --- a/scripts/perf_test.py +++ b/scripts/perf_test.py @@ -5,8 +5,8 @@ sys.path.insert(0, '.') -import sacrebleu -from sacrebleu.metrics import BLEU, CHRF +import sacrebleu # noqa: E402 +from sacrebleu.metrics import BLEU, CHRF # noqa: E402 N_REPEATS = 5 diff --git a/setup.cfg b/setup.cfg index 2e0f031c..a3fd11de 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,3 @@ [metadata] -description-file = README.md -license_file = LICENSE.txt +description_file = README.md +license_files = LICENSE.txt diff --git a/setup.py b/setup.py index 00c70029..f104799e 100755 --- a/setup.py +++ b/setup.py @@ -130,6 +130,9 @@ def get_long_description(): # Specify the Python versions you support here. In particular, ensure # that you indicate whether you support Python 2, Python 3 or both. 'Programming Language :: Python :: 3 :: Only', + + # Indicate that type hints are provided + 'Typing :: Typed' ], # What does your project relate to? @@ -151,7 +154,8 @@ def get_long_description(): # dependencies). You can install these using the following syntax, # for example: # $ pip install -e .[dev,test] - extras_require={'ja': ['mecab-python3>=1.0.5,<=1.0.6', 'ipadic>=1.0,<2.0'], + extras_require={'dev': ['wheel', 'pytest', 'mypy', 'types-tabulate', 'lxml-stubs'], + 'ja': ['mecab-python3>=1.0.5,<=1.0.6', 'ipadic>=1.0,<2.0'], 'ko': ['mecab-ko>=1.0.0,<=1.0.1', 'mecab-ko-dic>=1.0,<2.0']}, # To provide executable scripts, use entry points in preference to the diff --git a/test.sh b/test.sh index ee1657f6..1bb5720a 100755 --- a/test.sh +++ b/test.sh @@ -96,7 +96,7 @@ cd data if [[ ! -d wmt17-submitted-data ]]; then echo "Downloading and unpacking WMT'17 system submissions (46 MB)..." - wget -q http://data.statmt.org/wmt17/translation-task/wmt17-submitted-data-v1.0.tgz + wget -q https://data.statmt.org/wmt17/translation-task/wmt17-submitted-data-v1.0.tgz tar xzf wmt17-submitted-data-v1.0.tgz fi diff --git a/test/test_api.py b/test/test_api.py index 511b4c78..02ac9f03 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -44,7 +44,7 @@ def test_api_get_available_testsets(): the test sets found. """ available = get_available_testsets() - assert type(available) is list + assert isinstance(available, list) assert "wmt19" in available assert "wmt05" not in available @@ -59,12 +59,12 @@ def test_api_get_available_testsets_for_langpair(): the test sets found. """ available = get_available_testsets_for_langpair('en-it') - assert type(available) is list + assert isinstance(available, list) assert "wmt09" in available assert "wmt15" not in available available = get_available_testsets_for_langpair('en-fr') - assert type(available) is list + assert isinstance(available, list) assert "wmt11" in available assert "mtedx/test" in available assert "wmt20" not in available @@ -77,7 +77,7 @@ def test_api_get_langpairs_for_testset(): """ for testset in DATASETS.keys(): available = get_langpairs_for_testset(testset) - assert type(available) is list + assert isinstance(available, list) for langpair in DATASETS[testset].langpairs.keys(): # skip non-language keys if "-" not in langpair: diff --git a/test/test_dataset.py b/test/test_dataset.py index cff796dd..d3ece8cb 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -106,7 +106,7 @@ def test_wmt22_references(): # and that ref:A is the default for all languages where it wasn't overridden for langpair, langpair_data in wmt22.langpairs.items(): - if type(langpair_data) == dict: + if isinstance(langpair_data, dict): assert wmt22._get_langpair_allowed_refs(langpair) != ["ref:A"] else: assert wmt22._get_langpair_allowed_refs(langpair) == ["ref:A"] diff --git a/test/test_significance.py b/test/test_significance.py index 46679ac4..f7098328 100644 --- a/test/test_significance.py +++ b/test/test_significance.py @@ -1,9 +1,10 @@ import os from collections import defaultdict +from typing import DefaultDict from sacrebleu.metrics import BLEU -from sacrebleu.significance import PairedTest +from sacrebleu.significance import PairedTest, Result import pytest @@ -57,8 +58,8 @@ def _read_pickle_file(): } -SACREBLEU_BS_P_VALS = defaultdict(float) -SACREBLEU_AR_P_VALS = defaultdict(float) +SACREBLEU_BS_P_VALS: DefaultDict[str, float] = defaultdict(float) +SACREBLEU_AR_P_VALS: DefaultDict[str, float] = defaultdict(float) # Load data from pickled file to not bother with WMT17 downloading named_systems = _read_pickle_file() @@ -75,7 +76,9 @@ def _read_pickle_file(): test_type='bs', n_samples=2000)()[1] for name, result in zip(bs_scores['System'], bs_scores['BLEU']): + assert isinstance(result, Result) if result.p_value is not None: + assert isinstance(name, str) SACREBLEU_BS_P_VALS[name] += result.p_value @@ -87,7 +90,9 @@ def _read_pickle_file(): test_type='ar', n_samples=10000)()[1] for name, result in zip(ar_scores['System'], ar_scores['BLEU']): + assert isinstance(result, Result) if result.p_value is not None: + assert isinstance(name, str) SACREBLEU_AR_P_VALS[name] += result.p_value