From 3150f2b489eb74c1763d23e48a6cb20bc1d19f43 Mon Sep 17 00:00:00 2001
From: Dmitry Ustalov <dmitry.ustalov@gmail.com>
Date: Wed, 22 Nov 2023 09:30:33 +0100
Subject: [PATCH 1/4] Explicitly list the exported symbols (#249)

---
 sacrebleu/__init__.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sacrebleu/__init__.py b/sacrebleu/__init__.py
index e7342fc..ea73221 100644
--- a/sacrebleu/__init__.py
+++ b/sacrebleu/__init__.py
@@ -29,3 +29,15 @@
 from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu  # noqa: F401
 from .compat import corpus_chrf, sentence_chrf  # noqa: F401
 from .compat import corpus_ter, sentence_ter  # noqa: F401
+
+__all__ = [
+    'smart_open', 'SACREBLEU_DIR', 'download_test_set',
+    'get_source_file', 'get_reference_files',
+    'get_available_testsets', 'get_langpairs_for_testset',
+    'extract_word_ngrams', 'extract_char_ngrams',
+    'DATASETS',
+    'BLEU', 'CHRF', 'TER',
+    'corpus_bleu', 'raw_corpus_bleu', 'sentence_bleu',
+    'corpus_chrf', 'sentence_chrf',
+    'corpus_ter', 'sentence_ter'
+]

From 275e2920af0f73767597e43acf1c1d05e97227f3 Mon Sep 17 00:00:00 2001
From: Dmitry Ustalov <dmitry.ustalov@jetbrains.com>
Date: Tue, 28 Nov 2023 16:40:56 +0100
Subject: [PATCH 2/4] Update check-build.yml (#252)

* Update check-build.yml to run tests on every push and every pull request
---
 .github/workflows/check-build.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml
index 881091a..d9c113b 100644
--- a/.github/workflows/check-build.yml
+++ b/.github/workflows/check-build.yml
@@ -1,7 +1,9 @@
 name: check-build
 
 on:
-  pull_request
+  push:
+  pull_request:
+  workflow_dispatch:
 
 env:
   PYTHONUTF8: "1"

From 9e57a5176ba8f010f03873478528837ea8bf6ee2 Mon Sep 17 00:00:00 2001
From: Dmitry Ustalov <dmitry.ustalov@jetbrains.com>
Date: Tue, 28 Nov 2023 16:44:25 +0100
Subject: [PATCH 3/4] Use more linters, better (#250)

* Use more linters, better

* Rely on Ruff during CI only due to the outdated Python versions

* Run on Python 3.12, too, and fix typing on Windows

* Do not test on Python 3.12 as mecab-python3 is not available yet

* Build (but not publish) during CI

* Add wheel

* Update check-build.yml

* Update the dataset URLs to use HTTPS
---
 .github/workflows/check-build.yml     |  16 ++--
 .gitignore                            |   3 +-
 Makefile                              |   1 +
 mypy.ini                              |   9 ++
 sacrebleu/__init__.py                 |  18 ++--
 sacrebleu/compat.py                   |   2 +-
 sacrebleu/dataset/__init__.py         | 116 +++++++++++++-------------
 sacrebleu/dataset/__main__.py         |   2 +
 sacrebleu/dataset/base.py             |  10 +--
 sacrebleu/dataset/wmt_xml.py          |   6 +-
 sacrebleu/sacrebleu.py                |   2 +-
 sacrebleu/significance.py             |  15 ++--
 sacrebleu/tokenizers/tokenizer_spm.py |   1 -
 sacrebleu/utils.py                    |   6 +-
 scripts/perf_test.py                  |   4 +-
 setup.cfg                             |   4 +-
 setup.py                              |   6 +-
 test.sh                               |   2 +-
 test/test_api.py                      |   8 +-
 test/test_dataset.py                  |   2 +-
 test/test_significance.py             |  11 ++-
 21 files changed, 134 insertions(+), 110 deletions(-)

diff --git a/.github/workflows/check-build.yml b/.github/workflows/check-build.yml
index d9c113b..c546c0e 100644
--- a/.github/workflows/check-build.yml
+++ b/.github/workflows/check-build.yml
@@ -29,8 +29,6 @@ jobs:
           - os: ubuntu-20.04
             python-version: '3.6'   # test Python 3.6 on older Ubuntu instead
     steps:
-      # - name: update
-      #  run: sudo apt-get -y update
       - uses: actions/checkout@v3
       - name: Setup Python ${{ matrix.python-version }}
         uses: actions/setup-python@v3
@@ -44,12 +42,18 @@ jobs:
         run: choco install wget unzip
       - name: Install python dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install pytest
-          pip install .[ja]
-          pip install .[ko]
+          python3 -m pip install --upgrade pip
+          pip3 install .[dev]
+          pip3 install .[ja]
+          pip3 install .[ko]
+      - name: Lint with Mypy
+        run: mypy sacrebleu scripts test
+      - name: Lint with Ruff
+        uses: chartboost/ruff-action@v1
       - name: Python pytest test suite
         run: python3 -m pytest
       - name: CLI bash test suite
         shell: bash
         run: ./test.sh
+      - name: Build
+        run: python3 setup.py sdist bdist_wheel
diff --git a/.gitignore b/.gitignore
index 6bf040a..e2d4f53 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@ __pycache__
 sacrebleu.egg-info
 .sacrebleu
 *~
-.DS_Store
\ No newline at end of file
+.DS_Store
+.idea/
diff --git a/Makefile b/Makefile
index f6f8236..6b378c7 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 .PHONY: test
 test:
+	mypy sacrebleu scripts test
 	python3 -m pytest
 	bash test.sh
 
diff --git a/mypy.ini b/mypy.ini
index 7207d68..26ff095 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -18,3 +18,12 @@ ignore_missing_imports = True
 
 [mypy-MeCab.*]
 ignore_missing_imports = True
+
+[mypy-mecab_ko.*]
+ignore_missing_imports = True
+
+[mypy-mecab_ko_dic.*]
+ignore_missing_imports = True
+
+[mypy-sentencepiece.*]
+ignore_missing_imports = True
diff --git a/sacrebleu/__init__.py b/sacrebleu/__init__.py
index ea73221..19f7059 100644
--- a/sacrebleu/__init__.py
+++ b/sacrebleu/__init__.py
@@ -18,17 +18,17 @@
 __description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores'
 
 
-from .utils import smart_open, SACREBLEU_DIR, download_test_set  # noqa: F401
-from .utils import get_source_file, get_reference_files  # noqa: F401
-from .utils import get_available_testsets, get_langpairs_for_testset  # noqa: F401
-from .metrics.helpers import extract_word_ngrams, extract_char_ngrams  # noqa: F401
-from .dataset import DATASETS  # noqa: F401
-from .metrics import BLEU, CHRF, TER  # noqa: F401
+from .utils import smart_open, SACREBLEU_DIR, download_test_set
+from .utils import get_source_file, get_reference_files
+from .utils import get_available_testsets, get_langpairs_for_testset
+from .metrics.helpers import extract_word_ngrams, extract_char_ngrams
+from .dataset import DATASETS
+from .metrics import BLEU, CHRF, TER
 
 # Backward compatibility functions for old style API access (<= 1.4.10)
-from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu  # noqa: F401
-from .compat import corpus_chrf, sentence_chrf  # noqa: F401
-from .compat import corpus_ter, sentence_ter  # noqa: F401
+from .compat import corpus_bleu, raw_corpus_bleu, sentence_bleu
+from .compat import corpus_chrf, sentence_chrf
+from .compat import corpus_ter, sentence_ter
 
 __all__ = [
     'smart_open', 'SACREBLEU_DIR', 'download_test_set',
diff --git a/sacrebleu/compat.py b/sacrebleu/compat.py
index cce90e9..5735960 100644
--- a/sacrebleu/compat.py
+++ b/sacrebleu/compat.py
@@ -64,7 +64,7 @@ def raw_corpus_bleu(hypotheses: Sequence[str],
 def sentence_bleu(hypothesis: str,
                   references: Sequence[str],
                   smooth_method: str = 'exp',
-                  smooth_value: float = None,
+                  smooth_value: Optional[float] = None,
                   lowercase: bool = False,
                   tokenize=BLEU.TOKENIZER_DEFAULT,
                   use_effective_order: bool = True) -> BLEUScore:
diff --git a/sacrebleu/dataset/__init__.py b/sacrebleu/dataset/__init__.py
index 19f4f16..037c8f6 100644
--- a/sacrebleu/dataset/__init__.py
+++ b/sacrebleu/dataset/__init__.py
@@ -14,6 +14,19 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
+
+# This defines data locations.
+# Right below are test sets.
+# Beneath each test set, we define the location to download the test data.
+# The other keys are each language pair contained in the tarball, and the respective locations of the source and reference data within each.
+# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script.
+# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET}
+from .fake_sgml import FakeSGMLDataset, WMTAdditionDataset
+from .iwslt_xml import IWSLTXMLDataset
+from .plain_text import PlainTextDataset
+from .tsv import TSVDataset
+from .wmt_xml import WMTXMLDataset
+
 # Detailed document metadata annotation in form DocumentID -> CountryCode - Domain - OptionalFinegrainedCountryCode
 # While the annotation is subjective with many unclear cases, it may provide useful insights
 # when applied on large data (TODO: annotate all documents from recent WMT years, at least for origlang=en, consider renaming "world" to "other").
@@ -59,19 +72,6 @@
 COUNTRIES = sorted(list({v.split("-")[0] for v in SUBSETS["wmt19"].values()}))
 DOMAINS = sorted(list({v.split("-")[1] for v in SUBSETS["wmt19"].values()}))
 
-
-# This defines data locations.
-# At the top level are test sets.
-# Beneath each test set, we define the location to download the test data.
-# The other keys are each language pair contained in the tarball, and the respective locations of the source and reference data within each.
-# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script.
-# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET}
-from .fake_sgml import FakeSGMLDataset, WMTAdditionDataset
-from .iwslt_xml import IWSLTXMLDataset
-from .plain_text import PlainTextDataset
-from .tsv import TSVDataset
-from .wmt_xml import WMTXMLDataset
-
 DATASETS = {
     # wmt
     "wmt22": WMTXMLDataset(
@@ -151,7 +151,7 @@
     ),
     "wmt21": WMTXMLDataset(
         "wmt21",
-        data=["http://data.statmt.org/wmt21/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/test.tgz"],
         description="Official evaluation data for WMT21.",
         md5=["32e7ab995bc318414375d60f0269af92"],
         langpairs={
@@ -181,7 +181,7 @@
     ),
     "wmt21/B": WMTXMLDataset(
         "wmt21/B",
-        data=["http://data.statmt.org/wmt21/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/test.tgz"],
         description="Official evaluation data for WMT21 with reference B.",
         md5=["32e7ab995bc318414375d60f0269af92"],
         langpairs={
@@ -197,7 +197,7 @@
     ),
     "wmt21/AB": WMTXMLDataset(
         "wmt21/AB",
-        data=["http://data.statmt.org/wmt21/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/test.tgz"],
         description="Official evaluation data for WMT21 with references A and B.",
         md5=["32e7ab995bc318414375d60f0269af92"],
         langpairs={
@@ -214,7 +214,7 @@
     ),
     "wmt21/C": WMTXMLDataset(
         "wmt21/C",
-        data=["http://data.statmt.org/wmt21/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/test.tgz"],
         description="Official evaluation data for WMT21 with reference C",
         md5=["32e7ab995bc318414375d60f0269af92"],
         langpairs={
@@ -225,7 +225,7 @@
     ),
     "wmt21/AC": WMTXMLDataset(
         "wmt21/AC",
-        data=["http://data.statmt.org/wmt21/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/test.tgz"],
         description="Official evaluation data for WMT21 with references A and C",
         md5=["32e7ab995bc318414375d60f0269af92"],
         langpairs={
@@ -236,7 +236,7 @@
     ),
     "wmt21/D": WMTXMLDataset(
         "wmt21/D",
-        data=["http://data.statmt.org/wmt21/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/test.tgz"],
         description="Official evaluation data for WMT21 with reference D",
         md5=["32e7ab995bc318414375d60f0269af92"],
         langpairs={
@@ -247,7 +247,7 @@
     ),
     "wmt21/dev": WMTXMLDataset(
         "wmt21/dev",
-        data=["http://data.statmt.org/wmt21/translation-task/dev.tgz"],
+        data=["https://data.statmt.org/wmt21/translation-task/dev.tgz"],
         description="Development data for WMT21，if multiple references are available, the first one is used.",
         md5=["165da59ac8dfb5b7cafd7e90b1cac672"],
         langpairs={
@@ -261,7 +261,7 @@
     ),
     "wmt20/tworefs": FakeSGMLDataset(
         "wmt20/tworefs",
-        data=["http://data.statmt.org/wmt20/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt20/translation-task/test.tgz"],
         description="WMT20 news test sets with two references",
         md5=["3b1f777cfd2fb15ccf66e9bfdb2b1699"],
         langpairs={
@@ -294,7 +294,7 @@
     ),
     "wmt20": FakeSGMLDataset(
         "wmt20",
-        data=["http://data.statmt.org/wmt20/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt20/translation-task/test.tgz"],
         description="Official evaluation data for WMT20",
         md5=["3b1f777cfd2fb15ccf66e9bfdb2b1699"],
         langpairs={
@@ -390,7 +390,7 @@
     ),
     "wmt20/dev": FakeSGMLDataset(
         "wmt20/dev",
-        data=["http://data.statmt.org/wmt20/translation-task/dev.tgz"],
+        data=["https://data.statmt.org/wmt20/translation-task/dev.tgz"],
         description="Development data for tasks new to 2020.",
         md5=["037f2b37aab74febbb1b2307dc2afb54"],
         langpairs={
@@ -430,7 +430,7 @@
     ),
     "wmt20/robust/set1": PlainTextDataset(
         "wmt20/robust/set1",
-        data=["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
+        data=["https://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
         md5=["a12ac9ebe89b72195041518dffc4a9d5"],
         description="WMT20 robustness task, set 1",
         langpairs={
@@ -446,7 +446,7 @@
     ),
     "wmt20/robust/set2": PlainTextDataset(
         "wmt20/robust/set2",
-        data=["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
+        data=["https://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
         md5=["a12ac9ebe89b72195041518dffc4a9d5"],
         description="WMT20 robustness task, set 2",
         langpairs={
@@ -462,7 +462,7 @@
     ),
     "wmt20/robust/set3": PlainTextDataset(
         "wmt20/robust/set3",
-        data=["http://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
+        data=["https://data.statmt.org/wmt20/robustness-task/robustness20-3-sets.zip"],
         md5=["a12ac9ebe89b72195041518dffc4a9d5"],
         description="WMT20 robustness task, set 3",
         langpairs={
@@ -474,7 +474,7 @@
     ),
     "wmt19": FakeSGMLDataset(
         "wmt19",
-        data=["http://data.statmt.org/wmt19/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt19/translation-task/test.tgz"],
         description="Official evaluation data.",
         md5=["84de7162d158e28403103b01aeefc39a"],
         citation=r"""@proceedings{ws-2019-machine,
@@ -583,7 +583,7 @@
     ),
     "wmt19/dev": FakeSGMLDataset(
         "wmt19/dev",
-        data=["http://data.statmt.org/wmt19/translation-task/dev.tgz"],
+        data=["https://data.statmt.org/wmt19/translation-task/dev.tgz"],
         description="Development data for tasks new to 2019.",
         md5=["f2ec7af5947c19e0cacb3882eb208002"],
         langpairs={
@@ -616,7 +616,7 @@
     "wmt19/google/ar": WMTAdditionDataset(
         "wmt19/google/ar",
         data=[
-            "http://data.statmt.org/wmt19/translation-task/test.tgz",
+            "https://data.statmt.org/wmt19/translation-task/test.tgz",
             "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-ar.ref",
         ],
         description="Additional high-quality reference for WMT19/en-de.",
@@ -629,7 +629,7 @@
     "wmt19/google/arp": WMTAdditionDataset(
         "wmt19/google/arp",
         data=[
-            "http://data.statmt.org/wmt19/translation-task/test.tgz",
+            "https://data.statmt.org/wmt19/translation-task/test.tgz",
             "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-arp.ref",
         ],
         description="Additional paraphrase of wmt19/google/ar.",
@@ -642,7 +642,7 @@
     "wmt19/google/wmtp": WMTAdditionDataset(
         "wmt19/google/wmtp",
         data=[
-            "http://data.statmt.org/wmt19/translation-task/test.tgz",
+            "https://data.statmt.org/wmt19/translation-task/test.tgz",
             "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-wmtp.ref",
         ],
         description="Additional paraphrase of the official WMT19 reference.",
@@ -655,7 +655,7 @@
     "wmt19/google/hqr": WMTAdditionDataset(
         "wmt19/google/hqr",
         data=[
-            "http://data.statmt.org/wmt19/translation-task/test.tgz",
+            "https://data.statmt.org/wmt19/translation-task/test.tgz",
             "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-hqr.ref",
         ],
         description="Best human selected-reference between wmt19 and wmt19/google/ar.",
@@ -668,7 +668,7 @@
     "wmt19/google/hqp": WMTAdditionDataset(
         "wmt19/google/hqp",
         data=[
-            "http://data.statmt.org/wmt19/translation-task/test.tgz",
+            "https://data.statmt.org/wmt19/translation-task/test.tgz",
             "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-hqp.ref",
         ],
         description="Best human-selected reference between wmt19/google/arp and wmt19/google/wmtp.",
@@ -681,7 +681,7 @@
     "wmt19/google/hqall": WMTAdditionDataset(
         "wmt19/google/hqall",
         data=[
-            "http://data.statmt.org/wmt19/translation-task/test.tgz",
+            "https://data.statmt.org/wmt19/translation-task/test.tgz",
             "https://raw.githubusercontent.com/google/wmt19-paraphrased-references/master/wmt19/ende/wmt19-ende-hqall.ref",
         ],
         description="Best human-selected reference among original official reference and the Google reference and paraphrases.",
@@ -693,7 +693,7 @@
     ),
     "wmt18": FakeSGMLDataset(
         "wmt18",
-        data=["http://data.statmt.org/wmt18/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt18/translation-task/test.tgz"],
         md5=["f996c245ecffea23d0006fa4c34e9064"],
         description="Official evaluation data.",
         citation='@inproceedings{bojar-etal-2018-findings,\n    title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)",\n    author = "Bojar, Ond{\v{r}}ej  and\n      Federmann, Christian  and\n      Fishel, Mark  and\n      Graham, Yvette  and\n      Haddow, Barry  and\n      Koehn, Philipp  and\n      Monz, Christof",\n    booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",\n    month = oct,\n    year = "2018",\n    address = "Belgium, Brussels",\n    publisher = "Association for Computational Linguistics",\n    url = "https://www.aclweb.org/anthology/W18-6401",\n    pages = "272--303",\n}',
@@ -758,7 +758,7 @@
     ),
     "wmt18/test-ts": FakeSGMLDataset(
         "wmt18/test-ts",
-        data=["http://data.statmt.org/wmt18/translation-task/test-ts.tgz"],
+        data=["https://data.statmt.org/wmt18/translation-task/test-ts.tgz"],
         md5=["5c621a34d512cc2dd74162ae7d00b320"],
         description="Official evaluation sources with extra test sets interleaved.",
         langpairs={
@@ -780,7 +780,7 @@
     ),
     "wmt18/dev": FakeSGMLDataset(
         "wmt18/dev",
-        data=["http://data.statmt.org/wmt18/translation-task/dev.tgz"],
+        data=["https://data.statmt.org/wmt18/translation-task/dev.tgz"],
         md5=["486f391da54a7a3247f02ebd25996f24"],
         description="Development data (Estonian<>English).",
         langpairs={
@@ -796,7 +796,7 @@
     ),
     "wmt17": FakeSGMLDataset(
         "wmt17",
-        data=["http://data.statmt.org/wmt17/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt17/translation-task/test.tgz"],
         md5=["86a1724c276004aa25455ae2a04cef26"],
         description="Official evaluation data.",
         citation="@InProceedings{bojar-EtAl:2017:WMT1,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huang, Shujian  and  Huck, Matthias  and  Koehn, Philipp  and  Liu, Qun  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Post, Matt  and  Rubino, Raphael  and  Specia, Lucia  and  Turchi, Marco},\n  title     = {Findings of the 2017 Conference on Machine Translation (WMT17)},\n  booktitle = {Proceedings of the Second Conference on Machine Translation, Volume 2: Shared Task Papers},\n  month     = {September},\n  year      = {2017},\n  address   = {Copenhagen, Denmark},\n  publisher = {Association for Computational Linguistics},\n  pages     = {169--214},\n  url       = {http://www.aclweb.org/anthology/W17-4717}\n}",
@@ -861,7 +861,7 @@
     ),
     "wmt17/B": FakeSGMLDataset(
         "wmt17/B",
-        data=["http://data.statmt.org/wmt17/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt17/translation-task/test.tgz"],
         md5=["86a1724c276004aa25455ae2a04cef26"],
         description="Additional reference for EN-FI and FI-EN.",
         langpairs={
@@ -873,7 +873,7 @@
     ),
     "wmt17/tworefs": FakeSGMLDataset(
         "wmt17/tworefs",
-        data=["http://data.statmt.org/wmt17/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt17/translation-task/test.tgz"],
         md5=["86a1724c276004aa25455ae2a04cef26"],
         description="Systems with two references.",
         langpairs={
@@ -886,7 +886,7 @@
     ),
     "wmt17/improved": FakeSGMLDataset(
         "wmt17/improved",
-        data=["http://data.statmt.org/wmt17/translation-task/test-update-1.tgz"],
+        data=["https://data.statmt.org/wmt17/translation-task/test-update-1.tgz"],
         md5=["91dbfd5af99bc6891a637a68e04dfd41"],
         description="Improved zh-en and en-zh translations.",
         langpairs={
@@ -896,7 +896,7 @@
     ),
     "wmt17/dev": FakeSGMLDataset(
         "wmt17/dev",
-        data=["http://data.statmt.org/wmt17/translation-task/dev.tgz"],
+        data=["https://data.statmt.org/wmt17/translation-task/dev.tgz"],
         md5=["9b1aa63c1cf49dccdd20b962fe313989"],
         description="Development sets released for new languages in 2017.",
         langpairs={
@@ -922,7 +922,7 @@
         "wmt17/ms",
         data=[
             "https://github.com/MicrosoftTranslator/Translator-HumanParityData/archive/master.zip",
-            "http://data.statmt.org/wmt17/translation-task/test-update-1.tgz",
+            "https://data.statmt.org/wmt17/translation-task/test-update-1.tgz",
         ],
         md5=["18fdaa7a3c84cf6ef688da1f6a5fa96f", "91dbfd5af99bc6891a637a68e04dfd41"],
         description="Additional Chinese-English references from Microsoft Research.",
@@ -938,7 +938,7 @@
     ),
     "wmt16": FakeSGMLDataset(
         "wmt16",
-        data=["http://data.statmt.org/wmt16/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt16/translation-task/test.tgz"],
         md5=["3d809cd0c2c86adb2c67034d15c4e446"],
         description="Official evaluation data.",
         citation="@InProceedings{bojar-EtAl:2016:WMT1,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and  Jimeno Yepes, Antonio  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Neveol, Aurelie  and  Neves, Mariana  and  Popel, Martin  and  Post, Matt  and  Rubino, Raphael  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco  and  Verspoor, Karin  and  Zampieri, Marcos},\n  title     = {Findings of the 2016 Conference on Machine Translation},\n  booktitle = {Proceedings of the First Conference on Machine Translation},\n  month     = {August},\n  year      = {2016},\n  address   = {Berlin, Germany},\n  publisher = {Association for Computational Linguistics},\n  pages     = {131--198},\n  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}\n}",
@@ -995,7 +995,7 @@
     ),
     "wmt16/B": FakeSGMLDataset(
         "wmt16/B",
-        data=["http://data.statmt.org/wmt16/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt16/translation-task/test.tgz"],
         md5=["3d809cd0c2c86adb2c67034d15c4e446"],
         description="Additional reference for EN-FI.",
         langpairs={
@@ -1007,7 +1007,7 @@
     ),
     "wmt16/tworefs": FakeSGMLDataset(
         "wmt16/tworefs",
-        data=["http://data.statmt.org/wmt16/translation-task/test.tgz"],
+        data=["https://data.statmt.org/wmt16/translation-task/test.tgz"],
         md5=["3d809cd0c2c86adb2c67034d15c4e446"],
         description="EN-FI with two references.",
         langpairs={
@@ -1020,7 +1020,7 @@
     ),
     "wmt16/dev": FakeSGMLDataset(
         "wmt16/dev",
-        data=["http://data.statmt.org/wmt16/translation-task/dev.tgz"],
+        data=["https://data.statmt.org/wmt16/translation-task/dev.tgz"],
         md5=["4a3dc2760bb077f4308cce96b06e6af6"],
         description="Development sets released for new languages in 2016.",
         langpairs={
@@ -1044,7 +1044,7 @@
     ),
     "wmt15": FakeSGMLDataset(
         "wmt15",
-        data=["http://statmt.org/wmt15/test.tgz"],
+        data=["https://statmt.org/wmt15/test.tgz"],
         md5=["67e3beca15e69fe3d36de149da0a96df"],
         description="Official evaluation data.",
         citation="@InProceedings{bojar-EtAl:2015:WMT,\n  author    = {Bojar, Ond\\v{r}ej  and  Chatterjee, Rajen  and  Federmann, Christian  and  Haddow, Barry  and  Huck, Matthias  and  Hokamp, Chris  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Post, Matt  and  Scarton, Carolina  and  Specia, Lucia  and  Turchi, Marco},\n  title     = {Findings of the 2015 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Tenth Workshop on Statistical Machine Translation},\n  month     = {September},\n  year      = {2015},\n  address   = {Lisbon, Portugal},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--46},\n  url       = {http://aclweb.org/anthology/W15-3001}\n}",
@@ -1093,7 +1093,7 @@
     ),
     "wmt14": FakeSGMLDataset(
         "wmt14",
-        data=["http://statmt.org/wmt14/test-filtered.tgz"],
+        data=["https://statmt.org/wmt14/test-filtered.tgz"],
         md5=["84c597844c1542e29c2aff23aaee4310"],
         description="Official evaluation data.",
         citation="@InProceedings{bojar-EtAl:2014:W14-33,\n  author    = {Bojar, Ondrej  and  Buck, Christian  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Leveling, Johannes  and  Monz, Christof  and  Pecina, Pavel  and  Post, Matt  and  Saint-Amand, Herve  and  Soricut, Radu  and  Specia, Lucia  and  Tamchyna, Ale\\v{s}},\n  title     = {Findings of the 2014 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2014},\n  address   = {Baltimore, Maryland, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {12--58},\n  url       = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}",
@@ -1142,7 +1142,7 @@
     ),
     "wmt14/full": FakeSGMLDataset(
         "wmt14/full",
-        data=["http://statmt.org/wmt14/test-full.tgz"],
+        data=["https://statmt.org/wmt14/test-full.tgz"],
         md5=["a8cd784e006feb32ac6f3d9ec7eb389a"],
         description="Evaluation data released after official evaluation for further research.",
         langpairs={
@@ -1190,7 +1190,7 @@
     ),
     "wmt13": FakeSGMLDataset(
         "wmt13",
-        data=["http://statmt.org/wmt13/test.tgz"],
+        data=["https://statmt.org/wmt13/test.tgz"],
         md5=["48eca5d02f637af44e85186847141f67"],
         description="Official evaluation data.",
         citation="@InProceedings{bojar-EtAl:2013:WMT,\n  author    = {Bojar, Ond\\v{r}ej  and  Buck, Christian  and  Callison-Burch, Chris  and  Federmann, Christian  and  Haddow, Barry  and  Koehn, Philipp  and  Monz, Christof  and  Post, Matt  and  Soricut, Radu  and  Specia, Lucia},\n  title     = {Findings of the 2013 {Workshop on Statistical Machine Translation}},\n  booktitle = {Proceedings of the Eighth Workshop on Statistical Machine Translation},\n  month     = {August},\n  year      = {2013},\n  address   = {Sofia, Bulgaria},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--44},\n  url       = {http://www.aclweb.org/anthology/W13-2201}\n}",
@@ -1209,7 +1209,7 @@
     ),
     "wmt12": FakeSGMLDataset(
         "wmt12",
-        data=["http://statmt.org/wmt12/test.tgz"],
+        data=["https://statmt.org/wmt12/test.tgz"],
         md5=["608232d34ebc4ba2ff70fead45674e47"],
         description="Official evaluation data.",
         citation="@InProceedings{callisonburch-EtAl:2012:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Post, Matt  and  Soricut, Radu  and  Specia, Lucia},\n  title     = {Findings of the 2012 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Seventh Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2012},\n  address   = {Montr{'e}al, Canada},\n  publisher = {Association for Computational Linguistics},\n  pages     = {10--51},\n  url       = {http://www.aclweb.org/anthology/W12-3102}\n}",
@@ -1226,7 +1226,7 @@
     ),
     "wmt11": FakeSGMLDataset(
         "wmt11",
-        data=["http://statmt.org/wmt11/test.tgz"],
+        data=["https://statmt.org/wmt11/test.tgz"],
         md5=["b0c9680adf32d394aefc2b24e3a5937e"],
         description="Official evaluation data.",
         citation="@InProceedings{callisonburch-EtAl:2011:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Zaidan, Omar},\n  title     = {Findings of the 2011 Workshop on Statistical Machine Translation},\n  booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n  month     = {July},\n  year      = {2011},\n  address   = {Edinburgh, Scotland},\n  publisher = {Association for Computational Linguistics},\n  pages     = {22--64},\n  url       = {http://www.aclweb.org/anthology/W11-2103}\n}",
@@ -1243,7 +1243,7 @@
     ),
     "wmt10": FakeSGMLDataset(
         "wmt10",
-        data=["http://statmt.org/wmt10/test.tgz"],
+        data=["https://statmt.org/wmt10/test.tgz"],
         md5=["491cb885a355da5a23ea66e7b3024d5c"],
         description="Official evaluation data.",
         citation="@InProceedings{callisonburch-EtAl:2010:WMT,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Peterson, Kay  and  Przybocki, Mark  and  Zaidan, Omar},\n  title     = {Findings of the 2010 Joint Workshop on Statistical Machine Translation and Metrics for Machine Translation},\n  booktitle = {Proceedings of the Joint Fifth Workshop on Statistical Machine Translation and MetricsMATR},\n  month     = {July},\n  year      = {2010},\n  address   = {Uppsala, Sweden},\n  publisher = {Association for Computational Linguistics},\n  pages     = {17--53},\n  note      = {Revised August 2010},\n  url       = {http://www.aclweb.org/anthology/W10-1703}\n}",
@@ -1260,7 +1260,7 @@
     ),
     "wmt09": FakeSGMLDataset(
         "wmt09",
-        data=["http://statmt.org/wmt09/test.tgz"],
+        data=["https://statmt.org/wmt09/test.tgz"],
         md5=["da227abfbd7b666ec175b742a0d27b37"],
         description="Official evaluation data.",
         citation="@InProceedings{callisonburch-EtAl:2009:WMT-09,\n  author    = {Callison-Burch, Chris  and  Koehn, Philipp  and  Monz, Christof  and  Schroeder, Josh},\n  title     = {Findings of the 2009 {W}orkshop on {S}tatistical {M}achine {T}ranslation},\n  booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation},\n  month     = {March},\n  year      = {2009},\n  address   = {Athens, Greece},\n  publisher = {Association for Computational Linguistics},\n  pages     = {1--28},\n  url       = {http://www.aclweb.org/anthology/W/W09/W09-0401}\n}",
@@ -1281,7 +1281,7 @@
     ),
     "wmt08": FakeSGMLDataset(
         "wmt08",
-        data=["http://statmt.org/wmt08/test.tgz"],
+        data=["https://statmt.org/wmt08/test.tgz"],
         md5=["0582e4e894a3342044059c894e1aea3d"],
         description="Official evaluation data.",
         citation="@InProceedings{callisonburch-EtAl:2008:WMT,\n  author    = {Callison-Burch, Chris  and  Fordyce, Cameron  and  Koehn, Philipp  and  Monz, Christof  and  Schroeder, Josh},\n  title     = {Further Meta-Evaluation of Machine Translation},\n  booktitle = {Proceedings of the Third Workshop on Statistical Machine Translation},\n  month     = {June},\n  year      = {2008},\n  address   = {Columbus, Ohio},\n  publisher = {Association for Computational Linguistics},\n  pages     = {70--106},\n  url       = {http://www.aclweb.org/anthology/W/W08/W08-0309}\n}",
@@ -1300,7 +1300,7 @@
     ),
     "wmt08/nc": FakeSGMLDataset(
         "wmt08/nc",
-        data=["http://statmt.org/wmt08/test.tgz"],
+        data=["https://statmt.org/wmt08/test.tgz"],
         md5=["0582e4e894a3342044059c894e1aea3d"],
         description="Official evaluation data (news commentary).",
         langpairs={
@@ -1310,7 +1310,7 @@
     ),
     "wmt08/europarl": FakeSGMLDataset(
         "wmt08/europarl",
-        data=["http://statmt.org/wmt08/test.tgz"],
+        data=["https://statmt.org/wmt08/test.tgz"],
         md5=["0582e4e894a3342044059c894e1aea3d"],
         description="Official evaluation data (Europarl).",
         langpairs={
diff --git a/sacrebleu/dataset/__main__.py b/sacrebleu/dataset/__main__.py
index 2295492..5b13d59 100644
--- a/sacrebleu/dataset/__main__.py
+++ b/sacrebleu/dataset/__main__.py
@@ -16,6 +16,8 @@
 
     for item in DATASETS.values():
         if item.md5 is not None:
+            assert item.data
+            assert item.md5
             assert len(item.data) == len(item.md5)
             pairs = zip(item.data, item.md5)
             for url, md5_hash in pairs:
diff --git a/sacrebleu/dataset/base.py b/sacrebleu/dataset/base.py
index ba6e65b..cf3c092 100644
--- a/sacrebleu/dataset/base.py
+++ b/sacrebleu/dataset/base.py
@@ -4,7 +4,7 @@
 import os
 import re
 from abc import ABCMeta, abstractmethod
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 from ..utils import SACREBLEU_DIR, download_file, smart_open
 
@@ -13,10 +13,10 @@ class Dataset(metaclass=ABCMeta):
     def __init__(
         self,
         name: str,
-        data: List[str] = None,
-        description: str = None,
-        citation: str = None,
-        md5: List[str] = None,
+        data: Optional[List[str]] = None,
+        description: Optional[str] = None,
+        citation: Optional[str] = None,
+        md5: Optional[List[str]] = None,
         langpairs=Dict[str, List[str]],
         **kwargs,
     ):
diff --git a/sacrebleu/dataset/wmt_xml.py b/sacrebleu/dataset/wmt_xml.py
index 92c96d5..d5eb5d8 100644
--- a/sacrebleu/dataset/wmt_xml.py
+++ b/sacrebleu/dataset/wmt_xml.py
@@ -76,7 +76,7 @@ def _unwrap_wmt21_or_later(raw_file):
             def get_sents(doc):
                 return {
                     int(seg.get("id")): seg.text if seg.text else ""
-                    for seg in doc.findall(f".//seg")
+                    for seg in doc.findall(".//seg")
                 }
 
             ref_docs = doc.findall(".//ref")
@@ -114,7 +114,7 @@ def _get_langpair_path(self, langpair):
         in order to allow for overriding which test set to use.
         """
         langpair_data = self._get_langpair_metadata(langpair)[langpair]
-        rel_path = langpair_data["path"] if type(langpair_data) == dict else langpair_data[0]
+        rel_path = langpair_data["path"] if isinstance(langpair_data, dict) else langpair_data[0]
         return os.path.join(self._rawdir, rel_path)
 
     def process_to_text(self, langpair=None):
@@ -156,7 +156,7 @@ def _get_langpair_allowed_refs(self, langpair):
         """
         defaults = self.kwargs.get("refs", [])
         langpair_data = self._get_langpair_metadata(langpair)[langpair]
-        if type(langpair_data) == dict:
+        if isinstance(langpair_data, dict):
             allowed_refs = langpair_data.get("refs", defaults)
         else:
             allowed_refs = defaults
diff --git a/sacrebleu/sacrebleu.py b/sacrebleu/sacrebleu.py
index 7edbe3a..d778e1d 100755
--- a/sacrebleu/sacrebleu.py
+++ b/sacrebleu/sacrebleu.py
@@ -50,7 +50,7 @@
 
 try:
     # SIGPIPE is not available on Windows machines, throwing an exception.
-    from signal import SIGPIPE
+    from signal import SIGPIPE  # type: ignore
 
     # If SIGPIPE is available, change behaviour to default instead of ignore.
     from signal import signal, SIG_DFL
diff --git a/sacrebleu/significance.py b/sacrebleu/significance.py
index b39e0a5..a9c71d0 100644
--- a/sacrebleu/significance.py
+++ b/sacrebleu/significance.py
@@ -1,7 +1,7 @@
 import os
 import logging
 import multiprocessing as mp
-from typing import Sequence, Dict, Optional, Tuple, List, Union, Any
+from typing import Sequence, Dict, Optional, Tuple, List, Union, Any, Mapping
 
 import numpy as np
 
@@ -77,11 +77,11 @@ def _bootstrap_resample(stats: List[List[Union[int, float]]],
     idxs = rng.choice(len(stats), size=(n_samples, len(stats)), replace=True)
 
     # convert to numpy array. float32 is more efficient
-    stats = np.array(stats, dtype='float32')
+    stats_np = np.array(stats, dtype='float32')
 
     # recompute scores for all resamples
     scores = [
-        metric._compute_score_from_stats(_s.sum(0)) for _s in stats[idxs]]
+        metric._compute_score_from_stats(_s.sum(0)) for _s in stats_np[idxs]]
 
     return str(seed).lower(), scores
 
@@ -98,7 +98,7 @@ def _compute_p_value(stats: np.ndarray, real_difference: float) -> float:
     # "the != is important. if we want to score the same system against itself
     # having a zero difference should not be attributed to chance."
 
-    c = np.sum(stats > real_difference)
+    c = np.sum(stats > real_difference).item()
 
     # "+1 applies here, though it only matters for small numbers of shufflings,
     # which we typically never do. it's necessary to ensure the probability of
@@ -186,8 +186,9 @@ def _paired_ar_test(baseline_info: Dict[str, Tuple[np.ndarray, Result]],
             sacrelogger.info(f' > Performing bootstrap resampling for confidence interval (# resamples: {n_ar_confidence})')
             sys_stats = np.array(sys_stats, dtype='float32')
             # recompute scores for all resamples
-            sys_scores = [
-                metric._compute_score_from_stats(_s.sum(0)).score for _s in sys_stats[bs_idxs]]
+            sys_scores = np.array([
+                metric._compute_score_from_stats(_s.sum(0)).score for _s in sys_stats[bs_idxs]
+            ])
             res.mean, res.ci = estimate_ci(sys_scores)
 
         # Store the result
@@ -300,7 +301,7 @@ class PairedTest:
     }
 
     def __init__(self, named_systems: List[Tuple[str, Sequence[str]]],
-                 metrics: Dict[str, Metric],
+                 metrics: Mapping[str, Metric],
                  references: Optional[Sequence[Sequence[str]]],
                  test_type: str = 'ar',
                  n_samples: int = 0,
diff --git a/sacrebleu/tokenizers/tokenizer_spm.py b/sacrebleu/tokenizers/tokenizer_spm.py
index a50d0fb..92729b5 100644
--- a/sacrebleu/tokenizers/tokenizer_spm.py
+++ b/sacrebleu/tokenizers/tokenizer_spm.py
@@ -2,7 +2,6 @@
 
 import os
 import logging
-import urllib.request
 
 from functools import lru_cache
 from ..utils import SACREBLEU_DIR, download_file
diff --git a/sacrebleu/utils.py b/sacrebleu/utils.py
index 6187e3b..56e6fca 100644
--- a/sacrebleu/utils.py
+++ b/sacrebleu/utils.py
@@ -423,9 +423,7 @@ def download_file(source_path, dest_path, extract_to=None, expected_md5=None):
     with portalocker.Lock(lockfile, timeout=60):
 
         if not os.path.exists(dest_path) or os.path.getsize(dest_path) == 0:
-
             sacrelogger.info(f"Downloading {source_path} to {dest_path}")
-            md5 = hashlib.md5()
 
             try:
                 with urllib.request.urlopen(source_path) as f, open(dest_path, 'wb') as out:
@@ -441,7 +439,7 @@ def download_file(source_path, dest_path, extract_to=None, expected_md5=None):
                 if cur_md5 != expected_md5:
                     sacrelogger.error(f'Fatal: MD5 sum of downloaded file was incorrect (got {cur_md5}, expected {expected_md5}).')
                     sacrelogger.error(f'Please manually delete {dest_path!r} and rerun the command.')
-                    sacrelogger.error(f'If the problem persists, the tarball may have changed, in which case, please contact the SacreBLEU maintainer.')
+                    sacrelogger.error('If the problem persists, the tarball may have changed, in which case, please contact the SacreBLEU maintainer.')
                     sys.exit(1)
 
             # Extract the tarball
@@ -594,4 +592,4 @@ def print_subset_results(metrics, full_system, full_refs, args):
             print(f'{key}: sentences={n_system:<6} {score.name:<{max_metric_width}} = {score.score:.{w}f}')
 
 # import at the end to avoid circular import
-from .dataset import DATASETS, SUBSETS, DOMAINS, COUNTRIES
+from .dataset import DATASETS, SUBSETS, DOMAINS, COUNTRIES  # noqa: E402
diff --git a/scripts/perf_test.py b/scripts/perf_test.py
index 1cf2b48..f2812db 100644
--- a/scripts/perf_test.py
+++ b/scripts/perf_test.py
@@ -5,8 +5,8 @@
 
 sys.path.insert(0, '.')
 
-import sacrebleu
-from sacrebleu.metrics import BLEU, CHRF
+import sacrebleu  # noqa: E402
+from sacrebleu.metrics import BLEU, CHRF  # noqa: E402
 
 
 N_REPEATS = 5
diff --git a/setup.cfg b/setup.cfg
index 2e0f031..a3fd11d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
 [metadata]
-description-file = README.md
-license_file = LICENSE.txt
+description_file = README.md
+license_files = LICENSE.txt
diff --git a/setup.py b/setup.py
index 00c7002..f104799 100755
--- a/setup.py
+++ b/setup.py
@@ -130,6 +130,9 @@ def get_long_description():
         # Specify the Python versions you support here. In particular, ensure
         # that you indicate whether you support Python 2, Python 3 or both.
         'Programming Language :: Python :: 3 :: Only',
+
+        # Indicate that type hints are provided
+        'Typing :: Typed'
     ],
 
     # What does your project relate to?
@@ -151,7 +154,8 @@ def get_long_description():
     # dependencies). You can install these using the following syntax,
     # for example:
     # $ pip install -e .[dev,test]
-    extras_require={'ja': ['mecab-python3>=1.0.5,<=1.0.6', 'ipadic>=1.0,<2.0'],
+    extras_require={'dev': ['wheel', 'pytest', 'mypy', 'types-tabulate', 'lxml-stubs'],
+                    'ja': ['mecab-python3>=1.0.5,<=1.0.6', 'ipadic>=1.0,<2.0'],
                     'ko': ['mecab-ko>=1.0.0,<=1.0.1', 'mecab-ko-dic>=1.0,<2.0']},
 
     # To provide executable scripts, use entry points in preference to the
diff --git a/test.sh b/test.sh
index ee1657f..1bb5720 100755
--- a/test.sh
+++ b/test.sh
@@ -96,7 +96,7 @@ cd data
 
 if [[ ! -d wmt17-submitted-data ]]; then
    echo "Downloading and unpacking WMT'17 system submissions (46 MB)..."
-   wget -q http://data.statmt.org/wmt17/translation-task/wmt17-submitted-data-v1.0.tgz
+   wget -q https://data.statmt.org/wmt17/translation-task/wmt17-submitted-data-v1.0.tgz
    tar xzf wmt17-submitted-data-v1.0.tgz
 fi
 
diff --git a/test/test_api.py b/test/test_api.py
index 511b4c7..02ac9f0 100644
--- a/test/test_api.py
+++ b/test/test_api.py
@@ -44,7 +44,7 @@ def test_api_get_available_testsets():
     the test sets found.
     """
     available = get_available_testsets()
-    assert type(available) is list
+    assert isinstance(available, list)
     assert "wmt19" in available
     assert "wmt05" not in available
 
@@ -59,12 +59,12 @@ def test_api_get_available_testsets_for_langpair():
     the test sets found.
     """
     available = get_available_testsets_for_langpair('en-it')
-    assert type(available) is list
+    assert isinstance(available, list)
     assert "wmt09" in available
     assert "wmt15" not in available
 
     available = get_available_testsets_for_langpair('en-fr')
-    assert type(available) is list
+    assert isinstance(available, list)
     assert "wmt11" in available
     assert "mtedx/test" in available
     assert "wmt20" not in available
@@ -77,7 +77,7 @@ def test_api_get_langpairs_for_testset():
     """
     for testset in DATASETS.keys():
         available = get_langpairs_for_testset(testset)
-        assert type(available) is list
+        assert isinstance(available, list)
         for langpair in DATASETS[testset].langpairs.keys():
             # skip non-language keys
             if "-" not in langpair:
diff --git a/test/test_dataset.py b/test/test_dataset.py
index cff796d..d3ece8c 100644
--- a/test/test_dataset.py
+++ b/test/test_dataset.py
@@ -106,7 +106,7 @@ def test_wmt22_references():
 
     # and that ref:A is the default for all languages where it wasn't overridden
     for langpair, langpair_data in wmt22.langpairs.items():
-        if type(langpair_data) == dict:
+        if isinstance(langpair_data, dict):
             assert wmt22._get_langpair_allowed_refs(langpair) != ["ref:A"]
         else:
             assert wmt22._get_langpair_allowed_refs(langpair) == ["ref:A"]
diff --git a/test/test_significance.py b/test/test_significance.py
index 46679ac..f709832 100644
--- a/test/test_significance.py
+++ b/test/test_significance.py
@@ -1,9 +1,10 @@
 import os
 
 from collections import defaultdict
+from typing import DefaultDict
 
 from sacrebleu.metrics import BLEU
-from sacrebleu.significance import PairedTest
+from sacrebleu.significance import PairedTest, Result
 
 import pytest
 
@@ -57,8 +58,8 @@ def _read_pickle_file():
 }
 
 
-SACREBLEU_BS_P_VALS = defaultdict(float)
-SACREBLEU_AR_P_VALS = defaultdict(float)
+SACREBLEU_BS_P_VALS: DefaultDict[str, float] = defaultdict(float)
+SACREBLEU_AR_P_VALS: DefaultDict[str, float] = defaultdict(float)
 
 # Load data from pickled file to not bother with WMT17 downloading
 named_systems = _read_pickle_file()
@@ -75,7 +76,9 @@ def _read_pickle_file():
     test_type='bs', n_samples=2000)()[1]
 
 for name, result in zip(bs_scores['System'], bs_scores['BLEU']):
+    assert isinstance(result, Result)
     if result.p_value is not None:
+        assert isinstance(name, str)
         SACREBLEU_BS_P_VALS[name] += result.p_value
 
 
@@ -87,7 +90,9 @@ def _read_pickle_file():
                        test_type='ar', n_samples=10000)()[1]
 
 for name, result in zip(ar_scores['System'], ar_scores['BLEU']):
+    assert isinstance(result, Result)
     if result.p_value is not None:
+        assert isinstance(name, str)
         SACREBLEU_AR_P_VALS[name] += result.p_value
 
 

From b0ad2cb29c87f51a6553b4d6caaff23d6bb7708c Mon Sep 17 00:00:00 2001
From: Dmitry Ustalov <dmitry.ustalov@jetbrains.com>
Date: Wed, 29 Nov 2023 21:25:09 +0100
Subject: [PATCH 4/4] Version bump to 2.3.3 (#253)

---
 CHANGELOG.md          | 5 +++++
 sacrebleu/__init__.py | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 042a992..e21bd54 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Release Notes
 
+- 2.3.3 (2023-11-28)
+  Fixed:
+  - Typing issues (#249, #250)
+  - Improved builds (#252)
+
 - 2.3.2 (2023-11-06)
   Fixed:
   - Special treatment of empty references in TER (#232)
diff --git a/sacrebleu/__init__.py b/sacrebleu/__init__.py
index 19f7059..e7ddc63 100644
--- a/sacrebleu/__init__.py
+++ b/sacrebleu/__init__.py
@@ -14,7 +14,7 @@
 # express or implied. See the License for the specific language governing
 # permissions and limitations under the License.
 
-__version__ = '2.3.2'
+__version__ = '2.3.3'
 __description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores'