Merge branch 'main' of https://github.com/embeddings-benchmark/mteb

embeddings-benchmark · Mar 21, 2024 · b42abe4 · b42abe4
2 parents 364be7f + dd5d617
commit b42abe4
Show file tree

Hide file tree

Showing 233 changed files with 7,196 additions and 3,658 deletions.
diff --git a/.github/disabled_workflows/lint.yml b/.github/disabled_workflows/lint.yml
@@ -0,0 +1,28 @@
+# GitHub action to run linting
+
+name: run-linting
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: make install
+
+      - name: Lint
+        id: lint
+        run: |
+          make lint
diff --git a/.github/disabled_workflows/release.yml b/.github/disabled_workflows/release.yml
@@ -0,0 +1,50 @@
+# This workflow will
+# - Find the latest version tag based on the commit history
+#   - Create a git tag for the new version
+#   - Update the version number in pyproject.toml based on the commit history
+# - Upload the package to PyPI
+# - Create a release on GitHub
+
+# This workflow required the following secrets to be set:
+# - a GitHub personal access token with the `repo` scope called `RELEASE`
+# - and that you setup trusted publishing using PyPI as described here: https://blog.pypi.org/posts/2023-04-20-introducing-trusted-publishers/
+
+name: Release
+on:
+  workflow_run:
+    workflows: ["tests"]
+    types:
+      - completed
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    concurrency: release
+    permissions:
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing using PyPI 
+
+
+    if: ${{ github.ref == 'refs/heads/main' && github.event.workflow_run.conclusion == 'success'}}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.RELEASE }}
+
+      - name: Python Semantic Release
+        id: release
+        uses: python-semantic-release/python-semantic-release@v8.0.4
+        with:
+          github_token: ${{ secrets.RELEASE }}
+
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
+        if: steps.release.outputs.released == 'true'
+        # This action supports PyPI's trusted publishing implementation, which allows authentication to PyPI without a manually 
+        # configured API token or username/password combination. To perform trusted publishing with this action, your project's 
+        # publisher must already be configured on PyPI.
+
+      - name: Publish package distributions to GitHub Releases
+        uses: python-semantic-release/upload-to-gh-release@main
+        if: steps.release.outputs.released == 'true'
+        with:
+          github_token: ${{ secrets.RELEASE }}
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,43 @@
+# This workflow will:
+# 1) install Python dependencies
+# 2) run make test
+
+
+name: Tests
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  pytest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest] #, macos-latest, windows-latest]
+        python-version: ["3.8", "3.9", "3.10"]
+
+    # This allows a subsequently queued workflow run to interrupt previous runs
+    concurrency:
+      group: "${{ github.workflow }}-${{ matrix.python-version}}-${{ matrix.os }} @ ${{ github.ref }}"
+      cancel-in-progress: true
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+
+      - name: Install dependencies
+        shell: bash
+        run: |
+          make install
+
+      - name: Run tests
+        shell: bash
+        run: |
+          make test
diff --git a/.gitignore b/.gitignore
@@ -133,3 +133,6 @@ dmypy.json
 
 # error logs
 error_logs.txt
+
+# tests
+tests/results
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,5 +4,5 @@
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-    "editor.defaultFormatter": "ms-python.black-formatter"
+    "editor.defaultFormatter": "charliermarsh.ruff",
 }
diff --git a/Makefile b/Makefile
@@ -1,49 +1,17 @@
-.PHONY: modified_only_fixup quality style fixup tests
-
-check_dirs := tests mteb scripts
-
-modified_only_fixup:
-	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
-	@if test -n "$(modified_py_files)"; then \
-		echo "Checking/fixing $(modified_py_files)"; \
-		black --preview $(modified_py_files); \
-		isort $(modified_py_files); \
-		flake8 $(modified_py_files); \
-	else \
-		echo "No library .py files were modified"; \
-	fi
-
-# Super fast fix and check target that only works on relevant modified files since the branch was made
-fixup: modified_only_fixup
-
-
-# This installs all the required dependencies
 install:
+	@echo "--- 🚀 Installing project dependencies ---"
 	pip install -e ".[dev]"
 
-# this target runs checks on all files
-quality:
-	black --check --preview $(check_dirs)
-	isort --check-only $(check_dirs)
-	flake8 $(check_dirs)
-
-
-# this target runs checks on all files and potentially modifies some of them
-style:
-	black --preview $(check_dirs)
-	isort $(check_dirs)
-
-# runs the same lints as the github actions
 lint:
-	# stop the build if there are Python syntax errors or undefined names
-	flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-	# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-	flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+	@echo "--- 🧹 Running linters ---"
+	ruff format . 			# running ruff formatting
+	ruff check . --fix  	# running ruff linting
 
-# Run tests for the library
 test:
+	@echo "--- 🧪 Running tests ---"
 	pytest
 
-# add parllel test for faster execution (can sometimes cause issues with some tests)
 test-parallel:
-	pytest -n auto --dist=loadfile -s -v
+	@echo "--- 🧪 Running tests ---"
+	@echo "Note that parallel tests can sometimes cause issues with some tests."
+	pytest -n auto --dist=loadfile -s -v
diff --git a/README.md b/README.md
@@ -222,8 +222,6 @@ evaluation.run(model)
 | 📈 [Leaderboard] | The interactive leaderboard of the benchmark |
 | 🤖 [Adding a model] | Information related to how to submit a model to the leaderboard |
 | 🤝  [Contributing] | How to contribute to MTEB and set it up for development |
-| 
-
 
 [Tasks]: docs/tasks.md
 [Contributing]: docs/contributing.md

diff --git a/docs/tasks.md b/docs/tasks.md
@@ -1,7 +1,7 @@
 ## Available tasks
 The following tables gives you an overview of the tasks in MTEB.
 
-<!-- The following table is auto-generated: -->
+<!-- This allows the table to be autogenerated in the future: -->
 <!-- TABLE START -->
 
 ## Available tasks

diff --git a/mteb/__init__.py b/mteb/__init__.py
@@ -1,7 +1,11 @@
-__version__ = "1.2.1.dev0"
+from __future__ import annotations
+
+from importlib.metadata import version
 
 from mteb.evaluation import *
 
+__version__ = version("mteb")  # fetch version from install metadata
+
 
 MTEB_MAIN_EN = [
     "AmazonCounterfactualClassification",
@@ -70,5 +74,5 @@
     "TweetSentimentExtractionClassification",
     "TwentyNewsgroupsClustering",
     "TwitterSemEval2015",
-    "TwitterURLCorpus"
+    "TwitterURLCorpus",
 ]
diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import random
 from abc import ABC, abstractmethod
 
@@ -24,17 +26,19 @@ def load_data(self, **kwargs):
         """
         Load dataset from HuggingFace hub
         """
-        if self.data_loaded: return
+        if self.data_loaded:
+            return
 
         # TODO: add split argument
         self.dataset = datasets.load_dataset(
-            self.description["hf_hub_name"], revision=self.description.get("revision", None)
+            self.metadata_dict["hf_hub_name"],
+            revision=self.metadata_dict.get("revision", None),
         )
         self.data_loaded = True
 
     @property
     @abstractmethod
-    def description(self):
+    def metadata_dict(self) -> dict[str, str]:
         """
         Returns a description of the task. Should contain the following fields:
         name: Name of the task (usually equal to the class name. Should be a valid name for a path on disc)

diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import logging
 
 from ..evaluation.evaluators import BitextMiningEvaluator
@@ -11,7 +13,7 @@ class AbsTaskBitextMining(AbsTask):
     Abstract class for BitextMining tasks
     The similarity is computed between pairs and the results are ranked.
 
-    self.load_data() must generate a huggingface dataset with a split matching self.description["eval_splits"], and assign it to self.dataset. It must contain the following columns:
+    self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
         id: str
         sentence1: str
         sentence2: str
@@ -28,13 +30,13 @@ def evaluate(self, model, split, **kwargs):
             scores = {}
             for lang in self.dataset:
                 logger.info(
-                    f"\nTask: {self.description['name']}, split: {split}, language: {lang}. Running..."
+                    f"\nTask: {self.metadata_dict['name']}, split: {split}, language: {lang}. Running..."
                 )
                 data_split = self.dataset[lang][split]
                 scores[lang] = self._evaluate_split(model, data_split, **kwargs)
         else:
             logger.info(
-                f"\nTask: {self.description['name']}, split: {split}. Running..."
+                f"\nTask: {self.metadata_dict['name']}, split: {split}. Running..."
             )
             data_split = self.dataset[split]
             scores = self._evaluate_split(model, data_split, **kwargs)
@@ -72,9 +74,9 @@ def _evaluate_split(self, model, data_split, **kwargs):
         return metrics
 
     def _add_main_score(self, scores):
-        if self.description["main_score"] in scores:
-            scores["main_score"] = scores[self.description["main_score"]]
+        if self.metadata_dict["main_score"] in scores:
+            scores["main_score"] = scores[self.metadata_dict["main_score"]]
         else:
             logger.warn(
-                f"main score {self.description['main_score']} not found in scores {scores.keys()}"
+                f"main score {self.metadata_dict['main_score']} not found in scores {scores.keys()}"
             )