From 8483255757597102c767c8efc11c095148daf59e Mon Sep 17 00:00:00 2001 From: edknv <109497216+edknv@users.noreply.github.com> Date: Thu, 2 Nov 2023 18:26:26 -0700 Subject: [PATCH] add lint workflow (#22) * add lint workflow * lint * fix type check bug * skip test data for spell check * add codespell rule to pre commit config * fix spelling --- .github/workflows/lint.yaml | 14 ++++ .pre-commit-config.yaml | 65 +++++-------------- ci/ignore_codespell_words.txt | 1 + crossfit/__init__.py | 37 ++++++----- crossfit/backend/__init__.py | 5 +- crossfit/backend/cudf/array.py | 2 +- crossfit/backend/cudf/dataframe.py | 3 +- crossfit/backend/cudf/series.py | 2 +- crossfit/backend/cupy/array.py | 2 +- crossfit/backend/dask/aggregate.py | 2 +- crossfit/backend/dask/cluster.py | 13 ++-- crossfit/backend/dask/dataframe.py | 2 +- crossfit/backend/numpy/sparse.py | 6 +- crossfit/backend/pandas/dataframe.py | 4 +- crossfit/backend/torch/array.py | 2 +- crossfit/backend/torch/hf/model.py | 23 +++---- crossfit/backend/torch/loader.py | 45 ++++--------- crossfit/backend/torch/op/embed.py | 6 +- crossfit/calculate/aggregate.py | 3 +- crossfit/calculate/module.py | 5 +- crossfit/data/__init__.py | 11 +--- crossfit/data/array/conversion.py | 2 +- crossfit/data/array/decorator.py | 48 +++++--------- crossfit/data/array/ops.py | 2 +- crossfit/data/dataframe/core.py | 5 +- crossfit/data/dataframe/dispatch.py | 7 +- crossfit/data/sparse/core.py | 3 +- crossfit/data/sparse/ranking.py | 57 +++++++++------- crossfit/dataset/base.py | 3 - crossfit/dataset/beir/raw.py | 1 + crossfit/dataset/home.py | 1 - crossfit/metric/__init__.py | 7 +- crossfit/metric/base.py | 2 +- crossfit/metric/categorical/value_counts.py | 2 +- crossfit/metric/continuous/mean.py | 1 - crossfit/metric/ranking/__init__.py | 17 +++-- crossfit/metric/ranking/base.py | 4 +- crossfit/metric/ranking/f1.py | 2 - crossfit/metric/ranking/ndcg.py | 4 +- crossfit/metric/ranking/precision.py | 9 +-- crossfit/metric/ranking/rank.py | 2 +- crossfit/metric/ranking/recall.py | 5 +- crossfit/op/__init__.py | 8 ++- crossfit/op/base.py | 2 +- crossfit/report/beir/embed.py | 12 ++-- crossfit/report/beir/report.py | 28 ++++---- crossfit/report/data_overview/report.py | 16 ++--- .../data_overview/visualization/facets.py | 3 +- crossfit/utils/np_utils.py | 6 +- examples/crossarray-tf.ipynb | 4 +- examples/dask_compute_bench.py | 8 +-- pyproject.toml | 17 ++++- requirements/dev.txt | 34 +--------- setup.cfg | 4 +- tests/backend/test_sklearn.py | 4 +- tests/data/array/test_conversion.py | 2 +- tests/data/array/test_decorator.py | 4 +- tests/dataset/test_load.py | 7 +- tests/metrics/ranking/test_f1.py | 4 +- tests/metrics/ranking/test_hitrate.py | 4 +- tests/metrics/ranking/test_ndcg.py | 26 ++++---- tests/metrics/ranking/test_precision.py | 21 ++---- tests/metrics/ranking/test_rank.py | 17 ++--- tests/metrics/ranking/test_recall.py | 15 ++--- tests/report/beir/test_embed.py | 10 +-- tests/report/beir/test_report.py | 20 ++---- 66 files changed, 297 insertions(+), 416 deletions(-) create mode 100644 .github/workflows/lint.yaml create mode 100644 ci/ignore_codespell_words.txt diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 0000000..e82186e --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,14 @@ +name: lint + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - uses: pre-commit/action@v3.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ca0fcc9..096ae39 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,54 +5,25 @@ repos: hooks: - id: absolufy-imports - repo: https://github.com/python/black - rev: 22.10.0 + rev: 23.10.1 hooks: - id: black - # - repo: https://github.com/timothycrosley/isort - # rev: 5.10.1 - # hooks: - # - id: isort - # additional_dependencies: [toml] - # exclude: examples/* - # types - # - repo: https://github.com/pre-commit/mirrors-mypy - # rev: 'v0.940' - # hooks: - # - id: mypy - # language_version: python3 - # args: [--no-strict-optional, --ignore-missing-imports, --show-traceback, --install-types, --non-interactive] - # exclude: docs/* - # code style - # - repo: https://github.com/pycqa/pylint - # rev: pylint-2.7.4 - # hooks: - # - id: pylint - # exlude: notebooks/* - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.2 + - repo: https://github.com/timothycrosley/isort + rev: 5.12.0 + hooks: + - id: isort + - repo: https://github.com/pycqa/flake8 + rev: 6.1.0 hooks: - id: flake8 - exlude: notebooks/* - # notebooks - # - repo: https://github.com/s-weigand/flake8-nb - # rev: v0.3.0 - # hooks: - # - id: flake8-nb - # files: \.ipynb$ - # documentation - # - repo: https://github.com/econchick/interrogate - # rev: 1.5.0 - # hooks: - # - id: interrogate - # exclude: ^(build|docs|merlin/io|tests|setup.py|versioneer.py) - # args: [--config=pyproject.toml] - # - repo: https://github.com/codespell-project/codespell - # rev: v2.1.0 - # hooks: - # - id: codespell - # # security - # - repo: https://github.com/PyCQA/bandit - # rev: 1.7.0 - # hooks: - # - id: bandit - # args: [--verbose, -ll, -x, tests,examples,bench] \ No newline at end of file + - repo: https://github.com/codespell-project/codespell + rev: v2.2.6 + hooks: + - id: codespell + exclude: tests/testdata + # security + - repo: https://github.com/PyCQA/bandit + rev: 1.7.0 + hooks: + - id: bandit + args: [--verbose, -ll, -x, tests,examples,bench] diff --git a/ci/ignore_codespell_words.txt b/ci/ignore_codespell_words.txt new file mode 100644 index 0000000..2222453 --- /dev/null +++ b/ci/ignore_codespell_words.txt @@ -0,0 +1 @@ +nin diff --git a/crossfit/__init__.py b/crossfit/__init__.py index 7ff9648..f5e3ab0 100644 --- a/crossfit/__init__.py +++ b/crossfit/__init__.py @@ -1,3 +1,5 @@ +# flake8: noqa + from crossfit import backend, metric, op from crossfit.backend.dask.cluster import Distributed, Serial from crossfit.calculate.aggregate import Aggregator @@ -6,9 +8,8 @@ from crossfit.data.array.dispatch import crossarray from crossfit.data.dataframe.core import FrameBackend from crossfit.data.dataframe.dispatch import CrossFrame -from crossfit.metric import * # noqa -from crossfit.op import * # noqa - +from crossfit.metric import * +from crossfit.op import * __all__ = [ "Aggregator", @@ -27,21 +28,23 @@ try: - from crossfit.backend.torch import SentenceTransformerModel, TorchExactSearch, HFModel + from crossfit.backend.torch import HFModel, SentenceTransformerModel, TorchExactSearch + from crossfit.dataset.base import IRDataset, MultiDataset + from crossfit.dataset.load import load_dataset from crossfit.report.beir.embed import embed from crossfit.report.beir.report import beir_report - from crossfit.dataset.load import load_dataset - from crossfit.dataset.base import IRDataset, MultiDataset - - __all__.extend([ - "embed", - "beir_report", - "load_dataset", - "TorchExactSearch", - "SentenceTransformerModel", - "HFModel", - "MultiDataset", - "IRDataset", - ]) + + __all__.extend( + [ + "embed", + "beir_report", + "load_dataset", + "TorchExactSearch", + "SentenceTransformerModel", + "HFModel", + "MultiDataset", + "IRDataset", + ] + ) except ImportError as e: pass diff --git a/crossfit/backend/__init__.py b/crossfit/backend/__init__.py index 3370763..acdfe44 100644 --- a/crossfit/backend/__init__.py +++ b/crossfit/backend/__init__.py @@ -1,9 +1,10 @@ -from crossfit.backend.numpy.sparse import * +# flake8: noqa + from crossfit.backend.dask.dataframe import * +from crossfit.backend.numpy.sparse import * from crossfit.backend.pandas.array import * from crossfit.backend.pandas.dataframe import * - try: from crossfit.backend.cudf.array import * from crossfit.backend.cudf.dataframe import * diff --git a/crossfit/backend/cudf/array.py b/crossfit/backend/cudf/array.py index 3b04ab2..de1983f 100644 --- a/crossfit/backend/cudf/array.py +++ b/crossfit/backend/cudf/array.py @@ -1,7 +1,7 @@ import logging from crossfit.data.array import conversion -from crossfit.data.array.dispatch import np_backend_dispatch, ArrayBackend +from crossfit.data.array.dispatch import ArrayBackend, np_backend_dispatch @np_backend_dispatch.register_lazy("cudf") diff --git a/crossfit/backend/cudf/dataframe.py b/crossfit/backend/cudf/dataframe.py index 60801e5..5f8ee81 100644 --- a/crossfit/backend/cudf/dataframe.py +++ b/crossfit/backend/cudf/dataframe.py @@ -1,9 +1,8 @@ from typing import Callable - +from crossfit.backend.pandas.dataframe import PandasDataFrame from crossfit.data.array.dispatch import crossarray from crossfit.data.dataframe.dispatch import CrossFrame -from crossfit.backend.pandas.dataframe import PandasDataFrame class CudfDataFrame(PandasDataFrame): diff --git a/crossfit/backend/cudf/series.py b/crossfit/backend/cudf/series.py index e406085..7f1c678 100644 --- a/crossfit/backend/cudf/series.py +++ b/crossfit/backend/cudf/series.py @@ -1,5 +1,5 @@ -import cupy as cp import cudf +import cupy as cp from cudf.core.column import as_column diff --git a/crossfit/backend/cupy/array.py b/crossfit/backend/cupy/array.py index 2991561..d81ac80 100644 --- a/crossfit/backend/cupy/array.py +++ b/crossfit/backend/cupy/array.py @@ -1,7 +1,7 @@ import logging from crossfit.data.array import conversion -from crossfit.data.array.dispatch import np_backend_dispatch, ArrayBackend +from crossfit.data.array.dispatch import ArrayBackend, np_backend_dispatch @np_backend_dispatch.register_lazy("cupy") diff --git a/crossfit/backend/dask/aggregate.py b/crossfit/backend/dask/aggregate.py index 0d0d782..aa211b1 100644 --- a/crossfit/backend/dask/aggregate.py +++ b/crossfit/backend/dask/aggregate.py @@ -1,8 +1,8 @@ from functools import partial +import dask.dataframe as dd from dask.delayed import Delayed from dask.highlevelgraph import HighLevelGraph -import dask.dataframe as dd from crossfit.calculate.aggregate import Aggregator from crossfit.data.dataframe.dispatch import CrossFrame diff --git a/crossfit/backend/dask/cluster.py b/crossfit/backend/dask/cluster.py index ae7d9f6..3bcca3f 100644 --- a/crossfit/backend/dask/cluster.py +++ b/crossfit/backend/dask/cluster.py @@ -1,17 +1,16 @@ -from typing import Callable, Optional, Any -from contextvars import ContextVar -import importlib import gc +import importlib import warnings +from contextvars import ContextVar +from typing import Any, Callable, Optional import dask -from dask.distributed import Client, get_client -from dask.dataframe.optimize import optimize as dd_optimize import distributed +from dask.dataframe.optimize import optimize as dd_optimize +from dask.distributed import Client, get_client from crossfit.backend.gpu import HAS_GPU - _crossfit_dask_client = ContextVar("_crossfit_dask_client", default="auto") @@ -174,7 +173,7 @@ class only supports the automatic generation of The easiest way to use `Distributed` is within a conventional `with` statement:: - from merlin.core.utils import Disributed + from merlin.core.utils import Disrtibuted workflow = nvt.Workflow(["col"] >> ops.Normalize()) dataset = nvt.Dataset(...) diff --git a/crossfit/backend/dask/dataframe.py b/crossfit/backend/dask/dataframe.py index 4024301..ca0be91 100644 --- a/crossfit/backend/dask/dataframe.py +++ b/crossfit/backend/dask/dataframe.py @@ -3,10 +3,10 @@ from typing import Callable, List import dask.dataframe as dd + from crossfit.data.dataframe.core import FrameBackend from crossfit.data.dataframe.dispatch import CrossFrame - # @CrossFrame.register_lazy("dask") # def register_dask_backend(): # import dask.dataframe as dd diff --git a/crossfit/backend/numpy/sparse.py b/crossfit/backend/numpy/sparse.py index 28acad7..e8d562c 100644 --- a/crossfit/backend/numpy/sparse.py +++ b/crossfit/backend/numpy/sparse.py @@ -1,12 +1,12 @@ import itertools -from crossfit.data.array.masked import MaskedArray +import numba import numpy as np import scipy.sparse as sp -import numba -from crossfit.data.sparse.dispatch import CrossSparse +from crossfit.data.array.masked import MaskedArray from crossfit.data.sparse.core import SparseMatrixBackend +from crossfit.data.sparse.dispatch import CrossSparse class NPSparseMatrixBackend(SparseMatrixBackend): diff --git a/crossfit/backend/pandas/dataframe.py b/crossfit/backend/pandas/dataframe.py index 6d48016..bb21c00 100644 --- a/crossfit/backend/pandas/dataframe.py +++ b/crossfit/backend/pandas/dataframe.py @@ -36,7 +36,7 @@ def concat(cls, frames: List[FrameBackend], axis: int = 0): if len(frames) == 0: raise TypeError(f"Expected non-empty list, got {frames}") for frame in frames: - if type(frame) != cls: + if type(frame) is not cls: raise TypeError(f"All frames should be type {cls}, got {type(frame)}") return cls( @@ -103,8 +103,8 @@ def groupby_indices(self, by: list) -> dict: @CrossFrame.register_lazy("numpy") def register_numpy_backend(): try: - import pandas as pd import numpy as np + import pandas as pd @CrossFrame.register(np.ndarray) def _numpy_to_pandas(data, name="data"): diff --git a/crossfit/backend/torch/array.py b/crossfit/backend/torch/array.py index 5322770..508a05c 100644 --- a/crossfit/backend/torch/array.py +++ b/crossfit/backend/torch/array.py @@ -1,7 +1,7 @@ import logging from crossfit.data.array import conversion -from crossfit.data.array.dispatch import np_backend_dispatch, ArrayBackend +from crossfit.data.array.dispatch import ArrayBackend, np_backend_dispatch try: import torch diff --git a/crossfit/backend/torch/hf/model.py b/crossfit/backend/torch/hf/model.py index d8ba542..be71951 100644 --- a/crossfit/backend/torch/hf/model.py +++ b/crossfit/backend/torch/hf/model.py @@ -1,15 +1,16 @@ -from functools import lru_cache import gc import os -from crossfit.dataset.home import CF_HOME -import joblib +from functools import lru_cache +import joblib import numpy as np import torch +from sklearn.linear_model import LinearRegression from tqdm import tqdm from transformers import AutoConfig, AutoModel, AutoTokenizer -from sklearn.linear_model import LinearRegression + from crossfit.backend.torch.model import Model +from crossfit.dataset.home import CF_HOME class HFModel(Model): @@ -68,19 +69,13 @@ def fit_memory_estimate_curve(self, model=None): torch.cuda.reset_peak_memory_stats() batch = { - "input_ids": torch.randint(1, 501, (batch_size, seq_len)).to( - device=device - ), - "attention_mask": torch.ones((batch_size, seq_len)).to( - device=device - ), + "input_ids": torch.randint(1, 501, (batch_size, seq_len)).to(device=device), + "attention_mask": torch.ones((batch_size, seq_len)).to(device=device), } try: - outputs = model(batch) - memory_used = torch.cuda.max_memory_allocated() / ( - 1024**2 - ) # Convert to MB + _ = model(batch) + memory_used = torch.cuda.max_memory_allocated() / (1024**2) # Convert to MB X.append([batch_size, seq_len, seq_len**2]) y.append(memory_used) diff --git a/crossfit/backend/torch/loader.py b/crossfit/backend/torch/loader.py index 2563600..158b8f3 100644 --- a/crossfit/backend/torch/loader.py +++ b/crossfit/backend/torch/loader.py @@ -1,23 +1,20 @@ -from typing import Dict, overload -from itertools import islice import warnings +from itertools import islice +from typing import Dict, overload import torch from crossfit.backend.torch.model import Model -from crossfit.data.dataframe.dispatch import CrossFrame -from crossfit.data.array.dispatch import crossarray from crossfit.data.array.conversion import convert_array - +from crossfit.data.array.dispatch import crossarray +from crossfit.data.dataframe.dispatch import CrossFrame DEFAULT_BATCH_SIZE = 512 class InMemoryLoader: @overload - def __init__( - self, data: Dict[str, torch.Tensor], batch_size: int, progress_bar=None - ): + def __init__(self, data: Dict[str, torch.Tensor], batch_size: int, progress_bar=None): ... @overload @@ -52,9 +49,7 @@ def __next__(self): batch_size = self.batch_size end = batch_size + self.current_idx - batch = { - key: val[self.current_idx : end] for key, val in self.tensor_dict.items() - } + batch = {key: val[self.current_idx : end] for key, val in self.tensor_dict.items()} if self.max_seq_len is not None: batch = {key: val[:, : self.max_seq_len] for key, val in batch.items()} @@ -128,19 +123,17 @@ def __next__(self): for key, val in self.tensor_dict.items() if key not in self.to_ignore } - clip_len = min( - max(_tokens[start], _tokens[end - 1]), self.model.max_seq_length() - ) + clip_len = min(max(_tokens[start], _tokens[end - 1]), self.model.max_seq_length()) batch = {key: val[:, :clip_len] for key, val in batch.items()} for fn in self._to_map: batch = fn(batch) break - except torch.cuda.OutOfMemoryError as e: + except torch.cuda.OutOfMemoryError: mid = start + (end - start) // 2 warnings.warn( - f"Not enough memeory for a batch size of {end - start}. " + f"Not enough memory for a batch size of {end - start}. " f"Retrying with a new batch size of {mid - start}. " f"Consider setting initial batch size to {mid - start}." ) @@ -166,20 +159,14 @@ def _find_optimal_splits(self): max_seq_length = self.model.max_seq_length() while i < len(num_tokens): - best_fit_e_ind = ( - i + self.batch_size - ) # Initialize to at least initial_batch_size + best_fit_e_ind = i + self.batch_size # Initialize to at least initial_batch_size # Try aggressive doubling first for doubling_i in range(max_doubling_attempts): - tentative_e_ind = ( - i + best_fit_e_ind * doubling_factor - ) # Double the last best fit + tentative_e_ind = i + best_fit_e_ind * doubling_factor # Double the last best fit tentative_e_ind = min(tentative_e_ind, len(num_tokens)) max_token = int(num_tokens[tentative_e_ind - 1]) - est_memory = self.model.estimate_memory( - max_token, int(tentative_e_ind - i) - ) + est_memory = self.model.estimate_memory(max_token, int(tentative_e_ind - i)) if est_memory <= self.model.max_mem_gb: best_fit_e_ind = tentative_e_ind @@ -188,15 +175,11 @@ def _find_optimal_splits(self): break # Exit loop if we exceed memory limit for _ in range(max_steps): - tentative_e_ind = ( - best_fit_e_ind + dynamic_step_size - ) # Add dynamic step size + tentative_e_ind = best_fit_e_ind + dynamic_step_size # Add dynamic step size tentative_e_ind = min(tentative_e_ind, len(num_tokens)) max_token = int(num_tokens[tentative_e_ind - 1]) - est_memory = self.model.estimate_memory( - max_token, int(tentative_e_ind - i) - ) + est_memory = self.model.estimate_memory(max_token, int(tentative_e_ind - i)) # The closer we are to the end, the more we penalize the batch size penalty_factor = 1 + 5.0 * ((max_token / max_seq_length) ** 2) est_memory *= penalty_factor diff --git a/crossfit/backend/torch/op/embed.py b/crossfit/backend/torch/op/embed.py index d2cff25..6f05820 100644 --- a/crossfit/backend/torch/op/embed.py +++ b/crossfit/backend/torch/op/embed.py @@ -1,13 +1,13 @@ import gc -import cupy as cp import cudf +import cupy as cp import torch -from crossfit.op.base import Op from crossfit.backend.cudf.series import create_list_series_from_2d_ar +from crossfit.backend.torch.loader import DEFAULT_BATCH_SIZE, InMemoryLoader, SortedSeqLoader from crossfit.backend.torch.model import Model -from crossfit.backend.torch.loader import DEFAULT_BATCH_SIZE, SortedSeqLoader, InMemoryLoader +from crossfit.op.base import Op class Embedder(Op): diff --git a/crossfit/calculate/aggregate.py b/crossfit/calculate/aggregate.py index ef9abca..3c22899 100644 --- a/crossfit/calculate/aggregate.py +++ b/crossfit/calculate/aggregate.py @@ -3,8 +3,9 @@ from functools import wraps import numpy as np -from crossfit.data.dataframe.core import FrameBackend + from crossfit.data.array.conversion import convert_array +from crossfit.data.dataframe.core import FrameBackend def pre_processing(func): diff --git a/crossfit/calculate/module.py b/crossfit/calculate/module.py index 1276e53..a28bf9e 100644 --- a/crossfit/calculate/module.py +++ b/crossfit/calculate/module.py @@ -1,7 +1,6 @@ -from typing import Dict, List -from dataclasses import field, MISSING, Field from copy import deepcopy - +from dataclasses import MISSING, Field, field +from typing import Dict, List from typing_utils import get_origin diff --git a/crossfit/data/__init__.py b/crossfit/data/__init__.py index 07175b2..1103d49 100644 --- a/crossfit/data/__init__.py +++ b/crossfit/data/__init__.py @@ -1,15 +1,8 @@ -from crossfit.data.array.dispatch import ( - crossarray, - numpy, - ArrayBackend, - np_backend_dispatch, -) from crossfit.data.array import conversion from crossfit.data.array.conversion import convert_array - -from crossfit.data.dataframe.dispatch import CrossFrame +from crossfit.data.array.dispatch import ArrayBackend, crossarray, np_backend_dispatch, numpy from crossfit.data.dataframe.core import FrameBackend - +from crossfit.data.dataframe.dispatch import CrossFrame __all__ = [ "crossarray", diff --git a/crossfit/data/array/conversion.py b/crossfit/data/array/conversion.py index 827c9ad..9ba7bbb 100644 --- a/crossfit/data/array/conversion.py +++ b/crossfit/data/array/conversion.py @@ -1,5 +1,5 @@ -from typing import Any, Type, TypeVar from itertools import product +from typing import Any, Type, TypeVar import numpy as np from dask.utils import Dispatch diff --git a/crossfit/data/array/decorator.py b/crossfit/data/array/decorator.py index 5609919..36144a9 100644 --- a/crossfit/data/array/decorator.py +++ b/crossfit/data/array/decorator.py @@ -1,18 +1,18 @@ -from typing import Set, TypeVar, Union, Optional, List -from copy import deepcopy -import sys import ast -from pathlib import Path import functools import inspect +import sys import types +from copy import deepcopy from itertools import zip_longest +from pathlib import Path +from typing import List, Optional, Set, TypeVar, Union import astunparse import numpy as np -from crossfit.array import numpy as cnp, np_backend_dispatch - +from crossfit.array import np_backend_dispatch +from crossfit.array import numpy as cnp _CALL_HANDLER_ID = "__crossfit_call_handler__" _CLOSURE_WRAPPER_ID = "__crossfit_closure_wrapper__" @@ -58,9 +58,7 @@ def crossnp(func: FuncType, with_cache=True, validate_array_type=None) -> FuncTy if isinstance(func, np.ufunc) or func.__module__ == "numpy": return getattr(cnp, func.__name__) - cross_func = _compiler( - func, with_cache=with_cache, validate_array_type=validate_array_type - ) + cross_func = _compiler(func, with_cache=with_cache, validate_array_type=validate_array_type) if func == cross_func: func.__np__ = func @@ -94,9 +92,7 @@ class _AstModule(ast.NodeTransformer): from this AST. Default is None. """ - def __init__( - self, to_check: str, module: types.ModuleType, node: Optional[ast.AST] = None - ): + def __init__(self, to_check: str, module: types.ModuleType, node: Optional[ast.AST] = None): self.to_check = to_check self.module = module self.aliases: Set[str] = set() @@ -118,9 +114,7 @@ def __contains__(self, node: ast.Call) -> bool: bool True if the node is a call to the module, False otherwise. """ - if isinstance(node.func, ast.Attribute) and isinstance( - node.func.value, ast.Name - ): + if isinstance(node.func, ast.Attribute) and isinstance(node.func.value, ast.Name): if node.func.value.id in self.aliases: return True @@ -213,9 +207,7 @@ def __init__( self.file_ast = ast.parse(open(inspect.getsourcefile(py_module)).read()) self.numpy_module = _AstModule("numpy", self.py_module, node=self.file_ast) self.output = output - self.output_name = ( - getattr(output.names[0], "asname", None) or output.names[0].name - ) + self.output_name = getattr(output.names[0], "asname", None) or output.names[0].name def __call__( self, node_or_fn: Union[ast.AST, types.FunctionType], validate_array_type=None @@ -249,9 +241,7 @@ def __call__( self._validate_array_type = _validate_temp if not _compare_ast(orig, output): - output.body[0].name = _cross_np_name( - output.body[0].name, self.py_module.__name__ - ) + output.body[0].name = _cross_np_name(output.body[0].name, self.py_module.__name__) output.body[0].body.insert(0, self.output) output = ast.fix_missing_locations(output) @@ -300,9 +290,7 @@ def visit_Call(self, node: ast.Call) -> ast.Call: if fn_name not in backend: # TODO: Show how to add the function to the backend framework = array_type.__module__.split(".")[0] - raise ValueError( - f"Function {fn_name} is not supported by {framework}" - ) + raise ValueError(f"Function {fn_name} is not supported by {framework}") else: if isinstance(node.func, ast.Name): _maybe_transformed = self._maybe_compile(node.func.id) @@ -431,12 +419,10 @@ def __call__( self.non_np.add(fn) - # TODO: Should we throw an ecxeption here? + # TODO: Should we throw an exception here? return fn - def to_crossnp_fn( - self, fn: types.FunctionType, ast_node: ast.AST - ) -> types.FunctionType: + def to_crossnp_fn(self, fn: types.FunctionType, ast_node: ast.AST) -> types.FunctionType: """ Compile a transformed node to a function. @@ -621,11 +607,7 @@ def fn(...): # Or some expression involving a lambda. def _find_function_code(code: types.CodeType, fn_name: str): """Finds the code object within `code` corresponding to `fn_name`.""" - code = [ - const - for const in code.co_consts - if inspect.iscode(const) and const.co_name == fn_name - ] + code = [const for const in code.co_consts if inspect.iscode(const) and const.co_name == fn_name] assert len(code) == 1, f"Couldn't find function code for {fn_name!r}." return code[0] diff --git a/crossfit/data/array/ops.py b/crossfit/data/array/ops.py index 9e9f273..76ee2e1 100644 --- a/crossfit/data/array/ops.py +++ b/crossfit/data/array/ops.py @@ -1,6 +1,6 @@ import numpy as np -from crossfit.data.array.dispatch import with_dispatch, np_backend_dispatch +from crossfit.data.array.dispatch import np_backend_dispatch, with_dispatch dtype = with_dispatch(np.dtype) errstate = np.errstate diff --git a/crossfit/data/dataframe/core.py b/crossfit/data/dataframe/core.py index f89cade..974bc6c 100644 --- a/crossfit/data/dataframe/core.py +++ b/crossfit/data/dataframe/core.py @@ -366,7 +366,7 @@ def concat( if axis == 0: columns = frames[0].columns for frame in frames: - if type(frame) != cls: + if type(frame) is not cls: raise TypeError(f"All frames should be type {cls}, got {type(frame)}") if columns != frame.columns: raise TypeError("Cannot concatenat misaligned columns") @@ -379,7 +379,7 @@ def concat( columns = set() combined = {} for frame in frames: - if type(frame) != cls: + if type(frame) is not cls: raise TypeError(f"All frames should be type {cls}, got {type(frame)}") _columns = set(frame.columns) if _columns.intersection(columns): @@ -434,7 +434,6 @@ def take(self, indices, axis=0): return self.__class__({k: np.take(v, indices, axis=axis) for k, v in self.data.items()}) def groupby_indices(self, by: list) -> dict: - if isinstance(by, (str, int, tuple)): by = [by] diff --git a/crossfit/data/dataframe/dispatch.py b/crossfit/data/dataframe/dispatch.py index c4d777a..3f71dc7 100644 --- a/crossfit/data/dataframe/dispatch.py +++ b/crossfit/data/dataframe/dispatch.py @@ -9,20 +9,19 @@ def __call__(self, data, *args, **kwargs): return data # TODO: Fix this - from crossfit.backend.pandas.dataframe import PandasDataFrame from crossfit.backend.dask.dataframe import DaskDataFrame - + from crossfit.backend.pandas.dataframe import PandasDataFrame + backends = [PandasDataFrame, DaskDataFrame] - try: from crossfit.backend.cudf.dataframe import CudfDataFrame + CudfDataFrame._lib() backends.append(CudfDataFrame) except ImportError: pass - for backend in backends: if isinstance(data, getattr(backend._lib(), "DataFrame")): return backend(data, *args, **kwargs) diff --git a/crossfit/data/sparse/core.py b/crossfit/data/sparse/core.py index 09f361e..dca7746 100644 --- a/crossfit/data/sparse/core.py +++ b/crossfit/data/sparse/core.py @@ -74,7 +74,8 @@ def contains_inf(self) -> bool: class SparseMatrixBackend: """ - Stores sparse matrix data in unsorted CSR format (i.e., column indices in each row are unsorted). + Stores sparse matrix data in unsorted CSR format (i.e., column indices in each row are + unsorted). """ def __init__(self, idx_ptr, col_idx, data, shape=None): diff --git a/crossfit/data/sparse/ranking.py b/crossfit/data/sparse/ranking.py index 929ff10..d410b98 100644 --- a/crossfit/data/sparse/ranking.py +++ b/crossfit/data/sparse/ranking.py @@ -1,9 +1,10 @@ import warnings -from crossfit.data.array.masked import MaskedArray + import numpy as np -from crossfit.data.sparse.dispatch import CrossSparse, SparseMatrixProtocol from crossfit.data.array.dispatch import crossarray +from crossfit.data.array.masked import MaskedArray +from crossfit.data.sparse.dispatch import CrossSparse, SparseMatrixProtocol class SparseLabels: @@ -42,21 +43,23 @@ def labels(self) -> SparseMatrixProtocol: @classmethod def from_positive_indices(cls, indices): """ - Construct a binary labels instance from sparse data where only positive items are specified. + Construct a binary labels instance from sparse data where only positive items are specified Parameters ---------- indices : array_like, one row per context (e.g., user or query) - Specifies positive indices for each sample. Must be 1D or 2D, but row lengths can differ. + Specifies positive indices for each sample. Must be 1D or 2D, but row lengths can + differ. Raises ------ ValueError - if `indices` is of invalid shape, type or contains duplicate, negative or non-integer indices. + if `indices` is of invalid shape, type or contains duplicate, negative or non-integer + indices. Examples -------- - >>> BinaryLabels.from_positive_indices([[1,2], [2]]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + >>> BinaryLabels.from_positive_indices([[1,2], [2]]) """ return cls(CrossSparse.from_nonzero_indices(indices)) @@ -64,21 +67,22 @@ def from_positive_indices(cls, indices): @classmethod def from_matrix(cls, labels): """ - Construct a binary labels instance from dense or sparse matrix where each item's label is specified. + Construct a binary labels instance from dense or sparse matrix where each item's label is + specified. Parameters ---------- labels : 1D or 2D array, one row per context (e.g., user or query) - Contains binary labels for each item. Labels must be in {0, 1}. + Contains binary labels for each item. Labels must be in {0, 1}. Raises ------ ValueError - if `labels` is of invalid shape, type or non-binary. + if `labels` is of invalid shape, type or non-binary. Examples -------- - >>> BinaryLabels.from_matrix([[0, 1, 1], [0, 0, 1]]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + >>> BinaryLabels.from_matrix([[0, 1, 1], [0, 0, 1]]) """ return cls(CrossSparse.from_matrix(labels)) @@ -154,7 +158,8 @@ def _verify_input(cls, arr, dtype=np.floating): if np.issubdtype(dtype, np.floating): if not np.all(np.isfinite(arr)): warnings.warn( - "Input contains NaN or Inf entries which will be ignored.", InvalidValuesWarning + "Input contains NaN or Inf entries which will be ignored.", + InvalidValuesWarning, ) arr[~np.isfinite(arr)] = np.NINF elif not np.issubdtype(dtype, np.integer): @@ -175,7 +180,12 @@ def from_ranked_indices(cls, indices, valid_items=None, invalid_items=None): @classmethod def from_scores( - cls, raw_scores, valid_items=None, invalid_items=None, warn_empty=True, k_max=None + cls, + raw_scores, + valid_items=None, + invalid_items=None, + warn_empty=True, + k_max=None, ): raw_scores = cls._verify_input(raw_scores, dtype=np.floating) @@ -226,7 +236,8 @@ def __init__(self, indices, valid_items=None, invalid_items=None, warn_empty=Tru indices.difference(invalid_items) if not indices.isfinite(): warnings.warn( - "Input contains NaN or Inf entries which will be ignored.", InvalidValuesWarning + "Input contains NaN or Inf entries which will be ignored.", + InvalidValuesWarning, ) indices.remove_infinite() n_empty_rows = indices.count_empty_rows() @@ -247,18 +258,19 @@ def from_ranked_indices(cls, indices, valid_items=None, invalid_items=None): Parameters ---------- indices : array_like, one row per ranking - Indices of items after ranking. Must be 1D or 2D, but row lengths can differ. + Indices of items after ranking. Must be 1D or 2D, but row lengths can differ. valid_items : array_like, one row per ranking - Indices of valid items (e.g., candidate set). Invalid items will be discarded from ranking. + Indices of valid items (e.g., candidate set). Invalid items will be discarded from + ranking. Raises ------ ValueError - if `indices` or `valid_items` of invalid shape or type. + if `indices` or `valid_items` of invalid shape or type. Examples -------- - >>> Rankings.from_ranked_indices([[5, 2], [4, 3, 1]]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + >>> Rankings.from_ranked_indices([[5, 2], [4, 3, 1]]) """ indices = CrossSparse.from_lil(indices) @@ -273,23 +285,24 @@ def from_scores(cls, raw_scores, valid_items=None, invalid_items=None, warn_empt Parameters ---------- raw_scores : array_like, one row per ranking - Contains raw scores for each item. Must be 1D or 2D, but row lengths can differ. + Contains raw scores for each item. Must be 1D or 2D, but row lengths can differ. valid_items : array_like, one row per ranking - Indices of valid items (e.g., candidate set). Invalid items will be discarded from ranking. + Indices of valid items (e.g., candidate set). Invalid items will be discarded from + ranking. Raises ------ ValueError - if `raw_scores` or `valid_items` of invalid shape or type. + if `raw_scores` or `valid_items` of invalid shape or type. Warns ------ InvalidValuesWarning - if `raw_scores` contains non-finite values. + if `raw_scores` contains non-finite values. Examples -------- - >>> Rankings.from_scores([[0.1, 0.5, 0.2], [0.4, 0.2, 0.5]]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE + >>> Rankings.from_scores([[0.1, 0.5, 0.2], [0.4, 0.2, 0.5]]) """ indices = CrossSparse.from_values(raw_scores, keep_zeros=True) diff --git a/crossfit/dataset/base.py b/crossfit/dataset/base.py index cc3a126..5eec712 100644 --- a/crossfit/dataset/base.py +++ b/crossfit/dataset/base.py @@ -3,9 +3,6 @@ from typing import Dict, Optional, Union import dask_cudf -from cuml.dask.neighbors import NearestNeighbors - -from crossfit.backend.cudf.series import create_list_series_from_2d_ar _SPLIT_ALIASES = { "val": ["validation", "valid", "dev"], diff --git a/crossfit/dataset/beir/raw.py b/crossfit/dataset/beir/raw.py index 3a819c2..753c214 100644 --- a/crossfit/dataset/beir/raw.py +++ b/crossfit/dataset/beir/raw.py @@ -4,6 +4,7 @@ from typing import Dict, List, Union from beir import util + from crossfit.dataset.home import CF_HOME diff --git a/crossfit/dataset/home.py b/crossfit/dataset/home.py index 8cc21fb..157a651 100644 --- a/crossfit/dataset/home.py +++ b/crossfit/dataset/home.py @@ -1,4 +1,3 @@ import os - CF_HOME = os.environ.get("CF_HOME", os.path.join(os.path.expanduser("~"), ".cf")) diff --git a/crossfit/metric/__init__.py b/crossfit/metric/__init__.py index 87e9cc4..3d6900a 100644 --- a/crossfit/metric/__init__.py +++ b/crossfit/metric/__init__.py @@ -1,11 +1,10 @@ +from crossfit.metric.categorical.str_len import MeanStrLength +from crossfit.metric.categorical.value_counts import ValueCounts +from crossfit.metric.continuous.max import Max from crossfit.metric.continuous.mean import Mean, create_mean_metric from crossfit.metric.continuous.min import Min -from crossfit.metric.continuous.max import Max from crossfit.metric.continuous.sum import Sum -from crossfit.metric.categorical.value_counts import ValueCounts -from crossfit.metric.categorical.str_len import MeanStrLength - __all__ = [ "create_mean_metric", "Mean", diff --git a/crossfit/metric/base.py b/crossfit/metric/base.py index 6d7dc4e..a907cea 100644 --- a/crossfit/metric/base.py +++ b/crossfit/metric/base.py @@ -3,9 +3,9 @@ import numpy as np +from crossfit.calculate.aggregate import Aggregator from crossfit.calculate.module import CrossModule, state from crossfit.data import crossarray -from crossfit.calculate.aggregate import Aggregator class CrossMetric(CrossModule, abc.ABC): diff --git a/crossfit/metric/categorical/value_counts.py b/crossfit/metric/categorical/value_counts.py index 40fc6f5..621b24e 100644 --- a/crossfit/metric/categorical/value_counts.py +++ b/crossfit/metric/categorical/value_counts.py @@ -1,5 +1,5 @@ -import pandas as pd import numpy as np +import pandas as pd from crossfit.metric.base import CrossMetric, state diff --git a/crossfit/metric/continuous/mean.py b/crossfit/metric/continuous/mean.py index cb48e41..e19b5e6 100644 --- a/crossfit/metric/continuous/mean.py +++ b/crossfit/metric/continuous/mean.py @@ -1,6 +1,5 @@ import functools as ft - from crossfit.metric.base import CrossMetric, state diff --git a/crossfit/metric/ranking/__init__.py b/crossfit/metric/ranking/__init__.py index 1749fce..18d169b 100644 --- a/crossfit/metric/ranking/__init__.py +++ b/crossfit/metric/ranking/__init__.py @@ -1,17 +1,16 @@ -from crossfit.metric.ranking.f1 import F1 -from crossfit.metric.ranking.hitrate import HitRate -from crossfit.metric.ranking.ndcg import DCG, NDCG -from crossfit.metric.ranking.precision import Precision, AP -from crossfit.metric.ranking.rank import FirstRelevantRank, MeanRanks, ReciprocalRank -from crossfit.metric.ranking.recall import Recall from crossfit.data.sparse.ranking import ( - SparseLabels, + Rankings, SparseBinaryLabels, + SparseLabels, SparseNumericLabels, SparseRankings, - Rankings, ) - +from crossfit.metric.ranking.f1 import F1 +from crossfit.metric.ranking.hitrate import HitRate +from crossfit.metric.ranking.ndcg import DCG, NDCG +from crossfit.metric.ranking.precision import AP, Precision +from crossfit.metric.ranking.rank import FirstRelevantRank, MeanRanks, ReciprocalRank +from crossfit.metric.ranking.recall import Recall __all__ = [ "AP", diff --git a/crossfit/metric/ranking/base.py b/crossfit/metric/ranking/base.py index 3896e98..147eed3 100644 --- a/crossfit/metric/ranking/base.py +++ b/crossfit/metric/ranking/base.py @@ -1,9 +1,9 @@ -from crossfit.data.array.dispatch import crossarray import numpy as np +from crossfit.data.array.dispatch import crossarray from crossfit.data.array.masked import MaskedArray +from crossfit.data.sparse.ranking import Rankings, SparseBinaryLabels, SparseLabels from crossfit.metric.continuous.mean import Mean -from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseLabels, Rankings, SparseRankings class RankingMetric(Mean): diff --git a/crossfit/metric/ranking/f1.py b/crossfit/metric/ranking/f1.py index e707ccc..7353029 100644 --- a/crossfit/metric/ranking/f1.py +++ b/crossfit/metric/ranking/f1.py @@ -1,9 +1,7 @@ import numpy as np -from crossfit.metric.ranking.base import SparseBinaryLabels from crossfit.metric.ranking.precision import Precision from crossfit.metric.ranking.recall import Recall -from crossfit.data.array.masked import MaskedArray class F1(Precision, Recall): diff --git a/crossfit/metric/ranking/ndcg.py b/crossfit/metric/ranking/ndcg.py index 66b9c02..872d5c5 100644 --- a/crossfit/metric/ranking/ndcg.py +++ b/crossfit/metric/ranking/ndcg.py @@ -1,8 +1,8 @@ import numpy as np -from crossfit.metric.ranking.base import RankingMetric, SparseLabels, SparseRankings -from crossfit.data.array.masked import MaskedArray from crossfit.data.array.conversion import convert_array +from crossfit.data.array.masked import MaskedArray +from crossfit.metric.ranking.base import RankingMetric, SparseLabels class DCG(RankingMetric): diff --git a/crossfit/metric/ranking/precision.py b/crossfit/metric/ranking/precision.py index 5436258..220951d 100644 --- a/crossfit/metric/ranking/precision.py +++ b/crossfit/metric/ranking/precision.py @@ -2,11 +2,7 @@ from crossfit.data.array.conversion import convert_array from crossfit.data.array.masked import MaskedArray -from crossfit.metric.ranking.base import ( - BinaryRankingMetric, - SparseBinaryLabels, - SparseLabels, -) +from crossfit.metric.ranking.base import BinaryRankingMetric, SparseLabels class Precision(BinaryRankingMetric): @@ -17,8 +13,7 @@ def __init__(self, k, truncated=False): def _precision(self, y_true: SparseLabels, y_pred_labels: MaskedArray): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) n_relevant = np.sum( - (y_pred_labels.data[:, : self._k] == 1) - & (~y_pred_labels.mask[:, : self._k]), + (y_pred_labels.data[:, : self._k] == 1) & (~y_pred_labels.mask[:, : self._k]), axis=-1, ) diff --git a/crossfit/metric/ranking/rank.py b/crossfit/metric/ranking/rank.py index af71b08..e7c2be9 100644 --- a/crossfit/metric/ranking/rank.py +++ b/crossfit/metric/ranking/rank.py @@ -1,7 +1,7 @@ import numpy as np -from crossfit.metric.ranking.base import BinaryRankingMetric, SparseBinaryLabels from crossfit.data.array.masked import MaskedArray +from crossfit.metric.ranking.base import BinaryRankingMetric, SparseBinaryLabels class ReciprocalRank(BinaryRankingMetric): diff --git a/crossfit/metric/ranking/recall.py b/crossfit/metric/ranking/recall.py index c003a56..6c8384c 100644 --- a/crossfit/metric/ranking/recall.py +++ b/crossfit/metric/ranking/recall.py @@ -1,7 +1,7 @@ import numpy as np -from crossfit.metric.ranking.base import BinaryRankingMetric, SparseLabels from crossfit.data.array.masked import MaskedArray +from crossfit.metric.ranking.base import BinaryRankingMetric, SparseLabels class Recall(BinaryRankingMetric): @@ -12,7 +12,8 @@ def __init__(self, k, truncated=False): def _recall(self, y_true: SparseLabels, y_pred_labels: MaskedArray): n_pos = y_true.get_n_positives(y_pred_labels.shape[0]) n_relevant = np.sum( - (y_pred_labels.data[:, : self._k] >= 1) & (~y_pred_labels.mask[:, : self._k]), axis=-1 + (y_pred_labels.data[:, : self._k] >= 1) & (~y_pred_labels.mask[:, : self._k]), + axis=-1, ) scores = np.NaN * np.zeros_like(n_relevant, dtype=float) diff --git a/crossfit/op/__init__.py b/crossfit/op/__init__.py index 09dc4c1..76e95e3 100644 --- a/crossfit/op/__init__.py +++ b/crossfit/op/__init__.py @@ -1,7 +1,8 @@ +# flake8: noqa + from crossfit.op.base import Op from crossfit.op.combinators import Sequential - __all__ = [ "Op", "Sequential", @@ -9,7 +10,7 @@ try: from crossfit.backend.torch.op.embed import Embedder - + __all__.append("Embedder") except ImportError: pass @@ -17,7 +18,7 @@ try: from crossfit.op.tokenize import Tokenizer - + __all__.append("Tokenizer") except ImportError: pass @@ -25,6 +26,7 @@ try: from crossfit.op.vector_search import CuMLANNSearch, CuMLExactSearch, RaftExactSearch + __all__.extend(["CuMLANNSearch", "CuMLExactSearch", "RaftExactSearch"]) except ImportError: pass diff --git a/crossfit/op/base.py b/crossfit/op/base.py index 2caa9c7..a82458f 100644 --- a/crossfit/op/base.py +++ b/crossfit/op/base.py @@ -2,8 +2,8 @@ import uuid import dask.dataframe as dd -from tqdm.auto import tqdm from dask.distributed import get_worker +from tqdm.auto import tqdm class Op: diff --git a/crossfit/report/beir/embed.py b/crossfit/report/beir/embed.py index 0ab1180..b8ab1cb 100644 --- a/crossfit/report/beir/embed.py +++ b/crossfit/report/beir/embed.py @@ -3,12 +3,12 @@ from typing import Optional from crossfit import op +from crossfit.backend.torch.loader import DEFAULT_BATCH_SIZE +from crossfit.backend.torch.model import Model from crossfit.dataset.base import Dataset, EmbeddingDatataset, IRDataset from crossfit.dataset.home import CF_HOME from crossfit.dataset.load import load_dataset from crossfit.op.vector_search import VectorSearchOp -from crossfit.backend.torch.model import Model -from crossfit.backend.torch.loader import DEFAULT_BATCH_SIZE def embed( @@ -28,9 +28,7 @@ def embed( out_dir = out_dir or CF_HOME processed_name = "processed-test" if tiny_sample else "processed" - emb_dir = os.path.join( - out_dir, processed_name, "beir", dataset_name, "emb", model.path_or_name - ) + emb_dir = os.path.join(out_dir, processed_name, "beir", dataset_name, "emb", model.path_or_name) if os.path.exists(emb_dir): if overwrite: @@ -57,9 +55,7 @@ def embed( pipe = op.Sequential( op.Tokenizer(model, cols=["text"]), - op.Embedder( - model, sorted_data_loader=sorted_data_loader, batch_size=batch_size - ), + op.Embedder(model, sorted_data_loader=sorted_data_loader, batch_size=batch_size), repartition=partitions, keep_cols=["index", "_id"], ) diff --git a/crossfit/report/beir/report.py b/crossfit/report/beir/report.py index aa6919e..35db591 100644 --- a/crossfit/report/beir/report.py +++ b/crossfit/report/beir/report.py @@ -4,22 +4,20 @@ import cudf import cupy as cp import dask_cudf -from cuml.preprocessing import LabelEncoder -import numpy as np import torch +from cuml.preprocessing import LabelEncoder from crossfit.backend.dask.aggregate import aggregate -from crossfit.data.sparse.dispatch import CrossSparse +from crossfit.backend.torch.loader import DEFAULT_BATCH_SIZE +from crossfit.backend.torch.model import Model +from crossfit.calculate.aggregate import Aggregator from crossfit.data.array.dispatch import crossarray from crossfit.dataset.base import EmbeddingDatataset -from crossfit.report.beir.embed import embed -from crossfit.calculate.aggregate import Aggregator from crossfit.metric.continuous.mean import Mean -from crossfit.metric.ranking import AP, NDCG, Precision, Recall, SparseBinaryLabels, SparseNumericLabels, SparseRankings -from crossfit.report.base import Report +from crossfit.metric.ranking import AP, NDCG, Precision, Recall, SparseNumericLabels, SparseRankings from crossfit.op.vector_search import VectorSearchOp -from crossfit.backend.torch.model import Model -from crossfit.backend.torch.loader import DEFAULT_BATCH_SIZE +from crossfit.report.base import Report +from crossfit.report.beir.embed import embed class BeirMetricAggregator(Aggregator): @@ -81,9 +79,7 @@ def create_csr_matrix(ids, scores, label_encoder: LabelEncoder): values = scores.list.leaves.values.astype(cp.float32) indices = label_encoder.transform(ids.list.leaves).values indptr = scores.list._column.offsets.values - sparse_matrix = cp.sparse.csr_matrix( - (values, indices, indptr), shape=(num_rows, num_columns) - ) + sparse_matrix = cp.sparse.csr_matrix((values, indices, indptr), shape=(num_rows, num_columns)) return sparse_matrix @@ -109,7 +105,11 @@ def join_predictions(data, predictions): predictions = predictions.set_index("query-index") merged = observed.merge( - predictions, left_index=True, right_index=True, how="left", suffixes=("-obs", "-pred") + predictions, + left_index=True, + right_index=True, + how="left", + suffixes=("-obs", "-pred"), ).rename(columns={"split-obs": "split"}) output = merged.reset_index() @@ -133,7 +133,7 @@ def console(self): console.print(self.result_df) for i in range(len(self.result_df)): - console.rule(f": ".join(self.result_df.index[i])) + console.rule(": ".join(self.result_df.index[i])) grouped_columns = {} for col in self.result_df.columns: metric_type = col.split("@")[0] if "@" in col else col diff --git a/crossfit/report/data_overview/report.py b/crossfit/report/data_overview/report.py index 4b23409..4bb9f25 100644 --- a/crossfit/report/data_overview/report.py +++ b/crossfit/report/data_overview/report.py @@ -2,19 +2,15 @@ import numpy as np -from crossfit.report.base import Report +from crossfit.backend.dask.aggregate import aggregate from crossfit.calculate.aggregate import Aggregator -from crossfit.metric.common import CommonStats -from crossfit.metric.continuous.range import Range -from crossfit.metric.continuous.moments import Moments from crossfit.metric.categorical.str_len import MeanStrLength from crossfit.metric.categorical.value_counts import ValueCounts -from crossfit.report.data_overview.visualization.facets import ( - visualize, - FacetsOverview, -) - -from crossfit.backend.dask.aggregate import aggregate +from crossfit.metric.common import CommonStats +from crossfit.metric.continuous.moments import Moments +from crossfit.metric.continuous.range import Range +from crossfit.report.base import Report +from crossfit.report.data_overview.visualization.facets import FacetsOverview, visualize class ContinuousMetrics(Aggregator): diff --git a/crossfit/report/data_overview/visualization/facets.py b/crossfit/report/data_overview/visualization/facets.py index c527ae7..b950b46 100644 --- a/crossfit/report/data_overview/visualization/facets.py +++ b/crossfit/report/data_overview/visualization/facets.py @@ -3,12 +3,11 @@ from tensorflow_metadata.proto.v0 import statistics_pb2 - STATS_FILE_NAME = "stats.pb" def _maybe_to_pandas(data): - # Utility to covert cudf data to pandas (for now) + # Utility to convert cudf data to pandas (for now) if hasattr(data, "to_pandas"): return data.to_pandas() return data diff --git a/crossfit/utils/np_utils.py b/crossfit/utils/np_utils.py index c2d418b..67670bb 100644 --- a/crossfit/utils/np_utils.py +++ b/crossfit/utils/np_utils.py @@ -1,4 +1,5 @@ import inspect + import numpy as np @@ -34,10 +35,7 @@ def names_from_num(prefix, n): ("extobj", None), ] params = [] - params += [ - inspect.Parameter(name, inspect.Parameter.POSITIONAL_ONLY) - for name in input_names - ] + params += [inspect.Parameter(name, inspect.Parameter.POSITIONAL_ONLY) for name in input_names] if f.nout > 1: params += [ inspect.Parameter(name, inspect.Parameter.POSITIONAL_ONLY, default=None) diff --git a/examples/crossarray-tf.ipynb b/examples/crossarray-tf.ipynb index e2582de..50c775a 100644 --- a/examples/crossarray-tf.ipynb +++ b/examples/crossarray-tf.ipynb @@ -116,7 +116,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Tensorflow is definetely faster (since it leverages the GPU). *Note, the API is slightly different: `tf.matmul` vs `np.dot`*. \n", + "Tensorflow is definitely faster (since it leverages the GPU). *Note, the API is slightly different: `tf.matmul` vs `np.dot`*. \n", "\n", "This is where crossfit comes in! You can write your code using numpy, and crossfit takes care of running it in various supported backends including pytorch/jax/tensorflow." ] @@ -134,7 +134,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Crossfit enables writing your code using numpy & run it in a variaty of different backends. \n", + "Crossfit enables writing your code using numpy & run it in a variety of different backends. \n", "\n", "`crossarray` can be be used as a decorator or as a context-manager" ] diff --git a/examples/dask_compute_bench.py b/examples/dask_compute_bench.py index 65ec812..51063a7 100644 --- a/examples/dask_compute_bench.py +++ b/examples/dask_compute_bench.py @@ -1,14 +1,13 @@ import time import dask +from dask_cuda import LocalCUDACluster +from distributed import Client, LocalCluster from crossfit.calculate.frame import MetricFrame from crossfit.dask.calculate import calculate_per_col as calculate_dask from crossfit.stats.continuous.stats import ContinuousStats -from dask_cuda import LocalCUDACluster -from distributed import Client, LocalCluster - # Benchmark assumes Criteo dataset. # Low-cardinality columns: # {C6:4, C9:64, C13:11, C16:155, C17:4, C19:15, C25:109, C26:37} @@ -25,13 +24,12 @@ # Set Dask backend dask.config.set({"dataframe.backend": backend}) if backend == "cudf": - # For older dask versions, backend config wont work + # For older dask versions, backend config won't work import dask_cudf as dd else: import dask.dataframe as dd if __name__ == "__main__": - if use_cluster: # Spin up cluster cluster_type = LocalCUDACluster if backend == "cudf" else LocalCluster diff --git a/pyproject.toml b/pyproject.toml index 38429a9..89709d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,4 +3,19 @@ markers = [ "tensorflow", "pytorch", "jax", -] \ No newline at end of file +] + +[tool.black] +line-length = 100 + +[tool.isort] +use_parentheses = true +multi_line_output = 3 +include_trailing_comma = true +force_grid_wrap = 0 +ensure_newline_before_comments = true +line_length = 100 +balanced_wrapping = true +indent = " " +known_third_party = ["cudf", "cupy", "dask", "dask_cuda", "dask_cudf", "numba", "numpy", "pytest", "torch", "rmm", "tensorflow"] +skip = ["build", ".eggs"] diff --git a/requirements/dev.txt b/requirements/dev.txt index 2d4d825..a70ac66 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -1,34 +1,6 @@ -black==22.3.0 -click<8.1.0 -flake8==3.9.2 -ipython_genutils -isort==5.9.3 -nbsphinx>=0.6 -pylint==2.7.4 +black==23.10.1 +flake8==6.1.0 +isort==5.12.0 bandit==1.7.0 -flake8-nb==0.3.0 pytest>=5 pytest-cov>=2 -pytest-xdist - -moto>=2 -cpplint>=1.5 -codespell -interrogate==1.5.0 - -# docs -Sphinx<3.6 -jinja2<3.1 -markupsafe==2.0.1 -sphinx_markdown_tables==0.0.15 -sphinx-multiversion@git+https://github.com/mikemckiernan/sphinx-multiversion.git -sphinxcontrib-copydirs@git+https://github.com/mikemckiernan/sphinxcontrib-copydirs.git -sphinx-external-toc<0.4 -sphinx_rtd_theme -natsort<8.2 -myst-nb<0.14 -linkify-it-py<1.1 - -# needed to avoid bug in sphinx-markdown-tables -# https://github.com/ryanfox/sphinx-markdown-tables/issues/36 -markdown==3.3.7 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 51fb869..c24288f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -50,7 +50,7 @@ plugins = ignore = D100,D102,D103,D104,D105,D107,D203,D205,D211,D212,D213,D400,D401,D413,D415 [codespell] -skip = .*pb2.py,./.git,./.github,./bench,./dist,./docs/build,.*egg-info.*,versioneer.py,*.csv,*.parquet,./.mypy_cache +skip = .*pb2.py,./.git,./.github,./bench,./dist,./docs/build,.*egg-info.*,versioneer.py,*.csv,*.parquet,./.mypy_cache,./tests/testdata ignore-words = ./ci/ignore_codespell_words.txt count = -quiet-level = 3 \ No newline at end of file +quiet-level = 3 diff --git a/tests/backend/test_sklearn.py b/tests/backend/test_sklearn.py index 6992a4a..609c758 100644 --- a/tests/backend/test_sklearn.py +++ b/tests/backend/test_sklearn.py @@ -11,9 +11,7 @@ tensor_types = [ - m - for m in np_backend_dispatch.supports - if not m.__module__.startswith(("cupy", "cudf")) + m for m in np_backend_dispatch.supports if not m.__module__.startswith(("cupy", "cudf")) ] diff --git a/tests/data/array/test_conversion.py b/tests/data/array/test_conversion.py index 5a9240a..29782fc 100644 --- a/tests/data/array/test_conversion.py +++ b/tests/data/array/test_conversion.py @@ -17,6 +17,6 @@ def test_convert_roundtrip(to_type): assert isinstance(converted, to_type) orig = convert_array(converted, np.ndarray) - assert type(orig) == np.ndarray + assert type(orig) is np.ndarray assert np.all(from_array == orig) diff --git a/tests/data/array/test_decorator.py b/tests/data/array/test_decorator.py index 06dfa39..446cf68 100644 --- a/tests/data/array/test_decorator.py +++ b/tests/data/array/test_decorator.py @@ -13,9 +13,7 @@ def nesting_test(x, y): return test_utils.min_test(x, y) + max_test(x, y) -@pytest.mark.parametrize( - "fn", [np.all, np.sum, np.mean, np.std, np.var, np.any, np.prod] -) +@pytest.mark.parametrize("fn", [np.all, np.sum, np.mean, np.std, np.var, np.any, np.prod]) def test_simple_numpy_function_crossnp(fn): crossfn = crossarray(fn) diff --git a/tests/dataset/test_load.py b/tests/dataset/test_load.py index f688d7f..8b1a516 100644 --- a/tests/dataset/test_load.py +++ b/tests/dataset/test_load.py @@ -2,11 +2,8 @@ beir = pytest.importorskip("beir") -import os -import random - -import crossfit as cf -from crossfit.dataset.beir.raw import BEIR_DATASETS +import crossfit as cf # noqa: E402 +from crossfit.dataset.beir.raw import BEIR_DATASETS # noqa: E402 DATASETS = set(BEIR_DATASETS.keys()) DATASETS.discard("cqadupstack") diff --git a/tests/metrics/ranking/test_f1.py b/tests/metrics/ranking/test_f1.py index 5e2159a..b515d5b 100644 --- a/tests/metrics/ranking/test_f1.py +++ b/tests/metrics/ranking/test_f1.py @@ -2,8 +2,8 @@ pytest.importorskip("cupy") -from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings -from crossfit.metric.ranking import F1 +from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings # noqa: E402 +from crossfit.metric.ranking import F1 # noqa: E402 y1 = [0, 5] y2 = [8, 9] diff --git a/tests/metrics/ranking/test_hitrate.py b/tests/metrics/ranking/test_hitrate.py index a5e086c..556094e 100644 --- a/tests/metrics/ranking/test_hitrate.py +++ b/tests/metrics/ranking/test_hitrate.py @@ -2,8 +2,8 @@ pytest.importorskip("cupy") -from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings -from crossfit.metric.ranking import HitRate +from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings # noqa: E402 +from crossfit.metric.ranking import HitRate # noqa: E402 y1 = [0, 5] y2 = [8, 9] diff --git a/tests/metrics/ranking/test_ndcg.py b/tests/metrics/ranking/test_ndcg.py index d91bea3..0ff2933 100644 --- a/tests/metrics/ranking/test_ndcg.py +++ b/tests/metrics/ranking/test_ndcg.py @@ -2,13 +2,17 @@ pytest.importorskip("cupy") -import numpy as np -from pytrec_eval import RelevanceEvaluator - -from crossfit.data.sparse.ranking import (Rankings, SparseBinaryLabels, - SparseNumericLabels, SparseRankings) -from crossfit.metric.ranking import DCG, NDCG -from tests.pytrec_utils import create_qrel, create_results, create_run +import numpy as np # noqa: E402 +from pytrec_eval import RelevanceEvaluator # noqa: E402 + +from crossfit.data.sparse.ranking import ( # noqa: E402 + Rankings, + SparseBinaryLabels, + SparseNumericLabels, + SparseRankings, +) +from crossfit.metric.ranking import DCG, NDCG # noqa: E402 +from tests.pytrec_utils import create_qrel, create_results, create_run # noqa: E402 y1 = [0, 5] y2 = [8, 9] @@ -83,9 +87,7 @@ def test_numeric_score(self, y_gold, y_pred, expect, params): else: y_pred = SparseRankings.from_ranked_indices(y_pred) pred = ( - NDCG(3, **params, log_base="e") - .score(y_gold, y_pred, nan_handling="propagate") - .tolist() + NDCG(3, **params, log_base="e").score(y_gold, y_pred, nan_handling="propagate").tolist() ) assert pred == pytest.approx(expect, nan_ok=True) @@ -176,6 +178,4 @@ def test_pytrec_eval(self, obs, scores): for query_id, metrics in results.items(): for metric_name, value in metrics.items(): - assert value == pytest.approx( - pytrec_result[query_id][metric_name], rel=1e-3 - ) + assert value == pytest.approx(pytrec_result[query_id][metric_name], rel=1e-3) diff --git a/tests/metrics/ranking/test_precision.py b/tests/metrics/ranking/test_precision.py index e4647ee..86b3442 100644 --- a/tests/metrics/ranking/test_precision.py +++ b/tests/metrics/ranking/test_precision.py @@ -2,13 +2,12 @@ pytest.importorskip("cupy") -import numpy as np -from pytrec_eval import RelevanceEvaluator +import numpy as np # noqa: E402 +from pytrec_eval import RelevanceEvaluator # noqa: E402 -from crossfit.data.sparse.ranking import (Rankings, SparseBinaryLabels, - SparseRankings) -from crossfit.metric.ranking import AP, Precision -from tests.pytrec_utils import create_qrel, create_results, create_run +from crossfit.data.sparse.ranking import Rankings, SparseBinaryLabels, SparseRankings # noqa: E402 +from crossfit.metric.ranking import AP, Precision # noqa: E402 +from tests.pytrec_utils import create_qrel, create_results, create_run # noqa: E402 y1 = [0, 5] y2 = [8, 9] @@ -104,9 +103,7 @@ def test_pytrec_eval(self, obs, scores): for query_id, metrics in results.items(): for metric_name, value in metrics.items(): - assert value == pytest.approx( - pytrec_result[query_id][metric_name], rel=1e-3 - ) + assert value == pytest.approx(pytrec_result[query_id][metric_name], rel=1e-3) class TestTruncatedPrecision: @@ -134,11 +131,7 @@ def test_score(self, k, y_gold, y_pred, expect): y_pred = SparseRankings.from_ranked_indices(y_pred) else: y_pred = SparseRankings.from_ranked_indices(y_pred) - pred = ( - Precision(k, truncated=True) - .score(y_gold, y_pred, nan_handling="propagate") - .tolist() - ) + pred = Precision(k, truncated=True).score(y_gold, y_pred, nan_handling="propagate").tolist() assert pred == pytest.approx(expect, nan_ok=True) diff --git a/tests/metrics/ranking/test_rank.py b/tests/metrics/ranking/test_rank.py index 3307a4a..2c0861d 100644 --- a/tests/metrics/ranking/test_rank.py +++ b/tests/metrics/ranking/test_rank.py @@ -2,9 +2,8 @@ pytest.importorskip("cupy") -from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings -from crossfit.metric.ranking import (FirstRelevantRank, MeanRanks, - ReciprocalRank) +from crossfit.data.sparse.ranking import SparseBinaryLabels, SparseRankings # noqa: E402 +from crossfit.metric.ranking import FirstRelevantRank, MeanRanks, ReciprocalRank # noqa: E402 y1 = [0, 5] y2 = [8, 9] @@ -34,9 +33,7 @@ class TestReciprocalRank: def test_masked_score(self, k, y_gold, y_pred, valid, expect): y_gold = SparseBinaryLabels.from_positive_indices(y_gold) y_pred = SparseRankings.from_ranked_indices(y_pred, valid_items=valid) - pred = ( - ReciprocalRank(k).score(y_gold, y_pred, nan_handling="propagate").tolist() - ) + pred = ReciprocalRank(k).score(y_gold, y_pred, nan_handling="propagate").tolist() assert pred == pytest.approx(expect, nan_ok=True) @@ -63,9 +60,7 @@ def test_score(self, k, y_gold, y_pred, expect): y_pred = SparseRankings.from_ranked_indices(y_pred) else: y_pred = SparseRankings.from_ranked_indices(y_pred) - pred = ( - ReciprocalRank(k).score(y_gold, y_pred, nan_handling="propagate").tolist() - ) + pred = ReciprocalRank(k).score(y_gold, y_pred, nan_handling="propagate").tolist() assert pred == pytest.approx(expect, nan_ok=True) @@ -111,8 +106,6 @@ def test_score(self, y_gold, y_pred, expect): y_pred = SparseRankings.from_ranked_indices(y_pred) else: y_pred = SparseRankings.from_ranked_indices(y_pred) - pred = ( - FirstRelevantRank().score(y_gold, y_pred, nan_handling="propagate").tolist() - ) + pred = FirstRelevantRank().score(y_gold, y_pred, nan_handling="propagate").tolist() assert pred == pytest.approx(expect, nan_ok=True) diff --git a/tests/metrics/ranking/test_recall.py b/tests/metrics/ranking/test_recall.py index 59c8171..2421d5a 100644 --- a/tests/metrics/ranking/test_recall.py +++ b/tests/metrics/ranking/test_recall.py @@ -2,13 +2,12 @@ pytest.importorskip("cupy") -import numpy as np -from pytrec_eval import RelevanceEvaluator +import numpy as np # noqa: E402 +from pytrec_eval import RelevanceEvaluator # noqa: E402 -from crossfit.data.sparse.ranking import (Rankings, SparseBinaryLabels, - SparseRankings) -from crossfit.metric.ranking import Recall -from tests.pytrec_utils import create_qrel, create_results, create_run +from crossfit.data.sparse.ranking import Rankings, SparseBinaryLabels, SparseRankings # noqa: E402 +from crossfit.metric.ranking import Recall # noqa: E402 +from tests.pytrec_utils import create_qrel, create_results, create_run # noqa: E402 y1 = [0, 5] y2 = [8, 9] @@ -120,6 +119,4 @@ def test_pytrec_eval(self, obs, scores): for query_id, metrics in results.items(): for metric_name, value in metrics.items(): - assert value == pytest.approx( - pytrec_result[query_id][metric_name], rel=1e-3 - ) + assert value == pytest.approx(pytrec_result[query_id][metric_name], rel=1e-3) diff --git a/tests/report/beir/test_embed.py b/tests/report/beir/test_embed.py index bee2d2b..90764b3 100644 --- a/tests/report/beir/test_embed.py +++ b/tests/report/beir/test_embed.py @@ -2,11 +2,7 @@ cp = pytest.importorskip("cupy") -import random - -import numpy as np - -import crossfit as cf +import crossfit as cf # noqa: E402 @pytest.mark.singlegpu @@ -29,7 +25,5 @@ def test_embed_multi_gpu( ) embeds = embeds.predictions.ddf().compute().to_pandas() - assert set(embeds.columns) == set( - ["corpus-index", "score", "query-id", "query-index"] - ) + assert set(embeds.columns) == set(["corpus-index", "score", "query-id", "query-index"]) assert embeds["query-index"].nunique() == embeds["query-id"].nunique() diff --git a/tests/report/beir/test_report.py b/tests/report/beir/test_report.py index 40f789e..baa3d28 100644 --- a/tests/report/beir/test_report.py +++ b/tests/report/beir/test_report.py @@ -3,12 +3,12 @@ pytest.importorskip("cupy") beir = pytest.importorskip("beir") -import numpy as np +import numpy as np # noqa: E402 -import crossfit as cf -from crossfit.data.sparse.ranking import SparseNumericLabels, SparseRankings -from crossfit.metric.ranking import NDCG -from crossfit.report.beir.report import ( +import crossfit as cf # noqa: E402 +from crossfit.data.sparse.ranking import SparseNumericLabels, SparseRankings # noqa: E402 +from crossfit.metric.ranking import NDCG # noqa: E402 +from crossfit.report.beir.report import ( # noqa: E402 create_csr_matrix, create_label_encoder, join_predictions, @@ -35,15 +35,9 @@ def test_beir_report( ) expected_columns = [ - f"{metric}@{k}" - for metric in ["NDCG", "Recall", "Precision", "AP"] - for k in [1, 3, 5, 10] - ] - expected_indices = [ - ("split", "test"), - ("split", "train"), - ("split", "val"), + f"{metric}@{k}" for metric in ["NDCG", "Recall", "Precision", "AP"] for k in [1, 3, 5, 10] ] + assert sorted(report.result_df.columns.tolist()) == sorted(expected_columns) assert ("split", "test") in report.result_df.index.values.tolist() for col in expected_columns: