From 13924936f10b6108edd795e0752da21734508457 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 6 Sep 2022 09:18:19 -0500 Subject: [PATCH 01/77] FEAT-#4931: Create a query compiler that can connect to a service Signed-off-by: Devin Petersohn --- modin/config/envvars.py | 22 +- modin/core/execution/client/io.py | 46 ++ modin/core/execution/client/query_compiler.py | 686 ++++++++++++++++++ .../dispatching/factories/factories.py | 10 + modin/pandas/__init__.py | 8 +- modin/utils.py | 2 +- 6 files changed, 769 insertions(+), 5 deletions(-) create mode 100644 modin/core/execution/client/io.py create mode 100644 modin/core/execution/client/query_compiler.py diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 72c74258d94..69752d294d1 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -21,7 +21,7 @@ import secrets from .pubsub import Parameter, _TYPE_PARAMS, ExactStr, ValueSource -from typing import Optional +from typing import Optional, Any class EnvironmentVariable(Parameter, type=str, abstract=True): @@ -75,7 +75,23 @@ class Engine(EnvironmentVariable, type=str): """Distribution engine to run queries by.""" varname = "MODIN_ENGINE" - choices = ("Ray", "Dask", "Python", "Native") + choices = ("Ray", "Dask", "Python", "Native", "Client") + + @classmethod + def put(cls, value: Any) -> None: + """ + Set config value. + + Parameters + ---------- + value : Any + Config value to set. + """ + if cls._value_source == ValueSource.SET_BY_USER: + cls._check_callbacks(cls._put_nocallback(value)) + else: + cls._value = value + cls._value_source = ValueSource.SET_BY_USER @classmethod def _get_default(cls) -> str: @@ -141,7 +157,7 @@ class StorageFormat(EnvironmentVariable, type=str): varname = "MODIN_STORAGE_FORMAT" default = "Pandas" - choices = ("Pandas", "Hdk", "Pyarrow", "Cudf") + choices = ("Pandas", "OmniSci", "Pyarrow", "Cudf", "") class IsExperimental(EnvironmentVariable, type=bool): diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py new file mode 100644 index 00000000000..9e6163a596f --- /dev/null +++ b/modin/core/execution/client/io.py @@ -0,0 +1,46 @@ +from modin.core.io.io import BaseIO +import os +from .query_compiler import ClientQueryCompiler + + +class ClientIO(BaseIO): + _server_conn = None + _data_conn = None + + @classmethod + def set_server_connection(cls, conn): + cls._server_conn = conn + + @classmethod + def set_data_connection(cls, conn): + cls._data_conn = conn + + @classmethod + def read_csv(cls, filepath_or_buffer, **kwargs): + if isinstance(filepath_or_buffer, str): + filepath_or_buffer = os.path.abspath(filepath_or_buffer) + else: + raise NotImplementedError("Only filepaths are supported for read_csv") + if cls._server_conn is None: + raise ConnectionError( + "Missing server connection, did you initialize the connection?" + ) + return ClientQueryCompiler( + cls._server_conn.read_csv(cls._data_conn, filepath_or_buffer, **kwargs) + ) + + @classmethod + def read_sql(cls, sql, con, **kwargs): + if isinstance(con, str) and con.lower() == "auto" and cls._data_conn is None: + raise ConnectionError( + "Cannot connect with parameter 'auto' because connection is not set. Did you initialize it?" + ) + if cls._data_conn is None: + cls._data_conn = con + if cls._server_conn is None: + raise ConnectionError( + "Missing server connection, did you initialize the connection?" + ) + return ClientQueryCompiler( + cls._server_conn.read_sql(sql, cls._data_conn, **kwargs) + ) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py new file mode 100644 index 00000000000..6e35a083746 --- /dev/null +++ b/modin/core/execution/client/query_compiler.py @@ -0,0 +1,686 @@ +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +import numpy as np +import pickle +import inspect +from pandas.api.types import is_list_like +from pandas.core.computation.parsing import tokenize_string + + +class ClientQueryCompiler(BaseQueryCompiler): + @classmethod + def set_server_connection(cls, conn): + cls._service = conn + + @classmethod + def create_table(cls, table_name): + return cls(cls._service.create_query_compiler(table_name)) + + def __init__(self, id): + assert ( + id is not None + ), "Make sure the client is properly connected and returns and ID" + self._id = id + + def _set_columns(self, new_columns): + self._id = self._service.rename(self._id, new_col_labels=new_columns) + + def _get_columns(self): + return self._service.columns(self._id) + + def _set_index(self, new_index): + self._id = self._service.rename(self._id, new_row_labels=new_index) + + def _get_index(self): + return self._service.index(self._id) + + columns = property(_get_columns, _set_columns) + index = property(_get_index, _set_index) + _dtypes_cache = None + + @property + def dtypes(self): + if self._dtypes_cache is None: + ref = self._service.dtypes(self._id) + self._dtypes_cache = pickle.loads(pickle.dumps(ref)) + return self._dtypes_cache + + @classmethod + def from_pandas(cls, df, data_cls): + raise NotImplementedError + + def to_pandas(self): + remote_obj = self._service.to_pandas(self._id) + return pickle.loads(pickle.dumps(remote_obj)) + + def default_to_pandas(self, pandas_op, *args, **kwargs): + raise NotImplementedError + + def columnarize(self): + return self.__constructor__(self._service.columnarize(self._id)) + + def transpose(self): + return self.__constructor__(self._service.transpose(self._id)) + + def copy(self): + return self.__constructor__(self._id) + + def insert(self, loc, column, value): + if isinstance(value, ClientQueryCompiler): + value = value._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.insert(self._id, loc, column, value, is_qc) + ) + + def setitem(self, axis, key, value): + if isinstance(value, ClientQueryCompiler): + value = value._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.setitem(self._id, axis, key, value, is_qc) + ) + + def getitem_array(self, key): + if isinstance(key, ClientQueryCompiler): + key = key._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.getitem_array(self._id, key, is_qc)) + + def getitem_column_array(self, key, numeric=False): + return self.__constructor__( + self._service.getitem_column_array(self._id, key, numeric) + ) + + def getitem_row_labels_array(self, labels): + return self.__constructor__( + self._service.getitem_row_labels_array(self._id, labels) + ) + + def getitem_row_array(self, key): + return self.__constructor__(self._service.getitem_row_array(self._id, key)) + + def pivot(self, index, columns, values): + return self.__constructor__( + self._service.pivot(self._id, index, columns, values) + ) + + def get_dummies(self, columns, **kwargs): + return self.__constructor__( + self._service.get_dummies(self._id, columns, **kwargs) + ) + + def view(self, index=None, columns=None): + return self.__constructor__(self._service.view(self._id, index, columns)) + + take_2d = view + + def drop(self, index=None, columns=None): + return self.__constructor__(self._service.drop(self._id, index, columns)) + + def isna(self): + return self.__constructor__(self._service.isna(self._id)) + + def notna(self): + return self.__constructor__(self._service.notna(self._id)) + + def fillna( + self, + squeeze_self, + squeeze_value, + value=None, + method=None, + axis=None, + inplace=False, + limit=None, + downcast=None, + ): + if isinstance(value, ClientQueryCompiler): + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.fillna( + self._id, + squeeze_self, + squeeze_value, + value, + method, + axis, + inplace, + limit, + downcast, + is_qc, + ) + ) + + def dropna(self, **kwargs): + return self.__constructor__(self._service.dropna(self._id, **kwargs)) + + def sum(self, **kwargs): + return self.__constructor__(self._service.sum(self._id, **kwargs)) + + def prod(self, **kwargs): + return self.__constructor__(self._service.prod(self._id, **kwargs)) + + def count(self, **kwargs): + return self.__constructor__(self._service.count(self._id, **kwargs)) + + def mean(self, **kwargs): + return self.__constructor__(self._service.mean(self._id, **kwargs)) + + def median(self, **kwargs): + return self.__constructor__(self._service.median(self._id, **kwargs)) + + def std(self, **kwargs): + return self.__constructor__(self._service.std(self._id, **kwargs)) + + def min(self, **kwargs): + return self.__constructor__(self._service.min(self._id, **kwargs)) + + def max(self, **kwargs): + return self.__constructor__(self._service.max(self._id, **kwargs)) + + def any(self, **kwargs): + return self.__constructor__(self._service.any(self._id, **kwargs)) + + def all(self, **kwargs): + return self.__constructor__(self._service.all(self._id, **kwargs)) + + def quantile_for_single_value(self, **kwargs): + return self.__constructor__( + self._service.quantile_for_single_value(self._id, **kwargs) + ) + + def quantile_for_list_of_values(self, **kwargs): + return self.__constructor__( + self._service.quantile_for_list_of_values(self._id, **kwargs) + ) + + def describe(self, **kwargs): + return self.__constructor__(self._service.describe(self._id, **kwargs)) + + def set_index_from_columns(self, keys, drop: bool = True, append: bool = False): + return self.__constructor__( + self._service.set_index_from_columns(self._id, keys, drop, append) + ) + + def reset_index(self, **kwargs): + return self.__constructor__(self._service.reset_index(self._id, **kwargs)) + + def concat(self, axis, other, **kwargs): + if is_list_like(other): + other = [o._id for o in other] + else: + other = [other._id] + return self.__constructor__( + self._service.concat(self._id, axis, other, **kwargs) + ) + + def eq(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.eq(self._id, other, is_qc, **kwargs)) + + def lt(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.lt(self._id, other, is_qc, **kwargs)) + + def le(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.le(self._id, other, is_qc, **kwargs)) + + def gt(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.gt(self._id, other, is_qc, **kwargs)) + + def ge(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.ge(self._id, other, is_qc, **kwargs)) + + def ne(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.ne(self._id, other, is_qc, **kwargs)) + + def __and__(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.__and__(self._id, other, is_qc, **kwargs) + ) + + def __or__(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.__or__(self._id, other, is_qc, **kwargs) + ) + + def add(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.add(self._id, other, is_qc, **kwargs)) + + def radd(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.radd(self._id, other, is_qc, **kwargs) + ) + + def truediv(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.truediv(self._id, other, is_qc, **kwargs) + ) + + def mod(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.mod(self._id, other, is_qc, **kwargs)) + + def rmod(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.rmod(self._id, other, is_qc, **kwargs) + ) + + def sub(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.rsub(self._id, other, is_qc, **kwargs) + ) + + def rsub(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.rsub(self._id, other, is_qc, **kwargs) + ) + + def mul(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__(self._service.mul(self._id, other, is_qc, **kwargs)) + + def rmul(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.rmul(self._id, other, is_qc, **kwargs) + ) + + def floordiv(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.floordiv(self._id, other, is_qc, **kwargs) + ) + + def rfloordiv(self, other, **kwargs): + if isinstance(other, ClientQueryCompiler): + other = other._id + is_qc = True + else: + is_qc = False + return self.__constructor__( + self._service.rfloordiv(self._id, other, is_qc, **kwargs) + ) + + def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): + return self.__constructor__( + self._service.sort_rows_by_column_values( + self._id, columns, ascending=ascending, **kwargs + ) + ) + + def sort_index(self, **kwargs): + return self.__constructor__(self._service.sort_index(self._id, **kwargs)) + + def str_capitalize(self): + return self.__constructor__(self._service.str_capitalize(self._id)) + + def str_isalnum(self): + return self.__constructor__(self._service.str_isalnum(self._id)) + + def str_isalpha(self): + return self.__constructor__(self._service.str_isalpha(self._id)) + + def str_isdecimal(self): + return self.__constructor__(self._service.str_isdecimal(self._id)) + + def str_isdigit(self): + return self.__constructor__(self._service.str_isdigit(self._id)) + + def str_islower(self): + return self.__constructor__(self._service.str_islower(self._id)) + + def str_isnumeric(self): + return self.__constructor__(self._service.str_isnumeric(self._id)) + + def str_isspace(self): + return self.__constructor__(self._service.str_isspace(self._id)) + + def str_istitle(self): + return self.__constructor__(self._service.str_istitle(self._id)) + + def str_isupper(self): + return self.__constructor__(self._service.str_isupper(self._id)) + + def str_len(self): + return self.__constructor__(self._service.str_len(self._id)) + + def str_lower(self): + return self.__constructor__(self._service.str_lower(self._id)) + + def str_title(self): + return self.__constructor__(self._service.str_title(self._id)) + + def str_upper(self): + return self.__constructor__(self._service.str_upper(self._id)) + + def str_center(self, width, fillchar=" "): + return self.__constructor__(self._service.str_center(self._id, width, fillchar)) + + def str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + return self.__constructor__( + self._service.str_contains(self._id, pat, case, flags, na, regex) + ) + + def str_count(self, pat, flags=0, **kwargs): + return self.__constructor__( + self._service.str_count(self._id, pat, flags, **kwargs) + ) + + def str_endswith(self, pat, na=np.nan): + return self.__constructor__(self._service.str_endswith(self._id, pat, na)) + + def str_find(self, sub, start=0, end=None): + return self.__constructor__(self._service.str_find(self._id, sub, start, end)) + + def str_findall(self, pat, flags=0, **kwargs): + return self.__constructor__( + self._service.str_findall(self._id, pat, flags, **kwargs) + ) + + def str_get(self, i): + return self.__constructor__(self._service.str_get(self._id, i)) + + str_index = str_find + + def str_join(self, sep): + return self.__constructor__(self._service.str_join(self._id, sep)) + + def str_lstrip(self, to_strip=None): + return self.__constructor__(self._service.str_lstrip(self._id, to_strip)) + + def str_ljust(self, width, fillchar=" "): + return self.__constructor__(self._service.str_ljust(self._id, width, fillchar)) + + def str_match(self, pat, case=True, flags=0, na=np.nan): + return self.__constructor__( + self._service.str_match(self._id, pat, case, flags, na) + ) + + def str_pad(self, width, side="left", fillchar=" "): + return self.__constructor__( + self._service.str_pad(self._id, width, side, fillchar) + ) + + def str_repeat(self, repeats): + return self.__constructor__(self._service.str_repeat(self._id, repeats)) + + def str_rsplit(self, pat=None, n=-1, expand=False): + return self.__constructor__(self._service.str_rsplit(self._id, pat, n, expand)) + + def str_rstrip(self, to_strip=None): + return self.__constructor__(self._service.str_rstrip(self._id, to_strip)) + + def str_slice(self, start=None, stop=None, step=None): + return self.__constructor__( + self._service.str_slice(self._id, start, stop, step) + ) + + def str_slice_replace(self, start=None, stop=None, repl=None): + return self.__constructor__( + self._service.str_slice_replace(self._id, start, stop, repl) + ) + + def str_startswith(self, pat, na=np.nan): + return self.__constructor__(self._service.str_startswith(self._id, pat, na)) + + def str_strip(self, to_strip=None): + return self.__constructor__(self._service.str_strip(self._id, to_strip)) + + def str_zfill(self, width): + return self.__constructor__(self._service.str_zfill(self._id, width)) + + def merge(self, right, **kwargs): + return self.__constructor__(self._service.merge(self._id, right._id, **kwargs)) + + def groupby_mean( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + return self.__constructor__( + self._service.groupby_mean( + self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + ) + + def groupby_count( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + return self.__constructor__( + self._service.groupby_count( + self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + ) + + def groupby_max( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + return self.__constructor__( + self._service.groupby_max( + self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + ) + + def groupby_min( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + return self.__constructor__( + self._service.groupby_min( + self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + ) + + def groupby_sum( + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + return self.__constructor__( + self._service.groupby_sum( + self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + ) + + def cummax(self, fold_axis, skipna, *args, **kwargs): + return self.__constructor__( + self._service.cummax(self._id, fold_axis, skipna, *args, **kwargs) + ) + + def cummin(self, fold_axis, skipna, *args, **kwargs): + return self.__constructor__( + self._service.cummin(self._id, fold_axis, skipna, *args, **kwargs) + ) + + def cumsum(self, fold_axis, skipna, *args, **kwargs): + return self.__constructor__( + self._service.cumsum(self._id, fold_axis, skipna, *args, **kwargs) + ) + + def cumprod(self, fold_axis, skipna, *args, **kwargs): + return self.__constructor__( + self._service.cumprod(self._id, fold_axis, skipna, *args, **kwargs) + ) + + def get_index_names(self, axis=0): + if axis == 0: + return self.index.names + else: + return self.columns.names + + def is_monotonic_increasing(self): + return self.__constructor__(self._service.is_monotonic_increasing(self._id)) + + def is_monotonic_decreasing(self): + return self.__constructor__(self._service.is_monotonic_decreasing(self._id)) + + def idxmin(self, **kwargs): + return self.__constructor__(self._service.idxmin(self._id, **kwargs)) + + def idxmax(self, **kwargs): + return self.__constructor__(self._service.idxmax(self._id, **kwargs)) + + def query(self, expr, **kwargs): + is_variable = False + variable_list = [] + for k, v in tokenize_string(expr): + if v == "" or v == " ": + continue + if is_variable: + frame = inspect.currentframe() + identified = False + while frame: + if v in frame.f_locals: + value = frame.f_locals[v] + if isinstance(value, list): + value = tuple(value) + variable_list.append(str(value)) + identified = True + break + frame = frame.f_back + if not identified: + # TODO this error does not quite match pandas + raise ValueError(f"{v} not found") + is_variable = False + elif v == "@": + is_variable = True + continue + else: + variable_list.append(v) + expr = " ".join(variable_list) + return self.__constructor__(self._service.query(self._id, expr, **kwargs)) + + def finalize(self): + raise NotImplementedError + + def free(self): + raise NotImplementedError + + @classmethod + def from_arrow(cls, at, data_cls): + raise NotImplementedError + + @classmethod + def from_dataframe(cls, df, data_cls): + raise NotImplementedError + + def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): + raise NotImplementedError diff --git a/modin/core/execution/dispatching/factories/factories.py b/modin/core/execution/dispatching/factories/factories.py index 686832fead0..8c2aa2c5798 100644 --- a/modin/core/execution/dispatching/factories/factories.py +++ b/modin/core/execution/dispatching/factories/factories.py @@ -469,6 +469,16 @@ def prepare(cls): cls.io_cls = PandasOnDaskIO +@doc(_doc_factory_class, execution_name="Client") +class ClientFactory(BaseFactory): + @classmethod + @doc(_doc_factory_prepare_method, io_module_name="`Client`") + def prepare(cls): + from modin.core.execution.client.io import ClientIO + + cls.io_cls = ClientIO + + @doc(_doc_abstract_factory_class, role="experimental") class ExperimentalBaseFactory(BaseFactory): @classmethod diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 1305ff828ab..c4f8b4dccbf 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -104,11 +104,12 @@ _is_first_update = {} _NOINIT_ENGINES = { "Python", + "Client", } # engines that don't require initialization, useful for unit tests def _update_engine(publisher: Parameter): - from modin.config import StorageFormat, CpuCount + from modin.config import StorageFormat, CpuCount, Engine from modin.config.envvars import IsExperimental from modin.config.pubsub import ValueSource @@ -129,6 +130,11 @@ def _update_engine(publisher: Parameter): else: is_hdk = False + if Engine.get() == "Client": + if publisher.get_value_source() == ValueSource.DEFAULT: + StorageFormat.put("") + return + if is_hdk and publisher.get_value_source() == ValueSource.DEFAULT: publisher.put("Native") IsExperimental.put(True) diff --git a/modin/utils.py b/modin/utils.py index 7f039172cf9..1e6acf72f51 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -593,7 +593,7 @@ def get_current_execution() -> str: str Returns On-like string. """ - return f"{'Experimental' if IsExperimental.get() else ''}{StorageFormat.get()}On{Engine.get()}" + return f"{'Experimental' if IsExperimental.get() else ''}{StorageFormat.get()}{'On' if StorageFormat.get() != '' else ''}{Engine.get()}" def instancer(_class: Callable[[], T]) -> T: From 3797403fcced3450505eacff39d7c8ee742c2fc4 Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Tue, 6 Sep 2022 14:52:07 -0500 Subject: [PATCH 02/77] Fixes to pass CI + docs for io.py --- modin/config/envvars.py | 18 +---- modin/core/execution/client/io.py | 69 +++++++++++++++++++ modin/core/execution/client/query_compiler.py | 13 ++++ 3 files changed, 83 insertions(+), 17 deletions(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 69752d294d1..b61925a825b 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -21,7 +21,7 @@ import secrets from .pubsub import Parameter, _TYPE_PARAMS, ExactStr, ValueSource -from typing import Optional, Any +from typing import Optional class EnvironmentVariable(Parameter, type=str, abstract=True): @@ -77,22 +77,6 @@ class Engine(EnvironmentVariable, type=str): varname = "MODIN_ENGINE" choices = ("Ray", "Dask", "Python", "Native", "Client") - @classmethod - def put(cls, value: Any) -> None: - """ - Set config value. - - Parameters - ---------- - value : Any - Config value to set. - """ - if cls._value_source == ValueSource.SET_BY_USER: - cls._check_callbacks(cls._put_nocallback(value)) - else: - cls._value = value - cls._value_source = ValueSource.SET_BY_USER - @classmethod def _get_default(cls) -> str: """ diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 9e6163a596f..a72924e0cc9 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -1,22 +1,70 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""The module holds the factory which performs I/O using pandas on a Client.""" + from modin.core.io.io import BaseIO import os from .query_compiler import ClientQueryCompiler class ClientIO(BaseIO): + """Factory providing methods for performing I/O operations using a given Client as the execution engine.""" + _server_conn = None _data_conn = None @classmethod def set_server_connection(cls, conn): + """ + Set the server connection for the I/O object. + + Parameters + ---------- + conn : Any + Connection object that implements various methods. + """ cls._server_conn = conn @classmethod def set_data_connection(cls, conn): + """ + Set the data connection for the I/O object. + + Parameters + ---------- + conn : Any + Connection object that is implementation specific. + """ cls._data_conn = conn @classmethod def read_csv(cls, filepath_or_buffer, **kwargs): + """ + Read CSV data from given filepath or buffer. + + Parameters + ---------- + filepath_or_buffer : str, path object or file-like object + `filepath_or_buffer` parameter of read functions. + **kwargs : dict + Parameters of ``read_csv`` function. + + Returns + ------- + ClientQueryCompiler + Query compiler with CSV data read in. + """ if isinstance(filepath_or_buffer, str): filepath_or_buffer = os.path.abspath(filepath_or_buffer) else: @@ -31,6 +79,27 @@ def read_csv(cls, filepath_or_buffer, **kwargs): @classmethod def read_sql(cls, sql, con, **kwargs): + """ + Read data from a SQL connection. + + Parameters + ---------- + sql : str or SQLAlchemy Selectable (select or text object) + SQL query to be executed or a table name. + con : SQLAlchemy connectable, str, or sqlite3 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy + connectable; str connections are closed automatically. See + `here `_. + **kwargs : dict + Parameters of ``read_sql`` function. + + Returns + ------- + ClientQueryCompiler + Query compiler with data read in from SQL connection. + """ if isinstance(con, str) and con.lower() == "auto" and cls._data_conn is None: raise ConnectionError( "Cannot connect with parameter 'auto' because connection is not set. Did you initialize it?" diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 6e35a083746..4045daa0472 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -1,3 +1,16 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler import numpy as np import pickle From dd0e7a528f5ec17270ca3131aa4b44afe2ce2d19 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 6 Sep 2022 15:58:39 -0500 Subject: [PATCH 03/77] Update implementation Signed-off-by: Devin Petersohn --- modin/core/execution/client/query_compiler.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 4045daa0472..80dc9eddf0d 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -24,10 +24,6 @@ class ClientQueryCompiler(BaseQueryCompiler): def set_server_connection(cls, conn): cls._service = conn - @classmethod - def create_table(cls, table_name): - return cls(cls._service.create_query_compiler(table_name)) - def __init__(self, id): assert ( id is not None @@ -38,7 +34,9 @@ def _set_columns(self, new_columns): self._id = self._service.rename(self._id, new_col_labels=new_columns) def _get_columns(self): - return self._service.columns(self._id) + if self._columns_cache is None: + self._columns_cache = pickle.loads(pickle.dumps(self._service.columns(self._id))) + return self._columns_cache def _set_index(self, new_index): self._id = self._service.rename(self._id, new_row_labels=new_index) @@ -47,6 +45,7 @@ def _get_index(self): return self._service.index(self._id) columns = property(_get_columns, _set_columns) + _columns_cache = None index = property(_get_index, _set_index) _dtypes_cache = None From 026a91c6ade1b9a1408ecbbafaf1c1890c80da51 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 6 Sep 2022 17:29:02 -0500 Subject: [PATCH 04/77] Fix some things Signed-off-by: Devin Petersohn --- modin/pandas/base.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index d318334179e..3162a208e41 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -147,7 +147,7 @@ def _build_repr_df(self, num_rows, num_cols): """ # Fast track for empty dataframe. if len(self.index) == 0 or ( - hasattr(self, "columns") and len(self.columns) == 0 + len(self._query_compiler.columns) == 0 ): return pandas.DataFrame( index=self.index, @@ -172,26 +172,23 @@ def _build_repr_df(self, num_rows, num_cols): if num_rows_for_tail is not None else [] ) - if hasattr(self, "columns"): - if len(self.columns) <= num_cols: - col_indexer = slice(None) - else: - num_cols_for_front = num_cols // 2 + 1 - num_cols_for_back = ( - num_cols_for_front - if len(self.columns) > num_cols - else len(self.columns) - num_cols_for_front - if len(self.columns) - num_cols_for_front >= 0 - else None - ) - col_indexer = list(range(len(self.columns))[:num_cols_for_front]) + ( - list(range(len(self.columns))[-num_cols_for_back:]) - if num_cols_for_back is not None - else [] - ) - indexer = row_indexer, col_indexer + if len(self._query_compiler.columns) <= num_cols: + col_indexer = slice(None) else: - indexer = row_indexer + num_cols_for_front = num_cols // 2 + 1 + num_cols_for_back = ( + num_cols_for_front + if len(self.columns) > num_cols + else len(self.columns) - num_cols_for_front + if len(self.columns) - num_cols_for_front >= 0 + else None + ) + col_indexer = list(range(len(self.columns))[:num_cols_for_front]) + ( + list(range(len(self.columns))[-num_cols_for_back:]) + if num_cols_for_back is not None + else [] + ) + indexer = row_indexer, col_indexer return self.iloc[indexer]._query_compiler.to_pandas() def _update_inplace(self, new_query_compiler): From ea0ac1db4e7feb712e3bcc348fef4b31bc373558 Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Tue, 6 Sep 2022 20:22:46 -0500 Subject: [PATCH 05/77] Lint fixes --- modin/core/execution/client/query_compiler.py | 4 +++- modin/pandas/base.py | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 80dc9eddf0d..726595186b8 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -35,7 +35,9 @@ def _set_columns(self, new_columns): def _get_columns(self): if self._columns_cache is None: - self._columns_cache = pickle.loads(pickle.dumps(self._service.columns(self._id))) + self._columns_cache = pickle.loads( + pickle.dumps(self._service.columns(self._id)) + ) return self._columns_cache def _set_index(self, new_index): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 3162a208e41..46539e209b0 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -146,9 +146,7 @@ def _build_repr_df(self, num_rows, num_cols): A pandas dataset with `num_rows` or fewer rows and `num_cols` or fewer columns. """ # Fast track for empty dataframe. - if len(self.index) == 0 or ( - len(self._query_compiler.columns) == 0 - ): + if len(self.index) == 0 or (len(self._query_compiler.columns) == 0): return pandas.DataFrame( index=self.index, columns=self.columns if hasattr(self, "columns") else None, From c18342edc798d8c1e05272bdcb1907f86e85cfa4 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Tue, 6 Sep 2022 20:58:56 -0500 Subject: [PATCH 06/77] Fix put Signed-off-by: Devin Petersohn --- modin/config/envvars.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index b61925a825b..69752d294d1 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -21,7 +21,7 @@ import secrets from .pubsub import Parameter, _TYPE_PARAMS, ExactStr, ValueSource -from typing import Optional +from typing import Optional, Any class EnvironmentVariable(Parameter, type=str, abstract=True): @@ -77,6 +77,22 @@ class Engine(EnvironmentVariable, type=str): varname = "MODIN_ENGINE" choices = ("Ray", "Dask", "Python", "Native", "Client") + @classmethod + def put(cls, value: Any) -> None: + """ + Set config value. + + Parameters + ---------- + value : Any + Config value to set. + """ + if cls._value_source == ValueSource.SET_BY_USER: + cls._check_callbacks(cls._put_nocallback(value)) + else: + cls._value = value + cls._value_source = ValueSource.SET_BY_USER + @classmethod def _get_default(cls) -> str: """ From 711c819d604e3873dd3948ba58573c26f0bc9dd6 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 15 Sep 2022 15:05:31 -0500 Subject: [PATCH 07/77] Clean up and add new details Signed-off-by: Devin Petersohn --- modin/core/execution/client/query_compiler.py | 11 +++-------- modin/pandas/base.py | 5 +++++ modin/pandas/series.py | 4 +++- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 726595186b8..969a504f92b 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -13,7 +13,6 @@ from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler import numpy as np -import pickle import inspect from pandas.api.types import is_list_like from pandas.core.computation.parsing import tokenize_string @@ -35,9 +34,7 @@ def _set_columns(self, new_columns): def _get_columns(self): if self._columns_cache is None: - self._columns_cache = pickle.loads( - pickle.dumps(self._service.columns(self._id)) - ) + self._columns_cache = self._service.columns(self._id) return self._columns_cache def _set_index(self, new_index): @@ -54,8 +51,7 @@ def _get_index(self): @property def dtypes(self): if self._dtypes_cache is None: - ref = self._service.dtypes(self._id) - self._dtypes_cache = pickle.loads(pickle.dumps(ref)) + self._dtypes_cache = self._service.dtypes(self._id) return self._dtypes_cache @classmethod @@ -63,8 +59,7 @@ def from_pandas(cls, df, data_cls): raise NotImplementedError def to_pandas(self): - remote_obj = self._service.to_pandas(self._id) - return pickle.loads(pickle.dumps(remote_obj)) + return self._service.to_pandas(self._id) def default_to_pandas(self, pandas_op, *args, **kwargs): raise NotImplementedError diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 46539e209b0..d37a51b730b 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3179,6 +3179,11 @@ def __getitem__(self, key): """ if not self._query_compiler.lazy_execution and len(self) == 0: return self._default_to_pandas("__getitem__", key) + # fastpath for common case + if isinstance(key, str) and key in self._query_compiler.columns: + return self._getitem(key) + elif is_list_like(key) and all(k in self._query_compiler.columns for k in key): + return self._getitem(key) # see if we can slice the rows # This lets us reuse code in pandas to error check indexer = None diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 0e8ff29a920..2169e3deb33 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -311,7 +311,9 @@ def __getattr__(self, key): try: return object.__getattribute__(self, key) except AttributeError as err: - if key not in _ATTRS_NO_LOOKUP and key in self.index: + if not self._query_compiler.lazy_execution and ( + key not in _ATTRS_NO_LOOKUP and key in self.index + ): return self[key] raise err From e5c5f61ed36b03aa6759e4c3640fa606e397e875 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 15 Sep 2022 21:53:19 -0500 Subject: [PATCH 08/77] Use fsspec to get full path and allow URLs Signed-off-by: Devin Petersohn --- modin/core/execution/client/io.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index a72924e0cc9..8ba76b56f7c 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -15,6 +15,7 @@ from modin.core.io.io import BaseIO import os +import fsspec from .query_compiler import ClientQueryCompiler @@ -66,7 +67,11 @@ def read_csv(cls, filepath_or_buffer, **kwargs): Query compiler with CSV data read in. """ if isinstance(filepath_or_buffer, str): - filepath_or_buffer = os.path.abspath(filepath_or_buffer) + filepath_or_buffer = fsspec.open(filepath_or_buffer).full_name + if filepath_or_buffer.startswith("file://"): + # We will do this so that the backend can know whether this + # is a path or a URL. + filepath_or_buffer = filepath_or_buffer[7:] else: raise NotImplementedError("Only filepaths are supported for read_csv") if cls._server_conn is None: From 538dd5404b5884f99cb7eee8ec028f10d2030af7 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Fri, 16 Sep 2022 12:17:46 -0500 Subject: [PATCH 09/77] Add lazy loc Signed-off-by: Devin Petersohn --- modin/pandas/indexing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 16b3003f0f9..374e5295069 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -677,7 +677,10 @@ def __getitem__(self, key): if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) - + if self.qc.lazy_execution: + # Since we don't know if the row labels are present or not in lazy evaluation, + # immediately hand off computation to the engine + return type(self.df)(query_compiler=self.qc.getitem_row_labels_array(row_loc).getitem_column_array(col_loc)) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) result = self._getitem_positional( From 4c3dec6164a3d9b7b112627f8655b1b402a30582 Mon Sep 17 00:00:00 2001 From: Bala Atur Date: Mon, 19 Sep 2022 18:24:38 -0700 Subject: [PATCH 10/77] fixes for tests --- modin/core/execution/client/query_compiler.py | 61 ++++++++++--------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 969a504f92b..c0220ff76f1 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -73,6 +73,12 @@ def transpose(self): def copy(self): return self.__constructor__(self._id) + def add_prefix(self, prefix, axis=1): + return self.__constructor__(self._service.add_prefix(self._id, prefix, axis)) + + def add_suffix(self, suffix, axis=1): + return self.__constructor__(self._service.add_prefix(self._id, suffix, axis)) + def insert(self, loc, column, value): if isinstance(value, ClientQueryCompiler): value = value._id @@ -535,13 +541,7 @@ def merge(self, right, **kwargs): return self.__constructor__(self._service.merge(self._id, right._id, **kwargs)) def groupby_mean( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.__constructor__( self._service.groupby_mean( @@ -550,13 +550,7 @@ def groupby_mean( ) def groupby_count( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.__constructor__( self._service.groupby_count( @@ -565,13 +559,7 @@ def groupby_count( ) def groupby_max( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.__constructor__( self._service.groupby_max( @@ -580,13 +568,7 @@ def groupby_max( ) def groupby_min( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, ): return self.__constructor__( self._service.groupby_min( @@ -595,17 +577,36 @@ def groupby_min( ) def groupby_sum( + self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, + ): + return self.__constructor__( + self._service.groupby_sum( + self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + ) + + def groupby_agg( self, by, + agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, + how="axis_wise", drop=False, ): return self.__constructor__( - self._service.groupby_sum( - self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + self._service.groupby_agg( + self._id, + by._id, + agg_func, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + how, + drop, ) ) From 1f9797cfe859845dd9a4f3639c3f365f87cbe307 Mon Sep 17 00:00:00 2001 From: Bala Atur Date: Tue, 20 Sep 2022 19:58:35 -0700 Subject: [PATCH 11/77] porting more tests --- modin/core/execution/client/io.py | 4 +++ modin/core/execution/client/query_compiler.py | 28 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 8ba76b56f7c..462acdea0d6 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -118,3 +118,7 @@ def read_sql(cls, sql, con, **kwargs): return ClientQueryCompiler( cls._server_conn.read_sql(sql, cls._data_conn, **kwargs) ) + + @classmethod + def to_sql(cls, qc, **kwargs): + qc.to_sql(**kwargs) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index c0220ff76f1..526221f1f0f 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -58,7 +58,33 @@ def dtypes(self): def from_pandas(cls, df, data_cls): raise NotImplementedError + def to_sql( + self, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, + ): + return self._service.to_sql( + self._id, + name, + con, + schema, + if_exists, + index, + index_label, + chunksize, + dtype, + method, + ) + def to_pandas(self): + print("calling to_pandas in server") return self._service.to_pandas(self._id) def default_to_pandas(self, pandas_op, *args, **kwargs): @@ -77,7 +103,7 @@ def add_prefix(self, prefix, axis=1): return self.__constructor__(self._service.add_prefix(self._id, prefix, axis)) def add_suffix(self, suffix, axis=1): - return self.__constructor__(self._service.add_prefix(self._id, suffix, axis)) + return self.__constructor__(self._service.add_suffix(self._id, suffix, axis)) def insert(self, loc, column, value): if isinstance(value, ClientQueryCompiler): From 26d0ddca3de545d73ff39bebf85d5b852f9592cd Mon Sep 17 00:00:00 2001 From: Bala Atur Date: Wed, 21 Sep 2022 08:13:02 -0700 Subject: [PATCH 12/77] more fixes --- modin/core/execution/client/query_compiler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 526221f1f0f..358632646cd 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -84,7 +84,6 @@ def to_sql( ) def to_pandas(self): - print("calling to_pandas in server") return self._service.to_pandas(self._id) def default_to_pandas(self, pandas_op, *args, **kwargs): From 2489b33c0d0589f878c79f6b95fed7f8e0304db9 Mon Sep 17 00:00:00 2001 From: Bala Atur Date: Wed, 21 Sep 2022 11:00:26 -0700 Subject: [PATCH 13/77] moar fixes --- modin/core/execution/client/query_compiler.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 358632646cd..b553b6d983e 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -505,6 +505,9 @@ def str_endswith(self, pat, na=np.nan): def str_find(self, sub, start=0, end=None): return self.__constructor__(self._service.str_find(self._id, sub, start, end)) + def str_rfind(self, sub, start=0, end=None): + return self.__constructor__(self._service.str_rfind(self._id, sub, start, end)) + def str_findall(self, pat, flags=0, **kwargs): return self.__constructor__( self._service.str_findall(self._id, pat, flags, **kwargs) @@ -524,6 +527,9 @@ def str_lstrip(self, to_strip=None): def str_ljust(self, width, fillchar=" "): return self.__constructor__(self._service.str_ljust(self._id, width, fillchar)) + def str_rjust(self, width, fillchar=" "): + return self.__constructor__(self._service.str_rjust(self._id, width, fillchar)) + def str_match(self, pat, case=True, flags=0, na=np.nan): return self.__constructor__( self._service.str_match(self._id, pat, case, flags, na) @@ -537,6 +543,9 @@ def str_pad(self, width, side="left", fillchar=" "): def str_repeat(self, repeats): return self.__constructor__(self._service.str_repeat(self._id, repeats)) + def str_split(self, pat=None, n=-1, expand=False): + return self.__constructor__(self._service.str_split(self._id, pat, n, expand)) + def str_rsplit(self, pat=None, n=-1, expand=False): return self.__constructor__(self._service.str_rsplit(self._id, pat, n, expand)) From 3699df42518934cd600e35899c97ac4295370684 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 22 Sep 2022 14:24:47 -0500 Subject: [PATCH 14/77] Raise exception Signed-off-by: Devin Petersohn --- modin/core/execution/client/query_compiler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index b553b6d983e..75edfa5f23d 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -27,6 +27,8 @@ def __init__(self, id): assert ( id is not None ), "Make sure the client is properly connected and returns and ID" + if isinstance(id, Exception): + raise id self._id = id def _set_columns(self, new_columns): From c399ce26f2eada39525a5d4c0b9717972ba81e72 Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Thu, 22 Sep 2022 16:45:07 -0500 Subject: [PATCH 15/77] Lint fixes --- modin/core/execution/client/io.py | 1 - modin/core/execution/client/query_compiler.py | 40 ++++++++++++++++--- modin/pandas/indexing.py | 6 ++- modin/pandas/series.py | 2 +- 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 462acdea0d6..f7c6aec4974 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -14,7 +14,6 @@ """The module holds the factory which performs I/O using pandas on a Client.""" from modin.core.io.io import BaseIO -import os import fsspec from .query_compiler import ClientQueryCompiler diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 75edfa5f23d..2af90f45221 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -577,7 +577,13 @@ def merge(self, right, **kwargs): return self.__constructor__(self._service.merge(self._id, right._id, **kwargs)) def groupby_mean( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, ): return self.__constructor__( self._service.groupby_mean( @@ -586,7 +592,13 @@ def groupby_mean( ) def groupby_count( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, ): return self.__constructor__( self._service.groupby_count( @@ -595,7 +607,13 @@ def groupby_count( ) def groupby_max( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, ): return self.__constructor__( self._service.groupby_max( @@ -604,7 +622,13 @@ def groupby_max( ) def groupby_min( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, ): return self.__constructor__( self._service.groupby_min( @@ -613,7 +637,13 @@ def groupby_min( ) def groupby_sum( - self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=False, + self, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, ): return self.__constructor__( self._service.groupby_sum( diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 374e5295069..20b5952b6f5 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -680,7 +680,11 @@ def __getitem__(self, key): if self.qc.lazy_execution: # Since we don't know if the row labels are present or not in lazy evaluation, # immediately hand off computation to the engine - return type(self.df)(query_compiler=self.qc.getitem_row_labels_array(row_loc).getitem_column_array(col_loc)) + return type(self.df)( + query_compiler=self.qc.getitem_row_labels_array( + row_loc + ).getitem_column_array(col_loc) + ) row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) result = self._getitem_positional( diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 2169e3deb33..023b76ed120 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -312,7 +312,7 @@ def __getattr__(self, key): return object.__getattribute__(self, key) except AttributeError as err: if not self._query_compiler.lazy_execution and ( - key not in _ATTRS_NO_LOOKUP and key in self.index + key not in _ATTRS_NO_LOOKUP and key in self.index ): return self[key] raise err From c7858103fefc39b7619226478baaba959f79e854 Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Fri, 23 Sep 2022 09:40:14 -0500 Subject: [PATCH 16/77] Return Python as the default modin engine --- modin/config/envvars.py | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 69752d294d1..7c7b8a3708d 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -21,7 +21,7 @@ import secrets from .pubsub import Parameter, _TYPE_PARAMS, ExactStr, ValueSource -from typing import Optional, Any +from typing import Optional class EnvironmentVariable(Parameter, type=str, abstract=True): @@ -77,22 +77,6 @@ class Engine(EnvironmentVariable, type=str): varname = "MODIN_ENGINE" choices = ("Ray", "Dask", "Python", "Native", "Client") - @classmethod - def put(cls, value: Any) -> None: - """ - Set config value. - - Parameters - ---------- - value : Any - Config value to set. - """ - if cls._value_source == ValueSource.SET_BY_USER: - cls._check_callbacks(cls._put_nocallback(value)) - else: - cls._value = value - cls._value_source = ValueSource.SET_BY_USER - @classmethod def _get_default(cls) -> str: """ @@ -147,9 +131,9 @@ def _get_default(cls) -> str: pass else: return "Native" - raise ImportError( - "Please refer to installation documentation page to install an engine" - ) + + warnings.warn("No other engine was found so defaulting backend to Python.") + return "Python" class StorageFormat(EnvironmentVariable, type=str): @@ -157,7 +141,7 @@ class StorageFormat(EnvironmentVariable, type=str): varname = "MODIN_STORAGE_FORMAT" default = "Pandas" - choices = ("Pandas", "OmniSci", "Pyarrow", "Cudf", "") + choices = ("Pandas", "Hdk", "Pyarrow", "Cudf", "") class IsExperimental(EnvironmentVariable, type=bool): From 3e09a7ff4850a1b1f869051d5466507ee21aead1 Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Fri, 23 Sep 2022 10:11:24 -0500 Subject: [PATCH 17/77] Handle indexing case for client qc --- modin/pandas/indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 20b5952b6f5..d51b749e10b 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -35,6 +35,7 @@ from pandas.api.types import is_list_like, is_bool from pandas.core.dtypes.common import is_integer, is_bool_dtype, is_integer_dtype from pandas.core.indexing import IndexingError +from modin.core.execution.client.query_compiler import ClientQueryCompiler from modin.error_message import ErrorMessage from modin.logging import ClassLogger @@ -677,7 +678,7 @@ def __getitem__(self, key): if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) - if self.qc.lazy_execution: + if isinstance(self.qc, ClientQueryCompiler) and self.qc.lazy_execution: # Since we don't know if the row labels are present or not in lazy evaluation, # immediately hand off computation to the engine return type(self.df)( From ad0bc7be9ba7e6f81897f10d3f778243f1c5c782 Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Fri, 23 Sep 2022 11:15:47 -0500 Subject: [PATCH 18/77] Call fast path for __getitem__ if not lazy --- modin/pandas/base.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index d37a51b730b..f6bbcd1310d 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3177,13 +3177,16 @@ def __getitem__(self, key): BasePandasDataset Located dataset. """ - if not self._query_compiler.lazy_execution and len(self) == 0: - return self._default_to_pandas("__getitem__", key) - # fastpath for common case - if isinstance(key, str) and key in self._query_compiler.columns: - return self._getitem(key) - elif is_list_like(key) and all(k in self._query_compiler.columns for k in key): - return self._getitem(key) + if not self._query_compiler.lazy_execution: + if len(self) == 0: + return self._default_to_pandas("__getitem__", key) + # fastpath for common case + if isinstance(key, str) and key in self._query_compiler.columns: + return self._getitem(key) + elif is_list_like(key) and all( + k in self._query_compiler.columns for k in key + ): + return self._getitem(key) # see if we can slice the rows # This lets us reuse code in pandas to error check indexer = None From 2f4fbf0d6f0b52a5be3c3c0b1ec0fb3441c6381e Mon Sep 17 00:00:00 2001 From: Karthik Velayutham Date: Mon, 26 Sep 2022 09:51:28 -0500 Subject: [PATCH 19/77] Remove user warning for Python-engine fall back --- modin/config/envvars.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 7c7b8a3708d..c77792cc61a 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -132,7 +132,8 @@ def _get_default(cls) -> str: else: return "Native" - warnings.warn("No other engine was found so defaulting backend to Python.") + # If we can't import any other engines we should go ahead and default to Python being + # the default backend engine. return "Python" From 4b613742880959c34183cdba0b0c8ca493cf5271 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Sat, 24 Sep 2022 14:42:18 -0500 Subject: [PATCH 20/77] Add init Signed-off-by: Devin Petersohn --- modin/core/execution/client/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 modin/core/execution/client/__init__.py diff --git a/modin/core/execution/client/__init__.py b/modin/core/execution/client/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From 485793c87ee4d9055542309ce9805f2075f4c460 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 26 Sep 2022 09:30:59 -0500 Subject: [PATCH 21/77] Implement free as a no-op Signed-off-by: Devin Petersohn --- modin/core/execution/client/query_compiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 2af90f45221..4375b68f394 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -748,7 +748,7 @@ def finalize(self): raise NotImplementedError def free(self): - raise NotImplementedError + return @classmethod def from_arrow(cls, at, data_cls): From 5d5a617cb2b69bcaa8c793a6d3a6582ff1b8c5ea Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 23 Sep 2022 17:30:28 -0700 Subject: [PATCH 22/77] Add support for replace - client side --- modin/core/execution/client/query_compiler.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 4375b68f394..d1337edfd5c 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -14,6 +14,7 @@ from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler import numpy as np import inspect +from pandas._libs.lib import no_default, NoDefault from pandas.api.types import is_list_like from pandas.core.computation.parsing import tokenize_string @@ -171,6 +172,37 @@ def isna(self): def notna(self): return self.__constructor__(self._service.notna(self._id)) + def replace( + self, + to_replace=None, + value=no_default, + inplace=False, + limit=None, + regex=False, + method: "str | NoDefault" = no_default, + ): + if isinstance(to_replace, ClientQueryCompiler): + is_to_replace_qc = True + else: + is_to_replace_qc = False + if isinstance(regex, ClientQueryCompiler): + is_regex_qc = True + else: + is_regex_qc = False + return self.__constructor__( + self._service.replace( + self._id, + to_replace, + value, + inplace, + limit, + regex, + method, + is_to_replace_qc, + is_regex_qc, + ) + ) + def fillna( self, squeeze_self, From 8b169880a1c2f2b8312e76757118e4cc829c0663 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 26 Sep 2022 10:33:13 -0500 Subject: [PATCH 23/77] Fix a couple of issues with Client Signed-off-by: Devin Petersohn --- modin/core/execution/client/query_compiler.py | 16 ++++++++-------- modin/pandas/base.py | 4 ++++ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index d1337edfd5c..30a01e23527 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -708,24 +708,24 @@ def groupby_agg( ) ) - def cummax(self, fold_axis, skipna, *args, **kwargs): + def cummax(self, fold_axis, axis, skipna, *args, **kwargs): return self.__constructor__( - self._service.cummax(self._id, fold_axis, skipna, *args, **kwargs) + self._service.cummax(self._id, fold_axis, axis, skipna, *args, **kwargs) ) - def cummin(self, fold_axis, skipna, *args, **kwargs): + def cummin(self, fold_axis, axis, skipna, *args, **kwargs): return self.__constructor__( - self._service.cummin(self._id, fold_axis, skipna, *args, **kwargs) + self._service.cummin(self._id, fold_axis, axis, skipna, *args, **kwargs) ) - def cumsum(self, fold_axis, skipna, *args, **kwargs): + def cumsum(self, fold_axis, axis, skipna, *args, **kwargs): return self.__constructor__( - self._service.cumsum(self._id, fold_axis, skipna, *args, **kwargs) + self._service.cumsum(self._id, fold_axis, axis, skipna, *args, **kwargs) ) - def cumprod(self, fold_axis, skipna, *args, **kwargs): + def cumprod(self, fold_axis, axis, skipna, *args, **kwargs): return self.__constructor__( - self._service.cumprod(self._id, fold_axis, skipna, *args, **kwargs) + self._service.cumprod(self._id, fold_axis, axis, skipna, *args, **kwargs) ) def get_index_names(self, axis=0): diff --git a/modin/pandas/base.py b/modin/pandas/base.py index f6bbcd1310d..6022956e2e0 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1227,6 +1227,10 @@ def drop( elif axes[axis] is not None: if not is_list_like(axes[axis]): axes[axis] = [axes[axis]] + # In case of lazy execution we should bypass these error checking components + # because they can force the materialization of the row or column labels. + if self._query_compiler.lazy_execution: + continue if errors == "raise": non_existent = pandas.Index(axes[axis]).difference( getattr(self, axis) From 4485cc8881de7d10c27869b53bd6054ab3254fdc Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Mon, 26 Sep 2022 13:24:05 -0500 Subject: [PATCH 24/77] Throw errors on to_pandas Signed-off-by: Devin Petersohn --- modin/core/execution/client/query_compiler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 30a01e23527..59100df1fe4 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -87,7 +87,10 @@ def to_sql( ) def to_pandas(self): - return self._service.to_pandas(self._id) + value = self._service.to_pandas(self._id) + if isinstance(value, Exception): + raise value + return value def default_to_pandas(self, pandas_op, *args, **kwargs): raise NotImplementedError From 7fd51b2d770d32ecf79fd002e7554efa1372948d Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Mon, 26 Sep 2022 21:17:46 -0700 Subject: [PATCH 25/77] Do not default to pandas for str_repeat --- modin/pandas/series_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modin/pandas/series_utils.py b/modin/pandas/series_utils.py index ae01e884835..5bbb85cf2ae 100644 --- a/modin/pandas/series_utils.py +++ b/modin/pandas/series_utils.py @@ -331,7 +331,9 @@ def partition(self, sep=" ", expand=True): ) def repeat(self, repeats): - return self._default_to_pandas(pandas.Series.str.repeat, repeats) + return Series( + query_compiler=self._query_compiler.str_repeat(repeats) + ) def rpartition(self, sep=" ", expand=True): if sep is not None and len(sep) == 0: From a12fb00bec9ccd70800babef1869a6cdd57f6766 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Fri, 30 Sep 2022 10:22:05 -0700 Subject: [PATCH 26/77] Add support for 18 datetime functions/properties --- modin/core/execution/client/query_compiler.py | 54 +++++++++++++++++++ .../storage_formats/base/query_compiler.py | 4 ++ 2 files changed, 58 insertions(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 59100df1fe4..65a01787bfd 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -481,6 +481,60 @@ def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): def sort_index(self, **kwargs): return self.__constructor__(self._service.sort_index(self._id, **kwargs)) + def dt_nanosecond(self): + return self.__constructor__(self._service.dt_nanosecond(self._id)) + + def dt_microsecond(self): + return self.__constructor__(self._service.dt_microsecond(self._id)) + + def dt_second(self): + return self.__constructor__(self._service.dt_second(self._id)) + + def dt_minute(self): + return self.__constructor__(self._service.dt_minute(self._id)) + + def dt_hour(self): + return self.__constructor__(self._service.dt_hour(self._id)) + + def dt_day(self): + return self.__constructor__(self._service.dt_day(self._id)) + + def dt_dayofweek(self): + return self.__constructor__(self._service.dt_dayofweek(self._id)) + + def dt_day_of_week(self): + return self.__constructor__(self._service.dt_day_of_week(self._id)) + + def dt_weekday(self): + return self.__constructor__(self._service.dt_weekday(self._id)) + + def dt_day_name(self): + return self.__constructor__(self._service.dt_day_name(self._id)) + + def dt_dayofyear(self): + return self.__constructor__(self._service.dt_dayofyear(self._id)) + + def dt_day_of_year(self): + return self.__constructor__(self._service.dt_day_of_year(self._id)) + + def dt_week(self): + return self.__constructor__(self._service.dt_week(self._id)) + + def dt_weekofyear(self): + return self.__constructor__(self._service.dt_weekofyear(self._id)) + + def dt_month(self): + return self.__constructor__(self._service.dt_month(self._id)) + + def dt_month_name(self): + return self.__constructor__(self._service.dt_month_name(self._id)) + + def dt_quarter(self): + return self.__constructor__(self._service.dt_quarter(self._id)) + + def dt_year(self): + return self.__constructor__(self._service.dt_year(self._id)) + def str_capitalize(self): return self.__constructor__(self._service.str_capitalize(self._id)) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index a0e05638a8e..d17d1e32b85 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4066,6 +4066,10 @@ def resample_var(self, resample_kwargs, ddof, *args, **kwargs): # End of Resample methods + @doc_utils.doc_str_method(refer_to="capitalize", params="") + def str_capitalize(self): + return StrDefault.register(pandas.Series.str.capitalize)(self) + # Str methods @doc_utils.doc_str_method(refer_to="capitalize", params="") From 613ba25e3537b5d9134584e1ff44aa9032de2717 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Tue, 4 Oct 2022 20:31:31 -0700 Subject: [PATCH 27/77] Fix columns caching when renaming columns --- modin/core/execution/client/query_compiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 65a01787bfd..adf451a8519 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -34,6 +34,7 @@ def __init__(self, id): def _set_columns(self, new_columns): self._id = self._service.rename(self._id, new_col_labels=new_columns) + self._columns_cache = self._service.columns(self._id) def _get_columns(self): if self._columns_cache is None: From 0450f7c479f774b4df819a0896116885a8df9633 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Wed, 5 Oct 2022 14:02:12 -0700 Subject: [PATCH 28/77] Fix test_query: put backticks back for col names --- modin/core/execution/client/query_compiler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index adf451a8519..0b1fc5161c0 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -830,6 +830,8 @@ def query(self, expr, **kwargs): is_variable = True continue else: + if v in self.columns: + v = f'`{v}`' variable_list.append(v) expr = " ".join(variable_list) return self.__constructor__(self._service.query(self._id, expr, **kwargs)) From 679813cd45dcb88706e4d72e7446cd080c3727f9 Mon Sep 17 00:00:00 2001 From: Hazem Elmeleegy Date: Mon, 17 Oct 2022 05:47:17 -0700 Subject: [PATCH 29/77] Add support for astype -- client side --- modin/core/execution/client/query_compiler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 0b1fc5161c0..217476793cd 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -176,6 +176,9 @@ def isna(self): def notna(self): return self.__constructor__(self._service.notna(self._id)) + def astype(self, col_dtypes, **kwargs): + return self.__constructor__(self._service.astype(self._id, col_dtypes, **kwargs)) + def replace( self, to_replace=None, From dff3d54ebf631d0114337e80d123e67549d8e29d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 25 Oct 2022 12:23:31 -0500 Subject: [PATCH 30/77] Make client query compiler consistent with other query compiler. consistency check passes. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 13 ++++- modin/core/execution/client/query_compiler.py | 48 +++---------------- .../storage_formats/base/query_compiler.py | 13 ++++- .../storage_formats/pandas/query_compiler.py | 9 +++- modin/test/test_executions_api.py | 19 ++++++-- 5 files changed, 51 insertions(+), 51 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index f7c6aec4974..34a244d946d 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -120,4 +120,15 @@ def read_sql(cls, sql, con, **kwargs): @classmethod def to_sql(cls, qc, **kwargs): - qc.to_sql(**kwargs) + self._server_conn.to_sql( + qc._id, + name, + con, + schema, + if_exists, + index, + index_label, + chunksize, + dtype, + method, + ) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 217476793cd..d07168078af 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -62,31 +62,6 @@ def dtypes(self): def from_pandas(cls, df, data_cls): raise NotImplementedError - def to_sql( - self, - name, - con, - schema=None, - if_exists="fail", - index=True, - index_label=None, - chunksize=None, - dtype=None, - method=None, - ): - return self._service.to_sql( - self._id, - name, - con, - schema, - if_exists, - index, - index_label, - chunksize, - dtype, - method, - ) - def to_pandas(self): value = self._service.to_pandas(self._id) if isinstance(value, Exception): @@ -144,14 +119,11 @@ def getitem_column_array(self, key, numeric=False): self._service.getitem_column_array(self._id, key, numeric) ) - def getitem_row_labels_array(self, labels): + def getitem_row_array(self, key, numeric=False): return self.__constructor__( - self._service.getitem_row_labels_array(self._id, labels) + self._service.getitem_row_array(self._id, key, numeric) ) - def getitem_row_array(self, key): - return self.__constructor__(self._service.getitem_row_array(self._id, key)) - def pivot(self, index, columns, values): return self.__constructor__( self._service.pivot(self._id, index, columns, values) @@ -162,11 +134,9 @@ def get_dummies(self, columns, **kwargs): self._service.get_dummies(self._id, columns, **kwargs) ) - def view(self, index=None, columns=None): + def take_2d(self, index=None, columns=None): return self.__constructor__(self._service.view(self._id, index, columns)) - take_2d = view - def drop(self, index=None, columns=None): return self.__constructor__(self._service.drop(self._id, index, columns)) @@ -177,7 +147,9 @@ def notna(self): return self.__constructor__(self._service.notna(self._id)) def astype(self, col_dtypes, **kwargs): - return self.__constructor__(self._service.astype(self._id, col_dtypes, **kwargs)) + return self.__constructor__( + self._service.astype(self._id, col_dtypes, **kwargs) + ) def replace( self, @@ -506,9 +478,6 @@ def dt_day(self): def dt_dayofweek(self): return self.__constructor__(self._service.dt_dayofweek(self._id)) - def dt_day_of_week(self): - return self.__constructor__(self._service.dt_day_of_week(self._id)) - def dt_weekday(self): return self.__constructor__(self._service.dt_weekday(self._id)) @@ -518,9 +487,6 @@ def dt_day_name(self): def dt_dayofyear(self): return self.__constructor__(self._service.dt_dayofyear(self._id)) - def dt_day_of_year(self): - return self.__constructor__(self._service.dt_day_of_year(self._id)) - def dt_week(self): return self.__constructor__(self._service.dt_week(self._id)) @@ -834,7 +800,7 @@ def query(self, expr, **kwargs): continue else: if v in self.columns: - v = f'`{v}`' + v = f"`{v}`" variable_list.append(v) expr = " ".join(variable_list) return self.__constructor__(self._service.query(self._id, expr, **kwargs)) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index d17d1e32b85..8dbdaf99586 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -529,6 +529,11 @@ def mod(self, other, **kwargs): # noqa: PR02 def mul(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.mul)(self, other=other, **kwargs) + @doc_utils.doc_binary_method(operation="multiplication", sign="*", self_on_right=True) + def rmul(self, other, **kwargs): # noqa: PR02 + return BinaryDefault.register(pandas.DataFrame.rmul)(self, other=other, **kwargs) + + @doc_utils.add_refer_to("DataFrame.corr") def corr(self, **kwargs): # noqa: PR02 """ @@ -2143,7 +2148,7 @@ def get_column(df, key): return DataFrameDefault.register(get_column)(self, key=key) - def getitem_row_array(self, key): + def getitem_row_array(self, key: List[Hashable], numeric: bool = False): """ Get row data for target indices. @@ -2151,6 +2156,7 @@ def getitem_row_array(self, key): ---------- key : list-like Numeric indices of the rows to pick. + numeric : bool, default: False Returns ------- @@ -2159,7 +2165,10 @@ def getitem_row_array(self, key): """ def get_row(df, key): - return df.iloc[key] + if numeric: + return df.iloc[key] + else: + return df.loc[key] return DataFrameDefault.register(get_row)(self, key=key) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 0e4a4481193..2f944c26801 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -404,6 +404,7 @@ def to_numpy(self, **kwargs): lt = Binary.register(pandas.DataFrame.lt) mod = Binary.register(pandas.DataFrame.mod) mul = Binary.register(pandas.DataFrame.mul) + rmul = Binary.register(pandas.DataFrame.rmul) ne = Binary.register(pandas.DataFrame.ne) pow = Binary.register(pandas.DataFrame.pow) radd = Binary.register(pandas.DataFrame.radd) @@ -2246,9 +2247,13 @@ def getitem_column_array(self, key, numeric=False): ) return self.__constructor__(new_modin_frame) - def getitem_row_array(self, key): + def getitem_row_array(self, key: List[Hashable], numeric: bool = False): + if numeric: + kwargs = {"row_positions": key} + else: + kwargs = {"row_labels": key} return self.__constructor__( - self._modin_frame.take_2d_labels_or_positional(row_positions=key) + self._modin_frame.take_2d_labels_or_positional(**kwargs) ) def setitem(self, axis, key, value): diff --git a/modin/test/test_executions_api.py b/modin/test/test_executions_api.py index 949834ba9e6..f109801d084 100644 --- a/modin/test/test_executions_api.py +++ b/modin/test/test_executions_api.py @@ -13,6 +13,7 @@ import pytest +from modin.core.execution.client.query_compiler import ClientQueryCompiler from modin.core.storage_formats import ( BaseQueryCompiler, PandasQueryCompiler, @@ -21,7 +22,7 @@ BASE_EXECUTION = BaseQueryCompiler -EXECUTIONS = [PandasQueryCompiler, PyarrowQueryCompiler] +EXECUTIONS = [PandasQueryCompiler, PyarrowQueryCompiler, ClientQueryCompiler] def test_base_abstract_methods(): @@ -50,15 +51,23 @@ def test_base_abstract_methods(): ), f"{BASE_EXECUTION} has not implemented abstract methods: {not_implemented_methods}" -@pytest.mark.parametrize("execution", EXECUTIONS) -def test_api_consistent(execution): +@pytest.mark.parametrize( + "execution,expected_extra_methods", + [ + (PandasQueryCompiler, set()), + (PyarrowQueryCompiler, set()), + # client query compiler exposes set_server_connection, + # which the other compilers should not + (ClientQueryCompiler, {"set_server_connection"}), + ], +) +def test_api_consistent(execution, expected_extra_methods): base_methods = set(BASE_EXECUTION.__dict__) custom_methods = set( [key for key in execution.__dict__.keys() if not key.startswith("_")] ) extra_methods = custom_methods.difference(base_methods) - # checking that custom execution do not implements extra api methods assert ( - len(extra_methods) == 0 + extra_methods == expected_extra_methods ), f"{execution} implement these extra methods: {extra_methods}" From 18cf7254b949bfe5dec98eb3899e7d93b40a3a5e Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 25 Oct 2022 12:59:39 -0500 Subject: [PATCH 31/77] Fix black. Signed-off-by: mvashishtha --- modin/core/storage_formats/base/query_compiler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 8dbdaf99586..bc9e349db2c 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -529,10 +529,13 @@ def mod(self, other, **kwargs): # noqa: PR02 def mul(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.mul)(self, other=other, **kwargs) - @doc_utils.doc_binary_method(operation="multiplication", sign="*", self_on_right=True) + @doc_utils.doc_binary_method( + operation="multiplication", sign="*", self_on_right=True + ) def rmul(self, other, **kwargs): # noqa: PR02 - return BinaryDefault.register(pandas.DataFrame.rmul)(self, other=other, **kwargs) - + return BinaryDefault.register(pandas.DataFrame.rmul)( + self, other=other, **kwargs + ) @doc_utils.add_refer_to("DataFrame.corr") def corr(self, **kwargs): # noqa: PR02 From ea5dc77284d0c2b0e7097dea330cfd89d9bcbf60 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 25 Oct 2022 15:23:54 -0500 Subject: [PATCH 32/77] Fix black and flake8. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 13 +------------ modin/core/storage_formats/base/query_compiler.py | 4 ---- modin/pandas/series_utils.py | 4 +--- 3 files changed, 2 insertions(+), 19 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 34a244d946d..689ef92c6ad 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -120,15 +120,4 @@ def read_sql(cls, sql, con, **kwargs): @classmethod def to_sql(cls, qc, **kwargs): - self._server_conn.to_sql( - qc._id, - name, - con, - schema, - if_exists, - index, - index_label, - chunksize, - dtype, - method, - ) + cls._server_conn.to_sql(qc._id, **kwargs) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index bc9e349db2c..54b57bfa921 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -4078,10 +4078,6 @@ def resample_var(self, resample_kwargs, ddof, *args, **kwargs): # End of Resample methods - @doc_utils.doc_str_method(refer_to="capitalize", params="") - def str_capitalize(self): - return StrDefault.register(pandas.Series.str.capitalize)(self) - # Str methods @doc_utils.doc_str_method(refer_to="capitalize", params="") diff --git a/modin/pandas/series_utils.py b/modin/pandas/series_utils.py index 5bbb85cf2ae..368f77684fd 100644 --- a/modin/pandas/series_utils.py +++ b/modin/pandas/series_utils.py @@ -331,9 +331,7 @@ def partition(self, sep=" ", expand=True): ) def repeat(self, repeats): - return Series( - query_compiler=self._query_compiler.str_repeat(repeats) - ) + return Series(query_compiler=self._query_compiler.str_repeat(repeats)) def rpartition(self, sep=" ", expand=True): if sep is not None and len(sep) == 0: From 773eff09a80dc52d07e112b7d5e1334ac09f4289 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 26 Oct 2022 13:55:33 -0500 Subject: [PATCH 33/77] Hook up IO and test query compiler, but service missing methods that take id. Signed-off-by: mvashishtha --- modin/conftest.py | 87 ++++++++++++++++--- modin/core/execution/client/query_compiler.py | 2 + 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/modin/conftest.py b/modin/conftest.py index 4a9687a2f7f..2fb32457ee1 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -21,7 +21,8 @@ from pandas.util._decorators import doc import numpy as np import shutil -from typing import Optional +from typing import Any, NamedTuple, Optional +from uuid import uuid4, UUID assert ( "modin.utils" not in sys.modules @@ -46,12 +47,16 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): import modin # noqa: E402 import modin.config # noqa: E402 from modin.config import IsExperimental, TestRayClient # noqa: E402 -import uuid # noqa: E402 from modin.core.storage_formats import ( # noqa: E402 PandasQueryCompiler, BaseQueryCompiler, ) +from modin.core.execution.client.io import ClientIO # noqa: E402 +from modin.core.execution.client.query_compiler import ClientQueryCompiler # noqa: E402 +from modin.core.execution.python.implementations.pandas_on_python.dataframe.dataframe import ( # noqa: E402 + PandasOnPythonDataframe, +) from modin.core.execution.python.implementations.pandas_on_python.io import ( # noqa: E402 PandasOnPythonIO, ) @@ -223,9 +228,6 @@ def __iter__(self): os.environ = orig_env -BASE_EXECUTION_NAME = "BaseOnPython" - - class TestQC(BaseQueryCompiler): def __init__(self, modin_frame): self._modin_frame = modin_frame @@ -269,16 +271,77 @@ def prepare(cls): cls.io_cls = BaseOnPythonIO -def set_base_execution(name=BASE_EXECUTION_NAME): - setattr(factories, f"{name}Factory", BaseOnPythonFactory) - modin.set_execution(engine="python", storage_format=name.split("On")[0]) +def set_base_on_python_execution(): + factories.BaseOnPythonFactory = BaseOnPythonFactory + modin.set_execution(engine="python", storage_format="Base") + + +class BaseExecutionService: + class DefaultToPandasResult(NamedTuple): + result: Optional[Any] + result_is_qc_id: bool + + def __init__(self): + self._base_query_compiler_by_id = {} + + def add_query_compiler(self, qc) -> UUID: + id = self._generate_id() + self._base_query_compiler_by_id[self._generate_id()] = qc + return id + + def default_to_pandas( + self, id: UUID, pandas_op, *args, **kwargs + ) -> DefaultToPandasResult: + result = self._base_query_compiler_by_id[id].default_to_pandas( + pandas_op, *args, **kwargs + ) + result_is_qc_id = isinstance(result, BaseQueryCompiler) + if result_is_qc_id: + new_id = self._generate_id() + self._base_query_compiler_by_id[new_id] = result + result = new_id + return self.DefaultToPandasResult(result=result, result_is_qc_id=False) + + def _generate_id(self): + id = uuid4() + while id in self._base_query_compiler_by_id: + id = uuid4() + return id + + +class TestClientQueryCompiler(ClientQueryCompiler): + @classmethod + def from_pandas(cls, df, data_cls): + return cls(cls._service.add_query_compiler(TestQC.from_pandas(df, data_cls))) + + def default_to_pandas(self, pandas_op, *args, **kwargs): + result = self._service.default_to_pandas(self._id, pandas_op, *args, **kwargs) + if result.result_is_qc_id: + return self.__constructor__(result.result) + return result.result + + +class ClientFactory(factories.BaseFactory): + @classmethod + def prepare(cls): + cls.io_cls = ClientIO + + +def set_client_execution(): + service = BaseExecutionService() + ClientQueryCompiler.set_server_connection(service) + ClientIO.query_compiler_cls = TestClientQueryCompiler + ClientIO.set_server_connection(service) + ClientIO.frame_cls = PandasOnPythonDataframe + factories.ClientFactory = ClientFactory + modin.set_execution(engine="Client", storage_format="") @pytest.fixture(scope="function") def get_unique_base_execution(): """Setup unique execution for a single function and yield its QueryCompiler that's suitable for inplace modifications.""" # It's better to use decimal IDs rather than hex ones due to factory names formatting - execution_id = int(uuid.uuid4().hex, 16) + execution_id = int(uuid4().hex, 16) format_name = f"Base{execution_id}" engine_name = "Python" execution_name = f"{format_name}On{engine_name}" @@ -319,11 +382,13 @@ def pytest_configure(config): if execution is None: return - if execution == BASE_EXECUTION_NAME: - set_base_execution(BASE_EXECUTION_NAME) + if execution == "BaseOnPython": + set_base_on_python_execution() config.addinivalue_line( "filterwarnings", "default:.*defaulting to pandas.*:UserWarning" ) + elif execution == "Client": + set_client_execution() else: partition, engine = execution.split("On") modin.set_execution(engine=engine, storage_format=partition) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index d07168078af..e8fc0ec20c2 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -20,6 +20,8 @@ class ClientQueryCompiler(BaseQueryCompiler): + lazy_execution = True + @classmethod def set_server_connection(cls, conn): cls._service = conn From 87699b864d47725a005837e91f1aafecc49ea5fd Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 26 Oct 2022 15:36:16 -0500 Subject: [PATCH 34/77] Fix up the service and test_general passes with execution 'client'. Signed-off-by: mvashishtha --- modin/conftest.py | 48 ++++-------------- modin/core/execution/client/query_compiler.py | 50 +++++++++---------- 2 files changed, 34 insertions(+), 64 deletions(-) diff --git a/modin/conftest.py b/modin/conftest.py index 2fb32457ee1..58b8c04aa70 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -21,8 +21,8 @@ from pandas.util._decorators import doc import numpy as np import shutil -from typing import Any, NamedTuple, Optional -from uuid import uuid4, UUID +from typing import Optional +from uuid import uuid4 assert ( "modin.utils" not in sys.modules @@ -54,6 +54,9 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): ) from modin.core.execution.client.io import ClientIO # noqa: E402 from modin.core.execution.client.query_compiler import ClientQueryCompiler # noqa: E402 +from modin.core.execution.client.service import ( # noqa: E402 + ForwardingQueryCompilerService, +) from modin.core.execution.python.implementations.pandas_on_python.dataframe.dataframe import ( # noqa: E402 PandasOnPythonDataframe, ) @@ -68,6 +71,7 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): make_default_file, teardown_test_files, NROWS, + default_to_pandas_ignore_string, ) @@ -276,39 +280,6 @@ def set_base_on_python_execution(): modin.set_execution(engine="python", storage_format="Base") -class BaseExecutionService: - class DefaultToPandasResult(NamedTuple): - result: Optional[Any] - result_is_qc_id: bool - - def __init__(self): - self._base_query_compiler_by_id = {} - - def add_query_compiler(self, qc) -> UUID: - id = self._generate_id() - self._base_query_compiler_by_id[self._generate_id()] = qc - return id - - def default_to_pandas( - self, id: UUID, pandas_op, *args, **kwargs - ) -> DefaultToPandasResult: - result = self._base_query_compiler_by_id[id].default_to_pandas( - pandas_op, *args, **kwargs - ) - result_is_qc_id = isinstance(result, BaseQueryCompiler) - if result_is_qc_id: - new_id = self._generate_id() - self._base_query_compiler_by_id[new_id] = result - result = new_id - return self.DefaultToPandasResult(result=result, result_is_qc_id=False) - - def _generate_id(self): - id = uuid4() - while id in self._base_query_compiler_by_id: - id = uuid4() - return id - - class TestClientQueryCompiler(ClientQueryCompiler): @classmethod def from_pandas(cls, df, data_cls): @@ -328,7 +299,7 @@ def prepare(cls): def set_client_execution(): - service = BaseExecutionService() + service = ForwardingQueryCompilerService(BaseQueryCompiler) ClientQueryCompiler.set_server_connection(service) ClientIO.query_compiler_cls = TestClientQueryCompiler ClientIO.set_server_connection(service) @@ -384,10 +355,9 @@ def pytest_configure(config): if execution == "BaseOnPython": set_base_on_python_execution() - config.addinivalue_line( - "filterwarnings", "default:.*defaulting to pandas.*:UserWarning" - ) + config.addinivalue_line("filterwarnings", default_to_pandas_ignore_string) elif execution == "Client": + config.addinivalue_line("filterwarnings", default_to_pandas_ignore_string) set_client_execution() else: partition, engine = execution.split("On") diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index e8fc0ec20c2..2d35762f55e 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -89,7 +89,7 @@ def add_suffix(self, suffix, axis=1): return self.__constructor__(self._service.add_suffix(self._id, suffix, axis)) def insert(self, loc, column, value): - if isinstance(value, ClientQueryCompiler): + if isinstance(value, type(self)): value = value._id is_qc = True else: @@ -99,7 +99,7 @@ def insert(self, loc, column, value): ) def setitem(self, axis, key, value): - if isinstance(value, ClientQueryCompiler): + if isinstance(value, type(self)): value = value._id is_qc = True else: @@ -109,7 +109,7 @@ def setitem(self, axis, key, value): ) def getitem_array(self, key): - if isinstance(key, ClientQueryCompiler): + if isinstance(key, type(self)): key = key._id is_qc = True else: @@ -162,11 +162,11 @@ def replace( regex=False, method: "str | NoDefault" = no_default, ): - if isinstance(to_replace, ClientQueryCompiler): + if isinstance(to_replace, type(self)): is_to_replace_qc = True else: is_to_replace_qc = False - if isinstance(regex, ClientQueryCompiler): + if isinstance(regex, type(self)): is_regex_qc = True else: is_regex_qc = False @@ -195,7 +195,7 @@ def fillna( limit=None, downcast=None, ): - if isinstance(value, ClientQueryCompiler): + if isinstance(value, type(self)): is_qc = True else: is_qc = False @@ -278,7 +278,7 @@ def concat(self, axis, other, **kwargs): ) def eq(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -286,7 +286,7 @@ def eq(self, other, **kwargs): return self.__constructor__(self._service.eq(self._id, other, is_qc, **kwargs)) def lt(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -294,7 +294,7 @@ def lt(self, other, **kwargs): return self.__constructor__(self._service.lt(self._id, other, is_qc, **kwargs)) def le(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -302,7 +302,7 @@ def le(self, other, **kwargs): return self.__constructor__(self._service.le(self._id, other, is_qc, **kwargs)) def gt(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -310,7 +310,7 @@ def gt(self, other, **kwargs): return self.__constructor__(self._service.gt(self._id, other, is_qc, **kwargs)) def ge(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -318,7 +318,7 @@ def ge(self, other, **kwargs): return self.__constructor__(self._service.ge(self._id, other, is_qc, **kwargs)) def ne(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -326,7 +326,7 @@ def ne(self, other, **kwargs): return self.__constructor__(self._service.ne(self._id, other, is_qc, **kwargs)) def __and__(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -336,7 +336,7 @@ def __and__(self, other, **kwargs): ) def __or__(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -346,7 +346,7 @@ def __or__(self, other, **kwargs): ) def add(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -354,7 +354,7 @@ def add(self, other, **kwargs): return self.__constructor__(self._service.add(self._id, other, is_qc, **kwargs)) def radd(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -364,7 +364,7 @@ def radd(self, other, **kwargs): ) def truediv(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -374,7 +374,7 @@ def truediv(self, other, **kwargs): ) def mod(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -382,7 +382,7 @@ def mod(self, other, **kwargs): return self.__constructor__(self._service.mod(self._id, other, is_qc, **kwargs)) def rmod(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -392,7 +392,7 @@ def rmod(self, other, **kwargs): ) def sub(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -402,7 +402,7 @@ def sub(self, other, **kwargs): ) def rsub(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -412,7 +412,7 @@ def rsub(self, other, **kwargs): ) def mul(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -420,7 +420,7 @@ def mul(self, other, **kwargs): return self.__constructor__(self._service.mul(self._id, other, is_qc, **kwargs)) def rmul(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -430,7 +430,7 @@ def rmul(self, other, **kwargs): ) def floordiv(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: @@ -440,7 +440,7 @@ def floordiv(self, other, **kwargs): ) def rfloordiv(self, other, **kwargs): - if isinstance(other, ClientQueryCompiler): + if isinstance(other, type(self)): other = other._id is_qc = True else: From d56db3f744ed9969491418004193707f27077e34 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 26 Oct 2022 19:25:08 -0500 Subject: [PATCH 35/77] got test_indexing.py to pass, going in order through test-defaults. Signed-off-by: mvashishtha --- modin/conftest.py | 2 +- modin/core/execution/client/io.py | 4 ++-- modin/core/execution/client/query_compiler.py | 13 ++++++++----- modin/pandas/indexing.py | 10 +--------- modin/pandas/test/dataframe/test_default.py | 10 +++++++--- modin/pandas/test/dataframe/test_indexing.py | 7 +++++++ 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/modin/conftest.py b/modin/conftest.py index 58b8c04aa70..f0cf0a49b26 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -299,7 +299,7 @@ def prepare(cls): def set_client_execution(): - service = ForwardingQueryCompilerService(BaseQueryCompiler) + service = ForwardingQueryCompilerService(BaseQueryCompiler, PandasOnPythonIO) ClientQueryCompiler.set_server_connection(service) ClientIO.query_compiler_cls = TestClientQueryCompiler ClientIO.set_server_connection(service) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 689ef92c6ad..866d016b13e 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -62,7 +62,7 @@ def read_csv(cls, filepath_or_buffer, **kwargs): Returns ------- - ClientQueryCompiler + query_compiler_cls Query compiler with CSV data read in. """ if isinstance(filepath_or_buffer, str): @@ -77,7 +77,7 @@ def read_csv(cls, filepath_or_buffer, **kwargs): raise ConnectionError( "Missing server connection, did you initialize the connection?" ) - return ClientQueryCompiler( + return cls.query_compiler_cls( cls._server_conn.read_csv(cls._data_conn, filepath_or_buffer, **kwargs) ) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 2d35762f55e..3083110a47d 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -137,7 +137,7 @@ def get_dummies(self, columns, **kwargs): ) def take_2d(self, index=None, columns=None): - return self.__constructor__(self._service.view(self._id, index, columns)) + return self.__constructor__(self._service.take_2d(self._id, index, columns)) def drop(self, index=None, columns=None): return self.__constructor__(self._service.drop(self._id, index, columns)) @@ -397,9 +397,7 @@ def sub(self, other, **kwargs): is_qc = True else: is_qc = False - return self.__constructor__( - self._service.rsub(self._id, other, is_qc, **kwargs) - ) + return self.__constructor__(self._service.sub(self._id, other, is_qc, **kwargs)) def rsub(self, other, **kwargs): if isinstance(other, type(self)): @@ -706,9 +704,14 @@ def groupby_sum( agg_kwargs, drop=False, ): + if isinstance(by, type(self)): + by = by._id + is_qc = True + else: + is_qc = False return self.__constructor__( self._service.groupby_sum( - self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc ) ) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index d51b749e10b..16b3003f0f9 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -35,7 +35,6 @@ from pandas.api.types import is_list_like, is_bool from pandas.core.dtypes.common import is_integer, is_bool_dtype, is_integer_dtype from pandas.core.indexing import IndexingError -from modin.core.execution.client.query_compiler import ClientQueryCompiler from modin.error_message import ErrorMessage from modin.logging import ClassLogger @@ -678,14 +677,7 @@ def __getitem__(self, key): if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) - if isinstance(self.qc, ClientQueryCompiler) and self.qc.lazy_execution: - # Since we don't know if the row labels are present or not in lazy evaluation, - # immediately hand off computation to the engine - return type(self.df)( - query_compiler=self.qc.getitem_row_labels_array( - row_loc - ).getitem_column_array(col_loc) - ) + row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) result = self._getitem_positional( diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index 710eb4b152a..14897291558 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -129,6 +129,10 @@ def test_to_numpy(data): assert_array_equal(modin_df.values, pandas_df.values) +@pytest.mark.skipif( + get_current_execution() == "Client", + reason="Client query compiler does not have partitions", +) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): frame = pd.DataFrame(data) @@ -1200,9 +1204,9 @@ def test_setattr_axes(): # Test that setting .index or .columns does not warn df = pd.DataFrame([[1, 2], [3, 4]]) with warnings.catch_warnings(): - if get_current_execution() != "BaseOnPython": - # In BaseOnPython, setting columns raises a warning because get_axis - # defaults to pandas. + if get_current_execution() not in ("BaseOnPython", "Client"): + # In BaseOnPython and Client executions, setting columns raises a + # warning because get_axis defaults to pandas. warnings.simplefilter("error") df.index = ["foo", "bar"] df.columns = [9, 10] diff --git a/modin/pandas/test/dataframe/test_indexing.py b/modin/pandas/test/dataframe/test_indexing.py index 88ab6f5a1cd..61492e400f3 100644 --- a/modin/pandas/test/dataframe/test_indexing.py +++ b/modin/pandas/test/dataframe/test_indexing.py @@ -1993,6 +1993,13 @@ def test___setitem__mask(): ) @pytest.mark.parametrize("convert_to_series", [False, True]) @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) +@pytest.mark.skipif( + condition=get_current_execution() == "Client", + reason=( + "client query compiler uses lazy execution, so we don't default " + + "to pandas for the empty frame because we don't check whether the frame is empty. we can't do the insertion correctly right now without defaulting to pandas." + ), +) def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) From 3f9bf8c98b764a47e4738fe44e58898491bb25e7 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Thu, 27 Oct 2022 01:34:42 -0500 Subject: [PATCH 36/77] ci.yml tests pass through test_map_metadata. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 6 +- .../storage_formats/base/query_compiler.py | 4 +- .../storage_formats/pandas/query_compiler.py | 6 +- modin/pandas/base.py | 2 +- modin/pandas/test/dataframe/test_iter.py | 11 +-- .../test/dataframe/test_map_metadata.py | 67 ++++++++++++++----- 6 files changed, 69 insertions(+), 27 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 3083110a47d..7dbccdd4124 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -139,8 +139,10 @@ def get_dummies(self, columns, **kwargs): def take_2d(self, index=None, columns=None): return self.__constructor__(self._service.take_2d(self._id, index, columns)) - def drop(self, index=None, columns=None): - return self.__constructor__(self._service.drop(self._id, index, columns)) + def drop(self, index=None, columns=None, errors: str = "raise"): + return self.__constructor__( + self._service.drop(self._id, index, columns, errors) + ) def isna(self): return self.__constructor__(self._service.isna(self._id)) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 54b57bfa921..47f08a8e616 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2214,7 +2214,7 @@ def inserter(df, loc, column, value): # END Abstract insert # Abstract drop - def drop(self, index=None, columns=None): + def drop(self, index=None, columns=None, errors: str = "raise"): """ Drop specified rows or columns. @@ -2234,7 +2234,7 @@ def drop(self, index=None, columns=None): return self else: return DataFrameDefault.register(pandas.DataFrame.drop)( - self, index=index, columns=columns + self, index=index, columns=columns, errors=errors ) # END drop diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 2f944c26801..3a8e78822d5 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2355,7 +2355,11 @@ def dropna(self, **kwargs): ) ) - def drop(self, index=None, columns=None): + def drop(self, index=None, columns=None, errors: str = "raise"): + # `errors` parameter needs to be part of the function signature because + # other query compilers may not take care of error handling at the API + # layer. This query compiler assumes there won't be any errors due to + # invald keys. if index is not None: index = np.sort(self.index.get_indexer_for(self.index.difference(index))) if columns is not None: diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 6022956e2e0..b1609a35684 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1246,7 +1246,7 @@ def drop( axes[axis] = None new_query_compiler = self._query_compiler.drop( - index=axes["index"], columns=axes["columns"] + index=axes["index"], columns=axes["columns"], errors=errors ) return self._create_or_update_from_compiler(new_query_compiler, inplace) diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index a9978fbc4a0..d678ee11efa 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -17,7 +17,7 @@ import pandas import matplotlib import modin.pandas as pd -import io +from pandas._testing import ensure_clean import warnings from modin.pandas.test.utils import ( @@ -226,9 +226,12 @@ def test___repr__(): "2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" - pandas_df = pandas.read_csv(io.StringIO(string_data)) - with warns_that_defaulting_to_pandas(): - modin_df = pd.read_csv(io.StringIO(string_data)) + with ensure_clean(".csv") as path: + with open(path, "w") as f: + f.write(string_data) + pandas_df = pandas.read_csv(path) + with warns_that_defaulting_to_pandas(): + modin_df = pd.read_csv(path) assert repr(pandas_df) == repr(modin_df) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 8cdf61fb4c1..10fa9880b16 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -162,17 +162,22 @@ def test_empty_df(): assert len(df.index) == 0 assert len(df.columns) == 0 - df = pd.DataFrame() - pd_df = pandas.DataFrame() - df["a"] = [1, 2, 3, 4, 5] - pd_df["a"] = [1, 2, 3, 4, 5] - df_equals(df, pd_df) - - df = pd.DataFrame() - pd_df = pandas.DataFrame() - df["a"] = list("ABCDEF") - pd_df["a"] = list("ABCDEF") - df_equals(df, pd_df) + # client query compiler uses lazy execution, so we don't default to pandas + # for the empty frame because we don't check whether the frame is empty. + # we can't do the insertion correctly right now without defaulting to + # pandas. + if get_current_execution() != "Client": + df = pd.DataFrame() + pd_df = pandas.DataFrame() + df["a"] = [1, 2, 3, 4, 5] + pd_df["a"] = [1, 2, 3, 4, 5] + df_equals(df, pd_df) + + df = pd.DataFrame() + pd_df = pandas.DataFrame() + df["a"] = list("ABCDEF") + pd_df["a"] = list("ABCDEF") + df_equals(df, pd_df) df = pd.DataFrame() pd_df = pandas.DataFrame() @@ -293,7 +298,7 @@ def test_copy(data): new_modin_df = modin_df.copy() assert new_modin_df is not modin_df - if get_current_execution() != "BaseOnPython": + if get_current_execution() not in ("BaseOnPython", "Client"): assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, @@ -628,6 +633,10 @@ def test_convert_dtypes_single_partition( get_current_execution() == "BaseOnPython", reason="BaseOnPython cannot not have multiple partitions.", ) +@pytest.mark.skipif( + get_current_execution() == "Client", + reason="Client query compiler doesn't have partitions at all.", +) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype modin_part1 = pd.DataFrame(["a"]).convert_dtypes() @@ -708,12 +717,36 @@ def test_drop(): df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) - pytest.raises(ValueError, modin_simple.drop, 5) - pytest.raises(ValueError, modin_simple.drop, "C", 1) - pytest.raises(ValueError, modin_simple.drop, [1, 5]) - pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1) + # TODO(https://github.com/modin-project/modin/issues/5163): raise a + # KeyError like pandas when the label is not found when lazy_execution is + # off. Also use df_equals instead of + check_exception_type = modin_simple._query_compiler.lazy_execution + eval_general( + modin_simple, + simple, + lambda df: df.drop(5), + check_exception_type=check_exception_type, + ) + eval_general( + modin_simple, + simple, + lambda df: df.drop("C", axis=1), + check_exception_type=check_exception_type, + ) + eval_general( + modin_simple, + simple, + lambda df: df.drop([1, 5], axis=1), + check_exception_type=check_exception_type, + ) + eval_general( + modin_simple, + simple, + lambda df: df.drop(["A", "C"], axis=1), + check_exception_type=check_exception_type, + ) - # errors = 'ignore' + # test errors = 'ignore' df_equals(modin_simple.drop(5, errors="ignore"), simple) df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) From 86d489a5aa617f22f3473e94a18a1f49c94ead41 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Thu, 27 Oct 2022 01:43:24 -0500 Subject: [PATCH 37/77] Tests pass through test_reduce. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 7dbccdd4124..018e7cb0483 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -661,9 +661,14 @@ def groupby_count( agg_kwargs, drop=False, ): + if isinstance(by, type(self)): + by = by._id + is_qc = True + else: + is_qc = False return self.__constructor__( self._service.groupby_count( - self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc ) ) From 6946ae806e5d4cf559e32e600935a2cd5da32b37 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Thu, 27 Oct 2022 10:39:30 -0500 Subject: [PATCH 38/77] pass through test_udf.py and enable another skipped test. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 31 ------------------- modin/pandas/test/dataframe/test_default.py | 11 ++++--- .../test/dataframe/test_map_metadata.py | 8 ++--- modin/pandas/test/dataframe/test_udf.py | 9 ++++-- 4 files changed, 15 insertions(+), 44 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 018e7cb0483..193a6534f8c 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -13,10 +13,8 @@ from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler import numpy as np -import inspect from pandas._libs.lib import no_default, NoDefault from pandas.api.types import is_list_like -from pandas.core.computation.parsing import tokenize_string class ClientQueryCompiler(BaseQueryCompiler): @@ -786,35 +784,6 @@ def idxmax(self, **kwargs): return self.__constructor__(self._service.idxmax(self._id, **kwargs)) def query(self, expr, **kwargs): - is_variable = False - variable_list = [] - for k, v in tokenize_string(expr): - if v == "" or v == " ": - continue - if is_variable: - frame = inspect.currentframe() - identified = False - while frame: - if v in frame.f_locals: - value = frame.f_locals[v] - if isinstance(value, list): - value = tuple(value) - variable_list.append(str(value)) - identified = True - break - frame = frame.f_back - if not identified: - # TODO this error does not quite match pandas - raise ValueError(f"{v} not found") - is_variable = False - elif v == "@": - is_variable = True - continue - else: - if v in self.columns: - v = f"`{v}`" - variable_list.append(v) - expr = " ".join(variable_list) return self.__constructor__(self._service.query(self._id, expr, **kwargs)) def finalize(self): diff --git a/modin/pandas/test/dataframe/test_default.py b/modin/pandas/test/dataframe/test_default.py index 14897291558..f1b1293eea7 100644 --- a/modin/pandas/test/dataframe/test_default.py +++ b/modin/pandas/test/dataframe/test_default.py @@ -129,14 +129,15 @@ def test_to_numpy(data): assert_array_equal(modin_df.values, pandas_df.values) -@pytest.mark.skipif( - get_current_execution() == "Client", - reason="Client query compiler does not have partitions", -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_partition_to_numpy(data): frame = pd.DataFrame(data) - for partition in frame._query_compiler._modin_frame._partitions.flatten().tolist(): + qc = frame._query_compiler + if get_current_execution() == "Client": + modin_frame = qc._service._qc[qc._id]._modin_frame + else: + modin_frame = qc._modin_frame + for partition in modin_frame._partitions.flatten().tolist(): assert_array_equal(partition.to_pandas().values, partition.to_numpy()) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 10fa9880b16..37a3a1cd0d4 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -630,12 +630,8 @@ def test_convert_dtypes_single_partition( @pytest.mark.skipif( - get_current_execution() == "BaseOnPython", - reason="BaseOnPython cannot not have multiple partitions.", -) -@pytest.mark.skipif( - get_current_execution() == "Client", - reason="Client query compiler doesn't have partitions at all.", + get_current_execution() in ("BaseOnPython", "Client"), + reason="These exeuctions will not have multiple partitions.", ) def test_convert_dtypes_multiple_row_partitions(): # Column 0 should have string dtype diff --git a/modin/pandas/test/dataframe/test_udf.py b/modin/pandas/test/dataframe/test_udf.py index adda540b248..13f7e59a5e1 100644 --- a/modin/pandas/test/dataframe/test_udf.py +++ b/modin/pandas/test/dataframe/test_udf.py @@ -360,7 +360,7 @@ def f(x, arg2=0, arg3=0): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("funcs", query_func_values, ids=query_func_keys) def test_query(data, funcs): - if get_current_execution() == "BaseOnPython" and funcs != "col3 > col4": + if get_current_execution() in ("BaseOnPython", "Client") and funcs != "col3 > col4": pytest.xfail( reason="In this case, we are faced with the problem of handling empty data frames - #4934" ) @@ -374,8 +374,13 @@ def test_query(data, funcs): modin_df.query(funcs) else: modin_result = modin_df.query(funcs) + qc = modin_df._query_compiler + if get_current_execution() == "Client": + modin_frame = qc._service._qc[qc._id]._modin_frame + else: + modin_frame = qc._modin_frame # `dtypes` must be evaluated after `query` so we need to check cache - assert modin_result._query_compiler._modin_frame._dtypes is not None + assert modin_frame._dtypes is not None df_equals(modin_result, pandas_result) df_equals(modin_result.dtypes, pandas_result.dtypes) From 4f4831be0988bef8fd4e43c70ef02f28e9657073 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Thu, 27 Oct 2022 11:33:30 -0500 Subject: [PATCH 39/77] Pass through test_series, skipping pickle. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 4 +++- .../storage_formats/base/query_compiler.py | 18 ++++++++++-------- modin/pandas/series.py | 6 ++---- modin/pandas/test/test_series.py | 4 ++-- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 193a6534f8c..8c174b30a67 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -196,6 +196,7 @@ def fillna( downcast=None, ): if isinstance(value, type(self)): + value = value._id is_qc = True else: is_qc = False @@ -577,7 +578,8 @@ def str_findall(self, pat, flags=0, **kwargs): def str_get(self, i): return self.__constructor__(self._service.str_get(self._id, i)) - str_index = str_find + def str_index(self, sub, start=0, end=None): + return self.__constructor__(self._service.str_index(self._id, sub, start, end)) def str_join(self, sep): return self.__constructor__(self._service.str_join(self._id, sep)) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 47f08a8e616..848e12e0560 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -1748,20 +1748,22 @@ def describe(self, **kwargs): # noqa: PR02 # data in the same place. @doc_utils.doc_cum_agg(method="sum", refer_to="cumsum") - def cumsum(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cumsum)(self, **kwargs) + def cumsum(self, fold_axis, *args, **kwargs): # noqa: PR02 + return DataFrameDefault.register(pandas.DataFrame.cumsum)(self, *args, **kwargs) @doc_utils.doc_cum_agg(method="maximum", refer_to="cummax") - def cummax(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cummax)(self, **kwargs) + def cummax(self, fold_axis, *args, **kwargs): # noqa: PR02 + return DataFrameDefault.register(pandas.DataFrame.cummax)(self, *args, **kwargs) @doc_utils.doc_cum_agg(method="minimum", refer_to="cummin") - def cummin(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cummin)(self, **kwargs) + def cummin(self, fold_axis, *args, **kwargs): # noqa: PR02 + return DataFrameDefault.register(pandas.DataFrame.cummin)(self, *args, **kwargs) @doc_utils.doc_cum_agg(method="product", refer_to="cumprod") - def cumprod(self, fold_axis, **kwargs): # noqa: PR02 - return DataFrameDefault.register(pandas.DataFrame.cumprod)(self, **kwargs) + def cumprod(self, fold_axis, *args, **kwargs): # noqa: PR02 + return DataFrameDefault.register(pandas.DataFrame.cumprod)( + self, *args, **kwargs + ) @doc_utils.add_refer_to("DataFrame.diff") def diff(self, fold_axis, **kwargs): # noqa: PR02 diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 023b76ed120..17ffa05310f 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -311,9 +311,7 @@ def __getattr__(self, key): try: return object.__getattribute__(self, key) except AttributeError as err: - if not self._query_compiler.lazy_execution and ( - key not in _ATTRS_NO_LOOKUP and key in self.index - ): + if key not in _ATTRS_NO_LOOKUP and key in self.index: return self[key] raise err @@ -2467,7 +2465,7 @@ def _getitem(self, key): row_positions = self.index.get_indexer_for(key) if is_indexer else key if not all(is_integer(x) for x in row_positions): raise KeyError(key[0] if reduce_dimension else key) - result = self._query_compiler.getitem_row_array(row_positions) + result = self._query_compiler.getitem_row_array(row_positions, numeric=True) if reduce_dimension: return self._reduce_dimension(result) diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 22e551d6a7a..61ff44770b4 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -519,7 +519,7 @@ def test___repr__(name, dt_index, data): ) pandas_series.index = modin_series.index = index - if get_current_execution() == "BaseOnPython" and data == "empty": + if get_current_execution() in ("BaseOnPython", "Client") and data == "empty": # TODO: Remove this when default `dtype` of empty Series will be `object` in pandas (see #3142). assert modin_series.dtype == np.object assert pandas_series.dtype == np.float64 @@ -1631,7 +1631,7 @@ def test_dropna_inplace(data): def test_dtype_empty(): modin_series, pandas_series = pd.Series(), pandas.Series() - if get_current_execution() == "BaseOnPython": + if get_current_execution() in ("BaseOnPython", "Client"): # TODO: Remove this when default `dtype` of empty Series will be `object` in pandas (see #3142). assert modin_series.dtype == np.object assert pandas_series.dtype == np.float64 From 577c9896182f350f24bc1765e3db10578c5fc732 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Thu, 27 Oct 2022 21:25:05 -0500 Subject: [PATCH 40/77] Tests pass through test_general. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 36 ++++++++++++++++--- modin/pandas/general.py | 3 +- modin/pandas/groupby.py | 4 +-- modin/pandas/test/test_groupby.py | 23 +++++++++--- 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 8c174b30a67..df75d5d2ff3 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -646,9 +646,14 @@ def groupby_mean( agg_kwargs, drop=False, ): + if isinstance(by, type(self)): + by = by._id + is_qc = True + else: + is_qc = False return self.__constructor__( self._service.groupby_mean( - self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc ) ) @@ -681,9 +686,21 @@ def groupby_max( agg_kwargs, drop=False, ): + if isinstance(by, type(self)): + by = by._id + is_qc = True + else: + is_qc = False return self.__constructor__( self._service.groupby_max( - self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + self._id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop, + is_qc, ) ) @@ -696,9 +713,14 @@ def groupby_min( agg_kwargs, drop=False, ): + if isinstance(by, type(self)): + by = by._id + is_qc = True + else: + is_qc = False return self.__constructor__( self._service.groupby_min( - self._id, by._id, axis, groupby_kwargs, agg_args, agg_kwargs, drop + self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc ) ) @@ -733,10 +755,15 @@ def groupby_agg( how="axis_wise", drop=False, ): + if isinstance(by, type(self)): + by = by._id + is_qc = True + else: + is_qc = False return self.__constructor__( self._service.groupby_agg( self._id, - by._id, + by, agg_func, axis, groupby_kwargs, @@ -744,6 +771,7 @@ def groupby_agg( agg_kwargs, how, drop, + is_qc, ) ) diff --git a/modin/pandas/general.py b/modin/pandas/general.py index a34dac3a2e5..b2993b4c9bc 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -472,8 +472,7 @@ def concat( list_of_objs = [ obj._query_compiler for obj in list_of_objs - if (not obj._query_compiler.lazy_execution and len(obj.index)) - or len(obj.columns) + if len(obj.index) or len(obj.columns) ] if keys is not None: if all_series: diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index dcd990f13b2..fed97028282 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -918,7 +918,7 @@ def _iter(self): k, DataFrame( query_compiler=self._query_compiler.getitem_row_array( - indices[k] + indices[k], numeric=True ) ), ) @@ -1228,7 +1228,7 @@ def _iter(self): k, Series( query_compiler=self._query_compiler.getitem_row_array( - indices[k] + indices[k], numeric=True ) ), ) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index b805c8e4238..fdd09afe610 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -489,7 +489,7 @@ def maybe_get_columns(df, by): ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) - if get_current_execution() != "BaseOnPython": + if get_current_execution() not in ("BaseOnPython", "Client"): eval_general( modin_groupby, pandas_groupby, @@ -1276,7 +1276,7 @@ def eval_shift(modin_groupby, pandas_groupby): # groupby.shift internally masks the source frame with a Series boolean mask, # doing so ends up in the `getitem_array` method, that is broken for `BaseOnPython`: # https://github.com/modin-project/modin/issues/3701 - if get_current_execution() != "BaseOnPython": + if get_current_execution() not in ("BaseOnPython", "Client"): if isinstance(pandas_groupby, pandas.core.groupby.DataFrameGroupBy): pandas_res = pandas_groupby.shift(axis=1, fill_value=777) modin_res = modin_groupby.shift(axis=1, fill_value=777) @@ -1441,7 +1441,7 @@ def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): # https://github.com/modin-project/modin/issues/2912 # "BaseOnPython" tests are disabled because of the bug: # https://github.com/modin-project/modin/issues/3827 - if get_current_execution() != "BaseOnPython" and any( + if get_current_execution() not in ("BaseOnPython", "Client") and any( col in modin_df.columns for col in by_kwarg ): df_equals(md_grp.quantile(), pd_grp.quantile()) @@ -1555,7 +1555,7 @@ def test_agg_func_None_rename(by_and_agg_dict, as_index): pytest.param( False, marks=pytest.mark.xfail_executions( - ["BaseOnPython"], reason="See Pandas issue #39103" + ["BaseOnPython", "Client"], reason="See Pandas issue #39103" ), ), ], @@ -1910,6 +1910,16 @@ def test_multi_column_groupby_different_partitions( eval___getitem__(md_grp, pd_grp, [md_df.columns[1], md_df.columns[2]]) +# TODO(https://github.com/modin-project/modin/issues/5165): Consider +# making the dataframe not empty and fixing the resulting bugs. +@pytest.mark.skipif( + get_current_execution() == "Client", + reason=( + "Dataframe is empty, so other executions default to pandas and " + + "behave correctly, but Client execution has lazy_execution=True, so it " + + "doesn't default to pandas and it has bugs." + ), +) @pytest.mark.parametrize( "by", [ @@ -2120,8 +2130,11 @@ def test_groupby_with_virtual_partitions(): PandasDataframeAxisPartition, ) else: + qc = big_modin_df._query_compiler + if get_current_execution() == "Client": + qc = qc._service._qc[qc._id] assert not issubclass( - type(big_modin_df._query_compiler._modin_frame._partitions[0][0]), + type(qc._modin_frame._partitions[0][0]), PandasDataframeAxisPartition, ) eval_general( From b74a95c0cdcf318864d22613580d5178786e3b6a Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 10:59:42 -0500 Subject: [PATCH 41/77] TestCsv and TestSql pass. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 11 +++++++++-- modin/core/io/io.py | 11 +++++++++++ modin/pandas/test/test_io.py | 29 +++++++++++++++++++++++++---- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 866d016b13e..9bcbd18111b 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -15,6 +15,8 @@ from modin.core.io.io import BaseIO import fsspec +import pandas + from .query_compiler import ClientQueryCompiler @@ -77,9 +79,14 @@ def read_csv(cls, filepath_or_buffer, **kwargs): raise ConnectionError( "Missing server connection, did you initialize the connection?" ) - return cls.query_compiler_cls( - cls._server_conn.read_csv(cls._data_conn, filepath_or_buffer, **kwargs) + server_result = cls._server_conn.read_csv( + cls._data_conn, filepath_or_buffer, **kwargs ) + # This happens when `read_csv` returns a TextFileReader object for + # iterating through, e.g. because iterator=True + if isinstance(server_result, pandas.io.parsers.TextFileReader): + return server_result + return cls.query_compiler_cls(server_result) @classmethod def read_sql(cls, sql, con, **kwargs): diff --git a/modin/core/io/io.py b/modin/core/io/io.py index b388a2f9e6d..77d988f39e7 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -138,6 +138,17 @@ def _read_csv( ErrorMessage.default_to_pandas("`read_csv`") return cls._read(filepath_or_buffer=filepath_or_buffer, **kwargs) + @classmethod + @_inherit_docstrings(pandas.read_sql, apilink="pandas.read_sql") + @doc( + _doc_default_io_method, + summary="Read SQL query or database table into query compiler", + returns=_doc_returns_qc_or_parser, + ) + def _read_sql(cls, sql, con, **kwargs): # noqa: PR01 + ErrorMessage.default_to_pandas("`read_sql`") + return cls.from_pandas(pandas.read_sql(sql, con, **kwargs)) + @classmethod def _read(cls, **kwargs): """ diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 768a3ca4ebf..2694b0ffceb 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -37,7 +37,7 @@ ReadSqlEngine, ) from modin._compat import PandasCompatVersion -from modin.utils import to_pandas +from modin.utils import get_current_execution, to_pandas from modin.pandas.utils import from_arrow from modin.test.test_utils import warns_that_defaulting_to_pandas import pyarrow as pa @@ -76,7 +76,7 @@ else: from .utils import eval_io -if StorageFormat.get() == "Pandas": +if StorageFormat.get() in ("Pandas", ""): import modin.pandas as pd else: import modin.experimental.pandas as pd @@ -1056,6 +1056,9 @@ def _has_pandas_fallback_reason(self): condition="config.getoption('--simulate-cloud').lower() != 'off'", reason="The reason of tests fail in `cloud` mode is unknown for now - issue #2340", ) + @pytest.mark.xfail_executions( + "Client", reason="Client cannot read from buffer", raises=NotImplementedError + ) def test_read_csv_default_to_pandas(self): if self._has_pandas_fallback_reason(): warning_suffix = "buffers" @@ -1251,6 +1254,9 @@ def wrapped_read_csv(file, method): ], ) @pytest.mark.parametrize("buffer_start_pos", [0, 10]) + @pytest.mark.xfail_executions( + "Client", reason="Client cannot read from buffer", raises=NotImplementedError + ) def test_read_csv_file_handle(self, read_mode, make_csv_file, buffer_start_pos): with ensure_clean() as unique_filename: make_csv_file(filename=unique_filename) @@ -1264,7 +1270,10 @@ def test_read_csv_file_handle(self, read_mode, make_csv_file, buffer_start_pos): def test_unnamed_index(self): def get_internal_df(df): - partition = read_df._query_compiler._modin_frame._partitions[0][0] + qc = read_df._query_compiler + if get_current_execution() == "Client": + qc = qc._service._qc[qc._id] + partition = qc._modin_frame._partitions[0][0] return partition.to_pandas() path = "modin/pandas/test/data/issue_3119.csv" @@ -2096,7 +2105,19 @@ class TestSql: condition="config.getoption('--simulate-cloud').lower() != 'off'", reason="The reason of tests fail in `cloud` mode is unknown for now - issue #3264", ) - @pytest.mark.parametrize("read_sql_engine", ["Pandas", "Connectorx"]) + @pytest.mark.parametrize( + "read_sql_engine", + [ + "Pandas", + pytest.param( + "Connectorx", + marks=pytest.mark.skipif( + get_current_execution() == "Client", + reason="Client execution uses pandas.read_sql, which can't read from connectorx connections", + ), + ), + ], + ) def test_read_sql(self, make_sql_connection, read_sql_engine): with ensure_clean_dir() as dirname: filename = get_unique_filename(".db") From 81583d2eccc93d9d9e41a0df0f276b1d7d3dc3e5 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 13:05:22 -0500 Subject: [PATCH 42/77] Fix pydocstyle for qc and io. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 1 + modin/core/execution/client/io.py | 23 +++-- modin/core/execution/client/query_compiler.py | 85 +++++++++++++++---- modin/core/storage_formats/base/doc_utils.py | 1 + .../storage_formats/base/query_compiler.py | 2 + 5 files changed, 90 insertions(+), 22 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7e03bf16cef..250e9fe1155 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -129,6 +129,7 @@ jobs: - run: python scripts/doc_checker.py modin/core/storage_formats/base - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow - run: python scripts/doc_checker.py modin/core/storage_formats/pandas + - run: python scripts/doc_checker.py modin/core/execution/client - run: | python scripts/doc_checker.py \ modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \ diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 9bcbd18111b..679ce7017ba 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -17,8 +17,6 @@ import fsspec import pandas -from .query_compiler import ClientQueryCompiler - class ClientIO(BaseIO): """Factory providing methods for performing I/O operations using a given Client as the execution engine.""" @@ -64,7 +62,7 @@ def read_csv(cls, filepath_or_buffer, **kwargs): Returns ------- - query_compiler_cls + self.query_compiler_cls Query compiler with CSV data read in. """ if isinstance(filepath_or_buffer, str): @@ -108,7 +106,7 @@ def read_sql(cls, sql, con, **kwargs): Returns ------- - ClientQueryCompiler + self.query_compiler_cls Query compiler with data read in from SQL connection. """ if isinstance(con, str) and con.lower() == "auto" and cls._data_conn is None: @@ -121,10 +119,23 @@ def read_sql(cls, sql, con, **kwargs): raise ConnectionError( "Missing server connection, did you initialize the connection?" ) - return ClientQueryCompiler( + return cls.query_compiler_cls( cls._server_conn.read_sql(sql, cls._data_conn, **kwargs) ) @classmethod - def to_sql(cls, qc, **kwargs): + def to_sql(cls, qc, **kwargs) -> None: + """ + Write records stored in a DataFrame to a SQL database. + + Databases supported by SQLAlchemy [1]_ are supported. Tables can be + newly created, appended to, or overwritten. + + Parameters + ---------- + qc : self.query_compiler_cls + Query compiler with data to write to SQL. + **kwargs : dict + Parameters of ``read_sql`` function. + """ cls._server_conn.to_sql(qc._id, **kwargs) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index df75d5d2ff3..635f9b00ad2 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -11,46 +11,99 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +"""Module contains ``ClientQueryCompiler`` class.""" + import numpy as np +import pandas from pandas._libs.lib import no_default, NoDefault from pandas.api.types import is_list_like +from typing import Any +import uuid + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler +from modin.utils import _inherit_docstrings +@_inherit_docstrings(BaseQueryCompiler) class ClientQueryCompiler(BaseQueryCompiler): - lazy_execution = True + """ + Query compiler for sending queries to a remote server. + + This class translates the query compiler API to function calls on a service + object, which may be a remote service. + + Parameters + ---------- + id : uuid.UUID + ID of this query compiler. + """ + + lazy_execution: bool = True + + def __init__(self, id: uuid.UUID): + self._id = id @classmethod - def set_server_connection(cls, conn): + def set_server_connection(cls, conn: Any): + """ + Set the connection to the service. + + Parameters + ---------- + conn : Any + Connection to the service. + """ cls._service = conn - def __init__(self, id): - assert ( - id is not None - ), "Make sure the client is properly connected and returns and ID" - if isinstance(id, Exception): - raise id - self._id = id + def _set_columns(self, new_columns: pandas.Index) -> None: + """ + Set this query compiler's columns. - def _set_columns(self, new_columns): + Parameters + ---------- + new_columns : pandas.Index + New columns to set. + """ self._id = self._service.rename(self._id, new_col_labels=new_columns) self._columns_cache = self._service.columns(self._id) - def _get_columns(self): + def _get_columns(self) -> pandas.Index: + """ + Get the columns of this query compiler. + + Returns + ------- + pandas.Index : The columns of this query compiler. + """ if self._columns_cache is None: self._columns_cache = self._service.columns(self._id) return self._columns_cache - def _set_index(self, new_index): + def _set_index(self, new_index: pandas.Index): + """ + Set this query compiler's index. + + Parameters + ---------- + new_index : pandas.Index + New index to set. + """ self._id = self._service.rename(self._id, new_row_labels=new_index) - def _get_index(self): + def _get_index(self) -> pandas.Index: + """ + Get the index of this query compiler. + + Returns + ------- + pandas.Index : The index of this query compiler. + """ return self._service.index(self._id) columns = property(_get_columns, _set_columns) - _columns_cache = None + _columns_cache: pandas.Index = None index = property(_get_index, _set_index) - _dtypes_cache = None + _dtypes_cache: pandas.Index = None @property def dtypes(self): diff --git a/modin/core/storage_formats/base/doc_utils.py b/modin/core/storage_formats/base/doc_utils.py index b538c47c92b..3efe8fd2294 100644 --- a/modin/core/storage_formats/base/doc_utils.py +++ b/modin/core/storage_formats/base/doc_utils.py @@ -288,6 +288,7 @@ def doc_reduce_agg(method, refer_to, params=None, extra_params=None): ---------- fold_axis : {{0, 1}} skipna : bool + *args : iterable **kwargs : dict Serves the compatibility purpose. Does not affect the result. diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 848e12e0560..4d674d8c4d2 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2226,6 +2226,8 @@ def drop(self, index=None, columns=None, errors: str = "raise"): Labels of rows to drop. columns : list of labels, optional Labels of columns to drop. + errors : str, default: "raise" + If 'ignore', suppress error and only existing labels are dropped. Returns ------- From 7dc093d4f9d69f3f4def579218a7e6eaa4c45c07 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 14:44:14 -0500 Subject: [PATCH 43/77] REFACTOR: Dedupe single ID service methods. Signed-off-by: mvashishtha --- modin/core/execution/client/service.py | 566 +++++++++++++++++++++++++ 1 file changed, 566 insertions(+) create mode 100644 modin/core/execution/client/service.py diff --git a/modin/core/execution/client/service.py b/modin/core/execution/client/service.py new file mode 100644 index 00000000000..f376f334562 --- /dev/null +++ b/modin/core/execution/client/service.py @@ -0,0 +1,566 @@ +import numpy as np +import pickle +from typing import Any, NamedTuple, Optional +from uuid import UUID, uuid4 +from modin.core.io.io import BaseIO + +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler + + +class ForwardingQueryCompilerService: + def __init__(self, query_compiler_type: BaseQueryCompiler, io_type: BaseIO): + self._qc = {} + self._qc_type = query_compiler_type + self._io_type = io_type + + def _generate_id(self) -> UUID: + id = uuid4() + while id in self._qc: + id = uuid4() + return id + + def add_query_compiler(self, qc) -> UUID: + id = self._generate_id() + self._qc[id] = qc + return id + + def to_pandas(self, id): + return self._qc[id].to_pandas() + + class DefaultToPandasResult(NamedTuple): + result: Optional[Any] + result_is_qc_id: bool + + def default_to_pandas( + self, id: UUID, pandas_op, *args, **kwargs + ) -> DefaultToPandasResult: + result = self._qc[id].default_to_pandas(pandas_op, *args, **kwargs) + result_is_qc_id = isinstance(result, self._qc_type) + if result_is_qc_id: + new_id = self._generate_id() + self._qc[new_id] = result + result = new_id + return self.DefaultToPandasResult( + result=result, result_is_qc_id=result_is_qc_id + ) + + def rename(self, id, new_col_labels=None, new_row_labels=None): + new_id = self._generate_id() + new_qc = self._qc[new_id] = self._qc[id].copy() + if new_col_labels is not None: + new_qc.columns = new_col_labels + if new_row_labels is not None: + new_qc.index = new_row_labels + return new_id + + def columns(self, id): + return self._qc[id].columns + + def index(self, id): + return self._qc[id].index + + def dtypes(self, id): + return self._qc[id].dtypes + + def insert(self, id, loc, column, value, is_qc): + if is_qc: + value = self._qc[value] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].insert(loc, column, value) + return new_id + + def setitem(self, id, axis, key, value, is_qc): + if is_qc: + value = self._qc[value] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].setitem(axis, key, value) + return new_id + + def getitem_array(self, id, key, is_qc): + if is_qc: + key = self._qc[key] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].getitem_array(key) + return new_id + + def replace( + self, + id, + to_replace, + value, + inplace, + limit, + regex, + method, + is_to_replace_qc, + is_regex_qc, + ): + if is_to_replace_qc: + to_replace = self._qc[to_replace] + if is_regex_qc: + regex = self._qc[regex] + new_id = self._generate_id() + # TODO(GH#3108): Use positional arguments instead of keyword arguments + # in the query compilers so we don't have to name all the arguments + # here. + self._qc[new_id] = self._qc[id].replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + return new_id + + def fillna( + self, + id, + squeeze_self, + squeeze_value, + value, + method, + axis, + inplace, + limit, + downcast, + is_qc, + ): + if is_qc: + value = self._qc[value] + new_id = self._generate_id() + # TODO(GH#3108): Use positional arguments instead of keyword arguments + # in the query compilers so we don't have to name all the + # arguments here. + self._qc[new_id] = self._qc[id].fillna( + squeeze_self=squeeze_self, + squeeze_value=squeeze_value, + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + return new_id + + def concat(self, id, axis, other, **kwargs): + # convert id to query compiler + other = [self._qc[o] for o in other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].concat(axis, other, **kwargs) + return new_id + + def eq(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].eq(other, **kwargs) + return new_id + + def lt(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].lt(other, **kwargs) + return new_id + + def le(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].le(other, **kwargs) + return new_id + + def gt(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].gt(other, **kwargs) + return new_id + + def ge(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].ge(other, **kwargs) + return new_id + + def ne(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].ne(other, **kwargs) + return new_id + + def __and__(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].__and__(other, **kwargs) + return new_id + + def __or__(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].__or__(other, **kwargs) + return new_id + + def add(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].add(other, **kwargs) + return new_id + + def radd(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].radd(other, **kwargs) + return new_id + + def truediv(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].truediv(other, **kwargs) + return new_id + + def rtruediv(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].rtruediv(other, **kwargs) + return new_id + + def mod(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].mod(other, **kwargs) + return new_id + + def rmod(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].rmod(other, **kwargs) + return new_id + + def sub(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].sub(other, **kwargs) + return new_id + + def rsub(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].rsub(other, **kwargs) + return new_id + + def mul(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].mul(other, **kwargs) + return new_id + + def rmul(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].rmul(other, **kwargs) + return new_id + + def floordiv(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].floordiv(other, **kwargs) + return new_id + + def rfloordiv(self, id, other, is_qc, **kwargs): + if is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].rfloordiv(other, **kwargs) + return new_id + + def merge(self, id, right, **kwargs): + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].merge(self._qc[right], **kwargs) + return new_id + + def groupby_mean( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_mean( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_count( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_count( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_max( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_max( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_min( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_min( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_sum( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_sum( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_agg( + self, + id, + by, + agg_func, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + how="axis_wise", + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_agg( + by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop + ) + return new_id + + def read_csv(self, connection, filepath, **kwargs) -> UUID: + io_result = self._io_type._read_csv(filepath, **kwargs) + if isinstance(io_result, self._qc_type): + new_id = self._generate_id() + self._qc[new_id] = io_result + return new_id + return io_result + + def read_sql(self, sql, connection, **kwargs) -> UUID: + new_id = self._generate_id() + self._qc[new_id] = self._io_type._read_sql(sql, connection, **kwargs) + return new_id + + def to_sql( + self, + id, + name, + con, + schema=None, + if_exists="fail", + index=True, + index_label=None, + chunksize=None, + dtype=None, + method=None, + ): + self._io_type.to_sql( + self._qc[id], + name, + con, + schema, + if_exists, + index, + index_label, + chunksize, + dtype, + method, + ) + + +def _set_forwarding_method_for_single_id(method_name: str): + def forwarding_method( + self: "ForwardingQueryCompilerService", id: UUID, *args, **kwargs + ): + new_id = self._generate_id() + self._qc[new_id] = getattr(self._qc[id], method_name)(*args, **kwargs) + return new_id + + setattr(ForwardingQueryCompilerService, method_name, forwarding_method) + + +_SINGLE_ID_FORWARDING_METHODS = frozenset( + { + "columnarize", + "transpose", + "take_2d", + "getitem_column_array", + "getitem_row_array", + "pivot", + "get_dummies", + "drop", + "isna", + "notna", + "add_prefix", + "add_suffix", + "astype", + "dropna", + "sum", + "prod", + "count", + "mean", + "median", + "std", + "min", + "max", + "any", + "all", + "quantile_for_single_value", + "quantile_for_list_of_values", + "describe", + "set_index_from_columns", + "reset_index", + "sort_rows_by_column_values", + "sort_index", + "dt_nanosecond", + "dt_microsecond", + "dt_second", + "dt_minute", + "dt_hour", + "dt_day", + "dt_dayofweek", + "dt_weekday", + "dt_day_name", + "dt_dayofyear", + "dt_week", + "dt_weekofyear", + "dt_month", + "dt_month_name", + "dt_quarter", + "dt_year", + "str_capitalize", + "str_isalnum", + "str_isalpha", + "str_isdecimal", + "str_isdigit", + "str_islower", + "str_isnumeric", + "str_isspace", + "str_istitle", + "str_isupper", + "str_len", + "str_lower", + "str_title", + "str_upper", + "str_center", + "str_contains", + "str_count", + "str_endswith", + "str_find", + "str_index", + "str_rfind", + "str_findall", + "str_get", + "str_join", + "str_lstrip", + "str_ljust", + "str_rjust", + "str_match", + "str_pad", + "str_repeat", + "str_split", + "str_rsplit", + "str_rstrip", + "str_slice", + "str_slice_replace", + "str_startswith", + "str_strip", + "str_zfill", + "cummax", + "cummin", + "cumsum", + "cumprod", + "is_monotonic_increasing", + "is_monotonic_decreasing", + "idxmax", + "idxmin", + "query", + } +) + +for method in _SINGLE_ID_FORWARDING_METHODS: + _set_forwarding_method_for_single_id(method) From 7405550c40c12427b8e3707c0bc735a021cdcde6 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 15:51:13 -0500 Subject: [PATCH 44/77] REFACTOR: Dedupe binary code and refactor some is_qc. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 273 +++++------------- modin/core/execution/client/service.py | 158 ++++------ 2 files changed, 128 insertions(+), 303 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 635f9b00ad2..095aea6aab0 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -18,7 +18,7 @@ from pandas._libs.lib import no_default, NoDefault from pandas.api.types import is_list_like from typing import Any -import uuid +from uuid import UUID from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler from modin.utils import _inherit_docstrings @@ -34,13 +34,13 @@ class ClientQueryCompiler(BaseQueryCompiler): Parameters ---------- - id : uuid.UUID + id : UUID ID of this query compiler. """ lazy_execution: bool = True - def __init__(self, id: uuid.UUID): + def __init__(self, id: UUID): self._id = id @classmethod @@ -140,32 +140,28 @@ def add_suffix(self, suffix, axis=1): return self.__constructor__(self._service.add_suffix(self._id, suffix, axis)) def insert(self, loc, column, value): - if isinstance(value, type(self)): + value_is_qc = isinstance(value, type(self)) + if value_is_qc: value = value._id - is_qc = True - else: - is_qc = False return self.__constructor__( - self._service.insert(self._id, loc, column, value, is_qc) + self._service.insert(self._id, value_is_qc, loc, column, value) ) def setitem(self, axis, key, value): - if isinstance(value, type(self)): + value_is_qc = isinstance(value, type(self)) + if value_is_qc: value = value._id - is_qc = True - else: - is_qc = False return self.__constructor__( - self._service.setitem(self._id, axis, key, value, is_qc) + self._service.setitem(self._id, value_is_qc, axis, key, value) ) def getitem_array(self, key): - if isinstance(key, type(self)): + key_is_qc = isinstance(key, type(self)) + if key_is_qc: key = key._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.getitem_array(self._id, key, is_qc)) + return self.__constructor__( + self._service.getitem_array(self._id, key_is_qc, key) + ) def getitem_column_array(self, key, numeric=False): return self.__constructor__( @@ -215,25 +211,23 @@ def replace( regex=False, method: "str | NoDefault" = no_default, ): - if isinstance(to_replace, type(self)): - is_to_replace_qc = True - else: - is_to_replace_qc = False - if isinstance(regex, type(self)): - is_regex_qc = True - else: - is_regex_qc = False + to_replace_is_qc = isinstance(to_replace, type(self)) + if to_replace_is_qc: + to_replace = to_replace._id + regex_is_qc = isinstance(regex, type(self)) + if regex_is_qc: + regex = regex._id return self.__constructor__( self._service.replace( self._id, + to_replace_is_qc, + regex_is_qc, to_replace, value, inplace, limit, regex, method, - is_to_replace_qc, - is_regex_qc, ) ) @@ -248,14 +242,13 @@ def fillna( limit=None, downcast=None, ): - if isinstance(value, type(self)): + value_is_qc = isinstance(value, type(self)) + if value_is_qc: value = value._id - is_qc = True - else: - is_qc = False return self.__constructor__( self._service.fillna( self._id, + value_is_qc, squeeze_self, squeeze_value, value, @@ -264,7 +257,6 @@ def fillna( inplace, limit, downcast, - is_qc, ) ) @@ -331,176 +323,6 @@ def concat(self, axis, other, **kwargs): self._service.concat(self._id, axis, other, **kwargs) ) - def eq(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.eq(self._id, other, is_qc, **kwargs)) - - def lt(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.lt(self._id, other, is_qc, **kwargs)) - - def le(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.le(self._id, other, is_qc, **kwargs)) - - def gt(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.gt(self._id, other, is_qc, **kwargs)) - - def ge(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.ge(self._id, other, is_qc, **kwargs)) - - def ne(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.ne(self._id, other, is_qc, **kwargs)) - - def __and__(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.__and__(self._id, other, is_qc, **kwargs) - ) - - def __or__(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.__or__(self._id, other, is_qc, **kwargs) - ) - - def add(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.add(self._id, other, is_qc, **kwargs)) - - def radd(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.radd(self._id, other, is_qc, **kwargs) - ) - - def truediv(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.truediv(self._id, other, is_qc, **kwargs) - ) - - def mod(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.mod(self._id, other, is_qc, **kwargs)) - - def rmod(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.rmod(self._id, other, is_qc, **kwargs) - ) - - def sub(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.sub(self._id, other, is_qc, **kwargs)) - - def rsub(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.rsub(self._id, other, is_qc, **kwargs) - ) - - def mul(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__(self._service.mul(self._id, other, is_qc, **kwargs)) - - def rmul(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.rmul(self._id, other, is_qc, **kwargs) - ) - - def floordiv(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.floordiv(self._id, other, is_qc, **kwargs) - ) - - def rfloordiv(self, other, **kwargs): - if isinstance(other, type(self)): - other = other._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.rfloordiv(self._id, other, is_qc, **kwargs) - ) - def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): return self.__constructor__( self._service.sort_rows_by_column_values( @@ -885,3 +707,48 @@ def from_dataframe(cls, df, data_cls): def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): raise NotImplementedError + + +def _set_forwarding_method_for_binary_function(method_name: str): + def forwarding_method( + self: ClientQueryCompiler, + other: Any, + **kwargs, + ): + other_is_qc = isinstance(other, type(self)) + if other_is_qc: + other = other._id + return self.__constructor__( + getattr(self._service, method_name)(self._id, other_is_qc, other, **kwargs) + ) + + setattr(ClientQueryCompiler, method_name, forwarding_method) + + +_BINARY_FORWARDING_METHODS = frozenset( + { + "eq", + "lt", + "le", + "gt", + "ge", + "ne", + "__and__", + "__or__", + "add", + "radd", + "truediv", + "rtruediv", + "mod", + "rmod", + "sub", + "rsub", + "mul", + "rmul", + "floordiv", + "rfloordiv", + } +) + +for method in _BINARY_FORWARDING_METHODS: + _set_forwarding_method_for_binary_function(method) diff --git a/modin/core/execution/client/service.py b/modin/core/execution/client/service.py index f376f334562..94f64a4a848 100644 --- a/modin/core/execution/client/service.py +++ b/modin/core/execution/client/service.py @@ -1,6 +1,4 @@ -import numpy as np -import pickle -from typing import Any, NamedTuple, Optional +from typing import Any, NamedTuple, Optional, Union from uuid import UUID, uuid4 from modin.core.io.io import BaseIO @@ -62,22 +60,22 @@ def index(self, id): def dtypes(self, id): return self._qc[id].dtypes - def insert(self, id, loc, column, value, is_qc): - if is_qc: + def insert(self, id, value_is_qc: bool, loc, column, value): + if value_is_qc: value = self._qc[value] new_id = self._generate_id() self._qc[new_id] = self._qc[id].insert(loc, column, value) return new_id - def setitem(self, id, axis, key, value, is_qc): - if is_qc: + def setitem(self, id, value_is_qc: bool, axis, key, value): + if value_is_qc: value = self._qc[value] new_id = self._generate_id() self._qc[new_id] = self._qc[id].setitem(axis, key, value) return new_id - def getitem_array(self, id, key, is_qc): - if is_qc: + def getitem_array(self, key_is_qc: bool, id, key): + if key_is_qc: key = self._qc[key] new_id = self._generate_id() self._qc[new_id] = self._qc[id].getitem_array(key) @@ -86,18 +84,18 @@ def getitem_array(self, id, key, is_qc): def replace( self, id, + to_replace_is_qc: bool, + regex_is_qc: bool, to_replace, value, inplace, limit, regex, method, - is_to_replace_qc, - is_regex_qc, ): - if is_to_replace_qc: + if to_replace_is_qc: to_replace = self._qc[to_replace] - if is_regex_qc: + if regex_is_qc: regex = self._qc[regex] new_id = self._generate_id() # TODO(GH#3108): Use positional arguments instead of keyword arguments @@ -116,6 +114,7 @@ def replace( def fillna( self, id, + value_is_qc: bool, squeeze_self, squeeze_value, value, @@ -124,9 +123,8 @@ def fillna( inplace, limit, downcast, - is_qc, ): - if is_qc: + if value_is_qc: value = self._qc[value] new_id = self._generate_id() # TODO(GH#3108): Use positional arguments instead of keyword arguments @@ -145,96 +143,11 @@ def fillna( return new_id def concat(self, id, axis, other, **kwargs): - # convert id to query compiler other = [self._qc[o] for o in other] new_id = self._generate_id() self._qc[new_id] = self._qc[id].concat(axis, other, **kwargs) return new_id - def eq(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].eq(other, **kwargs) - return new_id - - def lt(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].lt(other, **kwargs) - return new_id - - def le(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].le(other, **kwargs) - return new_id - - def gt(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].gt(other, **kwargs) - return new_id - - def ge(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].ge(other, **kwargs) - return new_id - - def ne(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].ne(other, **kwargs) - return new_id - - def __and__(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].__and__(other, **kwargs) - return new_id - - def __or__(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].__or__(other, **kwargs) - return new_id - - def add(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].add(other, **kwargs) - return new_id - - def radd(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].radd(other, **kwargs) - return new_id - - def truediv(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].truediv(other, **kwargs) - return new_id - - def rtruediv(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].rtruediv(other, **kwargs) - return new_id - def mod(self, id, other, is_qc, **kwargs): if is_qc: other = self._qc[other] @@ -463,6 +376,48 @@ def forwarding_method( setattr(ForwardingQueryCompilerService, method_name, forwarding_method) +def _set_forwarding_method_for_binary_function(method_name: str): + def forwarding_method( + self: ForwardingQueryCompilerService, + id: UUID, + other_is_qc: bool, + other: Union[UUID, Any], + **kwargs, + ): + if other_is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = getattr(self._qc[id], method_name)(other, **kwargs) + return new_id + + setattr(ForwardingQueryCompilerService, method_name, forwarding_method) + + +_BINARY_FORWARDING_METHODS = frozenset( + { + "eq", + "lt", + "le", + "gt", + "ge", + "ne", + "__and__", + "__or__", + "add", + "radd", + "truediv", + "rtruediv", + "mod", + "rmod", + "sub", + "rsub", + "mul", + "rmul", + "floordiv", + "rfloordiv", + } +) + _SINGLE_ID_FORWARDING_METHODS = frozenset( { "columnarize", @@ -564,3 +519,6 @@ def forwarding_method( for method in _SINGLE_ID_FORWARDING_METHODS: _set_forwarding_method_for_single_id(method) + +for method in _BINARY_FORWARDING_METHODS: + _set_forwarding_method_for_binary_function(method) From 89ba4b08d4d1c0c251419b5bf2ef2ef8c28d65f7 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 16:23:08 -0500 Subject: [PATCH 45/77] Fix query compiler refactoring. Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 442 +++++------------- modin/core/execution/client/service.py | 2 +- 2 files changed, 116 insertions(+), 328 deletions(-) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 095aea6aab0..267a17d250d 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -13,7 +13,6 @@ """Module contains ``ClientQueryCompiler`` class.""" -import numpy as np import pandas from pandas._libs.lib import no_default, NoDefault from pandas.api.types import is_list_like @@ -124,21 +123,9 @@ def to_pandas(self): def default_to_pandas(self, pandas_op, *args, **kwargs): raise NotImplementedError - def columnarize(self): - return self.__constructor__(self._service.columnarize(self._id)) - - def transpose(self): - return self.__constructor__(self._service.transpose(self._id)) - def copy(self): return self.__constructor__(self._id) - def add_prefix(self, prefix, axis=1): - return self.__constructor__(self._service.add_prefix(self._id, prefix, axis)) - - def add_suffix(self, suffix, axis=1): - return self.__constructor__(self._service.add_suffix(self._id, suffix, axis)) - def insert(self, loc, column, value): value_is_qc = isinstance(value, type(self)) if value_is_qc: @@ -163,45 +150,6 @@ def getitem_array(self, key): self._service.getitem_array(self._id, key_is_qc, key) ) - def getitem_column_array(self, key, numeric=False): - return self.__constructor__( - self._service.getitem_column_array(self._id, key, numeric) - ) - - def getitem_row_array(self, key, numeric=False): - return self.__constructor__( - self._service.getitem_row_array(self._id, key, numeric) - ) - - def pivot(self, index, columns, values): - return self.__constructor__( - self._service.pivot(self._id, index, columns, values) - ) - - def get_dummies(self, columns, **kwargs): - return self.__constructor__( - self._service.get_dummies(self._id, columns, **kwargs) - ) - - def take_2d(self, index=None, columns=None): - return self.__constructor__(self._service.take_2d(self._id, index, columns)) - - def drop(self, index=None, columns=None, errors: str = "raise"): - return self.__constructor__( - self._service.drop(self._id, index, columns, errors) - ) - - def isna(self): - return self.__constructor__(self._service.isna(self._id)) - - def notna(self): - return self.__constructor__(self._service.notna(self._id)) - - def astype(self, col_dtypes, **kwargs): - return self.__constructor__( - self._service.astype(self._id, col_dtypes, **kwargs) - ) - def replace( self, to_replace=None, @@ -260,60 +208,6 @@ def fillna( ) ) - def dropna(self, **kwargs): - return self.__constructor__(self._service.dropna(self._id, **kwargs)) - - def sum(self, **kwargs): - return self.__constructor__(self._service.sum(self._id, **kwargs)) - - def prod(self, **kwargs): - return self.__constructor__(self._service.prod(self._id, **kwargs)) - - def count(self, **kwargs): - return self.__constructor__(self._service.count(self._id, **kwargs)) - - def mean(self, **kwargs): - return self.__constructor__(self._service.mean(self._id, **kwargs)) - - def median(self, **kwargs): - return self.__constructor__(self._service.median(self._id, **kwargs)) - - def std(self, **kwargs): - return self.__constructor__(self._service.std(self._id, **kwargs)) - - def min(self, **kwargs): - return self.__constructor__(self._service.min(self._id, **kwargs)) - - def max(self, **kwargs): - return self.__constructor__(self._service.max(self._id, **kwargs)) - - def any(self, **kwargs): - return self.__constructor__(self._service.any(self._id, **kwargs)) - - def all(self, **kwargs): - return self.__constructor__(self._service.all(self._id, **kwargs)) - - def quantile_for_single_value(self, **kwargs): - return self.__constructor__( - self._service.quantile_for_single_value(self._id, **kwargs) - ) - - def quantile_for_list_of_values(self, **kwargs): - return self.__constructor__( - self._service.quantile_for_list_of_values(self._id, **kwargs) - ) - - def describe(self, **kwargs): - return self.__constructor__(self._service.describe(self._id, **kwargs)) - - def set_index_from_columns(self, keys, drop: bool = True, append: bool = False): - return self.__constructor__( - self._service.set_index_from_columns(self._id, keys, drop, append) - ) - - def reset_index(self, **kwargs): - return self.__constructor__(self._service.reset_index(self._id, **kwargs)) - def concat(self, axis, other, **kwargs): if is_list_like(other): other = [o._id for o in other] @@ -323,192 +217,6 @@ def concat(self, axis, other, **kwargs): self._service.concat(self._id, axis, other, **kwargs) ) - def sort_rows_by_column_values(self, columns, ascending=True, **kwargs): - return self.__constructor__( - self._service.sort_rows_by_column_values( - self._id, columns, ascending=ascending, **kwargs - ) - ) - - def sort_index(self, **kwargs): - return self.__constructor__(self._service.sort_index(self._id, **kwargs)) - - def dt_nanosecond(self): - return self.__constructor__(self._service.dt_nanosecond(self._id)) - - def dt_microsecond(self): - return self.__constructor__(self._service.dt_microsecond(self._id)) - - def dt_second(self): - return self.__constructor__(self._service.dt_second(self._id)) - - def dt_minute(self): - return self.__constructor__(self._service.dt_minute(self._id)) - - def dt_hour(self): - return self.__constructor__(self._service.dt_hour(self._id)) - - def dt_day(self): - return self.__constructor__(self._service.dt_day(self._id)) - - def dt_dayofweek(self): - return self.__constructor__(self._service.dt_dayofweek(self._id)) - - def dt_weekday(self): - return self.__constructor__(self._service.dt_weekday(self._id)) - - def dt_day_name(self): - return self.__constructor__(self._service.dt_day_name(self._id)) - - def dt_dayofyear(self): - return self.__constructor__(self._service.dt_dayofyear(self._id)) - - def dt_week(self): - return self.__constructor__(self._service.dt_week(self._id)) - - def dt_weekofyear(self): - return self.__constructor__(self._service.dt_weekofyear(self._id)) - - def dt_month(self): - return self.__constructor__(self._service.dt_month(self._id)) - - def dt_month_name(self): - return self.__constructor__(self._service.dt_month_name(self._id)) - - def dt_quarter(self): - return self.__constructor__(self._service.dt_quarter(self._id)) - - def dt_year(self): - return self.__constructor__(self._service.dt_year(self._id)) - - def str_capitalize(self): - return self.__constructor__(self._service.str_capitalize(self._id)) - - def str_isalnum(self): - return self.__constructor__(self._service.str_isalnum(self._id)) - - def str_isalpha(self): - return self.__constructor__(self._service.str_isalpha(self._id)) - - def str_isdecimal(self): - return self.__constructor__(self._service.str_isdecimal(self._id)) - - def str_isdigit(self): - return self.__constructor__(self._service.str_isdigit(self._id)) - - def str_islower(self): - return self.__constructor__(self._service.str_islower(self._id)) - - def str_isnumeric(self): - return self.__constructor__(self._service.str_isnumeric(self._id)) - - def str_isspace(self): - return self.__constructor__(self._service.str_isspace(self._id)) - - def str_istitle(self): - return self.__constructor__(self._service.str_istitle(self._id)) - - def str_isupper(self): - return self.__constructor__(self._service.str_isupper(self._id)) - - def str_len(self): - return self.__constructor__(self._service.str_len(self._id)) - - def str_lower(self): - return self.__constructor__(self._service.str_lower(self._id)) - - def str_title(self): - return self.__constructor__(self._service.str_title(self._id)) - - def str_upper(self): - return self.__constructor__(self._service.str_upper(self._id)) - - def str_center(self, width, fillchar=" "): - return self.__constructor__(self._service.str_center(self._id, width, fillchar)) - - def str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - return self.__constructor__( - self._service.str_contains(self._id, pat, case, flags, na, regex) - ) - - def str_count(self, pat, flags=0, **kwargs): - return self.__constructor__( - self._service.str_count(self._id, pat, flags, **kwargs) - ) - - def str_endswith(self, pat, na=np.nan): - return self.__constructor__(self._service.str_endswith(self._id, pat, na)) - - def str_find(self, sub, start=0, end=None): - return self.__constructor__(self._service.str_find(self._id, sub, start, end)) - - def str_rfind(self, sub, start=0, end=None): - return self.__constructor__(self._service.str_rfind(self._id, sub, start, end)) - - def str_findall(self, pat, flags=0, **kwargs): - return self.__constructor__( - self._service.str_findall(self._id, pat, flags, **kwargs) - ) - - def str_get(self, i): - return self.__constructor__(self._service.str_get(self._id, i)) - - def str_index(self, sub, start=0, end=None): - return self.__constructor__(self._service.str_index(self._id, sub, start, end)) - - def str_join(self, sep): - return self.__constructor__(self._service.str_join(self._id, sep)) - - def str_lstrip(self, to_strip=None): - return self.__constructor__(self._service.str_lstrip(self._id, to_strip)) - - def str_ljust(self, width, fillchar=" "): - return self.__constructor__(self._service.str_ljust(self._id, width, fillchar)) - - def str_rjust(self, width, fillchar=" "): - return self.__constructor__(self._service.str_rjust(self._id, width, fillchar)) - - def str_match(self, pat, case=True, flags=0, na=np.nan): - return self.__constructor__( - self._service.str_match(self._id, pat, case, flags, na) - ) - - def str_pad(self, width, side="left", fillchar=" "): - return self.__constructor__( - self._service.str_pad(self._id, width, side, fillchar) - ) - - def str_repeat(self, repeats): - return self.__constructor__(self._service.str_repeat(self._id, repeats)) - - def str_split(self, pat=None, n=-1, expand=False): - return self.__constructor__(self._service.str_split(self._id, pat, n, expand)) - - def str_rsplit(self, pat=None, n=-1, expand=False): - return self.__constructor__(self._service.str_rsplit(self._id, pat, n, expand)) - - def str_rstrip(self, to_strip=None): - return self.__constructor__(self._service.str_rstrip(self._id, to_strip)) - - def str_slice(self, start=None, stop=None, step=None): - return self.__constructor__( - self._service.str_slice(self._id, start, stop, step) - ) - - def str_slice_replace(self, start=None, stop=None, repl=None): - return self.__constructor__( - self._service.str_slice_replace(self._id, start, stop, repl) - ) - - def str_startswith(self, pat, na=np.nan): - return self.__constructor__(self._service.str_startswith(self._id, pat, na)) - - def str_strip(self, to_strip=None): - return self.__constructor__(self._service.str_strip(self._id, to_strip)) - - def str_zfill(self, width): - return self.__constructor__(self._service.str_zfill(self._id, width)) - def merge(self, right, **kwargs): return self.__constructor__(self._service.merge(self._id, right._id, **kwargs)) @@ -650,47 +358,12 @@ def groupby_agg( ) ) - def cummax(self, fold_axis, axis, skipna, *args, **kwargs): - return self.__constructor__( - self._service.cummax(self._id, fold_axis, axis, skipna, *args, **kwargs) - ) - - def cummin(self, fold_axis, axis, skipna, *args, **kwargs): - return self.__constructor__( - self._service.cummin(self._id, fold_axis, axis, skipna, *args, **kwargs) - ) - - def cumsum(self, fold_axis, axis, skipna, *args, **kwargs): - return self.__constructor__( - self._service.cumsum(self._id, fold_axis, axis, skipna, *args, **kwargs) - ) - - def cumprod(self, fold_axis, axis, skipna, *args, **kwargs): - return self.__constructor__( - self._service.cumprod(self._id, fold_axis, axis, skipna, *args, **kwargs) - ) - def get_index_names(self, axis=0): if axis == 0: return self.index.names else: return self.columns.names - def is_monotonic_increasing(self): - return self.__constructor__(self._service.is_monotonic_increasing(self._id)) - - def is_monotonic_decreasing(self): - return self.__constructor__(self._service.is_monotonic_decreasing(self._id)) - - def idxmin(self, **kwargs): - return self.__constructor__(self._service.idxmin(self._id, **kwargs)) - - def idxmax(self, **kwargs): - return self.__constructor__(self._service.idxmax(self._id, **kwargs)) - - def query(self, expr, **kwargs): - return self.__constructor__(self._service.query(self._id, expr, **kwargs)) - def finalize(self): raise NotImplementedError @@ -724,6 +397,18 @@ def forwarding_method( setattr(ClientQueryCompiler, method_name, forwarding_method) +def _set_forwarding_method_for_single_id(method_name: str): + def forwarding_method( + self: ClientQueryCompiler, + *args, + **kwargs, + ): + return self.__constructor__( + getattr(self._service, method_name)(self._id, *args, **kwargs) + ) + + setattr(ClientQueryCompiler, method_name, forwarding_method) + _BINARY_FORWARDING_METHODS = frozenset( { @@ -752,3 +437,106 @@ def forwarding_method( for method in _BINARY_FORWARDING_METHODS: _set_forwarding_method_for_binary_function(method) + +_SINGLE_ID_FORWARDING_METHODS = frozenset( + { + "columnarize", + "transpose", + "take_2d", + "getitem_column_array", + "getitem_row_array", + "pivot", + "get_dummies", + "drop", + "isna", + "notna", + "add_prefix", + "add_suffix", + "astype", + "dropna", + "sum", + "prod", + "count", + "mean", + "median", + "std", + "min", + "max", + "any", + "all", + "quantile_for_single_value", + "quantile_for_list_of_values", + "describe", + "set_index_from_columns", + "reset_index", + "sort_rows_by_column_values", + "sort_index", + "dt_nanosecond", + "dt_microsecond", + "dt_second", + "dt_minute", + "dt_hour", + "dt_day", + "dt_dayofweek", + "dt_weekday", + "dt_day_name", + "dt_dayofyear", + "dt_week", + "dt_weekofyear", + "dt_month", + "dt_month_name", + "dt_quarter", + "dt_year", + "str_capitalize", + "str_isalnum", + "str_isalpha", + "str_isdecimal", + "str_isdigit", + "str_islower", + "str_isnumeric", + "str_isspace", + "str_istitle", + "str_isupper", + "str_len", + "str_lower", + "str_title", + "str_upper", + "str_center", + "str_contains", + "str_count", + "str_endswith", + "str_find", + "str_index", + "str_rfind", + "str_findall", + "str_get", + "str_join", + "str_lstrip", + "str_ljust", + "str_rjust", + "str_match", + "str_pad", + "str_repeat", + "str_split", + "str_rsplit", + "str_rstrip", + "str_slice", + "str_slice_replace", + "str_startswith", + "str_strip", + "str_zfill", + "cummax", + "cummin", + "cumsum", + "cumprod", + "is_monotonic_increasing", + "is_monotonic_decreasing", + "idxmax", + "idxmin", + "query", + } +) + + +for method in _SINGLE_ID_FORWARDING_METHODS: + _set_forwarding_method_for_single_id(method) diff --git a/modin/core/execution/client/service.py b/modin/core/execution/client/service.py index 94f64a4a848..610b4e43acf 100644 --- a/modin/core/execution/client/service.py +++ b/modin/core/execution/client/service.py @@ -74,7 +74,7 @@ def setitem(self, id, value_is_qc: bool, axis, key, value): self._qc[new_id] = self._qc[id].setitem(axis, key, value) return new_id - def getitem_array(self, key_is_qc: bool, id, key): + def getitem_array(self, id, key_is_qc: bool, key): if key_is_qc: key = self._qc[key] new_id = self._generate_id() From 0a3240f982ed951361583a9f14917dc1a7ccf538 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 16:26:09 -0500 Subject: [PATCH 46/77] Add a newline for black Signed-off-by: mvashishtha --- modin/core/execution/client/query_compiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 267a17d250d..4d371177ef5 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -397,6 +397,7 @@ def forwarding_method( setattr(ClientQueryCompiler, method_name, forwarding_method) + def _set_forwarding_method_for_single_id(method_name: str): def forwarding_method( self: ClientQueryCompiler, From df5b2a5d2345b64bdc2259f0b6e1227b9800e099 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 20:32:41 -0500 Subject: [PATCH 47/77] Make doc_checker work for all new files except container groupby. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 5 +- modin/conftest.py | 6 +- modin/core/execution/client/container.py | 826 ++++++++++++++++++ modin/core/execution/client/query_compiler.py | 20 +- modin/core/execution/client/service.py | 524 ----------- 5 files changed, 851 insertions(+), 530 deletions(-) create mode 100644 modin/core/execution/client/container.py delete mode 100644 modin/core/execution/client/service.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 250e9fe1155..92c0768f0f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -129,7 +129,10 @@ jobs: - run: python scripts/doc_checker.py modin/core/storage_formats/base - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow - run: python scripts/doc_checker.py modin/core/storage_formats/pandas - - run: python scripts/doc_checker.py modin/core/execution/client + - run: | + python scripts/doc_checker.py modin/core/execution/client/container.py + python scripts/doc_checker.py modin/core/execution/client/io.py + python scripts/doc_checker.py modin/core/execution/client/query_compiler.py - run: | python scripts/doc_checker.py \ modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \ diff --git a/modin/conftest.py b/modin/conftest.py index f0cf0a49b26..53c1eac3022 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -54,8 +54,8 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): ) from modin.core.execution.client.io import ClientIO # noqa: E402 from modin.core.execution.client.query_compiler import ClientQueryCompiler # noqa: E402 -from modin.core.execution.client.service import ( # noqa: E402 - ForwardingQueryCompilerService, +from modin.core.execution.client.container import ( # noqa: E402 + ForwardingQueryCompilerContainer, ) from modin.core.execution.python.implementations.pandas_on_python.dataframe.dataframe import ( # noqa: E402 PandasOnPythonDataframe, @@ -299,7 +299,7 @@ def prepare(cls): def set_client_execution(): - service = ForwardingQueryCompilerService(BaseQueryCompiler, PandasOnPythonIO) + service = ForwardingQueryCompilerContainer(BaseQueryCompiler, PandasOnPythonIO) ClientQueryCompiler.set_server_connection(service) ClientIO.query_compiler_cls = TestClientQueryCompiler ClientIO.set_server_connection(service) diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py new file mode 100644 index 00000000000..52879e520d0 --- /dev/null +++ b/modin/core/execution/client/container.py @@ -0,0 +1,826 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module contains ``ForwardingQueryCompilerContainer`` class.""" + +import numpy as np +import pandas +from typing import Any, NamedTuple, Optional, Union +from uuid import UUID, uuid4 + +from modin.core.io.io import BaseIO +from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler + + +class ForwardingQueryCompilerContainer: + """ + Container that forwards queries to query compilers within. + + Parameters + ---------- + query_compiler_class : BaseQueryCompiler + Query compiler class to contain. + io_class : BaseIO + The IO class to use for reading and writing data. + """ + + def __init__(self, query_compiler_class: BaseQueryCompiler, io_class: BaseIO): + self._qc = {} + self._query_compiler_class = query_compiler_class + self._io_class = io_class + + def _generate_id(self) -> UUID: + """ + Generate an ID for a new query compiler. + + Returns + ------- + UUID + The generated ID. + """ + id = uuid4() + while id in self._qc: + id = uuid4() + return id + + def add_query_compiler(self, qc: BaseQueryCompiler) -> UUID: + """ + Add a query compiler to the container. + + Parameters + ---------- + qc : BaseQueryCompiler + + Returns + ------- + UUID + The ID of the query compiler. + """ + id = self._generate_id() + self._qc[id] = qc + return id + + def to_pandas(self, id: UUID) -> pandas.DataFrame: + """ + Convert the query compiler to a pandas DataFrame. + + Parameters + ---------- + id : UUID + The ID of the query compiler to convert. + + Returns + ------- + pandas.DataFrame + The converted DataFrame. + """ + return self._qc[id].to_pandas() + + class DefaultToPandasResult(NamedTuple): + """ + The result of ``default_to_pandas``. + + Parameters + ---------- + result : Any + The result of the operation. + result_is_qc_id : bool + Whether the result is a query compiler ID. + """ + + result: Any + result_is_qc_id: bool + + def default_to_pandas( + self, id: UUID, pandas_op: Union[str, callable], *args: Any, **kwargs: dict + ) -> DefaultToPandasResult: # noqa: D401 + """ + Default to pandas for an operation on a query compiler. + + Use the inner query compiler's default_to_pandas to execute the + operation on a pandas dataframe. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + pandas_op : Union[str, callable] + The operation to perform. + *args : iterable + The arguments to pass to the operation. + **kwargs : dict + The keyword arguments to pass to the operation. + + Returns + ------- + DefaultToPandasResult + The result of the operation. The result is a query compiler ID if + and only if the result of the pandas operation is a new + query compiler. + """ + result = self._qc[id].default_to_pandas(pandas_op, *args, **kwargs) + result_is_qc_id = isinstance(result, self._query_compiler_class) + if result_is_qc_id: + new_id = self._generate_id() + self._qc[new_id] = result + result = new_id + return self.DefaultToPandasResult( + result=result, result_is_qc_id=result_is_qc_id + ) + + def rename( + self, + id: UUID, + new_col_labels: Optional[pandas.Index] = None, + new_row_labels: Optional[pandas.Index] = None, + ) -> UUID: + """ + Rename the columns and/or rows of a query compiler. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + new_col_labels : pandas.Index, default: None + The new column labels. + new_row_labels : pandas.Index, default: None + The new row labels. + + Returns + ------- + UUID + The ID of the renamed query compiler. + """ + new_id = self._generate_id() + new_qc = self._qc[new_id] = self._qc[id].copy() + if new_col_labels is not None: + new_qc.columns = new_col_labels + if new_row_labels is not None: + new_qc.index = new_row_labels + return new_id + + def columns(self, id) -> pandas.Index: + """ + Get the columns of the query compiler. + + Parameters + ---------- + id : UUID + The ID of a query compiler. + + Returns + ------- + pandas.Index + The columns. + """ + return self._qc[id].columns + + def index(self, id: UUID) -> pandas.Index: + """ + Get the index of a query compiler. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + + Returns + ------- + pandas.Index + The index. + """ + return self._qc[id].index + + def dtypes(self, id: UUID) -> pandas.Series: + """ + Get the dtypes of a query compiler. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + + Returns + ------- + pandas.Series + The dtypes. + """ + return self._qc[id].dtypes + + def insert(self, id: UUID, value_is_qc: bool, loc, column, value) -> UUID: + """ + Insert a value into a query compiler. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + value_is_qc : bool + Whether ``value`` is the ID of a query compiler. + loc : int + The location to insert the value. + column : str + The column to insert the value. + value : Any + The value to insert. + + Returns + ------- + UUID + The ID of the query compiler with the inserted value. + """ + if value_is_qc: + value = self._qc[value] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].insert(loc, column, value) + return new_id + + def setitem(self, id, value_is_qc: bool, axis, key, value) -> UUID: + """ + Set a value in a query compiler. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + value_is_qc : bool + Whether ``value`` is the ID of a query compiler. + axis : int + The axis to set the value. + key : Any + The key to set the value. + value : Any + The value to set. + + Returns + ------- + UUID + The ID of the query compiler with the value set. + """ + if value_is_qc: + value = self._qc[value] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].setitem(axis, key, value) + return new_id + + def getitem_array( + self, id, key_is_qc: bool, key: Union[UUID, np.ndarray, list] + ) -> UUID: + """ + Get the values at ``key`` from a query compiler. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + key_is_qc : bool + Whether ``key`` is the ID of a query compiler. + key : UUID, np.ndarray or list of column labels + Boolean mask represented by QueryCompiler UUID or ``np.ndarray`` of the same + shape as query compiler with ID ``id``, or enumerable of columns to pick. + + Returns + ------- + UUID + The ID of the new query compiler. + """ + if key_is_qc: + key = self._qc[key] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].getitem_array(key) + return new_id + + def replace( + self, + id, + to_replace_is_qc: bool, + regex_is_qc: bool, + to_replace, + value, + inplace, + limit, + regex, + method, + ): + """ + Replace values given in `to_replace` by `value`. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + to_replace_is_qc : bool + Whether ``to_replace`` is the ID of a query compiler. + regex_is_qc : bool + Whether ``regex`` is the ID of a query compiler. + to_replace : scalar, list-like, regex, modin.pandas.Series, or None + Value to replace. + value : scalar, list-like, regex or dict + Value to replace matching values with. + inplace : bool + This parameter is for compatibility. Always has to be False. + limit : Optional[int] + Maximum size gap to forward or backward fill. + regex : bool or same types as ``to_replace`` + Whether to interpret ``to_replace`` and/or ``value`` as regular + expressions. + method : {"pad", "ffill", "bfill", None} + The method to use when for replacement, when to_replace is a + scalar, list or tuple and value is None. + + Returns + ------- + UUID + UUID of query compiler with all `to_replace` values replaced by `value`. + """ + if to_replace_is_qc: + to_replace = self._qc[to_replace] + if regex_is_qc: + regex = self._qc[regex] + new_id = self._generate_id() + # TODO(GH#3108): Use positional arguments instead of keyword arguments + # in the query compilers so we don't have to name all the arguments + # here. + self._qc[new_id] = self._qc[id].replace( + to_replace=to_replace, + value=value, + inplace=inplace, + limit=limit, + regex=regex, + method=method, + ) + return new_id + + def fillna( + self, + id, + value_is_qc: bool, + squeeze_self: bool, + squeeze_value: bool, + value, + method, + axis, + inplace, + limit, + downcast, + ): + """ + Replace NaN values using provided method. + + Parameters + ---------- + id : UUID + The ID of the query compiler. + value_is_qc : bool + Whether ``value`` is the ID of a query compiler. + squeeze_self : bool + Whether to squeeze ``self``. + squeeze_value : bool + Whether to squeeze ``value``. + value : scalar or dict + method : {"backfill", "bfill", "pad", "ffill", None} + axis : {0, 1} + inplace : {False} + This parameter is for compatibility. Always has to be False. + limit : int, optional + downcast : dict, optional + + Returns + ------- + BaseQueryCompiler + New QueryCompiler with all null values filled. + """ + if value_is_qc: + value = self._qc[value] + new_id = self._generate_id() + # TODO(GH#3108): Use positional arguments instead of keyword arguments + # in the query compilers so we don't have to name all the + # arguments here. + self._qc[new_id] = self._qc[id].fillna( + squeeze_self=squeeze_self, + squeeze_value=squeeze_value, + value=value, + method=method, + axis=axis, + inplace=inplace, + limit=limit, + downcast=downcast, + ) + return new_id + + def concat(self, id, axis, other, **kwargs): + """ + Concatenate query compilers along the specified axis. + + Parameters + ---------- + id : UUID + The ID of the main query compiler to concatenate. + axis : {0, 1} + The axis to concatenate along. + other : list of UUIDs + The IDs of the query compilers to concatenate to the one + represented by ``id``. + **kwargs : dict + Additional parameters to pass to the concatenation function. + + Returns + ------- + UUID + The ID of the query compiler containing the concatenation result. + """ + other = [self._qc[o] for o in other] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].concat(axis, other, **kwargs) + return new_id + + def merge(self, id, right, **kwargs): + """ + Merge two query compilers using a database-style join. + + Parameters + ---------- + id : UUID + The ID of the left query compiler. + right : UUID + The ID of the right query compiler. + **kwargs : dict + Additional parameters to pass to the merge function. + + Returns + ------- + UUID + The ID of the query compiler containing the merge result. + """ + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].merge(self._qc[right], **kwargs) + return new_id + + def groupby_mean( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_mean( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_count( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_count( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_max( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_max( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_min( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_min( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_sum( + self, + id, + by, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_sum( + by, axis, groupby_kwargs, agg_args, agg_kwargs, drop + ) + return new_id + + def groupby_agg( + self, + id, + by, + agg_func, + axis, + groupby_kwargs, + agg_args, + agg_kwargs, + how="axis_wise", + drop=False, + is_qc: bool = False, + ): + if is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = self._qc[id].groupby_agg( + by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop + ) + return new_id + + def read_csv(self, connection, filepath, **kwargs) -> UUID: + """ + Read a CSV file from the specified filepath. + + Parameters + ---------- + connection : object + The data connection, e.g. a connnection to the database where the + service will store the result. + filepath : str + The filepath to read the CSV file from. + **kwargs : dict + Additional parameters to pass to the pandas read_csv function. + + Returns + ------- + UUID + The ID of the query compiler containing the read result. + """ + io_result = self._io_class._read_csv(filepath, **kwargs) + if isinstance(io_result, self._query_compiler_class): + new_id = self._generate_id() + self._qc[new_id] = io_result + return new_id + return io_result + + def read_sql(self, sql, connection, **kwargs) -> UUID: + """ + Read data from a SQL connection. + + Parameters + ---------- + sql : str + SQL query to be executed or a table name. + connection : SQLAlchemy connectable, str, or sqlite3 connection + Using SQLAlchemy makes it possible to use any DB supported by that + library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible + for engine disposal and connection closure for the SQLAlchemy + connectable; str connections are closed automatically. See + `here `_. + **kwargs : dict + Parameters of ``read_sql`` function. + + Returns + ------- + UUID + ID of query compiler with data read in from SQL connection. + """ + new_id = self._generate_id() + self._qc[new_id] = self._io_class._read_sql(sql, connection, **kwargs) + return new_id + + def to_sql(self, id, **kwargs) -> None: + """ + Write records stored in a DataFrame to a SQL database. + + Databases supported by SQLAlchemy [1]_ are supported. Tables can be + newly created, appended to, or overwritten. + + Parameters + ---------- + id : UUID + ID of query compiler to write to database. + **kwargs : dict + Parameters of ``read_sql`` function. + """ + self._io_class.to_sql(self._qc[id], **kwargs) + + +def _set_forwarding_method_for_single_id(method_name: str): + """ + Define a method that forwards arguments to the inner query compiler. + + Parameters + ---------- + method_name : str + """ + + def forwarding_method( + self: ForwardingQueryCompilerContainer, id: UUID, *args, **kwargs + ): + new_id = self._generate_id() + self._qc[new_id] = getattr(self._qc[id], method_name)(*args, **kwargs) + return new_id + + setattr(ForwardingQueryCompilerContainer, method_name, forwarding_method) + + +def _set_forwarding_method_for_binary_function(method_name: str): + """ + Define a binary method that forwards arguments to the inner query compiler. + + Parameters + ---------- + method_name : str + """ + + def forwarding_method( + self: ForwardingQueryCompilerContainer, + id: UUID, + other_is_qc: bool, + other: Union[UUID, Any], + **kwargs, + ): + if other_is_qc: + other = self._qc[other] + new_id = self._generate_id() + self._qc[new_id] = getattr(self._qc[id], method_name)(other, **kwargs) + return new_id + + setattr(ForwardingQueryCompilerContainer, method_name, forwarding_method) + + +_BINARY_FORWARDING_METHODS = frozenset( + { + "eq", + "lt", + "le", + "gt", + "ge", + "ne", + "__and__", + "__or__", + "add", + "radd", + "truediv", + "rtruediv", + "mod", + "rmod", + "sub", + "rsub", + "mul", + "rmul", + "floordiv", + "rfloordiv", + } +) + +_SINGLE_ID_FORWARDING_METHODS = frozenset( + { + "columnarize", + "transpose", + "take_2d", + "getitem_column_array", + "getitem_row_array", + "pivot", + "get_dummies", + "drop", + "isna", + "notna", + "add_prefix", + "add_suffix", + "astype", + "dropna", + "sum", + "prod", + "count", + "mean", + "median", + "std", + "min", + "max", + "any", + "all", + "quantile_for_single_value", + "quantile_for_list_of_values", + "describe", + "set_index_from_columns", + "reset_index", + "sort_rows_by_column_values", + "sort_index", + "dt_nanosecond", + "dt_microsecond", + "dt_second", + "dt_minute", + "dt_hour", + "dt_day", + "dt_dayofweek", + "dt_weekday", + "dt_day_name", + "dt_dayofyear", + "dt_week", + "dt_weekofyear", + "dt_month", + "dt_month_name", + "dt_quarter", + "dt_year", + "str_capitalize", + "str_isalnum", + "str_isalpha", + "str_isdecimal", + "str_isdigit", + "str_islower", + "str_isnumeric", + "str_isspace", + "str_istitle", + "str_isupper", + "str_len", + "str_lower", + "str_title", + "str_upper", + "str_center", + "str_contains", + "str_count", + "str_endswith", + "str_find", + "str_index", + "str_rfind", + "str_findall", + "str_get", + "str_join", + "str_lstrip", + "str_ljust", + "str_rjust", + "str_match", + "str_pad", + "str_repeat", + "str_split", + "str_rsplit", + "str_rstrip", + "str_slice", + "str_slice_replace", + "str_startswith", + "str_strip", + "str_zfill", + "cummax", + "cummin", + "cumsum", + "cumprod", + "is_monotonic_increasing", + "is_monotonic_decreasing", + "idxmax", + "idxmin", + "query", + } +) + +for method in _SINGLE_ID_FORWARDING_METHODS: + _set_forwarding_method_for_single_id(method) + +for method in _BINARY_FORWARDING_METHODS: + _set_forwarding_method_for_binary_function(method) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 4d371177ef5..4d0bdb13f09 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -382,7 +382,15 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): raise NotImplementedError -def _set_forwarding_method_for_binary_function(method_name: str): +def _set_forwarding_method_for_binary_function(method_name: str) -> None: + """ + Define a binary method that forwards arguments to the service. + + Parameters + ---------- + method_name : str + """ + def forwarding_method( self: ClientQueryCompiler, other: Any, @@ -398,7 +406,15 @@ def forwarding_method( setattr(ClientQueryCompiler, method_name, forwarding_method) -def _set_forwarding_method_for_single_id(method_name: str): +def _set_forwarding_method_for_single_id(method_name: str) -> None: + """ + Define a method that forwards arguments to the service. + + Parameters + ---------- + method_name : str + """ + def forwarding_method( self: ClientQueryCompiler, *args, diff --git a/modin/core/execution/client/service.py b/modin/core/execution/client/service.py deleted file mode 100644 index 610b4e43acf..00000000000 --- a/modin/core/execution/client/service.py +++ /dev/null @@ -1,524 +0,0 @@ -from typing import Any, NamedTuple, Optional, Union -from uuid import UUID, uuid4 -from modin.core.io.io import BaseIO - -from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler - - -class ForwardingQueryCompilerService: - def __init__(self, query_compiler_type: BaseQueryCompiler, io_type: BaseIO): - self._qc = {} - self._qc_type = query_compiler_type - self._io_type = io_type - - def _generate_id(self) -> UUID: - id = uuid4() - while id in self._qc: - id = uuid4() - return id - - def add_query_compiler(self, qc) -> UUID: - id = self._generate_id() - self._qc[id] = qc - return id - - def to_pandas(self, id): - return self._qc[id].to_pandas() - - class DefaultToPandasResult(NamedTuple): - result: Optional[Any] - result_is_qc_id: bool - - def default_to_pandas( - self, id: UUID, pandas_op, *args, **kwargs - ) -> DefaultToPandasResult: - result = self._qc[id].default_to_pandas(pandas_op, *args, **kwargs) - result_is_qc_id = isinstance(result, self._qc_type) - if result_is_qc_id: - new_id = self._generate_id() - self._qc[new_id] = result - result = new_id - return self.DefaultToPandasResult( - result=result, result_is_qc_id=result_is_qc_id - ) - - def rename(self, id, new_col_labels=None, new_row_labels=None): - new_id = self._generate_id() - new_qc = self._qc[new_id] = self._qc[id].copy() - if new_col_labels is not None: - new_qc.columns = new_col_labels - if new_row_labels is not None: - new_qc.index = new_row_labels - return new_id - - def columns(self, id): - return self._qc[id].columns - - def index(self, id): - return self._qc[id].index - - def dtypes(self, id): - return self._qc[id].dtypes - - def insert(self, id, value_is_qc: bool, loc, column, value): - if value_is_qc: - value = self._qc[value] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].insert(loc, column, value) - return new_id - - def setitem(self, id, value_is_qc: bool, axis, key, value): - if value_is_qc: - value = self._qc[value] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].setitem(axis, key, value) - return new_id - - def getitem_array(self, id, key_is_qc: bool, key): - if key_is_qc: - key = self._qc[key] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].getitem_array(key) - return new_id - - def replace( - self, - id, - to_replace_is_qc: bool, - regex_is_qc: bool, - to_replace, - value, - inplace, - limit, - regex, - method, - ): - if to_replace_is_qc: - to_replace = self._qc[to_replace] - if regex_is_qc: - regex = self._qc[regex] - new_id = self._generate_id() - # TODO(GH#3108): Use positional arguments instead of keyword arguments - # in the query compilers so we don't have to name all the arguments - # here. - self._qc[new_id] = self._qc[id].replace( - to_replace=to_replace, - value=value, - inplace=inplace, - limit=limit, - regex=regex, - method=method, - ) - return new_id - - def fillna( - self, - id, - value_is_qc: bool, - squeeze_self, - squeeze_value, - value, - method, - axis, - inplace, - limit, - downcast, - ): - if value_is_qc: - value = self._qc[value] - new_id = self._generate_id() - # TODO(GH#3108): Use positional arguments instead of keyword arguments - # in the query compilers so we don't have to name all the - # arguments here. - self._qc[new_id] = self._qc[id].fillna( - squeeze_self=squeeze_self, - squeeze_value=squeeze_value, - value=value, - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - return new_id - - def concat(self, id, axis, other, **kwargs): - other = [self._qc[o] for o in other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].concat(axis, other, **kwargs) - return new_id - - def mod(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].mod(other, **kwargs) - return new_id - - def rmod(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].rmod(other, **kwargs) - return new_id - - def sub(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].sub(other, **kwargs) - return new_id - - def rsub(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].rsub(other, **kwargs) - return new_id - - def mul(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].mul(other, **kwargs) - return new_id - - def rmul(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].rmul(other, **kwargs) - return new_id - - def floordiv(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].floordiv(other, **kwargs) - return new_id - - def rfloordiv(self, id, other, is_qc, **kwargs): - if is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].rfloordiv(other, **kwargs) - return new_id - - def merge(self, id, right, **kwargs): - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].merge(self._qc[right], **kwargs) - return new_id - - def groupby_mean( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_mean( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_count( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_count( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_max( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_max( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_min( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_min( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_sum( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_sum( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_agg( - self, - id, - by, - agg_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - how="axis_wise", - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_agg( - by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop - ) - return new_id - - def read_csv(self, connection, filepath, **kwargs) -> UUID: - io_result = self._io_type._read_csv(filepath, **kwargs) - if isinstance(io_result, self._qc_type): - new_id = self._generate_id() - self._qc[new_id] = io_result - return new_id - return io_result - - def read_sql(self, sql, connection, **kwargs) -> UUID: - new_id = self._generate_id() - self._qc[new_id] = self._io_type._read_sql(sql, connection, **kwargs) - return new_id - - def to_sql( - self, - id, - name, - con, - schema=None, - if_exists="fail", - index=True, - index_label=None, - chunksize=None, - dtype=None, - method=None, - ): - self._io_type.to_sql( - self._qc[id], - name, - con, - schema, - if_exists, - index, - index_label, - chunksize, - dtype, - method, - ) - - -def _set_forwarding_method_for_single_id(method_name: str): - def forwarding_method( - self: "ForwardingQueryCompilerService", id: UUID, *args, **kwargs - ): - new_id = self._generate_id() - self._qc[new_id] = getattr(self._qc[id], method_name)(*args, **kwargs) - return new_id - - setattr(ForwardingQueryCompilerService, method_name, forwarding_method) - - -def _set_forwarding_method_for_binary_function(method_name: str): - def forwarding_method( - self: ForwardingQueryCompilerService, - id: UUID, - other_is_qc: bool, - other: Union[UUID, Any], - **kwargs, - ): - if other_is_qc: - other = self._qc[other] - new_id = self._generate_id() - self._qc[new_id] = getattr(self._qc[id], method_name)(other, **kwargs) - return new_id - - setattr(ForwardingQueryCompilerService, method_name, forwarding_method) - - -_BINARY_FORWARDING_METHODS = frozenset( - { - "eq", - "lt", - "le", - "gt", - "ge", - "ne", - "__and__", - "__or__", - "add", - "radd", - "truediv", - "rtruediv", - "mod", - "rmod", - "sub", - "rsub", - "mul", - "rmul", - "floordiv", - "rfloordiv", - } -) - -_SINGLE_ID_FORWARDING_METHODS = frozenset( - { - "columnarize", - "transpose", - "take_2d", - "getitem_column_array", - "getitem_row_array", - "pivot", - "get_dummies", - "drop", - "isna", - "notna", - "add_prefix", - "add_suffix", - "astype", - "dropna", - "sum", - "prod", - "count", - "mean", - "median", - "std", - "min", - "max", - "any", - "all", - "quantile_for_single_value", - "quantile_for_list_of_values", - "describe", - "set_index_from_columns", - "reset_index", - "sort_rows_by_column_values", - "sort_index", - "dt_nanosecond", - "dt_microsecond", - "dt_second", - "dt_minute", - "dt_hour", - "dt_day", - "dt_dayofweek", - "dt_weekday", - "dt_day_name", - "dt_dayofyear", - "dt_week", - "dt_weekofyear", - "dt_month", - "dt_month_name", - "dt_quarter", - "dt_year", - "str_capitalize", - "str_isalnum", - "str_isalpha", - "str_isdecimal", - "str_isdigit", - "str_islower", - "str_isnumeric", - "str_isspace", - "str_istitle", - "str_isupper", - "str_len", - "str_lower", - "str_title", - "str_upper", - "str_center", - "str_contains", - "str_count", - "str_endswith", - "str_find", - "str_index", - "str_rfind", - "str_findall", - "str_get", - "str_join", - "str_lstrip", - "str_ljust", - "str_rjust", - "str_match", - "str_pad", - "str_repeat", - "str_split", - "str_rsplit", - "str_rstrip", - "str_slice", - "str_slice_replace", - "str_startswith", - "str_strip", - "str_zfill", - "cummax", - "cummin", - "cumsum", - "cumprod", - "is_monotonic_increasing", - "is_monotonic_decreasing", - "idxmax", - "idxmin", - "query", - } -) - -for method in _SINGLE_ID_FORWARDING_METHODS: - _set_forwarding_method_for_single_id(method) - -for method in _BINARY_FORWARDING_METHODS: - _set_forwarding_method_for_binary_function(method) From 7d2751ae300edd9f7a018b9698d16d66033131e3 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 21:23:15 -0500 Subject: [PATCH 48/77] Fix all docstrings and add ci.yml and push.yml. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 8 +- .github/workflows/push.yml | 4 + modin/core/execution/client/container.py | 144 +++------------ modin/core/execution/client/query_compiler.py | 168 +++--------------- 4 files changed, 65 insertions(+), 259 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92c0768f0f6..ec006744465 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -339,7 +339,7 @@ jobs: shell: bash -l {0} strategy: matrix: - execution: [BaseOnPython] + execution: [BaseOnPython, Client] env: MODIN_TEST_DATASET_SIZE: "small" name: Test ${{ matrix.execution }} execution, Python 3.8 @@ -372,7 +372,11 @@ jobs: - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - run: pytest modin/experimental/xgboost/test/test_default.py --execution=${{ matrix.execution }} + # Client execution doesn't need to work with xgboost + if: matrix.execution != 'Client' - run: python -m pytest -n 2 modin/test/storage_formats/base/test_internals.py --execution=${{ matrix.execution }} + # Client execution has different internals that we dont' test yet + if: matrix.execution != 'Client' - run: pytest -n 2 modin/pandas/test/dataframe/test_binary.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_default.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_indexing.py --execution=${{ matrix.execution }} @@ -383,6 +387,8 @@ jobs: - run: pytest -n 2 modin/pandas/test/dataframe/test_udf.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_window.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_pickle.py --execution=${{ matrix.execution }} + # Client execution dosen't need to pickle modin.pandas objects. + if: matrix.execution != 'Client' - run: python -m pytest -n 2 modin/pandas/test/test_series.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_concat.py --execution=${{ matrix.execution }} diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index eb3ea952f2e..04fc359419b 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -84,6 +84,8 @@ jobs: - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - run: pytest -n 2 modin/experimental/xgboost/test/test_default.py --execution=${{ matrix.execution }} + # Client execution doesn't need to work with xgboost + if: matrix.execution != 'Client' - run: pytest -n 2 modin/pandas/test/dataframe/test_binary.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_default.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_indexing.py --execution=${{ matrix.execution }} @@ -94,6 +96,8 @@ jobs: - run: pytest -n 2 modin/pandas/test/dataframe/test_udf.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_window.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_pickle.py --execution=${{ matrix.execution }} + # Client execution dosen't need to pickle modin.pandas objects. + if: matrix.execution != 'Client' - run: python -m pytest -n 2 modin/pandas/test/test_series.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_concat.py --execution=${{ matrix.execution }} diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py index 52879e520d0..65e86837172 100644 --- a/modin/core/execution/client/container.py +++ b/modin/core/execution/client/container.py @@ -466,121 +466,7 @@ def merge(self, id, right, **kwargs): self._qc[new_id] = self._qc[id].merge(self._qc[right], **kwargs) return new_id - def groupby_mean( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_mean( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_count( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_count( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_max( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_max( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_min( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_min( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_sum( - self, - id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_sum( - by, axis, groupby_kwargs, agg_args, agg_kwargs, drop - ) - return new_id - - def groupby_agg( - self, - id, - by, - agg_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - how="axis_wise", - drop=False, - is_qc: bool = False, - ): - if is_qc: - by = self._qc[by] - new_id = self._generate_id() - self._qc[new_id] = self._qc[id].groupby_agg( - by, agg_func, axis, groupby_kwargs, agg_args, agg_kwargs, how, drop - ) - return new_id + ### I/O methods go below. ### def read_csv(self, connection, filepath, **kwargs) -> UUID: """ @@ -651,9 +537,28 @@ def to_sql(self, id, **kwargs) -> None: self._io_class.to_sql(self._qc[id], **kwargs) +def _set_forwarding_groupby_method(method_name: str): + """ + Define a groupby method that forwards arguments to an inner query compiler. + + Parameters + ---------- + method_name : str + """ + + def forwarding_method(self, id, by_is_qc, by, *args, **kwargs): + if by_is_qc: + by = self._qc[by] + new_id = self._generate_id() + self._qc[new_id] = getattr(self._qc[id], method_name)(by, *args, **kwargs) + return new_id + + setattr(ForwardingQueryCompilerContainer, method_name, forwarding_method) + + def _set_forwarding_method_for_single_id(method_name: str): """ - Define a method that forwards arguments to the inner query compiler. + Define a method that forwards arguments to an inner query compiler. Parameters ---------- @@ -672,7 +577,7 @@ def forwarding_method( def _set_forwarding_method_for_binary_function(method_name: str): """ - Define a binary method that forwards arguments to the inner query compiler. + Define a binary method that forwards arguments to an inner query compiler. Parameters ---------- @@ -695,6 +600,8 @@ def forwarding_method( setattr(ForwardingQueryCompilerContainer, method_name, forwarding_method) +_GROUPBY_FORWARDING_METHODS = frozenset({"mean", "count", "max", "min", "sum", "agg"}) + _BINARY_FORWARDING_METHODS = frozenset( { "eq", @@ -824,3 +731,6 @@ def forwarding_method( for method in _BINARY_FORWARDING_METHODS: _set_forwarding_method_for_binary_function(method) + +for method in _GROUPBY_FORWARDING_METHODS: + _set_forwarding_groupby_method("groupby_" + method) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 4d0bdb13f09..68b5474a4ab 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -220,144 +220,6 @@ def concat(self, axis, other, **kwargs): def merge(self, right, **kwargs): return self.__constructor__(self._service.merge(self._id, right._id, **kwargs)) - def groupby_mean( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - if isinstance(by, type(self)): - by = by._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.groupby_mean( - self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc - ) - ) - - def groupby_count( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - if isinstance(by, type(self)): - by = by._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.groupby_count( - self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc - ) - ) - - def groupby_max( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - if isinstance(by, type(self)): - by = by._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.groupby_max( - self._id, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop, - is_qc, - ) - ) - - def groupby_min( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - if isinstance(by, type(self)): - by = by._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.groupby_min( - self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc - ) - ) - - def groupby_sum( - self, - by, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - drop=False, - ): - if isinstance(by, type(self)): - by = by._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.groupby_sum( - self._id, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop, is_qc - ) - ) - - def groupby_agg( - self, - by, - agg_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - how="axis_wise", - drop=False, - ): - if isinstance(by, type(self)): - by = by._id - is_qc = True - else: - is_qc = False - return self.__constructor__( - self._service.groupby_agg( - self._id, - by, - agg_func, - axis, - groupby_kwargs, - agg_args, - agg_kwargs, - how, - drop, - is_qc, - ) - ) - def get_index_names(self, axis=0): if axis == 0: return self.index.names @@ -382,6 +244,26 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): raise NotImplementedError +def _set_forwarding_groupby_method(method_name: str): + """ + Define a groupby method that forwards arguments to the service. + + Parameters + ---------- + method_name : str + """ + + def forwading_method(self, by, *args, **kwargs): + by_is_qc: bool = isinstance(by, type(self)) + if by_is_qc: + by = by._id + return self.__constructor__( + getattr(self._service, method_name)(self._id, by_is_qc, by, *args, **kwargs) + ) + + setattr(ClientQueryCompiler, method_name, forwading_method) + + def _set_forwarding_method_for_binary_function(method_name: str) -> None: """ Define a binary method that forwards arguments to the service. @@ -427,6 +309,8 @@ def forwarding_method( setattr(ClientQueryCompiler, method_name, forwarding_method) +_GROUPBY_FORWARDING_METHODS = frozenset({"mean", "count", "max", "min", "sum", "agg"}) + _BINARY_FORWARDING_METHODS = frozenset( { "eq", @@ -452,9 +336,6 @@ def forwarding_method( } ) -for method in _BINARY_FORWARDING_METHODS: - _set_forwarding_method_for_binary_function(method) - _SINGLE_ID_FORWARDING_METHODS = frozenset( { "columnarize", @@ -554,6 +435,11 @@ def forwarding_method( } ) +for method in _BINARY_FORWARDING_METHODS: + _set_forwarding_method_for_binary_function(method) for method in _SINGLE_ID_FORWARDING_METHODS: _set_forwarding_method_for_single_id(method) + +for method in _GROUPBY_FORWARDING_METHODS: + _set_forwarding_groupby_method("groupby_" + method) From aa5be586423282abf5c0eea20ff3c5030965d79d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 21:26:47 -0500 Subject: [PATCH 49/77] Add binary methods from hazem's dfce9189226190bddf6aacab35cbcf44e1a74977. Signed-off-by: mvashishtha --- modin/core/execution/client/container.py | 6 ++++++ modin/core/execution/client/query_compiler.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py index 65e86837172..e06f6dd6278 100644 --- a/modin/core/execution/client/container.py +++ b/modin/core/execution/client/container.py @@ -624,6 +624,12 @@ def forwarding_method( "rmul", "floordiv", "rfloordiv", + "__rand__", + "__ror__", + "__xor__", + "__rxor__", + "pow", + "rpow", } ) diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 68b5474a4ab..546bc2685e9 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -333,6 +333,12 @@ def forwarding_method( "rmul", "floordiv", "rfloordiv", + "__rand__", + "__ror__", + "__xor__", + "__rxor__", + "pow", + "rpow", } ) From 1e3bdc640a0ef8d1d4bcdef2f8452e7bd75322f6 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 22:50:46 -0500 Subject: [PATCH 50/77] Fix CI falures. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 2 -- modin/conftest.py | 33 +++++++++++-------- .../storage_formats/base/query_compiler.py | 2 +- .../storage_formats/hdk/query_compiler.py | 6 +++- modin/pandas/test/dataframe/test_iter.py | 13 +++++++- 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec006744465..3e4ff320f08 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -375,8 +375,6 @@ jobs: # Client execution doesn't need to work with xgboost if: matrix.execution != 'Client' - run: python -m pytest -n 2 modin/test/storage_formats/base/test_internals.py --execution=${{ matrix.execution }} - # Client execution has different internals that we dont' test yet - if: matrix.execution != 'Client' - run: pytest -n 2 modin/pandas/test/dataframe/test_binary.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_default.py --execution=${{ matrix.execution }} - run: pytest -n 2 modin/pandas/test/dataframe/test_indexing.py --execution=${{ matrix.execution }} diff --git a/modin/conftest.py b/modin/conftest.py index 53c1eac3022..b6e7be44ec8 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -53,7 +53,6 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): BaseQueryCompiler, ) from modin.core.execution.client.io import ClientIO # noqa: E402 -from modin.core.execution.client.query_compiler import ClientQueryCompiler # noqa: E402 from modin.core.execution.client.container import ( # noqa: E402 ForwardingQueryCompilerContainer, ) @@ -280,18 +279,6 @@ def set_base_on_python_execution(): modin.set_execution(engine="python", storage_format="Base") -class TestClientQueryCompiler(ClientQueryCompiler): - @classmethod - def from_pandas(cls, df, data_cls): - return cls(cls._service.add_query_compiler(TestQC.from_pandas(df, data_cls))) - - def default_to_pandas(self, pandas_op, *args, **kwargs): - result = self._service.default_to_pandas(self._id, pandas_op, *args, **kwargs) - if result.result_is_qc_id: - return self.__constructor__(result.result) - return result.result - - class ClientFactory(factories.BaseFactory): @classmethod def prepare(cls): @@ -299,6 +286,26 @@ def prepare(cls): def set_client_execution(): + # Can't always import ClientQueryCompiler, because it uses NoDefault, which + # is not available on older pandas. + + from modin.core.execution.client.query_compiler import ClientQueryCompiler + + class TestClientQueryCompiler(ClientQueryCompiler): + @classmethod + def from_pandas(cls, df, data_cls): + return cls( + cls._service.add_query_compiler(TestQC.from_pandas(df, data_cls)) + ) + + def default_to_pandas(self, pandas_op, *args, **kwargs): + result = self._service.default_to_pandas( + self._id, pandas_op, *args, **kwargs + ) + if result.result_is_qc_id: + return self.__constructor__(result.result) + return result.result + service = ForwardingQueryCompilerContainer(BaseQueryCompiler, PandasOnPythonIO) ClientQueryCompiler.set_server_connection(service) ClientIO.query_compiler_cls = TestClientQueryCompiler diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 4d674d8c4d2..bb10ef73385 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -3155,7 +3155,7 @@ def mask(idx): return ( self.getitem_column_array(idx, numeric=True) if axis - else self.getitem_row_array(idx) + else self.getitem_row_array(idx, numeric=True) ) if 0 <= loc < len(self.get_axis(axis)): diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index 7ac735b4a02..b07348e84da 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -555,7 +555,11 @@ def concat(self, axis, other, **kwargs): ) return self.__constructor__(new_modin_frame) - def drop(self, index=None, columns=None): + def drop(self, index=None, columns=None, errors: str = "raise"): + # `errors` parameter needs to be part of the function signature because + # other query compilers may not take care of error handling at the API + # layer. This query compiler assumes there won't be any errors due to + # invald keys. assert index is None, "Only column drop is supported" return self.__constructor__( self._modin_frame.take_2d_labels_or_positional( diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index d678ee11efa..9b6d5ad15bb 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -13,10 +13,12 @@ import pytest +import contextlib import numpy as np import pandas import matplotlib import modin.pandas as pd +from modin.utils import get_current_execution from pandas._testing import ensure_clean import warnings @@ -39,6 +41,12 @@ matplotlib.use("Agg") +@contextlib.contextmanager +def _nullcontext(): + """Replacement for contextlib.nullcontext missing in older Python.""" + yield + + @pytest.mark.parametrize("method", ["items", "iteritems", "iterrows"]) def test_items_iteritems_iterrows(method): data = test_data["float_nan_data"] @@ -230,7 +238,10 @@ def test___repr__(): with open(path, "w") as f: f.write(string_data) pandas_df = pandas.read_csv(path) - with warns_that_defaulting_to_pandas(): + with warns_that_defaulting_to_pandas() if get_current_execution() in ( + "BaseOnPython", + "Client", + ) else _nullcontext(): modin_df = pd.read_csv(path) assert repr(pandas_df) == repr(modin_df) From f9e0605818fc7be05091dd76a94f25190238dc3d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 23:44:22 -0500 Subject: [PATCH 51/77] Fix more tests. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 5 +++++ modin/conftest.py | 2 +- modin/core/execution/client/container.py | 11 ++++++----- modin/pandas/test/dataframe/test_iter.py | 1 + 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3e4ff320f08..b4a60af3820 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -393,6 +393,11 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }} + - run: | + python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} + python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} + # Client has to be able to to CSV and SQL I/O. + if: matrix.execution == 'Client' - uses: codecov/codecov-action@v2 test-hdk: diff --git a/modin/conftest.py b/modin/conftest.py index b6e7be44ec8..da83c4c8a68 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -306,7 +306,7 @@ def default_to_pandas(self, pandas_op, *args, **kwargs): return self.__constructor__(result.result) return result.result - service = ForwardingQueryCompilerContainer(BaseQueryCompiler, PandasOnPythonIO) + service = ForwardingQueryCompilerContainer(BaseQueryCompiler, BaseOnPythonIO) ClientQueryCompiler.set_server_connection(service) ClientIO.query_compiler_cls = TestClientQueryCompiler ClientIO.set_server_connection(service) diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py index e06f6dd6278..ea341c64255 100644 --- a/modin/core/execution/client/container.py +++ b/modin/core/execution/client/container.py @@ -28,13 +28,14 @@ class ForwardingQueryCompilerContainer: Parameters ---------- - query_compiler_class : BaseQueryCompiler - Query compiler class to contain. - io_class : BaseIO - The IO class to use for reading and writing data. + query_compiler_class : type + Query compiler class to contain. Should be a subclass of BaseQueryCompiler. + io_class : type + The IO class to use for reading and writing data. Should be a subclass + of modin.core.io.io.BaseIO. """ - def __init__(self, query_compiler_class: BaseQueryCompiler, io_class: BaseIO): + def __init__(self, query_compiler_class: type, io_class: type): self._qc = {} self._query_compiler_class = query_compiler_class self._io_class = io_class diff --git a/modin/pandas/test/dataframe/test_iter.py b/modin/pandas/test/dataframe/test_iter.py index 9b6d5ad15bb..4fab87739cf 100644 --- a/modin/pandas/test/dataframe/test_iter.py +++ b/modin/pandas/test/dataframe/test_iter.py @@ -240,6 +240,7 @@ def test___repr__(): pandas_df = pandas.read_csv(path) with warns_that_defaulting_to_pandas() if get_current_execution() in ( "BaseOnPython", + "PandasOnPython", "Client", ) else _nullcontext(): modin_df = pd.read_csv(path) From 09c07f93852f7acc4981800dd522c9620a537abf Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Fri, 28 Oct 2022 23:46:27 -0500 Subject: [PATCH 52/77] Fix flake8. Signed-off-by: mvashishtha --- modin/core/execution/client/container.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py index ea341c64255..8c998bd5c24 100644 --- a/modin/core/execution/client/container.py +++ b/modin/core/execution/client/container.py @@ -18,7 +18,6 @@ from typing import Any, NamedTuple, Optional, Union from uuid import UUID, uuid4 -from modin.core.io.io import BaseIO from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler From 0f53343f77cbd4bdc1bf0cc0d82c6cacaf846e4b Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Sat, 29 Oct 2022 00:19:30 -0500 Subject: [PATCH 53/77] Fix some tests. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 3 ++- .github/workflows/push.yml | 6 ++++++ modin/pandas/test/test_series.py | 11 +++++++---- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4a60af3820..f97e5f77ed4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -393,7 +393,8 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }} - - run: | + - name: I/O tests + run: | python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} # Client has to be able to to CSV and SQL I/O. diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 04fc359419b..92e5c042221 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -104,6 +104,12 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }} + - name: I/O tests + run: | + python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} + python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} + # Client has to be able to to CSV and SQL I/O. + if: matrix.execution == 'Client' - uses: codecov/codecov-action@v2 test-hdk: diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 61ff44770b4..458552bbe2f 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -1739,11 +1739,14 @@ def dt_with_empty_partition(lib): df_b = lib.DataFrame({"B": [lib.to_datetime("27/10/2020")]}) df = lib.concat([df_a, df_b], axis=1) eval_result = df.eval("B - A", engine="python") - # BaseOnPython had a single partition after the concat, and it - # maintains that partition after eval. In other execution modes, - # eval() should re-split the result into two column partitions, + # BaseOnPython and Client have a single partition after the concat, + # and they maintain that partition after eval. In other execution + # modes, eval() should re-split the result into two column partitions, # one of which is empty. - if isinstance(df, pd.DataFrame) and get_current_execution() != "BaseOnPython": + if isinstance(df, pd.DataFrame) and get_current_execution() not in ( + "BaseOnPython", + "Client", + ): assert eval_result._query_compiler._modin_frame._partitions.shape == (1, 2) return eval_result.dt.days From ff9478200cb51372a9ad468b6bf9b7b4ca75c4cf Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Sat, 29 Oct 2022 00:39:36 -0500 Subject: [PATCH 54/77] Update modin/core/execution/client/io.py --- modin/core/execution/client/io.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 679ce7017ba..80af63e1f98 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -67,10 +67,11 @@ def read_csv(cls, filepath_or_buffer, **kwargs): """ if isinstance(filepath_or_buffer, str): filepath_or_buffer = fsspec.open(filepath_or_buffer).full_name - if filepath_or_buffer.startswith("file://"): + file_protocol = "file://" + if filepath_or_buffer.startswith(file_protocol): # We will do this so that the backend can know whether this # is a path or a URL. - filepath_or_buffer = filepath_or_buffer[7:] + filepath_or_buffer = filepath_or_buffer[len(file_protocol):] else: raise NotImplementedError("Only filepaths are supported for read_csv") if cls._server_conn is None: From d4fbf0acdd6ba42d663ffe69730863b007e8ac8b Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Sat, 29 Oct 2022 00:42:05 -0500 Subject: [PATCH 55/77] Fix black. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 80af63e1f98..091d33fbe52 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -71,7 +71,7 @@ def read_csv(cls, filepath_or_buffer, **kwargs): if filepath_or_buffer.startswith(file_protocol): # We will do this so that the backend can know whether this # is a path or a URL. - filepath_or_buffer = filepath_or_buffer[len(file_protocol):] + filepath_or_buffer = filepath_or_buffer[len(file_protocol) :] else: raise NotImplementedError("Only filepaths are supported for read_csv") if cls._server_conn is None: From 6119b8eb6c4ab6c6e87df7d92677ea2aab9ff65d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Sat, 29 Oct 2022 01:22:29 -0500 Subject: [PATCH 56/77] Fix omnisci by restoring lazy execution check. Signed-off-by: mvashishtha --- modin/pandas/general.py | 3 ++- modin/pandas/test/test_concat.py | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/modin/pandas/general.py b/modin/pandas/general.py index b2993b4c9bc..a34dac3a2e5 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -472,7 +472,8 @@ def concat( list_of_objs = [ obj._query_compiler for obj in list_of_objs - if len(obj.index) or len(obj.columns) + if (not obj._query_compiler.lazy_execution and len(obj.index)) + or len(obj.columns) ] if keys is not None: if all_series: diff --git a/modin/pandas/test/test_concat.py b/modin/pandas/test/test_concat.py index 12d23892b2e..4ed4af2c5e5 100644 --- a/modin/pandas/test/test_concat.py +++ b/modin/pandas/test/test_concat.py @@ -170,6 +170,10 @@ def test_concat_series_only(): ) +@pytest.mark.xfail_executions( + "Client", + reason="Client query compiler has lazy_execution=True, so it doesn't detect any frames when looking for query compilers here: https://github.com/modin-project/modin/blob/f492ba9888fc05ff7c224db8a22faac8c0106a4b/modin/pandas/general.py#L472-L477", +) def test_concat_with_empty_frame(): modin_empty_df = pd.DataFrame() pandas_empty_df = pandas.DataFrame() From 7db25b70638b65f1db173f08ad1f60e1ec22107d Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Sat, 29 Oct 2022 02:11:51 -0500 Subject: [PATCH 57/77] Try fixing Client io yml. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f97e5f77ed4..93593fe3a65 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -393,11 +393,11 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }} - - name: I/O tests - run: | - python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} - python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} - # Client has to be able to to CSV and SQL I/O. + - run: python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} + # Client has to be able to to do CSV I/O. + if: matrix.execution == 'Client' + - run: python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} + # Client has to be able to to do SQL I/O. if: matrix.execution == 'Client' - uses: codecov/codecov-action@v2 From 86bbc75e331f1d9190cf9baed3fbf3141e43638b Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Sat, 29 Oct 2022 02:37:03 -0500 Subject: [PATCH 58/77] Make test dataset size normal so I/O tests pass. Signed-off-by: mvashishtha --- .github/workflows/ci.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 93593fe3a65..10224ab1406 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -393,11 +393,13 @@ jobs: - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }} - - run: python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} - # Client has to be able to to do CSV I/O. - if: matrix.execution == 'Client' - - run: python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} - # Client has to be able to to do SQL I/O. + - name: Test I/O + # note that if test dataset size is small like for the other tests in + # this job, the tests fail. + run: | + MODIN_TEST_DATASET_SIZE=NORMAL python -m pytest modin/pandas/test/test_io.py::TestCsv --execution=${{ matrix.execution }} + MODIN_TEST_DATASET_SIZE=NORMAL python -m pytest modin/pandas/test/test_io.py::TestSql --execution=${{ matrix.execution }} + # Client has to be able to to do CSV and SQL I/O. if: matrix.execution == 'Client' - uses: codecov/codecov-action@v2 From 8826c547c5335002594e602ce87cb95b0850963e Mon Sep 17 00:00:00 2001 From: Mahesh Vashishtha Date: Mon, 31 Oct 2022 10:06:54 -0500 Subject: [PATCH 59/77] Apply suggestions from code review Co-authored-by: Karthik Velayutham --- modin/core/storage_formats/pandas/query_compiler.py | 2 +- modin/experimental/core/storage_formats/hdk/query_compiler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 3a8e78822d5..1f10ff00f7c 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2359,7 +2359,7 @@ def drop(self, index=None, columns=None, errors: str = "raise"): # `errors` parameter needs to be part of the function signature because # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to - # invald keys. + # invalid keys. if index is not None: index = np.sort(self.index.get_indexer_for(self.index.difference(index))) if columns is not None: diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index b07348e84da..711068621e9 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -559,7 +559,7 @@ def drop(self, index=None, columns=None, errors: str = "raise"): # `errors` parameter needs to be part of the function signature because # other query compilers may not take care of error handling at the API # layer. This query compiler assumes there won't be any errors due to - # invald keys. + # invalid keys. assert index is None, "Only column drop is supported" return self.__constructor__( self._modin_frame.take_2d_labels_or_positional( From 584ef10cfcdc331ca27585c79f6497a13664c226 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 31 Oct 2022 12:02:11 -0500 Subject: [PATCH 60/77] Address comments. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 18 ++++++++---------- .../storage_formats/base/query_compiler.py | 2 ++ modin/pandas/base.py | 12 ++---------- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 091d33fbe52..be165e168fb 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -14,7 +14,7 @@ """The module holds the factory which performs I/O using pandas on a Client.""" from modin.core.io.io import BaseIO -import fsspec +import os import pandas @@ -65,19 +65,17 @@ def read_csv(cls, filepath_or_buffer, **kwargs): self.query_compiler_cls Query compiler with CSV data read in. """ - if isinstance(filepath_or_buffer, str): - filepath_or_buffer = fsspec.open(filepath_or_buffer).full_name - file_protocol = "file://" - if filepath_or_buffer.startswith(file_protocol): - # We will do this so that the backend can know whether this - # is a path or a URL. - filepath_or_buffer = filepath_or_buffer[len(file_protocol) :] - else: - raise NotImplementedError("Only filepaths are supported for read_csv") if cls._server_conn is None: raise ConnectionError( "Missing server connection, did you initialize the connection?" ) + if not isinstance(filepath_or_buffer, str): + raise NotImplementedError("Only filepaths are supported for read_csv") + if os.path.exists(filepath_or_buffer): + # In case this is a local path, we should use the absolute path + # because the service might be running in a different directory + # on the same machine. + filepath_or_buffer = os.path.abspath(filepath_or_buffer) server_result = cls._server_conn.read_csv( cls._data_conn, filepath_or_buffer, **kwargs ) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index bb10ef73385..8764b369257 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2162,6 +2162,8 @@ def getitem_row_array(self, key: List[Hashable], numeric: bool = False): key : list-like Numeric indices of the rows to pick. numeric : bool, default: False + Whether the key passed in represents the numeric row positions or + or the possibly non-numeric row labels. Returns ------- diff --git a/modin/pandas/base.py b/modin/pandas/base.py index b1609a35684..87271fd8053 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -3181,16 +3181,8 @@ def __getitem__(self, key): BasePandasDataset Located dataset. """ - if not self._query_compiler.lazy_execution: - if len(self) == 0: - return self._default_to_pandas("__getitem__", key) - # fastpath for common case - if isinstance(key, str) and key in self._query_compiler.columns: - return self._getitem(key) - elif is_list_like(key) and all( - k in self._query_compiler.columns for k in key - ): - return self._getitem(key) + if not self._query_compiler.lazy_execution and len(self) == 0: + return self._default_to_pandas("__getitem__", key) # see if we can slice the rows # This lets us reuse code in pandas to error check indexer = None From 187115403a0dd7d5dbb0c8cb0d7411584c523986 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 31 Oct 2022 23:51:50 -0500 Subject: [PATCH 61/77] Respond to comments. Signed-off-by: mvashishtha --- modin/conftest.py | 2 +- modin/pandas/test/dataframe/test_map_metadata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/conftest.py b/modin/conftest.py index da83c4c8a68..13571d45487 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -364,8 +364,8 @@ def pytest_configure(config): set_base_on_python_execution() config.addinivalue_line("filterwarnings", default_to_pandas_ignore_string) elif execution == "Client": - config.addinivalue_line("filterwarnings", default_to_pandas_ignore_string) set_client_execution() + config.addinivalue_line("filterwarnings", default_to_pandas_ignore_string) else: partition, engine = execution.split("On") modin.set_execution(engine=engine, storage_format=partition) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 37a3a1cd0d4..5a82bbd72e6 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -715,7 +715,7 @@ def test_drop(): # TODO(https://github.com/modin-project/modin/issues/5163): raise a # KeyError like pandas when the label is not found when lazy_execution is - # off. Also use df_equals instead of + # off. check_exception_type = modin_simple._query_compiler.lazy_execution eval_general( modin_simple, From 17073902b0c6d4d20afe3634064964a162feed7c Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 1 Nov 2022 00:12:02 -0500 Subject: [PATCH 62/77] Fix fuzzydata by making getitem_row_array use numeric=True everywhere. Signed-off-by: mvashishtha --- .../execution/ray/implementations/pandas_on_ray/io/io.py | 2 +- modin/core/storage_formats/base/query_compiler.py | 2 +- modin/core/storage_formats/pandas/query_compiler.py | 6 +++--- modin/pandas/base.py | 4 +++- modin/pandas/series.py | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py b/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py index 6922ef0406e..329ddea8075 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py @@ -91,7 +91,7 @@ def to_sql(cls, qc, **kwargs): # since the mapping operation is non-blocking, each partition will return an empty DF # so at the end, the blocking operation will be this empty DF to_pandas - empty_df = qc.getitem_row_array([0]).to_pandas().head(0) + empty_df = qc.getitem_row_array([0], numeric=True).to_pandas().head(0) empty_df.to_sql(**kwargs) # so each partition will append its respective DF kwargs["if_exists"] = "append" diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 8764b369257..52ad27dea8a 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2153,7 +2153,7 @@ def get_column(df, key): return DataFrameDefault.register(get_column)(self, key=key) - def getitem_row_array(self, key: List[Hashable], numeric: bool = False): + def getitem_row_array(self, key: List[Hashable], numeric: bool): """ Get row data for target indices. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 1f10ff00f7c..a9e5d62c749 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2221,7 +2221,7 @@ def getitem_array(self, key): # requested. key = pandas.RangeIndex(len(self.index))[key] if len(key): - return self.getitem_row_array(key) + return self.getitem_row_array(key, numeric=True) else: return self.from_pandas( pandas.DataFrame(columns=self.columns), type(self._modin_frame) @@ -2247,7 +2247,7 @@ def getitem_column_array(self, key, numeric=False): ) return self.__constructor__(new_modin_frame) - def getitem_row_array(self, key: List[Hashable], numeric: bool = False): + def getitem_row_array(self, key: List[Hashable], numeric: bool): if numeric: kwargs = {"row_positions": key} else: @@ -3302,7 +3302,7 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): rows = [rows] ErrorMessage.default_to_pandas("sort_values") broadcast_value_list = [ - self.getitem_row_array([row]).to_pandas() for row in rows + self.getitem_row_array([row], numeric=True).to_pandas() for row in rows ] index_builder = list(zip(broadcast_value_list, rows)) broadcast_values = pandas.concat( diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 87271fd8053..7c776e07971 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2499,7 +2499,9 @@ def _sample( query_compiler = self._query_compiler.getitem_column_array(samples) return self.__constructor__(query_compiler=query_compiler) else: - query_compiler = self._query_compiler.getitem_row_array(samples) + query_compiler = self._query_compiler.getitem_row_array( + samples, numeric=True + ) return self.__constructor__(query_compiler=query_compiler) def _sem( diff --git a/modin/pandas/series.py b/modin/pandas/series.py index 17ffa05310f..ea83ab9f6d2 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -2443,7 +2443,7 @@ def _getitem(self, key): if is_bool_indexer(key): return self.__constructor__( query_compiler=self._query_compiler.getitem_row_array( - pandas.RangeIndex(len(self.index))[key] + pandas.RangeIndex(len(self.index))[key], numeric=True ) ) # TODO: More efficiently handle `tuple` case for `Series.__getitem__` From e7af275cedd31d023b7d3e00369ed2be51e60feb Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 1 Nov 2022 18:22:43 -0500 Subject: [PATCH 63/77] Pass errors through astype. Signed-off-by: mvashishtha --- modin/core/storage_formats/base/query_compiler.py | 6 ++++-- modin/core/storage_formats/pandas/query_compiler.py | 4 ++-- modin/pandas/base.py | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 52ad27dea8a..3258f57c6f8 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -1416,7 +1416,7 @@ def stack(self, level, dropna): ) # Abstract map partitions across select indices - def astype(self, col_dtypes, **kwargs): # noqa: PR02 + def astype(self, col_dtypes, errors: str): """ Convert columns dtypes to given dtypes. @@ -1424,6 +1424,8 @@ def astype(self, col_dtypes, **kwargs): # noqa: PR02 ---------- col_dtypes : dict Map for column names and new dtypes. + error : {"raise", "ignore"} + Control raising of exceptions on invalid data for provided dtype. **kwargs : dict Serves the compatibility purpose. Does not affect the result. @@ -1433,7 +1435,7 @@ def astype(self, col_dtypes, **kwargs): # noqa: PR02 New QueryCompiler with updated dtypes. """ return DataFrameDefault.register(pandas.DataFrame.astype)( - self, dtype=col_dtypes, **kwargs + self, dtype=col_dtypes, errors=errors ) def infer_objects(self): diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index a9e5d62c749..3ac654f85fc 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -1539,8 +1539,8 @@ def datetime_freq(df): # END Dt map partitions operations - def astype(self, col_dtypes, **kwargs): - return self.__constructor__(self._modin_frame.astype(col_dtypes)) + def astype(self, col_dtypes, errors: str): + return self.__constructor__(self._modin_frame.astype(col_dtypes, errors=errors)) def infer_objects(self): return self.__constructor__(self._modin_frame.infer_objects()) diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 7c776e07971..e021970558a 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -923,7 +923,7 @@ def astype(self, dtype, copy=True, errors="raise"): # noqa: PR01, RT01, D200 # Assume that the dtype is a scalar. col_dtypes = {column: dtype for column in self._query_compiler.columns} - new_query_compiler = self._query_compiler.astype(col_dtypes) + new_query_compiler = self._query_compiler.astype(col_dtypes, errors=errors) return self._create_or_update_from_compiler(new_query_compiler, not copy) @property From edf99f8960790801696fd7063024767fa0db90ff Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 1 Nov 2022 19:35:33 -0500 Subject: [PATCH 64/77] Fix astype errors. Signed-off-by: mvashishtha --- modin/core/storage_formats/pandas/query_compiler.py | 4 ++-- modin/experimental/core/storage_formats/hdk/query_compiler.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 3ac654f85fc..d80323184bb 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -1540,7 +1540,7 @@ def datetime_freq(df): # END Dt map partitions operations def astype(self, col_dtypes, errors: str): - return self.__constructor__(self._modin_frame.astype(col_dtypes, errors=errors)) + return self.__constructor__(self._modin_frame.astype(col_dtypes)) def infer_objects(self): return self.__constructor__(self._modin_frame.infer_objects()) @@ -2638,7 +2638,7 @@ def groupby_mean(self, by, axis, groupby_kwargs, agg_args, agg_kwargs, drop=Fals ) qc_with_converted_datetime_cols = ( - self.astype({col: "int64" for col in datetime_cols.keys()}) + self.astype({col: "int64" for col in datetime_cols.keys()}, errors="raise") if len(datetime_cols) > 0 else self ) diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index 711068621e9..02bc5187052 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -691,9 +691,9 @@ def reset_index(self, **kwargs): self._modin_frame.reset_index(drop), shape_hint=shape_hint ) - def astype(self, col_dtypes, **kwargs): + def astype(self, col_dtypes, errors: str): return self.__constructor__( - self._modin_frame.astype(col_dtypes), self._shape_hint + self._modin_frame.astype(col_dtypes, errors), self._shape_hint ) def setitem(self, axis, key, value): From e74db7cecd12d88e9bd68dc8ce7a6400fdc94ac2 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 7 Nov 2022 17:51:37 -0600 Subject: [PATCH 65/77] Use new take_2d_labels for most insertion. test_indexing passes except one multiindexing case Signed-off-by: mvashishtha --- modin/core/execution/client/container.py | 2 + modin/core/execution/client/query_compiler.py | 2 + .../storage_formats/base/query_compiler.py | 36 +++++++ modin/pandas/indexing.py | 97 +++++++++++++------ 4 files changed, 107 insertions(+), 30 deletions(-) diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py index 8c998bd5c24..6e4c0f31268 100644 --- a/modin/core/execution/client/container.py +++ b/modin/core/execution/client/container.py @@ -639,7 +639,9 @@ def forwarding_method( "transpose", "take_2d", "getitem_column_array", + "get_columns_with_labels", "getitem_row_array", + "get_rows_with_labels", "pivot", "get_dummies", "drop", diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 546bc2685e9..dffcf750059 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -348,7 +348,9 @@ def forwarding_method( "transpose", "take_2d", "getitem_column_array", + "get_columns_with_labels", "getitem_row_array", + "get_rows_with_labels", "pivot", "get_dummies", "drop", diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 3258f57c6f8..de724bca17a 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -3126,6 +3126,42 @@ def applyer(df): return DataFrameDefault.register(applyer)(self) + def take_2d_labels( + self, + index, + columns, + index_is_series, + column_is_series, + ): + """ + Take the given labels. + + Parameters + ---------- + index : slice, scalar, or list-like + Labels of rows to grab. + columns : slice, scalar, or list-like + Labels of columns to grab. + + Returns + ------- + BaseQueryCompiler + Subset of this QueryCompiler. + """ + if isinstance(index, type(self)): + index = index.to_pandas() + if index_is_series: + index = index.iloc[:, 0] + if isinstance(columns, type(self)): + columns = columns.to_pandas() + if column_is_series: + columns = columns.iloc[:, 0] + + def applyer(df): + return df.loc[index, columns] + + return DataFrameDefault.register(applyer)(self) + def insert_item(self, axis, loc, value, how="inner", replace=False): """ Insert rows/columns defined by `value` at the specified position. diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 16b3003f0f9..7b0e07ca873 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -35,6 +35,9 @@ from pandas.api.types import is_list_like, is_bool from pandas.core.dtypes.common import is_integer, is_bool_dtype, is_integer_dtype from pandas.core.indexing import IndexingError +from typing import Union + +from modin.core.execution.client.query_compiler import ClientQueryCompiler from modin.error_message import ErrorMessage from modin.logging import ClassLogger @@ -319,10 +322,60 @@ def __setitem__(self, key, item): # pragma: no cover """ raise NotImplementedError("Implemented by subclasses") - def _getitem_positional( + def _take_2d_labels(self, row_lookup, col_lookup): + """ + Take 2D labels from the DataFrame. + + Parameters + ---------- + row_lookup : list-like + List of row labels to take. + col_lookup : list-like + List of column labels to take. + + Returns + ------- + modin.pandas.DataFrame + DataFrame with taken labels. + """ + row_is_series = isinstance(row_lookup, Series) + col_is_series = isinstance(col_lookup, Series) + if is_scalar(row_lookup): + row_lookup = [row_lookup] + elif isinstance(row_lookup, (Series, DataFrame)): + row_lookup = row_lookup._query_compiler + if is_scalar(col_lookup): + col_lookup = [col_lookup] + elif isinstance(col_lookup, (Series, DataFrame)): + col_lookup = col_lookup._query_compiler + return self.qc.take_2d_labels( + row_lookup, col_lookup, row_is_series, col_is_series + ) + + def _take_2d_positional( + self, + row_lookup: Union[slice, range, np.ndarray], + col_lookup: Union[slice, range, np.ndarray], + ): + """ """ + if isinstance(row_lookup, slice): + ErrorMessage.catch_bugs_and_request_email( + failure_condition=row_lookup != slice(None), + extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}", + ) + row_lookup = None + if isinstance(col_lookup, slice): + ErrorMessage.catch_bugs_and_request_email( + failure_condition=col_lookup != slice(None), + extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}", + ) + col_lookup = None + + return self.qc.take_2d(row_lookup, col_lookup) + + def _get_pandas_object_from_qc_view( self, - row_lookup, - col_lookup, + qc_view, row_multiindex_full_lookup: bool, col_multiindex_full_lookup: bool, row_scalar: bool, @@ -334,10 +387,8 @@ def _getitem_positional( Parameters ---------- - row_lookup : slice(None), range or np.ndarray - The global row index to retrieve data from. - col_lookup : slice(None), range or np.ndarray - The global col index to retrieve data from. + qc_view : BaseQueryCompiler + Query compiler to operate on. row_multiindex_full_lookup : bool See _multiindex_possibly_contains_key.__doc__. col_multiindex_full_lookup : bool @@ -361,20 +412,6 @@ def _getitem_positional( Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ - if isinstance(row_lookup, slice): - ErrorMessage.catch_bugs_and_request_email( - failure_condition=row_lookup != slice(None), - extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}", - ) - row_lookup = None - if isinstance(col_lookup, slice): - ErrorMessage.catch_bugs_and_request_email( - failure_condition=col_lookup != slice(None), - extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}", - ) - col_lookup = None - - qc_view = self.qc.take_2d(row_lookup, col_lookup) if ndim == 2: return self.df.__constructor__(query_compiler=qc_view) @@ -678,11 +715,12 @@ def __getitem__(self, key): if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) - row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - - result = self._getitem_positional( - row_lookup, - col_lookup, + if isinstance(self.qc, ClientQueryCompiler): + qc_view = self._take_2d_labels(row_loc, col_loc) + else: + qc_view = self._take_2d_positional(*self._compute_lookup(row_loc, col_loc)) + result = self._get_pandas_object_from_qc_view( + qc_view, row_multiindex_full_lookup, col_multiindex_full_lookup, row_scalar, @@ -731,7 +769,7 @@ def __getitem__(self, key): # This is done for cases where the index passed in has other state, like a # frequency in the case of DateTimeIndex. if ( - row_lookup is not None + row_loc is not None and isinstance(col_loc, slice) and col_loc == slice(None) and isinstance(key, pandas.Index) @@ -1064,9 +1102,8 @@ def __getitem__(self, key): row_lookup, col_lookup = self._compute_lookup(row_loc, col_loc) - result = self._getitem_positional( - row_lookup, - col_lookup, + result = self._get_pandas_object_from_qc_view( + self._take_2d_positional(row_lookup, col_lookup), row_multiindex_full_lookup=False, col_multiindex_full_lookup=False, row_scalar=row_scalar, From b616c28a1c0064d7cda6a6a09239f9429d8b1900 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Mon, 7 Nov 2022 18:07:56 -0600 Subject: [PATCH 66/77] Actually use client query compiler. Signed-off-by: mvashishtha --- modin/core/execution/client/container.py | 3 +-- modin/core/execution/client/query_compiler.py | 3 +-- modin/core/storage_formats/base/query_compiler.py | 8 ++++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/modin/core/execution/client/container.py b/modin/core/execution/client/container.py index 6e4c0f31268..559156c4487 100644 --- a/modin/core/execution/client/container.py +++ b/modin/core/execution/client/container.py @@ -639,9 +639,8 @@ def forwarding_method( "transpose", "take_2d", "getitem_column_array", - "get_columns_with_labels", "getitem_row_array", - "get_rows_with_labels", + "take_2d_labels", "pivot", "get_dummies", "drop", diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index dffcf750059..93776f57569 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -348,9 +348,8 @@ def forwarding_method( "transpose", "take_2d", "getitem_column_array", - "get_columns_with_labels", "getitem_row_array", - "get_rows_with_labels", + "take_2d_labels", "pivot", "get_dummies", "drop", diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index de724bca17a..72ffd2d24ab 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -3148,16 +3148,20 @@ def take_2d_labels( BaseQueryCompiler Subset of this QueryCompiler. """ - if isinstance(index, type(self)): + print( + f"call take_2d_labels with index {index} and columns {columns} and {isinstance(index, type(self))=} and {isinstance(columns, type(self))=}" + ) + if isinstance(index, BaseQueryCompiler): index = index.to_pandas() if index_is_series: index = index.iloc[:, 0] - if isinstance(columns, type(self)): + if isinstance(columns, BaseQueryCompiler): columns = columns.to_pandas() if column_is_series: columns = columns.iloc[:, 0] def applyer(df): + print(f"getting loc with index {index} and columns {columns}") return df.loc[index, columns] return DataFrameDefault.register(applyer)(self) From 832556d79cfd41543ff886ff05ca8879e68d6a41 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 8 Nov 2022 11:51:52 -0600 Subject: [PATCH 67/77] Fix multiindex and fix doc_checker. Signed-off-by: mvashishtha --- .../storage_formats/base/query_compiler.py | 22 +++----- modin/pandas/indexing.py | 53 +++++++++++++------ 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 72ffd2d24ab..4cc171e2ce9 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -1424,10 +1424,8 @@ def astype(self, col_dtypes, errors: str): ---------- col_dtypes : dict Map for column names and new dtypes. - error : {"raise", "ignore"} + errors : {"raise", "ignore"} Control raising of exceptions on invalid data for provided dtype. - **kwargs : dict - Serves the compatibility purpose. Does not affect the result. Returns ------- @@ -3130,17 +3128,15 @@ def take_2d_labels( self, index, columns, - index_is_series, - column_is_series, ): """ Take the given labels. Parameters ---------- - index : slice, scalar, or list-like + index : slice, scalar, list-like, or BaseQueryCompiler Labels of rows to grab. - columns : slice, scalar, or list-like + columns : slice, scalar, list-like, or BaseQueryCompiler Labels of columns to grab. Returns @@ -3148,20 +3144,16 @@ def take_2d_labels( BaseQueryCompiler Subset of this QueryCompiler. """ - print( - f"call take_2d_labels with index {index} and columns {columns} and {isinstance(index, type(self))=} and {isinstance(columns, type(self))=}" - ) if isinstance(index, BaseQueryCompiler): index = index.to_pandas() - if index_is_series: - index = index.iloc[:, 0] + assert len(index.columns) == 1 + index = index.iloc[:, 0] if isinstance(columns, BaseQueryCompiler): columns = columns.to_pandas() - if column_is_series: - columns = columns.iloc[:, 0] + assert len(columns.columns) == 1 + columns = columns.iloc[:, 0] def applyer(df): - print(f"getting loc with index {index} and columns {columns}") return df.loc[index, columns] return DataFrameDefault.register(applyer)(self) diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 7b0e07ca873..d63e278c79b 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -322,7 +322,13 @@ def __setitem__(self, key, item): # pragma: no cover """ raise NotImplementedError("Implemented by subclasses") - def _take_2d_labels(self, row_lookup, col_lookup): + def _take_2d_labels( + self, + row_lookup, + col_lookup, + row_multiindex_full_lookup, + col_multiindex_full_lookup, + ): """ Take 2D labels from the DataFrame. @@ -332,32 +338,46 @@ def _take_2d_labels(self, row_lookup, col_lookup): List of row labels to take. col_lookup : list-like List of column labels to take. + row_multiindex_full_lookup : bool + See _multiindex_possibly_contains_key.__doc__. + col_multiindex_full_lookup : bool + See _multiindex_possibly_contains_key.__doc__. Returns ------- modin.pandas.DataFrame DataFrame with taken labels. """ - row_is_series = isinstance(row_lookup, Series) - col_is_series = isinstance(col_lookup, Series) - if is_scalar(row_lookup): + if is_scalar(row_lookup) or row_multiindex_full_lookup: row_lookup = [row_lookup] elif isinstance(row_lookup, (Series, DataFrame)): row_lookup = row_lookup._query_compiler - if is_scalar(col_lookup): + if is_scalar(col_lookup) or col_multiindex_full_lookup: col_lookup = [col_lookup] elif isinstance(col_lookup, (Series, DataFrame)): col_lookup = col_lookup._query_compiler - return self.qc.take_2d_labels( - row_lookup, col_lookup, row_is_series, col_is_series - ) + return self.qc.take_2d_labels(row_lookup, col_lookup) def _take_2d_positional( self, row_lookup: Union[slice, range, np.ndarray], col_lookup: Union[slice, range, np.ndarray], ): - """ """ + """ + Take 2D positional data from the DataFrame. + + Parameters + ---------- + row_lookup : slice, range, or np.ndarray + Row positions to take. + col_lookup : slice, range, or np.ndarray + Column positions to take. + + Returns + ------- + BaseQueryCompiler + Query compiler with given positions. + """ if isinstance(row_lookup, slice): ErrorMessage.catch_bugs_and_request_email( failure_condition=row_lookup != slice(None), @@ -383,27 +403,27 @@ def _get_pandas_object_from_qc_view( ndim: int, ): """ - Retrieve dataset according to `row_lookup` and `col_lookup`. + Convert the query compiler view to the appropriate pandas object. Parameters ---------- qc_view : BaseQueryCompiler - Query compiler to operate on. + Query compiler to convert. row_multiindex_full_lookup : bool See _multiindex_possibly_contains_key.__doc__. col_multiindex_full_lookup : bool See _multiindex_possibly_contains_key.__doc__. row_scalar : bool - Whether indexer for rows is scalar or not. + Whether indexer for rows is scalar. col_scalar : bool - Whether indexer for columns is scalar or not. + Whether indexer for columns is scalar. ndim : {0, 1, 2} Number of dimensions in dataset to be retrieved. Returns ------- modin.pandas.DataFrame or modin.pandas.Series - Located dataset. + The pandas object with the data from the query compiler view. Notes ----- @@ -412,7 +432,6 @@ def _get_pandas_object_from_qc_view( Ideally, this API should get rid of using slices as indexers and either use a common ``Indexer`` object or range and ``np.ndarray`` only. """ - if ndim == 2: return self.df.__constructor__(query_compiler=qc_view) if isinstance(self.df, Series) and not row_scalar: @@ -716,7 +735,9 @@ def __getitem__(self, key): return self._handle_boolean_masking(row_loc, col_loc) if isinstance(self.qc, ClientQueryCompiler): - qc_view = self._take_2d_labels(row_loc, col_loc) + qc_view = self._take_2d_labels( + row_loc, col_loc, row_multiindex_full_lookup, col_multiindex_full_lookup + ) else: qc_view = self._take_2d_positional(*self._compute_lookup(row_loc, col_loc)) result = self._get_pandas_object_from_qc_view( From 51fd254d6cfda880280ad6b079056cb4a38d91ba Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 8 Nov 2022 13:19:36 -0600 Subject: [PATCH 68/77] Fix IO astype bug. Signed-off-by: mvashishtha --- modin/core/io/file_dispatcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/core/io/file_dispatcher.py b/modin/core/io/file_dispatcher.py index b0247d90cd7..efda84f31ec 100644 --- a/modin/core/io/file_dispatcher.py +++ b/modin/core/io/file_dispatcher.py @@ -170,7 +170,8 @@ def read(cls, *args, **kwargs): t: dtypes[t] for t in dtypes.index if isinstance(dtypes[t], kernel_lib.CategoricalDtype) - } + }, + kwargs.get("errors", "raise"), ) return query_compiler From ebe2719dfe1ac672d37b2f225acf2ffd32a5504e Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 8 Nov 2022 14:38:48 -0600 Subject: [PATCH 69/77] Make ClientIO use ClientQueryCompiler by default. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index be165e168fb..52361e37ebc 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -13,6 +13,7 @@ """The module holds the factory which performs I/O using pandas on a Client.""" +from .query_compiler import ClientQueryCompiler from modin.core.io.io import BaseIO import os import pandas @@ -23,6 +24,7 @@ class ClientIO(BaseIO): _server_conn = None _data_conn = None + query_compiler_cls = ClientQueryCompiler @classmethod def set_server_connection(cls, conn): From f2058015884b3f481f62c6f03ed29fc28f77b7d2 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 8 Nov 2022 15:04:07 -0600 Subject: [PATCH 70/77] Debug read_sql. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 1 + modin/core/execution/client/query_compiler.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 52361e37ebc..4a00e11f6e6 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -110,6 +110,7 @@ def read_sql(cls, sql, con, **kwargs): self.query_compiler_cls Query compiler with data read in from SQL connection. """ + print(f'called client io read_sql with {sql=} and {con=} and {kwargs=}') if isinstance(con, str) and con.lower() == "auto" and cls._data_conn is None: raise ConnectionError( "Cannot connect with parameter 'auto' because connection is not set. Did you initialize it?" diff --git a/modin/core/execution/client/query_compiler.py b/modin/core/execution/client/query_compiler.py index 93776f57569..cd670ce8ded 100644 --- a/modin/core/execution/client/query_compiler.py +++ b/modin/core/execution/client/query_compiler.py @@ -40,6 +40,12 @@ class ClientQueryCompiler(BaseQueryCompiler): lazy_execution: bool = True def __init__(self, id: UUID): + # The service can return an exception instead of the ID of a new query + # compiler. + if isinstance(id, Exception): + raise id + if not isinstance(id, UUID): + raise TypeError(f"Expected UUID, got {type(id)}") self._id = id @classmethod From 92ad6dd62e266116ff075fa57def8bed0daad7ec Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Tue, 8 Nov 2022 16:32:19 -0600 Subject: [PATCH 71/77] Fix getitem_row_array. Signed-off-by: mvashishtha --- .../ray/implementations/pandas_on_ray/io/io.py | 2 +- modin/core/storage_formats/base/query_compiler.py | 12 +++--------- modin/core/storage_formats/pandas/query_compiler.py | 12 ++++-------- modin/pandas/base.py | 2 +- modin/pandas/groupby.py | 4 ++-- modin/pandas/series.py | 4 ++-- 6 files changed, 13 insertions(+), 23 deletions(-) diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py b/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py index 329ddea8075..6922ef0406e 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/io/io.py @@ -91,7 +91,7 @@ def to_sql(cls, qc, **kwargs): # since the mapping operation is non-blocking, each partition will return an empty DF # so at the end, the blocking operation will be this empty DF to_pandas - empty_df = qc.getitem_row_array([0], numeric=True).to_pandas().head(0) + empty_df = qc.getitem_row_array([0]).to_pandas().head(0) empty_df.to_sql(**kwargs) # so each partition will append its respective DF kwargs["if_exists"] = "append" diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 4cc171e2ce9..08eed980632 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2153,7 +2153,7 @@ def get_column(df, key): return DataFrameDefault.register(get_column)(self, key=key) - def getitem_row_array(self, key: List[Hashable], numeric: bool): + def getitem_row_array(self, key: List[Hashable]): """ Get row data for target indices. @@ -2161,9 +2161,6 @@ def getitem_row_array(self, key: List[Hashable], numeric: bool): ---------- key : list-like Numeric indices of the rows to pick. - numeric : bool, default: False - Whether the key passed in represents the numeric row positions or - or the possibly non-numeric row labels. Returns ------- @@ -2172,10 +2169,7 @@ def getitem_row_array(self, key: List[Hashable], numeric: bool): """ def get_row(df, key): - if numeric: - return df.iloc[key] - else: - return df.loc[key] + return df.loc[key] return DataFrameDefault.register(get_row)(self, key=key) @@ -3191,7 +3185,7 @@ def mask(idx): return ( self.getitem_column_array(idx, numeric=True) if axis - else self.getitem_row_array(idx, numeric=True) + else self.getitem_row_array(idx) ) if 0 <= loc < len(self.get_axis(axis)): diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index d80323184bb..08cc058ca96 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2221,7 +2221,7 @@ def getitem_array(self, key): # requested. key = pandas.RangeIndex(len(self.index))[key] if len(key): - return self.getitem_row_array(key, numeric=True) + return self.getitem_row_array(key) else: return self.from_pandas( pandas.DataFrame(columns=self.columns), type(self._modin_frame) @@ -2247,13 +2247,9 @@ def getitem_column_array(self, key, numeric=False): ) return self.__constructor__(new_modin_frame) - def getitem_row_array(self, key: List[Hashable], numeric: bool): - if numeric: - kwargs = {"row_positions": key} - else: - kwargs = {"row_labels": key} + def getitem_row_array(self, key: List[Hashable]): return self.__constructor__( - self._modin_frame.take_2d_labels_or_positional(**kwargs) + self._modin_frame.take_2d_labels_or_positional(row_labels=key) ) def setitem(self, axis, key, value): @@ -3302,7 +3298,7 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): rows = [rows] ErrorMessage.default_to_pandas("sort_values") broadcast_value_list = [ - self.getitem_row_array([row], numeric=True).to_pandas() for row in rows + self.getitem_row_array([row]).to_pandas() for row in rows ] index_builder = list(zip(broadcast_value_list, rows)) broadcast_values = pandas.concat( diff --git a/modin/pandas/base.py b/modin/pandas/base.py index e021970558a..8672844acc9 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2500,7 +2500,7 @@ def _sample( return self.__constructor__(query_compiler=query_compiler) else: query_compiler = self._query_compiler.getitem_row_array( - samples, numeric=True + samples ) return self.__constructor__(query_compiler=query_compiler) diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index fed97028282..dcd990f13b2 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -918,7 +918,7 @@ def _iter(self): k, DataFrame( query_compiler=self._query_compiler.getitem_row_array( - indices[k], numeric=True + indices[k] ) ), ) @@ -1228,7 +1228,7 @@ def _iter(self): k, Series( query_compiler=self._query_compiler.getitem_row_array( - indices[k], numeric=True + indices[k] ) ), ) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index ea83ab9f6d2..0e8ff29a920 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -2443,7 +2443,7 @@ def _getitem(self, key): if is_bool_indexer(key): return self.__constructor__( query_compiler=self._query_compiler.getitem_row_array( - pandas.RangeIndex(len(self.index))[key], numeric=True + pandas.RangeIndex(len(self.index))[key] ) ) # TODO: More efficiently handle `tuple` case for `Series.__getitem__` @@ -2465,7 +2465,7 @@ def _getitem(self, key): row_positions = self.index.get_indexer_for(key) if is_indexer else key if not all(is_integer(x) for x in row_positions): raise KeyError(key[0] if reduce_dimension else key) - result = self._query_compiler.getitem_row_array(row_positions, numeric=True) + result = self._query_compiler.getitem_row_array(row_positions) if reduce_dimension: return self._reduce_dimension(result) From 1d7e4942728777a8c2c412629706099c2200bd9c Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 9 Nov 2022 10:14:24 -0600 Subject: [PATCH 72/77] Fix black and flake8, and add a comment. Signed-off-by: mvashishtha --- modin/core/execution/client/io.py | 1 - modin/pandas/base.py | 4 +--- modin/pandas/indexing.py | 6 ++++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/modin/core/execution/client/io.py b/modin/core/execution/client/io.py index 4a00e11f6e6..52361e37ebc 100644 --- a/modin/core/execution/client/io.py +++ b/modin/core/execution/client/io.py @@ -110,7 +110,6 @@ def read_sql(cls, sql, con, **kwargs): self.query_compiler_cls Query compiler with data read in from SQL connection. """ - print(f'called client io read_sql with {sql=} and {con=} and {kwargs=}') if isinstance(con, str) and con.lower() == "auto" and cls._data_conn is None: raise ConnectionError( "Cannot connect with parameter 'auto' because connection is not set. Did you initialize it?" diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 8672844acc9..de4956206f3 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -2499,9 +2499,7 @@ def _sample( query_compiler = self._query_compiler.getitem_column_array(samples) return self.__constructor__(query_compiler=query_compiler) else: - query_compiler = self._query_compiler.getitem_row_array( - samples - ) + query_compiler = self._query_compiler.getitem_row_array(samples) return self.__constructor__(query_compiler=query_compiler) def _sem( diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index d63e278c79b..369d1277c1f 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -735,6 +735,12 @@ def __getitem__(self, key): return self._handle_boolean_masking(row_loc, col_loc) if isinstance(self.qc, ClientQueryCompiler): + # TODO(https://github.com/modin-project/modin/issues/5202): + # currently only the client query compiler implements + # take_2d_labels without defaulting to pandas. Eventually we want + # the query compilers to use take_2d_labels to do loc indexing + # instead of always converting row and column labels to positions + # here and passing positions to the query compilers. qc_view = self._take_2d_labels( row_loc, col_loc, row_multiindex_full_lookup, col_multiindex_full_lookup ) From 8be834aa68b887515aae99f183aa68d2cd6b1019 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 9 Nov 2022 10:28:56 -0600 Subject: [PATCH 73/77] Fix getitem_row_array. Signed-off-by: mvashishtha --- modin/core/storage_formats/base/query_compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index 08eed980632..b05e1229535 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -2153,7 +2153,7 @@ def get_column(df, key): return DataFrameDefault.register(get_column)(self, key=key) - def getitem_row_array(self, key: List[Hashable]): + def getitem_row_array(self, key): """ Get row data for target indices. @@ -2169,7 +2169,7 @@ def getitem_row_array(self, key: List[Hashable]): """ def get_row(df, key): - return df.loc[key] + return df.iloc[key] return DataFrameDefault.register(get_row)(self, key=key) From 61a7aadac6d58e1a0f9b33ed552b59b7d69841fc Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 9 Nov 2022 10:47:47 -0600 Subject: [PATCH 74/77] Fix getitem_row_array again. Signed-off-by: mvashishtha --- modin/core/storage_formats/pandas/query_compiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 08cc058ca96..2cfaad5513d 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2247,9 +2247,9 @@ def getitem_column_array(self, key, numeric=False): ) return self.__constructor__(new_modin_frame) - def getitem_row_array(self, key: List[Hashable]): + def getitem_row_array(self, key): return self.__constructor__( - self._modin_frame.take_2d_labels_or_positional(row_labels=key) + self._modin_frame.take_2d_labels_or_positional(row_positions=key) ) def setitem(self, axis, key, value): From 07ad5c5293dd8326d2e0f10844db6cbfa07ee929 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 9 Nov 2022 10:59:20 -0600 Subject: [PATCH 75/77] Fix bugs that showed up in CI. Signed-off-by: mvashishtha --- modin/conftest.py | 8 ++++++-- modin/pandas/indexing.py | 12 ++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/modin/conftest.py b/modin/conftest.py index 13571d45487..8115a94fd54 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -52,7 +52,6 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): PandasQueryCompiler, BaseQueryCompiler, ) -from modin.core.execution.client.io import ClientIO # noqa: E402 from modin.core.execution.client.container import ( # noqa: E402 ForwardingQueryCompilerContainer, ) @@ -282,14 +281,19 @@ def set_base_on_python_execution(): class ClientFactory(factories.BaseFactory): @classmethod def prepare(cls): + # Can't always import ClientIO, because it uses NoDefault, which + # is not available on older pandas. + from modin.core.execution.client.io import ClientIO + cls.io_cls = ClientIO def set_client_execution(): # Can't always import ClientQueryCompiler, because it uses NoDefault, which - # is not available on older pandas. + # is not available on older pandas. ClientIO also uses ClientQueryCompiler. from modin.core.execution.client.query_compiler import ClientQueryCompiler + from modin.core.execution.client.io import ClientIO class TestClientQueryCompiler(ClientQueryCompiler): @classmethod diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index 369d1277c1f..f5b26995594 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -37,7 +37,7 @@ from pandas.core.indexing import IndexingError from typing import Union -from modin.core.execution.client.query_compiler import ClientQueryCompiler +from modin._compat import PandasCompatVersion from modin.error_message import ErrorMessage from modin.logging import ClassLogger @@ -734,13 +734,21 @@ def __getitem__(self, key): if isinstance(row_loc, Series) and is_boolean_array(row_loc): return self._handle_boolean_masking(row_loc, col_loc) - if isinstance(self.qc, ClientQueryCompiler): + is_client_qc = False + if PandasCompatVersion.CURRENT == PandasCompatVersion.LATEST: + # Can't always import ClientQueryCompiler, because it uses NoDefault, which + # is not available on older pandas. + + from modin.core.execution.client.query_compiler import ClientQueryCompiler + + is_client_qc = isinstance(self.qc, ClientQueryCompiler) # TODO(https://github.com/modin-project/modin/issues/5202): # currently only the client query compiler implements # take_2d_labels without defaulting to pandas. Eventually we want # the query compilers to use take_2d_labels to do loc indexing # instead of always converting row and column labels to positions # here and passing positions to the query compilers. + if is_client_qc: qc_view = self._take_2d_labels( row_loc, col_loc, row_multiindex_full_lookup, col_multiindex_full_lookup ) From 38ef127408149d259e1a8420e44d8a88403f1672 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 9 Nov 2022 12:11:25 -0600 Subject: [PATCH 76/77] Fix a multiindex Client bug, and fix an hdk astype bug. Signed-off-by: mvashishtha --- .../core/storage_formats/hdk/query_compiler.py | 2 +- modin/pandas/indexing.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/modin/experimental/core/storage_formats/hdk/query_compiler.py b/modin/experimental/core/storage_formats/hdk/query_compiler.py index 02bc5187052..5da3a54027e 100644 --- a/modin/experimental/core/storage_formats/hdk/query_compiler.py +++ b/modin/experimental/core/storage_formats/hdk/query_compiler.py @@ -693,7 +693,7 @@ def reset_index(self, **kwargs): def astype(self, col_dtypes, errors: str): return self.__constructor__( - self._modin_frame.astype(col_dtypes, errors), self._shape_hint + self._modin_frame.astype(col_dtypes), self._shape_hint ) def setitem(self, axis, key, value): diff --git a/modin/pandas/indexing.py b/modin/pandas/indexing.py index f5b26995594..ae4571831e7 100644 --- a/modin/pandas/indexing.py +++ b/modin/pandas/indexing.py @@ -678,7 +678,14 @@ def _multiindex_possibly_contains_key(self, axis, key): return False multiindex = self.df.index if axis == 0 else self.df.columns - return isinstance(key, tuple) and len(key) == len(multiindex.levels) + # If not every element of the key is a scalar, e.g. the key is + # (slice(None), 0), then the key isn't a full key-lookup, and the + # entire key behaves more like a slice than like a scalar. + return ( + isinstance(key, tuple) + and len(key) == len(multiindex.levels) + and all(is_scalar(k) for k in key) + ) class _LocIndexer(_LocationIndexerBase): From ead877ef8176c260ddb59e2fb305d101d92175f3 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Thu, 10 Nov 2022 16:09:35 -0600 Subject: [PATCH 77/77] Respond to comments. Signed-off-by: mvashishtha --- modin/pandas/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index c4f8b4dccbf..db976ca45ad 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -116,6 +116,11 @@ def _update_engine(publisher: Parameter): # Set this so that Pandas doesn't try to multithread by itself os.environ["OMP_NUM_THREADS"] = "1" + if Engine.get() == "Client": + if publisher.get_value_source() == ValueSource.DEFAULT: + StorageFormat.put("") + return + sfmt = StorageFormat.get() if sfmt == "Hdk": @@ -130,11 +135,6 @@ def _update_engine(publisher: Parameter): else: is_hdk = False - if Engine.get() == "Client": - if publisher.get_value_source() == ValueSource.DEFAULT: - StorageFormat.put("") - return - if is_hdk and publisher.get_value_source() == ValueSource.DEFAULT: publisher.put("Native") IsExperimental.put(True)