Skip to content

Commit

Permalink
added python typing to datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
rmj3197 committed Sep 3, 2024
1 parent ac42cfa commit 0c76b1d
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 47 deletions.
176 changes: 139 additions & 37 deletions QuadratiK/datasets/_dataset.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
from importlib import resources

from typing import Union, Tuple, Optional

import numpy as np
import pandas as pd


def load_wireless_data(desc=False, return_X_y=False, as_dataframe=True, scaled=False):
def load_wireless_data(
desc: bool = False,
return_X_y: bool = False,
as_dataframe: bool = True,
scaled: bool = False,
) -> Union[
Tuple[str, pd.DataFrame, pd.DataFrame],
Tuple[str, pd.DataFrame],
Tuple[str, np.ndarray],
Tuple[pd.DataFrame, pd.DataFrame],
Tuple[np.ndarray, np.ndarray],
pd.dataFrame,
np.ndarray,
]:
"""
The wireless data frame has 2000 rows and 8 columns. The first 7 variables
report the measurements of the Wi-Fi signal strength received from 7 Wi-Fi routers in an
Expand Down Expand Up @@ -96,25 +111,57 @@ def load_wireless_data(desc=False, return_X_y=False, as_dataframe=True, scaled=F
if return_X_y:
X = data[:, :-1]
y = data[:, -1].astype(int)
if desc:
return (fdescr, X, y)
else:
return (X, y)

if as_dataframe:
data = pd.DataFrame(data, columns=feature_names)
data["Class"] = data["Class"].astype(int)
if desc:
return (fdescr, data)
else:
return data
else:
data_df = pd.DataFrame(data, columns=feature_names)
data_df["Class"] = data_df["Class"].astype(int)

if desc and return_X_y and as_dataframe:
return (
fdescr,
pd.DataFrame(X, columns=feature_names[:-1]),
pd.DataFrame(y, columns=["Class"]),
)

if desc and return_X_y and not as_dataframe:
return (fdescr, X, y)

if desc and not return_X_y and as_dataframe:
return (fdescr, data_df)

if desc and not return_X_y and not as_dataframe:
return (fdescr, data)

if not desc and return_X_y and as_dataframe:
return (
pd.DataFrame(X, columns=feature_names[:-1]),
pd.DataFrame(y, columns=["Class"]),
)

if not desc and return_X_y and not as_dataframe:
return (X, y)

if not desc and not return_X_y and as_dataframe:
return data_df

if not desc and not return_X_y and not as_dataframe:
return data


def load_wisconsin_breast_cancer_data(
desc=False, return_X_y=False, as_dataframe=True, scaled=False
):
desc: bool = False,
return_X_y: bool = False,
as_dataframe: bool = True,
scaled: bool = False,
) -> Union[
Tuple[str, pd.DataFrame, pd.DataFrame],
Tuple[str, pd.DataFrame],
Tuple[str, np.ndarray],
Tuple[pd.DataFrame, pd.DataFrame],
Tuple[np.ndarray, np.ndarray],
pd.dataFrame,
np.ndarray,
]:
"""
The Wisconsin breast cancer dataset data frame has 569 rows and 31 columns. The first 30 variables
report the features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.
Expand Down Expand Up @@ -239,23 +286,57 @@ def load_wisconsin_breast_cancer_data(
if return_X_y:
X = data[:, :-1]
y = data[:, -1].astype(int)
if desc:
return (fdescr, X, y)
else:
return (X, y)

if as_dataframe:
data = pd.DataFrame(data, columns=feature_names)
data["Class"] = data["Class"].astype(int)
if desc:
return (fdescr, data)
else:
return data
else:
data_df = pd.DataFrame(data, columns=feature_names)
data_df["Class"] = data_df["Class"].astype(int)

if desc and return_X_y and as_dataframe:
return (
fdescr,
pd.DataFrame(X, columns=feature_names[:-1]),
pd.DataFrame(y, columns=["Class"]),
)

if desc and return_X_y and not as_dataframe:
return (fdescr, X, y)

if desc and not return_X_y and as_dataframe:
return (fdescr, data_df)

if desc and not return_X_y and not as_dataframe:
return (fdescr, data)

if not desc and return_X_y and as_dataframe:
return (
pd.DataFrame(X, columns=feature_names[:-1]),
pd.DataFrame(y, columns=["Class"]),
)

if not desc and return_X_y and not as_dataframe:
return (X, y)

if not desc and not return_X_y and as_dataframe:
return data_df

if not desc and not return_X_y and not as_dataframe:
return data


def load_wine_data(desc=False, return_X_y=False, as_dataframe=True, scaled=False):
def load_wine_data(
desc: bool = False,
return_X_y: bool = False,
as_dataframe: bool = True,
scaled: bool = False,
) -> Union[
Tuple[str, pd.DataFrame, pd.DataFrame],
Tuple[str, pd.DataFrame],
Tuple[str, np.ndarray],
Tuple[pd.DataFrame, pd.DataFrame],
Tuple[np.ndarray, np.ndarray],
pd.dataFrame,
np.ndarray,
]:
"""
The wine data frame has 178 rows and 14 columns. The first 13 variables
report 13 constituents found in each of the three types of wines.
Expand Down Expand Up @@ -357,17 +438,38 @@ def load_wine_data(desc=False, return_X_y=False, as_dataframe=True, scaled=False
if return_X_y:
X = data[:, :-1]
y = data[:, -1].astype(int)
if desc:
return (fdescr, X, y)
else:
return (X, y)

if as_dataframe:
data = pd.DataFrame(data, columns=feature_names)
data["Class"] = data["Class"].astype(int)
if desc:
return (fdescr, data)
else:
return data
else:
data_df = pd.DataFrame(data, columns=feature_names)
data_df["Class"] = data_df["Class"].astype(int)

if desc and return_X_y and as_dataframe:
return (
fdescr,
pd.DataFrame(X, columns=feature_names[:-1]),
pd.DataFrame(y, columns=["Class"]),
)

if desc and return_X_y and not as_dataframe:
return (fdescr, X, y)

if desc and not return_X_y and as_dataframe:
return (fdescr, data_df)

if desc and not return_X_y and not as_dataframe:
return (fdescr, data)

if not desc and return_X_y and as_dataframe:
return (
pd.DataFrame(X, columns=feature_names[:-1]),
pd.DataFrame(y, columns=["Class"]),
)

if not desc and return_X_y and not as_dataframe:
return (X, y)

if not desc and not return_X_y and as_dataframe:
return data_df

if not desc and not return_X_y and not as_dataframe:
return data
2 changes: 1 addition & 1 deletion QuadratiK/kernel_test/_h_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def _objective_k_sample(

def select_h(
x: Union[np.ndarray, pd.DataFrame],
y=Optional[Union[np.ndarray, pd.DataFrame]],
y: Optional[Union[np.ndarray, pd.DataFrame]] = None,
alternative: str = "location",
method: str = "subsampling",
b: float = 0.8,
Expand Down
20 changes: 11 additions & 9 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,11 @@ def test_desc(self):
self.assertTrue(isinstance(descr3, str))

def test_scaled_as_numpy(self):
X1, y1 = load_wireless_data(scaled=True, return_X_y=True)
X2, y2 = load_wisconsin_breast_cancer_data(scaled=True, return_X_y=True)
X3, y3 = load_wine_data(scaled=True, return_X_y=True)
X1, y1 = load_wireless_data(scaled=True, return_X_y=True, as_dataframe=False)
X2, y2 = load_wisconsin_breast_cancer_data(
scaled=True, return_X_y=True, as_dataframe=False
)
X3, y3 = load_wine_data(scaled=True, return_X_y=True, as_dataframe=False)
self.assertTrue(isinstance(X1, np.ndarray))
self.assertTrue(isinstance(y1, np.ndarray))
self.assertTrue(isinstance(X2, np.ndarray))
Expand All @@ -82,12 +84,12 @@ def test_desc_with_Xy(self):
self.assertTrue(isinstance(descr1, str))
self.assertTrue(isinstance(descr2, str))
self.assertTrue(isinstance(descr3, str))
self.assertTrue(isinstance(X1, np.ndarray))
self.assertTrue(isinstance(y1, np.ndarray))
self.assertTrue(isinstance(X2, np.ndarray))
self.assertTrue(isinstance(y2, np.ndarray))
self.assertTrue(isinstance(X3, np.ndarray))
self.assertTrue(isinstance(y3, np.ndarray))
self.assertTrue(isinstance(X1, pd.DataFrame))
self.assertTrue(isinstance(y1, pd.DataFrame))
self.assertTrue(isinstance(X2, pd.DataFrame))
self.assertTrue(isinstance(y2, pd.DataFrame))
self.assertTrue(isinstance(X3, pd.DataFrame))
self.assertTrue(isinstance(y3, pd.DataFrame))

def test_as_numpy(self):
df1 = load_wireless_data(as_dataframe=False)
Expand Down

0 comments on commit 0c76b1d

Please sign in to comment.