-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
131 lines (99 loc) · 4.53 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings("ignore")
def generate_example_data(rows: int = 1000):
"""
Generate random data with a binary target variable and 10 features.
Args:
rows (int): The number of rows in the generated dataset.
Returns:
pandas.DataFrame: A DataFrame containing the randomly generated data.
"""
X, y = make_classification(
n_samples=rows, n_features=10, n_classes=2, random_state=42
)
return train_test_split(X, y, test_size=0.2, random_state=42)
def standardize_features(x_train, x_test):
"""
Standardizes the features of the input data using Min-Max scaling.
Args:
x_train (pandas.DataFrame or numpy.ndarray): The training data.
x_test (pandas.DataFrame or numpy.ndarray): The test data.
Returns:
tuple: A tuple containing two pandas DataFrames.
- The first DataFrame contains the standardized features of the training data.
- The second DataFrame contains the standardized features of the test data.
"""
# Create a MinMaxScaler object
scaler = MinMaxScaler(feature_range=(0, 1))
# Fit and transform the data to perform feature scaling
scaler = scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)
# Create a new DataFrame with standardized features
standardized_train = pd.DataFrame(scaled_x_train)
standardized_test = pd.DataFrame(scaled_x_test)
return standardized_train, standardized_test
def train_decision_tree_model(X_train, y_train):
"""
Train a decision tree model using scikit-learn.
Args:
X_train (array-like or sparse matrix of shape (n_samples, n_features)): The training input samples.
y_train (array-like of shape (n_samples,)): The target values for training.
Returns:
sklearn.tree.DecisionTreeClassifier: The trained decision tree model.
"""
# Create and train the decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)
return model
def prepare_plot_df(model, X, X_focus):
"""
Prepares the data for plotting by performing PCA (Principal Component Analysis) on the input data.
Args:
model (object): A trained machine learning model capable of making predictions.
X (array-like): The input data for which predictions are made.
X_focus (array-like): Additional input data used for cfxplorer predictions.
Returns:
tuple: A tuple containing two pandas DataFrames.
- The first DataFrame contains the PCA-transformed features of `X`
and the corresponding predictions.
- The second DataFrame contains the PCA-transformed features of `X_focus`
and the corresponding cfxplorer predictions.
"""
pca = PCA(n_components=2)
predictions = pd.DataFrame(model.predict(X), columns=["predictions"])
focus_predictions = pd.DataFrame(model.predict(X_focus), columns=["predictions"])
pca.fit(X)
pca_features = pd.DataFrame(pca.transform(X), columns=["pca1", "pca2"])
pca_focus_features = pd.DataFrame(pca.transform(X_focus), columns=["pca1", "pca2"])
return pd.concat([pca_features, predictions], axis=1), pd.concat(
[pca_focus_features, focus_predictions], axis=1
)
def plot_pca(plot_df, focus_plot_df):
"""
Plots the PCA-transformed features and corresponding predictions before and after applying FOCUS.
Args:
plot_df (pandas.DataFrame): A DataFrame containing the PCA-transformed features and
predictions before applying FOCUS.
focus_plot_df (pandas.DataFrame): A DataFrame containing the PCA-transformed features and
predictions after applying FOCUS.
Returns:
None: This function displays the plot but does not return any value.
"""
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
sns.scatterplot(
data=focus_plot_df, x="pca1", y="pca2", hue="predictions", ax=axes[0]
)
axes[0].set_title("After applying FOCUS")
sns.scatterplot(data=plot_df, x="pca1", y="pca2", hue="predictions", ax=axes[1])
axes[1].set_title("Before applying FOCUS")
fig.suptitle("Prediction Before and After FOCUS comparison")
plt.show()