Skip to content

Commit

Permalink
466 add feature importance CLI and modify tool (#468)
Browse files Browse the repository at this point in the history
* Change n_repeats default to 10 from 50

* Rename parameters, improve documentation

* Add missing checks

* Add CLI function
  • Loading branch information
msorvoja authored Dec 4, 2024
1 parent 6bee396 commit 5ca25b3
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 29 deletions.
34 changes: 34 additions & 0 deletions eis_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,40 @@ def local_morans_i_cli(
typer.echo(f"Local Moran's I completed, output vector saved to {output_vector}.")


# FEATURE IMPORTANCE
@app.command()
def feature_importance_cli(
model_file: INPUT_FILE_OPTION,
input_rasters: INPUT_FILES_ARGUMENT,
target_labels: INPUT_FILE_OPTION,
n_repeats: int = 10,
random_state: Optional[int] = None,
):
"""Evaluate the feature importance of a sklearn classifier or regressor."""
from eis_toolkit.exploratory_analyses.feature_importance import evaluate_feature_importance
from eis_toolkit.prediction.machine_learning_general import load_model, prepare_data_for_ml

typer.echo("Progress: 10%")

model = load_model(model_file)
typer.echo("Progress: 20%")

X, y, _, _ = prepare_data_for_ml(input_rasters, target_labels)
typer.echo("Progress: 30%")

feature_names = [raster.name for raster in input_rasters]
typer.echo("Progress: 40%")

feature_importance, _ = evaluate_feature_importance(model, X, y, feature_names, n_repeats, random_state)
typer.echo("Progress: 80%")

results = dict(zip(feature_importance["Feature"], feature_importance["Importance"]))
json_str = json.dumps(results)
typer.echo("Progress: 100%")

typer.echo(f"Results: {json_str}")


# --- RASTER PROCESSING ---


Expand Down
42 changes: 26 additions & 16 deletions eis_toolkit/exploratory_analyses/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,32 @@
from beartype.typing import Optional, Sequence
from sklearn.inspection import permutation_importance

from eis_toolkit.exceptions import InvalidDatasetException, InvalidParameterValueException
from eis_toolkit.exceptions import (
InvalidDatasetException,
InvalidParameterValueException,
NonMatchingParameterLengthsException,
)


@beartype
def evaluate_feature_importance(
model: sklearn.base.BaseEstimator,
x_test: np.ndarray,
y_test: np.ndarray,
X: np.ndarray,
y: np.ndarray,
feature_names: Sequence[str],
n_repeats: int = 50,
n_repeats: int = 10,
random_state: Optional[int] = None,
) -> tuple[pd.DataFrame, dict]:
"""
Evaluate the feature importance of a sklearn classifier or regressor.
Evaluate the feature importance of a Sklearn classifier or regressor.
Args:
model: A trained and fitted Sklearn model.
x_test: Testing feature data (X data need to be normalized / standardized).
y_test: Testing label data.
feature_names: Names of the feature columns.
n_repeats: Number of iteration used when calculate feature importance. Defaults to 50.
random_state: random state for repeatability of results. Optional parameter.
X: Feature data.
y: Target labels.
feature_names: Names of features in X.
n_repeats: Number of iteration used when calculating feature importance. Defaults to 10.
random_state: Seed for random number generation. Defaults to None.
Returns:
A dataframe containing features and their importance.
Expand All @@ -37,18 +41,24 @@ def evaluate_feature_importance(
InvalidParameterValueException: Value for 'n_repeats' is not at least one.
"""

if x_test.size == 0:
raise InvalidDatasetException("Array 'x_test' is empty.")
if X.size == 0:
raise InvalidDatasetException("Feature matrix X is empty.")

if y_test.size == 0:
raise InvalidDatasetException("Array 'y_test' is empty.")
if y.size == 0:
raise InvalidDatasetException("Target labels y is empty.")

if n_repeats < 1:
raise InvalidParameterValueException("Value for 'n_repeats' is less than one.")

result = permutation_importance(model, x_test, y_test.ravel(), n_repeats=n_repeats, random_state=random_state)
if len(X) != len(y):
raise NonMatchingParameterLengthsException("Feature matrix X and target labels y must have the same length.")

feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean * 100})
if len(feature_names) != X.shape[1]:
raise InvalidParameterValueException("Number of feature names must match the number of input features.")

result = permutation_importance(model, X, y.ravel(), n_repeats=n_repeats, random_state=random_state)

feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": result.importances_mean})

feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

Expand Down
42 changes: 29 additions & 13 deletions tests/exploratory_analyses/feature_importance_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

from eis_toolkit.exceptions import InvalidDatasetException, InvalidParameterValueException
from eis_toolkit.exceptions import (
InvalidDatasetException,
InvalidParameterValueException,
NonMatchingParameterLengthsException,
)
from eis_toolkit.exploratory_analyses.feature_importance import evaluate_feature_importance

feature_names = [
Expand Down Expand Up @@ -42,43 +46,55 @@ def test_empty_data():
empty_data = np.array([])
empty_labels = np.array([])
with pytest.raises(InvalidDatasetException):
_, _ = evaluate_feature_importance(
model=classifier, x_test=empty_data, y_test=labels, feature_names=feature_names
)
_, _ = evaluate_feature_importance(model=classifier, X=empty_data, y=labels, feature_names=feature_names)

with pytest.raises(InvalidDatasetException):
_, _ = evaluate_feature_importance(
model=classifier, x_test=data, y_test=empty_labels, feature_names=feature_names
)
_, _ = evaluate_feature_importance(model=classifier, X=data, y=empty_labels, feature_names=feature_names)


def test_invalid_n_repeats():
"""Test that invalid value for 'n_repeats' raises exception."""
with pytest.raises(InvalidParameterValueException):
_, _ = evaluate_feature_importance(
model=classifier, x_test=data, y_test=labels, feature_names=feature_names, n_repeats=0
)
_, _ = evaluate_feature_importance(model=classifier, X=data, y=labels, feature_names=feature_names, n_repeats=0)


def test_model_output():
"""Test that function output is as expected."""
classifier.fit(data, labels.ravel())
feature_importance, importance_results = evaluate_feature_importance(
model=classifier, x_test=data, y_test=labels, feature_names=feature_names, random_state=0
model=classifier, X=data, y=labels, feature_names=feature_names, n_repeats=50, random_state=0
)

np.testing.assert_almost_equal(
feature_importance.loc[feature_importance["Feature"] == "EM_ratio", "Importance"].values[0],
desired=12.923077,
desired=0.129231,
decimal=6,
)
np.testing.assert_almost_equal(
feature_importance.loc[feature_importance["Feature"] == "EM_Qd", "Importance"].values[0],
desired=4.461538,
desired=0.044615,
decimal=6,
)
np.testing.assert_equal(len(feature_importance), desired=len(feature_names))
np.testing.assert_equal(
tuple(importance_results.keys()),
desired=("importances_mean", "importances_std", "importances"),
)


def test_invalid_input_lengths():
"""Test that non matcing X and y lengths raises an exception."""
labels = np.random.randint(2, size=12)
with pytest.raises(NonMatchingParameterLengthsException):
_, _ = evaluate_feature_importance(model=classifier, X=data, y=labels, feature_names=feature_names)


def test_invalid_number_of_feature_names():
"""Test that invalid number of feature names raises an exception."""
with pytest.raises(InvalidParameterValueException):
_, _ = evaluate_feature_importance(
model=classifier,
X=data,
y=labels,
feature_names=["a", "b", "c"],
)

0 comments on commit 5ca25b3

Please sign in to comment.