Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Usability tweaks & new featurizer preset #215

Merged
merged 5 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
219 changes: 219 additions & 0 deletions modnet/featurizers/presets/matminer_2024_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
"""This submodule contains the `Matminer2024FastFeaturizer` class. """

import numpy as np
import modnet.featurizers
import contextlib


class Matminer2024FastFeaturizer(modnet.featurizers.MODFeaturizer):
"""A set of efficient featurizers for features implemented in matminer
at time of creation (matminer v0.9.2 from 2024).

Removes featurizers that are known to be slow (i.e., orders of magnitude
more intensive to compute than the rest of the featurizers).

"""

def __init__(
self,
fast_oxid: bool = True,
continuous_only: bool = True,
):
"""Creates the featurizer and imports all featurizer functions.

Parameters:
fast_oxid: Whether to use the accelerated oxidation state parameters within
pymatgen when constructing features that constrain oxidation states such
that all sites with the same species in a structure will have the same
oxidation state (recommended if featurizing any structure
with large unit cells).
continuous_only: Whether to keep only the features that are continuous
with respect to the composition (only for composition featurizers).
Discontinuous features may lead to discontinuities in the model predictions.

"""

super().__init__()
self.drop_allnan = False
self.fast_oxid = fast_oxid
self.continuous_only = continuous_only
self.load_featurizers()

def load_featurizers(self):
with contextlib.redirect_stdout(None):
from matminer.featurizers.composition import (
BandCenter,
ElementFraction,
ElementProperty,
Stoichiometry,
TMetalFraction,
ValenceOrbital,
)
from matminer.featurizers.structure import (
DensityFeatures,
EwaldEnergy,
GlobalSymmetryFeatures,
StructuralComplexity,
)
from matminer.utils.data import (
DemlData,
PymatgenData,
)

pymatgen_features = [
"block",
"mendeleev_no",
"electrical_resistivity",
"velocity_of_sound",
"thermal_conductivity",
"bulk_modulus",
"coefficient_of_linear_thermal_expansion",
]

deml_features = [
"atom_radius",
"molar_vol",
"heat_fusion",
"boiling_point",
"heat_cap",
"first_ioniz",
"electric_pol",
"GGAU_Etot",
"mus_fere",
"FERE correction",
]

magpie_featurizer = ElementProperty.from_preset("magpie")
magpie_featurizer.stats = ["mean", "avg_dev"]

pymatgen_featurizer = ElementProperty(
data_source=PymatgenData(),
stats=["mean", "avg_dev"],
features=pymatgen_features,
)

deml_featurizer = ElementProperty(
data_source=DemlData(),
stats=["mean", "avg_dev"],
features=deml_features,
)

self.composition_featurizers = (
BandCenter(),
ElementFraction(),
magpie_featurizer,
pymatgen_featurizer,
deml_featurizer,
Stoichiometry(p_list=[2, 3, 5, 7, 10]),
TMetalFraction(),
ValenceOrbital(props=["frac"]),
)

self.oxid_composition_featurizers = []

self.structure_featurizers = (
DensityFeatures(),
EwaldEnergy(),
GlobalSymmetryFeatures(),
StructuralComplexity(),
)

self.site_featurizers = []

def featurize_composition(self, df):
"""Applies the preset composition featurizers to the input dataframe,
renames some fields and cleans the output dataframe.

"""
from pymatgen.core.periodic_table import Element

df = super().featurize_composition(df)

if self.composition_featurizers and not self.continuous_only:
_orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
df["AtomicOrbitals|HOMO_character"] = df[
"AtomicOrbitals|HOMO_character"
].map(_orbitals)
df["AtomicOrbitals|LUMO_character"] = df[
"AtomicOrbitals|LUMO_character"
].map(_orbitals)

df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)
df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)

if self.continuous_only:
# These are additional features that have shown discontinuities in my tests.
# Hopefully, I got them all...
df.drop(
columns=[
"ElementProperty|DemlData mean electric_pol",
"ElementProperty|DemlData mean FERE correction",
"ElementProperty|DemlData mean GGAU_Etot",
"ElementProperty|DemlData mean heat_fusion",
"ElementProperty|DemlData mean mus_fere",
],
inplace=True,
errors="ignore",
)

if self.oxid_composition_featurizers:
df.drop(columns=["IonProperty|max ionic char"], inplace=True)

return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)

def featurize_structure(self, df):
"""Applies the preset structural featurizers to the input dataframe,
renames some fields and cleans the output dataframe.

"""

if self.structure_featurizers:
df = super().featurize_structure(df)

_crystal_system = {
"cubic": 1,
"tetragonal": 2,
"orthorombic": 3,
"hexagonal": 4,
"trigonal": 5,
"monoclinic": 6,
"triclinic": 7,
}

def _int_map(x):
if x == np.nan:
return 0
elif x:
return 1
else:
return 0

df["GlobalSymmetryFeatures|crystal_system"] = df[
"GlobalSymmetryFeatures|crystal_system"
].map(_crystal_system)
df["GlobalSymmetryFeatures|is_centrosymmetric"] = df[
"GlobalSymmetryFeatures|is_centrosymmetric"
].map(_int_map)

return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)

def featurize_site(self, df):
"""Applies the preset site featurizers to the input dataframe,
renames some fields and cleans the output dataframe.

"""

# rename some features for backwards compatibility with pretrained models
aliases = {
"GeneralizedRadialDistributionFunction": "GeneralizedRDF",
"AGNIFingerprints": "AGNIFingerPrint",
"BondOrientationalParameter": "BondOrientationParameter",
}
df = super().featurize_site(df, aliases=aliases)
df = df.loc[:, (df != 0).any(axis=0)]

return modnet.featurizers.clean_df(df, drop_allnan=self.drop_allnan)
13 changes: 11 additions & 2 deletions modnet/models/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,11 @@ def fit(
pool.join()

def predict(
self, test_data: MODData, return_unc=False, return_prob=False
self,
test_data: MODData,
return_unc: bool = False,
return_prob: bool = False,
remap_out_of_bounds: bool = True,
) -> pd.DataFrame:
"""Predict the target values for the passed MODData.

Expand All @@ -154,6 +158,7 @@ def predict(
return_prob: For a classification task only: whether to return the probability of each
class OR only return the most probable class.
return_unc: whether to return a second dataframe containing the uncertainties
remap_out_of_bounds: whether to remap out-of-bounds values to the nearest bound.

Returns:
A `pandas.DataFrame` containing the predicted values of the targets.
Expand All @@ -163,7 +168,11 @@ class OR only return the most probable class.

all_predictions = []
for i in range(self.n_models):
p = self.models[i].predict(test_data, return_prob=return_prob)
p = self.models[i].predict(
test_data,
return_prob=return_prob,
remap_out_of_bounds=remap_out_of_bounds,
)
all_predictions.append(p.values)

p_mean = np.array(all_predictions).mean(axis=0)
Expand Down
38 changes: 23 additions & 15 deletions modnet/models/vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,14 +693,20 @@ def fit_preset(

return models, val_losses, best_learning_curve, learning_curves, best_preset

def predict(self, test_data: MODData, return_prob=False) -> pd.DataFrame:
def predict(
self,
test_data: MODData,
return_prob: bool = False,
remap_out_of_bounds: bool = True,
) -> pd.DataFrame:
"""Predict the target values for the passed MODData.

Parameters:
test_data: A featurized and feature-selected `MODData`
object containing the descriptors used in training.
return_prob: For a classification tasks only: whether to return the probability of each
class OR only return the most probable class.
remap_out_of_bounds: Whether to remap out-of-bounds predictions to the training data distribution.

Returns:
A `pandas.DataFrame` containing the predicted values of the targets.
Expand All @@ -724,20 +730,22 @@ class OR only return the most probable class.
p = [p]

# post-process based on training data
if max(self.num_classes.values()) <= 2: # regression
for i, vals in enumerate(p):
yrange = self.max_y[i] - self.min_y[i]
upper_bound = self.max_y[i] + 0.25 * yrange
lower_bound = self.min_y[i] - 0.25 * yrange
for j in range(len(self.targets_groups[i])):
out_of_range_idxs = np.where(
(vals[:, j] < lower_bound[j]) | (vals[:, j] > upper_bound[j])
)
vals[out_of_range_idxs, j] = (
np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
* (yrange[j])
+ self.min_y[i][j]
)
if remap_out_of_bounds:
if max(self.num_classes.values()) <= 2: # regression
for i, vals in enumerate(p):
yrange = self.max_y[i] - self.min_y[i]
upper_bound = self.max_y[i] + 0.25 * yrange
lower_bound = self.min_y[i] - 0.25 * yrange
for j in range(len(self.targets_groups[i])):
out_of_range_idxs = np.where(
(vals[:, j] < lower_bound[j])
| (vals[:, j] > upper_bound[j])
)
vals[out_of_range_idxs, j] = (
np.random.uniform(0, 1, size=len(out_of_range_idxs[0]))
* (yrange[j])
+ self.min_y[i][j]
)

p_dic = {}

Expand Down
26 changes: 17 additions & 9 deletions modnet/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,8 +664,12 @@ def __init__(
LOG.info(f"Loaded {self.featurizer.__class__.__name__} featurizer.")

if target_names is not None:
if isinstance(target_names, str):
target_names = [target_names]
if np.shape(targets)[-1] != len(target_names):
raise ValueError("Target names must be supplied for every target.")
raise ValueError(
f"Target names must be supplied for every target: {np.shape(targets)} vs {target_names=}"
)
elif targets is not None:
if len(np.shape(targets)) == 1:
target_names = ["prop0"]
Expand All @@ -681,16 +685,20 @@ def __init__(
"List of IDs (`structure_ids`) provided must be unique."
)

if len(structure_ids) != len(materials):
raise ValueError(
"List of IDs (`structure_ids`) must have same length as list of structure."
)
if materials is not None:
if len(structure_ids) != len(materials):
raise ValueError(
"List of IDs (`structure_ids`) must have same length as list of structure."
)

else:
num_entries = (
len(materials) if materials is not None else len(df_featurized)
)
structure_ids = [f"id{i}" for i in range(num_entries)]
if df_featurized is not None:
structure_ids = df_featurized.index
else:
num_entries = (
len(materials) if materials is not None else len(df_featurized)
)
structure_ids = [f"id{i}" for i in range(num_entries)]

if targets is not None:
# set up dataframe for targets with columns (id, property_1, ..., property_n)
Expand Down
Loading