Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding a featurizer preset with additional matminer featurizer #150

Merged
merged 15 commits into from
Jun 1, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 35 additions & 5 deletions modnet/featurizers/featurizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional, Iterable, Tuple, Dict

import pandas as pd
from pymatgen.core import Composition

from matminer.featurizers.base import MultipleFeaturizer, BaseFeaturizer
from matminer.featurizers.structure import SiteStatsFingerprint
Expand Down Expand Up @@ -204,14 +205,41 @@ def featurize_composition(self, df: pd.DataFrame) -> pd.DataFrame:

if self.oxid_composition_featurizers:
LOG.info("Applying oxidation state featurizers...")
# Get integer composition if some are not
col_comp = "composition"
if not all(
all(amt == int(amt) for amt in comp.values())
for comp in df["composition"].values
):
LOG.info(
"There are non-integer compositions in the dataset, and featurizers that need them. "
"Computing..."
)
df["integer_composition"] = [
Composition(
comp.get_integer_formula_and_factor(
max_denominator=10
if getattr(self, "fast_oxid", False)
else 100
)[0]
)
for comp in df["composition"].values
]
# df["integer_composition"] = df["composition"].apply(
# lambda c: c.get_integer_formula_and_factor(
# max_denominator=10 if getattr(self, "fast_oxid", False) else 100
# )[0]
# )

col_comp = "integer_composition"
if getattr(self, "fast_oxid", False):
df = CompositionToOxidComposition(
all_oxi_states=False, max_sites=-1
).featurize_dataframe(df, "composition")
).featurize_dataframe(df, col_id=col_comp)
else:
df = CompositionToOxidComposition().featurize_dataframe(
df, "composition"
)
df = CompositionToOxidComposition(
max_sites=-1 if getattr(self, "continuous_only", False) else None
).featurize_dataframe(df, col_id=col_comp, ignore_errors=True)
df = self._fit_apply_featurizers(
df,
self.oxid_composition_featurizers,
Expand Down Expand Up @@ -271,14 +299,16 @@ def featurize_site(
df.columns = ["Input data|" + x for x in df.columns]

for fingerprint in self.site_featurizers:
fingerprint_name = fingerprint.__class__.__name__
if fingerprint_name == "SOAP":
fingerprint.fit(df["Input data|structure"])
site_stats_fingerprint = SiteStatsFingerprint(
fingerprint, stats=self.site_stats
)
df = site_stats_fingerprint.featurize_dataframe(
df, "Input data|structure", multiindex=False, ignore_errors=True
)

fingerprint_name = fingerprint.__class__.__name__
if aliases:
fingerprint_name = aliases.get(fingerprint_name, fingerprint_name)
if "|" not in fingerprint_name:
Expand Down
6 changes: 6 additions & 0 deletions modnet/featurizers/presets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from typing import Dict, Type
from .debreuck_2020 import DeBreuck2020Featurizer, CompositionOnlyFeaturizer
from .matminer_2023 import Matminer2023Featurizer, CompositionOnlyMatminer2023Featurizer
from .matminer_all_2023 import (
MatminerAll2023Featurizer,
CompositionOnlyMatminerAll2023Featurizer,
)
from modnet.featurizers import MODFeaturizer

DEFAULT_FEATURIZER: str = "Matminer2023"
Expand All @@ -16,5 +20,7 @@
"DeBreuck2020": DeBreuck2020Featurizer,
"CompositionOnly": CompositionOnlyFeaturizer,
"Matminer2023": Matminer2023Featurizer,
"MatminerAll2023": MatminerAll2023Featurizer,
"CompositionOnlyMatminer2023": CompositionOnlyMatminer2023Featurizer,
"CompositionOnlyMatminerAll2023": CompositionOnlyMatminerAll2023Featurizer,
}
81 changes: 50 additions & 31 deletions modnet/featurizers/presets/matminer_2023.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class Matminer2023Featurizer(modnet.featurizers.MODFeaturizer):

"""

def __init__(self, fast_oxid: bool = False):
def __init__(self, fast_oxid: bool = False, continuous_only: bool = False):
"""Creates the featurizer and imports all featurizer functions.

Parameters:
Expand All @@ -28,8 +28,9 @@ def __init__(self, fast_oxid: bool = False):
"""

super().__init__()
self.load_featurizers()
self.continuous_only = continuous_only
self.fast_oxid = fast_oxid
self.load_featurizers()

def load_featurizers(self):
with contextlib.redirect_stdout(None):
Expand Down Expand Up @@ -82,19 +83,33 @@ def load_featurizers(self):
VoronoiFingerprint,
)

self.composition_featurizers = (
AtomicOrbitals(),
AtomicPackingEfficiency(),
BandCenter(),
ElementFraction(),
ElementProperty.from_preset("magpie"),
IonProperty(),
Miedema(),
Stoichiometry(),
TMetalFraction(),
ValenceOrbital(),
YangSolidSolution(),
)
if self.continuous_only:
magpie_featurizer = ElementProperty.from_preset("magpie")
magpie_featurizer.stats = ["mean", "avg_dev"]

self.composition_featurizers = (
BandCenter(),
ElementFraction(),
magpie_featurizer,
IonProperty(fast=self.fast_oxid),
Stoichiometry(p_list=[2, 3, 5, 7, 10]),
TMetalFraction(),
ValenceOrbital(props=["frac"]),
)
else:
self.composition_featurizers = (
AtomicOrbitals(),
AtomicPackingEfficiency(),
BandCenter(),
ElementFraction(),
ElementProperty.from_preset("magpie"),
IonProperty(),
Miedema(),
Stoichiometry(),
TMetalFraction(),
ValenceOrbital(),
YangSolidSolution(),
)

self.oxid_composition_featurizers = (
ElectronegativityDiff(),
Expand Down Expand Up @@ -145,20 +160,24 @@ def featurize_composition(self, df):

df = super().featurize_composition(df)

_orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
df["AtomicOrbitals|HOMO_character"] = df["AtomicOrbitals|HOMO_character"].map(
_orbitals
)
df["AtomicOrbitals|LUMO_character"] = df["AtomicOrbitals|LUMO_character"].map(
_orbitals
)

df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)
df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)
if not self.continuous_only:
_orbitals = {"s": 1, "p": 2, "d": 3, "f": 4}
df["AtomicOrbitals|HOMO_character"] = df[
"AtomicOrbitals|HOMO_character"
].map(_orbitals)
df["AtomicOrbitals|LUMO_character"] = df[
"AtomicOrbitals|LUMO_character"
].map(_orbitals)

df["AtomicOrbitals|HOMO_element"] = df["AtomicOrbitals|HOMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)
df["AtomicOrbitals|LUMO_element"] = df["AtomicOrbitals|LUMO_element"].apply(
lambda x: -1 if not isinstance(x, str) else Element(x).Z
)

else:
df.drop(columns=["IonProperty|max ionic char"], inplace=True)

return modnet.featurizers.clean_df(df)

Expand Down Expand Up @@ -224,8 +243,8 @@ class CompositionOnlyMatminer2023Featurizer(Matminer2023Featurizer):

"""

def __init__(self):
super().__init__()
def __init__(self, continuous_only: bool = False, fast_oxid: bool = False):
super().__init__(fast_oxid=fast_oxid, continuous_only=continuous_only)
self.oxid_composition_featurizers = ()
self.structure_featurizers = ()
self.site_featurizers = ()
Loading