Skip to content

Commit 22f47c5

Browse files
scaffold new namespace
1 parent ca157a4 commit 22f47c5

File tree

12 files changed

+2470
-0
lines changed

12 files changed

+2470
-0
lines changed

mpcontribs-lux/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

mpcontribs-lux/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
## MPContribs-LUX
2+
3+
<span style="color:forestgreen"><i>Ego sum lux datorum</i></span>.
4+
5+
MPContribs-lux is a package which <it>sheds light</it> on data stored on the [Materials Project's AWS S3 OpenData bucket](https://materialsproject-contribs.s3.amazonaws.com/index.html#) by providing annotated schemas and optionally analysis tools to better explore user-submitted data.

mpcontribs-lux/mpcontribs/lux/autogen/__init__.py

Whitespace-only changes.

mpcontribs-lux/mpcontribs/lux/projects/__init__.py

Whitespace-only changes.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.base import MLTrainDoc
2+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.MatPES_2025_1 import MatPESTrainDoc
3+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.MP_ALOE_2025 import MPAloeTrainDoc
4+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.MPtrj_2022_9 import MPtrjTrainDoc
5+
6+
__all__ = ["MLTrainDoc","MatPESTrainDoc","MPAloeTrainDoc","MPtrjTrainDoc"]
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""Define schemas for the MP-ALOE 2025 dataset."""
2+
from pydantic import Field
3+
4+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.MatPES_2025_1 import MatPESTrainDoc
5+
6+
class MPAloeTrainDoc(MatPESTrainDoc):
7+
"""Schematize MP-ALOE data."""
8+
9+
mp_aloe_id: str | None = Field(
10+
None, description="The identifier of this entry in MP-ALOE."
11+
)
12+
ionic_step_number: int | None = Field(
13+
None, description="The ionic step index of this frame."
14+
)
15+
prototype_number: int | None = Field(
16+
None, description="The index of the prototype structure used in generation."
17+
)
18+
is_charge_balanced: bool | None = Field(
19+
None, description="Whether the structure is likely charge balanced."
20+
)
21+
has_overlapping_pseudo_cores: bool | None = Field(
22+
None,
23+
description="Whether the pseudopotential cores overlap for at least one set of nearest neighbors.",
24+
)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""Define schemas for the MPtrj v2022.9 dataset."""
2+
3+
from pydantic import BaseModel, Field
4+
5+
from emmet.core.types.typing import IdentifierType
6+
7+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.base import MLTrainDoc
8+
9+
class MPtrjProvenance(BaseModel):
10+
"""Metadata for MPtrj entries."""
11+
12+
material_id: IdentifierType | None = Field(
13+
None, description="The Materials Project (summary) ID for this material."
14+
)
15+
task_id: IdentifierType | None = Field(
16+
None, description="The Materials Project (summary) ID for this material."
17+
)
18+
calcs_reversed_index: int | None = Field(
19+
None, description="The index of the reversed calculations, if applicable."
20+
)
21+
ionic_step_index: int | None = Field(
22+
None, description="The index of the ionic step, if applicable."
23+
)
24+
25+
26+
class MPtrjTrainDoc(MLTrainDoc):
27+
"""Schematize MPtrj data."""
28+
29+
energy: float | None = Field(
30+
None, description="The total uncorrected energy associated with this structure."
31+
)
32+
33+
cohesive_energy_per_atom: float | None = Field(
34+
None, description="The uncorrected cohesive energy per atom of this material."
35+
)
36+
37+
corrected_cohesive_energy_per_atom: float | None = Field(
38+
None,
39+
description=(
40+
"The corrected cohesive energy per atom of this material, "
41+
"using the Materials Project GGA / GGA+U mixing scheme."
42+
),
43+
)
44+
45+
provenance: MPtrjProvenance | None = Field(
46+
None, description="Metadata for this frame."
47+
)
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""Define schemas for the MatPES 2025.1 dataset."""
2+
3+
from pydantic import BaseModel, Field
4+
5+
from emmet.core.types.typing import IdentifierType
6+
7+
from mpcontribs.lux.projects.esoteric_ephemera.schemas.base import MLTrainDoc
8+
9+
class MatPESProvenanceDoc(BaseModel):
10+
"""Information regarding the origins of a MatPES structure."""
11+
12+
original_mp_id: IdentifierType | None = Field(
13+
None,
14+
description="MP identifier corresponding to the Materials Project structure from which this entry was sourced from.",
15+
)
16+
materials_project_version: str | None = Field(
17+
None,
18+
description="The version of the Materials Project from which the struture was sourced.",
19+
)
20+
md_ensemble: str | None = Field(
21+
None,
22+
description="The molecular dynamics ensemble used to generate this structure.",
23+
)
24+
md_temperature: float | None = Field(
25+
None,
26+
description="If a float, the temperature in Kelvin at which MLMD was performed.",
27+
)
28+
md_pressure: float | None = Field(
29+
None,
30+
description="If a float, the pressure in atmosphere at which MLMD was performed.",
31+
)
32+
md_step: int | None = Field(
33+
None,
34+
description="The step in the MD simulation from which the structure was sampled.",
35+
)
36+
mlip_name: str | None = Field(
37+
None, description="The name of the ML potential used to perform MLMD."
38+
)
39+
40+
41+
class MatPESTrainDoc(MLTrainDoc):
42+
"""
43+
Schema for VASP data in the Materials Potential Energy Surface (MatPES) effort.
44+
45+
This schema is used in the data entries for MatPES v2025.1,
46+
which can be downloaded either:
47+
- On [MPContribs](https://materialsproject-contribs.s3.amazonaws.com/index.html#MatPES_2025_1/)
48+
- or on [the site]
49+
"""
50+
51+
matpes_id: str | None = Field(None, description="MatPES identifier.")
52+
53+
formation_energy_per_atom: float | None = Field(
54+
None,
55+
description="The uncorrected formation enthalpy per atom at zero pressure and temperature.",
56+
)
57+
cohesive_energy_per_atom: float | None = Field(
58+
None, description="The uncorrected cohesive energy per atom."
59+
)
60+
61+
provenance: MatPESProvenanceDoc | None = Field(
62+
None, description="Information about the provenance of the structure."
63+
)
64+
65+
@property
66+
def pressure(self) -> float | None:
67+
"""Return the pressure from the DFT stress tensor."""
68+
return sum(self.stress[:3]) / 3.0 if self.stress else None

mpcontribs-lux/mpcontribs/lux/projects/esoteric_ephemera/schemas/__init__.py

Whitespace-only changes.
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
"""Define base schemas for machine learning interatomic potential data."""
2+
3+
from __future__ import annotations
4+
5+
from functools import cached_property
6+
from typing import TYPE_CHECKING
7+
8+
import numpy as np
9+
from pydantic import Field
10+
11+
from emmet.core.structure import StructureMetadata
12+
from emmet.core.math import Matrix3D, Vector3D, Vector6D, matrix_3x3_to_voigt
13+
from emmet.core.types.pymatgen_types.composition_adapter import CompositionType
14+
from emmet.core.types.pymatgen_types.element_adapter import ElementType
15+
from emmet.core.vasp.calc_types import RunType as VaspRunType
16+
17+
from pymatgen.core import Element, Structure
18+
19+
if TYPE_CHECKING:
20+
from typing_extensions import Self
21+
22+
from emmet.core.tasks import TaskDoc
23+
24+
class MLTrainDoc(StructureMetadata, extra="allow"): # type: ignore[call-arg]
25+
"""Generic schema for ML training data."""
26+
27+
cell: Matrix3D | None = Field(
28+
None,
29+
description="The 3x3 matrix of cell/lattice vectors, such that a is the first row, b the second, and c the third.",
30+
)
31+
32+
atomic_numbers: list[int] | None = Field(
33+
None,
34+
description="The list of proton numbers at each site. Should be the same length as `cart_coords`",
35+
)
36+
37+
cart_coords: list[Vector3D] | None = Field(
38+
None,
39+
description="The list of Cartesian coordinates of each atom. Should be the same length as `atomic_numbers`.",
40+
)
41+
42+
magmoms: list[float] | None = Field(
43+
None, description="The list of on-site magnetic moments."
44+
)
45+
46+
energy: float | None = Field(
47+
None, description="The total energy associated with this structure."
48+
)
49+
50+
forces: list[Vector3D] | None = Field(
51+
None,
52+
description="The interatomic forces corresponding to each site in the structure.",
53+
)
54+
55+
abs_forces: list[float] | None = Field(
56+
None, description="The magnitude of the interatomic force on each site."
57+
)
58+
59+
stress: Vector6D | None = Field(
60+
None,
61+
description="The components of the symmetric stress tensor in Voigt notation (xx, yy, zz, yz, xz, xy).",
62+
)
63+
64+
stress_matrix: Matrix3D | None = Field(
65+
None,
66+
description="The 3x3 stress tensor. Use this if the tensor is unphysically non-symmetric.",
67+
)
68+
69+
bandgap: float | None = Field(None, description="The final DFT bandgap.")
70+
71+
elements: list[ElementType] | None = Field(
72+
None,
73+
description="List of unique elements in the material sorted alphabetically.",
74+
)
75+
76+
composition: CompositionType | None = Field(
77+
None, description="Full composition for the material."
78+
)
79+
80+
composition_reduced: CompositionType | None = Field(
81+
None,
82+
title="Reduced Composition",
83+
description="Simplified representation of the composition.",
84+
)
85+
86+
functional: VaspRunType | None = Field(
87+
None, description="The approximate functional used to generate this entry."
88+
)
89+
90+
bader_charges: list[float] | None = Field(
91+
None, description="Bader charges on each site of the structure."
92+
)
93+
bader_magmoms: list[float] | None = Field(
94+
None,
95+
description="Bader on-site magnetic moments for each site of the structure.",
96+
)
97+
98+
@cached_property
99+
def structure(self) -> Structure:
100+
"""Get the structure associated with this entry."""
101+
site_props = {"magmom": self.magmoms} if self.magmoms else None
102+
return Structure(
103+
np.array(self.cell),
104+
[Element.from_Z(z) for z in self.atomic_numbers], # type: ignore[union-attr]
105+
self.cart_coords, # type: ignore[arg-type]
106+
coords_are_cartesian=True,
107+
site_properties=site_props,
108+
)
109+
110+
@classmethod
111+
def from_structure(
112+
cls,
113+
meta_structure: Structure,
114+
fields: list[str] | None = None,
115+
**kwargs,
116+
) -> Self:
117+
"""
118+
Create an ML training document from an ordered structure and fields.
119+
120+
This method mostly exists to ensure that the structure field is
121+
set because `meta_structure` does not populate it automatically.
122+
123+
Parameters
124+
-----------
125+
meta_structure : Structure
126+
An ordered structure
127+
fields : list of str or None
128+
Additional fields in the document to populate
129+
**kwargs
130+
Any other fields / constructor kwargs
131+
"""
132+
if not meta_structure.is_ordered:
133+
raise ValueError(
134+
f"{cls.__name__} only supports ordered structures at this time."
135+
)
136+
137+
if (forces := kwargs.get("forces")) is not None and kwargs.get(
138+
"abs_forces"
139+
) is None:
140+
kwargs["abs_forces"] = [np.linalg.norm(f) for f in forces]
141+
142+
if magmoms := meta_structure.site_properties.get("magmom"):
143+
kwargs["magmoms"] = magmoms
144+
145+
return super().from_structure(
146+
meta_structure=meta_structure,
147+
fields=fields,
148+
cell=meta_structure.lattice.matrix,
149+
atomic_numbers=[site.specie.Z for site in meta_structure],
150+
cart_coords=meta_structure.cart_coords,
151+
**kwargs,
152+
)
153+
154+
@classmethod
155+
def from_task_doc(
156+
cls,
157+
task_doc: TaskDoc,
158+
**kwargs,
159+
) -> list[Self]:
160+
"""Create a list of ML training documents from the ionic steps in a TaskDoc.
161+
162+
Parameters
163+
-----------
164+
task_doc : TaskDoc
165+
**kwargs
166+
Any kwargs to pass to `from_structure`.
167+
"""
168+
entries = []
169+
170+
for cr in task_doc.calcs_reversed[::-1]:
171+
nion = len(cr.output.ionic_steps)
172+
173+
for iion, ionic_step in enumerate(cr.output.ionic_steps):
174+
structure = Structure.from_dict(ionic_step.structure.as_dict())
175+
# these are fields that should only be set on the final frame of a calculation
176+
# also patch in magmoms because of how Calculation works
177+
last_step_kwargs = {}
178+
if iion == nion - 1:
179+
if magmom := cr.output.structure.site_properties.get("magmom"):
180+
structure.add_site_property("magmom", magmom)
181+
last_step_kwargs["bandgap"] = cr.output.bandgap
182+
if bader_analysis := cr.bader:
183+
for bk in (
184+
"charge",
185+
"magmom",
186+
):
187+
last_step_kwargs[f"bader_{bk}s"] = bader_analysis[bk]
188+
189+
if (_st := ionic_step.stress) is not None:
190+
st = np.array(_st)
191+
if np.allclose(st, st.T, rtol=1e-8):
192+
# Stress tensor is symmetric
193+
last_step_kwargs["stress"] = matrix_3x3_to_voigt(_st)
194+
else:
195+
# Stress tensor is non-symmetric
196+
last_step_kwargs["stress_matrix"] = _st
197+
198+
entries.append(
199+
cls.from_structure(
200+
meta_structure=structure,
201+
energy=ionic_step.e_0_energy,
202+
forces=ionic_step.forces,
203+
functional=cr.run_type,
204+
**last_step_kwargs,
205+
**kwargs,
206+
)
207+
)
208+
return entries
209+
210+
@cached_property
211+
def to_ase_atoms(self):
212+
"""Get the ASE Atoms associated with this entry."""
213+
214+
try:
215+
from ase.calculators.singlepoint import SinglePointCalculator
216+
from ase import Atoms
217+
except ImportError:
218+
raise ImportError(
219+
"You must `pip install ase` to use the atoms functionality here!"
220+
)
221+
222+
atoms = Atoms(
223+
positions=self.cart_coords,
224+
numbers=self.atomic_numbers,
225+
cell=self.cell,
226+
)
227+
calc = SinglePointCalculator(
228+
atoms,
229+
**{
230+
k: getattr(self, k, None)
231+
for k in {"energy", "forces", "stress", "magmoms"}
232+
},
233+
)
234+
atoms.calc = calc
235+
return atoms

0 commit comments

Comments
 (0)