|
| 1 | +"""Define base schemas for machine learning interatomic potential data.""" |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from functools import cached_property |
| 6 | +from typing import TYPE_CHECKING |
| 7 | + |
| 8 | +import numpy as np |
| 9 | +from pydantic import Field |
| 10 | + |
| 11 | +from emmet.core.structure import StructureMetadata |
| 12 | +from emmet.core.math import Matrix3D, Vector3D, Vector6D, matrix_3x3_to_voigt |
| 13 | +from emmet.core.types.pymatgen_types.composition_adapter import CompositionType |
| 14 | +from emmet.core.types.pymatgen_types.element_adapter import ElementType |
| 15 | +from emmet.core.vasp.calc_types import RunType as VaspRunType |
| 16 | + |
| 17 | +from pymatgen.core import Element, Structure |
| 18 | + |
| 19 | +if TYPE_CHECKING: |
| 20 | + from typing_extensions import Self |
| 21 | + |
| 22 | + from emmet.core.tasks import TaskDoc |
| 23 | + |
| 24 | +class MLTrainDoc(StructureMetadata, extra="allow"): # type: ignore[call-arg] |
| 25 | + """Generic schema for ML training data.""" |
| 26 | + |
| 27 | + cell: Matrix3D | None = Field( |
| 28 | + None, |
| 29 | + description="The 3x3 matrix of cell/lattice vectors, such that a is the first row, b the second, and c the third.", |
| 30 | + ) |
| 31 | + |
| 32 | + atomic_numbers: list[int] | None = Field( |
| 33 | + None, |
| 34 | + description="The list of proton numbers at each site. Should be the same length as `cart_coords`", |
| 35 | + ) |
| 36 | + |
| 37 | + cart_coords: list[Vector3D] | None = Field( |
| 38 | + None, |
| 39 | + description="The list of Cartesian coordinates of each atom. Should be the same length as `atomic_numbers`.", |
| 40 | + ) |
| 41 | + |
| 42 | + magmoms: list[float] | None = Field( |
| 43 | + None, description="The list of on-site magnetic moments." |
| 44 | + ) |
| 45 | + |
| 46 | + energy: float | None = Field( |
| 47 | + None, description="The total energy associated with this structure." |
| 48 | + ) |
| 49 | + |
| 50 | + forces: list[Vector3D] | None = Field( |
| 51 | + None, |
| 52 | + description="The interatomic forces corresponding to each site in the structure.", |
| 53 | + ) |
| 54 | + |
| 55 | + abs_forces: list[float] | None = Field( |
| 56 | + None, description="The magnitude of the interatomic force on each site." |
| 57 | + ) |
| 58 | + |
| 59 | + stress: Vector6D | None = Field( |
| 60 | + None, |
| 61 | + description="The components of the symmetric stress tensor in Voigt notation (xx, yy, zz, yz, xz, xy).", |
| 62 | + ) |
| 63 | + |
| 64 | + stress_matrix: Matrix3D | None = Field( |
| 65 | + None, |
| 66 | + description="The 3x3 stress tensor. Use this if the tensor is unphysically non-symmetric.", |
| 67 | + ) |
| 68 | + |
| 69 | + bandgap: float | None = Field(None, description="The final DFT bandgap.") |
| 70 | + |
| 71 | + elements: list[ElementType] | None = Field( |
| 72 | + None, |
| 73 | + description="List of unique elements in the material sorted alphabetically.", |
| 74 | + ) |
| 75 | + |
| 76 | + composition: CompositionType | None = Field( |
| 77 | + None, description="Full composition for the material." |
| 78 | + ) |
| 79 | + |
| 80 | + composition_reduced: CompositionType | None = Field( |
| 81 | + None, |
| 82 | + title="Reduced Composition", |
| 83 | + description="Simplified representation of the composition.", |
| 84 | + ) |
| 85 | + |
| 86 | + functional: VaspRunType | None = Field( |
| 87 | + None, description="The approximate functional used to generate this entry." |
| 88 | + ) |
| 89 | + |
| 90 | + bader_charges: list[float] | None = Field( |
| 91 | + None, description="Bader charges on each site of the structure." |
| 92 | + ) |
| 93 | + bader_magmoms: list[float] | None = Field( |
| 94 | + None, |
| 95 | + description="Bader on-site magnetic moments for each site of the structure.", |
| 96 | + ) |
| 97 | + |
| 98 | + @cached_property |
| 99 | + def structure(self) -> Structure: |
| 100 | + """Get the structure associated with this entry.""" |
| 101 | + site_props = {"magmom": self.magmoms} if self.magmoms else None |
| 102 | + return Structure( |
| 103 | + np.array(self.cell), |
| 104 | + [Element.from_Z(z) for z in self.atomic_numbers], # type: ignore[union-attr] |
| 105 | + self.cart_coords, # type: ignore[arg-type] |
| 106 | + coords_are_cartesian=True, |
| 107 | + site_properties=site_props, |
| 108 | + ) |
| 109 | + |
| 110 | + @classmethod |
| 111 | + def from_structure( |
| 112 | + cls, |
| 113 | + meta_structure: Structure, |
| 114 | + fields: list[str] | None = None, |
| 115 | + **kwargs, |
| 116 | + ) -> Self: |
| 117 | + """ |
| 118 | + Create an ML training document from an ordered structure and fields. |
| 119 | +
|
| 120 | + This method mostly exists to ensure that the structure field is |
| 121 | + set because `meta_structure` does not populate it automatically. |
| 122 | +
|
| 123 | + Parameters |
| 124 | + ----------- |
| 125 | + meta_structure : Structure |
| 126 | + An ordered structure |
| 127 | + fields : list of str or None |
| 128 | + Additional fields in the document to populate |
| 129 | + **kwargs |
| 130 | + Any other fields / constructor kwargs |
| 131 | + """ |
| 132 | + if not meta_structure.is_ordered: |
| 133 | + raise ValueError( |
| 134 | + f"{cls.__name__} only supports ordered structures at this time." |
| 135 | + ) |
| 136 | + |
| 137 | + if (forces := kwargs.get("forces")) is not None and kwargs.get( |
| 138 | + "abs_forces" |
| 139 | + ) is None: |
| 140 | + kwargs["abs_forces"] = [np.linalg.norm(f) for f in forces] |
| 141 | + |
| 142 | + if magmoms := meta_structure.site_properties.get("magmom"): |
| 143 | + kwargs["magmoms"] = magmoms |
| 144 | + |
| 145 | + return super().from_structure( |
| 146 | + meta_structure=meta_structure, |
| 147 | + fields=fields, |
| 148 | + cell=meta_structure.lattice.matrix, |
| 149 | + atomic_numbers=[site.specie.Z for site in meta_structure], |
| 150 | + cart_coords=meta_structure.cart_coords, |
| 151 | + **kwargs, |
| 152 | + ) |
| 153 | + |
| 154 | + @classmethod |
| 155 | + def from_task_doc( |
| 156 | + cls, |
| 157 | + task_doc: TaskDoc, |
| 158 | + **kwargs, |
| 159 | + ) -> list[Self]: |
| 160 | + """Create a list of ML training documents from the ionic steps in a TaskDoc. |
| 161 | +
|
| 162 | + Parameters |
| 163 | + ----------- |
| 164 | + task_doc : TaskDoc |
| 165 | + **kwargs |
| 166 | + Any kwargs to pass to `from_structure`. |
| 167 | + """ |
| 168 | + entries = [] |
| 169 | + |
| 170 | + for cr in task_doc.calcs_reversed[::-1]: |
| 171 | + nion = len(cr.output.ionic_steps) |
| 172 | + |
| 173 | + for iion, ionic_step in enumerate(cr.output.ionic_steps): |
| 174 | + structure = Structure.from_dict(ionic_step.structure.as_dict()) |
| 175 | + # these are fields that should only be set on the final frame of a calculation |
| 176 | + # also patch in magmoms because of how Calculation works |
| 177 | + last_step_kwargs = {} |
| 178 | + if iion == nion - 1: |
| 179 | + if magmom := cr.output.structure.site_properties.get("magmom"): |
| 180 | + structure.add_site_property("magmom", magmom) |
| 181 | + last_step_kwargs["bandgap"] = cr.output.bandgap |
| 182 | + if bader_analysis := cr.bader: |
| 183 | + for bk in ( |
| 184 | + "charge", |
| 185 | + "magmom", |
| 186 | + ): |
| 187 | + last_step_kwargs[f"bader_{bk}s"] = bader_analysis[bk] |
| 188 | + |
| 189 | + if (_st := ionic_step.stress) is not None: |
| 190 | + st = np.array(_st) |
| 191 | + if np.allclose(st, st.T, rtol=1e-8): |
| 192 | + # Stress tensor is symmetric |
| 193 | + last_step_kwargs["stress"] = matrix_3x3_to_voigt(_st) |
| 194 | + else: |
| 195 | + # Stress tensor is non-symmetric |
| 196 | + last_step_kwargs["stress_matrix"] = _st |
| 197 | + |
| 198 | + entries.append( |
| 199 | + cls.from_structure( |
| 200 | + meta_structure=structure, |
| 201 | + energy=ionic_step.e_0_energy, |
| 202 | + forces=ionic_step.forces, |
| 203 | + functional=cr.run_type, |
| 204 | + **last_step_kwargs, |
| 205 | + **kwargs, |
| 206 | + ) |
| 207 | + ) |
| 208 | + return entries |
| 209 | + |
| 210 | + @cached_property |
| 211 | + def to_ase_atoms(self): |
| 212 | + """Get the ASE Atoms associated with this entry.""" |
| 213 | + |
| 214 | + try: |
| 215 | + from ase.calculators.singlepoint import SinglePointCalculator |
| 216 | + from ase import Atoms |
| 217 | + except ImportError: |
| 218 | + raise ImportError( |
| 219 | + "You must `pip install ase` to use the atoms functionality here!" |
| 220 | + ) |
| 221 | + |
| 222 | + atoms = Atoms( |
| 223 | + positions=self.cart_coords, |
| 224 | + numbers=self.atomic_numbers, |
| 225 | + cell=self.cell, |
| 226 | + ) |
| 227 | + calc = SinglePointCalculator( |
| 228 | + atoms, |
| 229 | + **{ |
| 230 | + k: getattr(self, k, None) |
| 231 | + for k in {"energy", "forces", "stress", "magmoms"} |
| 232 | + }, |
| 233 | + ) |
| 234 | + atoms.calc = calc |
| 235 | + return atoms |
0 commit comments