Source code for mattertune.data.matbench

from __future__ import annotations

import logging
from typing import TYPE_CHECKING, Literal

import ase
from torch.utils.data import Dataset
from typing_extensions import override

from ..registry import data_registry
from ..util import optional_import_error_message
from .base import DatasetConfigBase

if TYPE_CHECKING:
    from pymatgen.core.structure import Structure

log = logging.getLogger(__name__)


[docs] @data_registry.register class MatbenchDatasetConfig(DatasetConfigBase): """Configuration for the Matbench dataset.""" type: Literal["matbench"] = "matbench" """Discriminator for the Matbench dataset.""" task: str | None = None """The name of the self.tasks to include in the dataset.""" property_name: str | None = None """Assign a property name for the self.task. Must match the property head in the model.""" fold_idx: Literal[0, 1, 2, 3, 4] = 0 """The index of the fold to be used in the dataset."""
[docs] @override def create_dataset(self): return MatbenchDataset(self)
[docs] class MatbenchDataset(Dataset[ase.Atoms]):
[docs] def __init__(self, config: MatbenchDatasetConfig): super().__init__() self.config = config self._initialize_benchmark() self._load_data()
def _initialize_benchmark(self) -> None: """Initialize the Matbench benchmark and task.""" with optional_import_error_message("matbench"): from matbench.bench import MatbenchBenchmark # type: ignore[reportMissingImports] # noqa if self.config.task is None: mb = MatbenchBenchmark(autoload=False) all_tasks = list(mb.metadata.keys()) raise ValueError(f"Please specify a task from {all_tasks}") else: mb = MatbenchBenchmark(autoload=False, subset=[self.config.task]) self._task = list(mb.tasks)[0] self._task.load() def _load_data(self) -> None: """Load and process the dataset split.""" fold = self._task.folds[self.config.fold_idx] inputs_data, outputs_data = self._task.get_train_and_val_data(fold) self._atoms_list = self._convert_structures_to_atoms(inputs_data, outputs_data) log.info( f"Loaded {len(self._atoms_list)} samples " f"(fold {self.config.fold_idx})" ) def _convert_structures_to_atoms( self, structures: list[Structure], property_values: list[float] | None = None, ) -> list[ase.Atoms]: """Convert pymatgen structures to ASE atoms. Args: structures: List of pymatgen Structure objects. property_values: Optional list of property values to add to atoms.info. Returns: List of ASE ase.Atoms objects. """ with optional_import_error_message("pymatgen"): from pymatgen.io.ase import AseAtomsAdaptor # type: ignore[reportMissingImports] # noqa adapter = AseAtomsAdaptor() atoms_list = [] prop_name = ( self.config.property_name if self.config.property_name is not None else self.config.task ) for i, structure in enumerate(structures): atoms = adapter.get_atoms(structure) assert isinstance(atoms, ase.Atoms), "Expected an Atoms object" if property_values is not None: atoms.info[prop_name] = property_values[i] atoms_list.append(atoms) return atoms_list @override def __getitem__(self, idx: int) -> ase.Atoms: """Get an item from the dataset by index.""" return self._atoms_list[idx] def __len__(self) -> int: """Get the total number of items in the dataset.""" return len(self._atoms_list)
[docs] def get_test_data(self) -> list[ase.Atoms]: """Load the test data for the current task and fold. Returns: List of ASE ase.Atoms objects from the test set. """ test_inputs = self._task.get_test_data( self._task.folds[self.config.fold_idx], include_target=False ) return self._convert_structures_to_atoms(test_inputs)