Source code for kosmos.ml.datasets.bloodmnist_dataset

from importlib.resources import as_file, files
from typing import ClassVar

import numpy as np

from kosmos.ml.datasets.dataset import SLDataset



[docs]
class BloodMNISTDataset(SLDataset):
    """BloodMNIST dataset for blood cell classification from biomedical images.

    Notes:
      - Number of instances: 17,092 (11,959 train + 1,712 val + 3,421 test)
      - Number of features: 2,352 numeric (28x28x3 RGB images, flattened)
      - Classes: 8 (different blood cell types)

    References:
      - MedMNIST: https://medmnist.com/

    """

    BLOOD_CELL_CLASSES: ClassVar[list[str]] = [
        "basophil",
        "eosinophil",
        "erythroblast",
        "immature_granulocytes",
        "lymphocyte",
        "monocyte",
        "neutrophil",
        "platelet",
    ]

    def __init__(self, *, min_max_scaler: bool = True) -> None:
        """Initialize the dataset.

        Args:
            min_max_scaler (bool): Whether to apply min-max scaling to the features.
                Defaults to True.

        """
        path = files("kosmos.ml.datasets.data") / "bloodmnist.npz"
        with as_file(path) as p:
            data = np.load(p)

            # Combine train, validation, and test sets
            x_train = data["train_images"]
            y_train = data["train_labels"]
            x_val = data["val_images"]
            y_val = data["val_labels"]
            x_test = data["test_images"]
            y_test = data["test_labels"]
            x = np.concatenate([x_train, x_val, x_test], axis=0)
            y = np.concatenate([y_train, y_val, y_test], axis=0)

            # Flatten RGB images
            x = x.reshape(x.shape[0], -1).astype(np.float32, copy=False)

            # Flatten labels
            y = y.flatten().astype(np.int64, copy=False)

        super().__init__(x, y, min_max_scaler=min_max_scaler)

    @property
    def class_names(self) -> list[str]:
        """Return human-readable class labels for blood cell types."""
        return self.BLOOD_CELL_CLASSES