Source code for kosmos.ml.datasets.bloodmnist_dataset

from importlib.resources import as_file, files
from typing import ClassVar

import numpy as np

from kosmos.ml.datasets.dataset import SLDataset


[docs] class BloodMNISTDataset(SLDataset): """BloodMNIST dataset for blood cell classification from biomedical images. Notes: - Number of instances: 17,092 (11,959 train + 1,712 val + 3,421 test) - Number of features: 2,352 numeric (28x28x3 RGB images, flattened) - Classes: 8 (different blood cell types) References: - MedMNIST: https://medmnist.com/ """ BLOOD_CELL_CLASSES: ClassVar[list[str]] = [ "basophil", "eosinophil", "erythroblast", "immature_granulocytes", "lymphocyte", "monocyte", "neutrophil", "platelet", ] def __init__(self, *, min_max_scaler: bool = True) -> None: """Initialize the dataset. Args: min_max_scaler (bool): Whether to apply min-max scaling to the features. Defaults to True. """ path = files("kosmos.ml.datasets.data") / "bloodmnist.npz" with as_file(path) as p: data = np.load(p) # Combine train, validation, and test sets x_train = data["train_images"] y_train = data["train_labels"] x_val = data["val_images"] y_val = data["val_labels"] x_test = data["test_images"] y_test = data["test_labels"] x = np.concatenate([x_train, x_val, x_test], axis=0) y = np.concatenate([y_train, y_val, y_test], axis=0) # Flatten RGB images x = x.reshape(x.shape[0], -1).astype(np.float32, copy=False) # Flatten labels y = y.flatten().astype(np.int64, copy=False) super().__init__(x, y, min_max_scaler=min_max_scaler) @property def class_names(self) -> list[str]: """Return human-readable class labels for blood cell types.""" return self.BLOOD_CELL_CLASSES