Source code for kosmos.ml.datasets.wdbc_dataset
from importlib.resources import files
import numpy as np
import pandas as pd
from kosmos.ml.datasets.dataset import SLDataset
[docs]
class WDBCDataset(SLDataset):
"""WDBC (wisconsin breast cancer) dataset for binary classification.
Notes:
- Number of instances: 569
- Number of features: 30 numeric
- Classes: 2 (slighty imbalanced; 357 benign, 212 malign)
References:
- UCI Machine Learning Repository — WDBC dataset: https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic
"""
def __init__(self, *, min_max_scaler: bool = True) -> None:
"""Initialize the dataset.
Args:
min_max_scaler (bool): Whether to apply min-max scaling to the features.
Defaults to True.
"""
with (files("kosmos.ml.datasets.data") / "wdbc.data").open("r", encoding="utf-8") as f:
cols = ["id", "diagnosis"] + [f"f{i}" for i in range(30)]
df = pd.read_csv(f, header=None, names=cols)
df = df.drop(columns=["id"])
df["diagnosis"] = df["diagnosis"].map({"B": 0, "M": 1}).astype("int64")
x = df.drop(columns=["diagnosis"]).to_numpy(np.float32)
y = df["diagnosis"].to_numpy()
super().__init__(x, y, min_max_scaler=min_max_scaler)
@property
def class_names(self) -> list[str]:
"""Return human-readable class labels and map: B->0 (Benign), M->1 (Malignant)."""
return ["Benign", "Malignant"]