Source code for kosmos.ml.datasets.income_dataset

from importlib.resources import files
from typing import ClassVar

import numpy as np
import pandas as pd

from kosmos.ml.datasets.dataset import SLDataset



[docs]
class IncomeDataset(SLDataset):
    """Adult Income (Census) dataset — binary classification (>50K vs <=50K).

    Notes:
        - Instances: 48,842 (32,561 train + 16,281 test)
        - Features: 14 (Mix numerical and categorical), after One-Hot-Decision more columns
        - Classes: 2 (imbalanced; ~24% >50K, ~76% <=50K)

    References:
        - UCI ML Repository — Adult
          https://archive.ics.uci.edu/ml/machine-learning-databases/adult/

    """

    COLS: ClassVar[list[str]] = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country",
        "income",
    ]

    def __init__(self, *, min_max_scaler: bool = True) -> None:
        """Initialize the dataset.

        Args:
            min_max_scaler (bool): Whether to apply min-max scaling to the features.
                Defaults to True.

        """
        with (files("kosmos.ml.datasets.data") / "adult.data").open("r", encoding="utf-8") as f:
            df = pd.read_csv(
                f,
                header=None,
                names=self.COLS,
                sep=",",
                skipinitialspace=True,
                na_values="?",
            )

        df["income"] = (
            df["income"]
            .astype(str)
            .str.strip()
            .str.replace(".", "", regex=False)
            .map({">50K": 1, "<=50K": 0})
            .astype("Int64")
        )

        y = df["income"].astype("int64").to_numpy()

        x = df.drop(columns=["income"])

        # One-Hot-Encoding
        cat_cols = x.select_dtypes(include=["str"]).columns.tolist()
        x = pd.get_dummies(x, columns=cat_cols, drop_first=True)

        # Set missing values to 0
        x = x.fillna(0)

        x = x.to_numpy(dtype=np.float32)

        super().__init__(x, y, min_max_scaler=min_max_scaler)

    @property
    def class_names(self) -> list[str]:
        """Return human-readable class labels: 0 -> <=50K, 1 -> >50K."""
        return ["<=50K", ">50K"]