Source code for kosmos.ml.datasets.income_dataset
from importlib.resources import files
from typing import ClassVar
import numpy as np
import pandas as pd
from kosmos.ml.datasets.dataset import SLDataset
[docs]
class IncomeDataset(SLDataset):
"""Adult Income (Census) dataset — binary classification (>50K vs <=50K).
Notes:
- Instances: 48,842 (32,561 train + 16,281 test)
- Features: 14 (Mix numerical and categorical), after One-Hot-Decision more columns
- Classes: 2 (imbalanced; ~24% >50K, ~76% <=50K)
References:
- UCI ML Repository — Adult
https://archive.ics.uci.edu/ml/machine-learning-databases/adult/
"""
COLS: ClassVar[list[str]] = [
"age",
"workclass",
"fnlwgt",
"education",
"education_num",
"marital_status",
"occupation",
"relationship",
"race",
"sex",
"capital_gain",
"capital_loss",
"hours_per_week",
"native_country",
"income",
]
def __init__(self, *, min_max_scaler: bool = True) -> None:
"""Initialize the dataset.
Args:
min_max_scaler (bool): Whether to apply min-max scaling to the features.
Defaults to True.
"""
with (files("kosmos.ml.datasets.data") / "adult.data").open("r", encoding="utf-8") as f:
df = pd.read_csv(
f,
header=None,
names=self.COLS,
sep=",",
skipinitialspace=True,
na_values="?",
)
df["income"] = (
df["income"]
.astype(str)
.str.strip()
.str.replace(".", "", regex=False)
.map({">50K": 1, "<=50K": 0})
.astype("Int64")
)
y = df["income"].astype("int64").to_numpy()
x = df.drop(columns=["income"])
# One-Hot-Encoding
cat_cols = x.select_dtypes(include=["str"]).columns.tolist()
x = pd.get_dummies(x, columns=cat_cols, drop_first=True)
# Set missing values to 0
x = x.fillna(0)
x = x.to_numpy(dtype=np.float32)
super().__init__(x, y, min_max_scaler=min_max_scaler)
@property
def class_names(self) -> list[str]:
"""Return human-readable class labels: 0 -> <=50K, 1 -> >50K."""
return ["<=50K", ">50K"]