Source code for kosmos.ml.datasets.qsar_dataset
from importlib.resources import files
import numpy as np
import pandas as pd
from kosmos.ml.datasets.dataset import SLDataset
[docs]
class QSARDataset(SLDataset):
"""QSAR biodegration dataset for binary classification.
Notes:
- Number of instances: 1055
- Number of features: 41 numeric
- Classes: 2 (slightly imbalanced, RD (ready biodegradable) ca 34%; NRB ca 66%)
References:
- UCI Machine Learning Repository — QSAR dataset: https://archive.ics.uci.edu/dataset/254/qsar+biodegradation
"""
def __init__(self, *, min_max_scaler: bool = True) -> None:
"""Initialize the dataset.
Args:
min_max_scaler (bool): Whether to apply min-max scaling to the features.
Defaults to True.
"""
with (files("kosmos.ml.datasets.data") / "qsar.data").open("r", encoding="utf-8") as f:
df = pd.read_csv(f, sep=";", header=None)
x = df.iloc[:, :-1].to_numpy(dtype=np.float32)
y = (
df.iloc[:, -1]
.astype(str)
.str.strip()
.map({"NRB": 0, "RB": 1, "0": 0, "1": 1})
.astype("int64")
.to_numpy()
)
super().__init__(x, y, min_max_scaler=min_max_scaler)
@property
def class_names(self) -> list[str]:
"""Return human-readable class labels and map: NRB->0 (not ready biodegradable) RB->1."""
return ["Not Ready Biodegradable", "Ready Biodegradable"]