Source code for kosmos.ml.datasets.qsar_dataset

from importlib.resources import files

import numpy as np
import pandas as pd

from kosmos.ml.datasets.dataset import SLDataset


[docs] class QSARDataset(SLDataset): """QSAR biodegration dataset for binary classification. Notes: - Number of instances: 1055 - Number of features: 41 numeric - Classes: 2 (slightly imbalanced, RD (ready biodegradable) ca 34%; NRB ca 66%) References: - UCI Machine Learning Repository — QSAR dataset: https://archive.ics.uci.edu/dataset/254/qsar+biodegradation """ def __init__(self, *, min_max_scaler: bool = True) -> None: """Initialize the dataset. Args: min_max_scaler (bool): Whether to apply min-max scaling to the features. Defaults to True. """ with (files("kosmos.ml.datasets.data") / "qsar.data").open("r", encoding="utf-8") as f: df = pd.read_csv(f, sep=";", header=None) x = df.iloc[:, :-1].to_numpy(dtype=np.float32) y = ( df.iloc[:, -1] .astype(str) .str.strip() .map({"NRB": 0, "RB": 1, "0": 0, "1": 1}) .astype("int64") .to_numpy() ) super().__init__(x, y, min_max_scaler=min_max_scaler) @property def class_names(self) -> list[str]: """Return human-readable class labels and map: NRB->0 (not ready biodegradable) RB->1.""" return ["Not Ready Biodegradable", "Ready Biodegradable"]