Source code for dpipe.split.cv

from typing import Sequence, Union, Callable

import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split

from dpipe.itertools import extract
from .base import split_train, kfold_split, indices_to_ids


[docs]def split(ids, *, n_splits, random_state=42):
    split_indices = kfold_split(ids, n_splits, random_state=random_state)
    return indices_to_ids(split_indices, ids)


[docs]def leave_group_out(ids, groups, *, val_size=None, random_state=42):
    """Leave one group out CV. Validation subset will be selected randomly."""
    n_splits = len(np.unique(groups))
    splits = kfold_split(ids, n_splits, groups=groups)
    if val_size is not None:
        splits = split_train(splits, val_size, random_state=random_state)
    return indices_to_ids(splits, ids)


[docs]def train_val_test_split(ids, *, val_size, n_splits, random_state=42):
    """
    Splits the dataset's ids into triplets (train, validation, test).
    The test ids are determined as in the standard K-fold cross-validation setting:
    for each fold a different portion of 1/K ids is kept for testing.
    The remaining (K - 1) / K ids are split into train and validation sets according to ``val_size``.

    Parameters
    ----------
    ids
    val_size: float, int
        If ``float``, should be between 0.0 and 1.0 and represents the proportion
        of the train set to include in the validation set. If ``int``, represents the
        absolute number of validation samples.
    n_splits: int
        the number of cross-validation folds.

    Returns
    -------
    splits: Sequence of triplets
    """
    split_indices = kfold_split(subj_ids=ids, n_splits=n_splits, random_state=random_state)
    split_indices = split_train(splits=split_indices, val_size=val_size, random_state=random_state)
    return indices_to_ids(split_indices, ids)


[docs]def group_train_val_test_split(ids: Sequence, groups: Union[Callable, Sequence], *, val_size, n_splits,
                               random_state=42):
    """
    Splits the dataset's ids into triplets (train, validation, test) keeping all the objects
    from a group in the same set (either train, validation or test).
    The test ids are determined as in the standard K-fold cross-validation setting:
    for each fold a different portion of 1 / K ids is kept for testing.
    The remaining (K - 1) / K ids are split into train and validation sets according to ``val_size``.

    The splitter guarantees that no objects belonging to the same group will en up in different sets.

    Parameters
    ----------
    ids
    groups: np.ndarray[int]
    val_size: float, int
        If ``float``, should be between 0.0 and 1.0 and represents the proportion
        of the train set to include in the validation set. If ``int``, represents the
        absolute number of validation samples.
    n_splits: int
        the number of cross-validation folds
    """
    if callable(groups):
        groups = list(map(groups, ids))
    groups = np.asarray(groups)
    split_indices = kfold_split(ids, n_splits, groups=groups, random_state=random_state)
    split_indices = split_train(split_indices, val_size, groups=groups, random_state=random_state)
    return indices_to_ids(split_indices, ids)


[docs]def stratified_train_val_test_split(ids: Sequence, labels: Union[Callable, Sequence], *, val_size, n_splits,
                                    random_state=42):
    if callable(labels):
        labels = list(map(labels, ids))

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    train_val_test_ids = []
    for i, (train_val_indices, test_indices) in enumerate(cv.split(ids, labels)):
        train_val_ids = extract(ids, train_val_indices)
        test_ids = extract(ids, test_indices)
        if val_size:
            train_ids, val_ids = train_test_split(train_val_ids, test_size=val_size, random_state=25 + i)
        else:
            train_ids, val_ids = train_val_ids, []

        train_val_test_ids.append((train_ids, val_ids, test_ids))

    return train_val_test_ids