Source code for gemclus.data.synthetic_data

from numbers import Integral, Real
from typing import Tuple

import numpy as np
from scipy.linalg import block_diag
from sklearn.utils import check_random_state, check_array
from sklearn.utils._param_validation import Interval
from .._constraints import constraint_params


[docs]
@constraint_params(
    {
        "n": [Interval(Integral, 1, None, closed="left")],
        "loc": ["array-like"],
        "scale": ["array-like"],
        "pvals": ["array-like"],
        "random_state": ["random_state"]
    }
)
def draw_gmm(n, loc, scale, pvals, random_state=None) -> Tuple[np.ndarray, np.ndarray]:
    """
    Returns :math:`n` samples drawn from a mixture of Gaussian distributions. The number of components
    is determined by the number of elements in the lists of the parameters.

    Parameters
    ----------
    n: int
        The number of samples to draw from the GMM.
    loc: list of K ndarray of shape (d,)
        A list containing the means of all components of the Gaussian mixture distributions.
    scale: list of K ndarray of shape (d,d)
        A list containing the covariances of all components of the Gaussian mixture distribution.
    pvals: ndarray of shape (K,)
        The proportions of each component of the Gaussian mixture.
    random_state: int, RandomState instance or None, default=None
        Determines random number generation for dataset creation. Pass an int for reproducible output across
        multiple runs.

    Returns
    -------
    X: ndarray of shape (n, d)
        The array containing the samples drawn from the mixture model. d is the
    y: ndarray of shape (n,)
        The component from which each sample originates.
    """
    loc = check_array(loc, ensure_2d=True, ensure_min_samples=2, input_name="Means")
    scale = check_array(scale, allow_nd=True, ensure_min_samples=2, input_name="Covariances")
    pvals = check_array(pvals, ensure_2d=False, ensure_min_samples=2, input_name="Proportions")

    # Check that the parameters have satisfying properties
    K, d = loc.shape
    if K != scale.shape[0]:
        raise ValueError("The means and the covariances do not contain the same number of components")
    if d!=1:
        if d != scale.shape[1] or d != scale.shape[2]:
            raise ValueError("The covariances should be square matrices")
    if K != pvals.shape[0]:
        raise ValueError("The proportions and the means do not contain the same number of components")
    if np.any(pvals <= 0):
        raise ValueError("Proportions or components should be strictly positive.")
    if np.sum(pvals) != 1:
        raise ValueError("Proportions of components do not add up to one.")

    generator = check_random_state(random_state)

    # Draw samples from each distribution

    X = []

    # Draw the true cluster from which to draw
    y = generator.choice(K, p=pvals, size=(n,))
    if d == 1:
        for k in range(K):
            if scale[k] <= 0:
                raise ValueError(f"The {k}-th variance is negative.")
        for k in range(len(loc)):
            X += [generator.normal(loc[k], scale[k], size=(n,))]
    else:
        for k in range(K):
            if np.any(np.linalg.eigvals(scale[k]) < 0):
                raise ValueError(f"The {k}-th covariance is not positive semi-definite")
            if np.all(scale[k] == 0):
                raise ValueError(f"The {k}-th covariance matrix contains only zeroes")

        for k in range(len(loc)):
            X += [generator.multivariate_normal(loc[k], scale[k], size=(n,))]

    X = [X[k][i].reshape((1, -1)) for i, k in enumerate(y)]

    return np.concatenate(X, axis=0), y




[docs]
@constraint_params(
    {
        "n": [Interval(Integral, 1, None, closed="left")],
        "loc": ["array-like"],
        "scale": ["array-like"],
        "df": [Interval(Real, 0, None, closed="neither")],
        "random_state": ["random_state"]
    }
)
def multivariate_student_t(n, loc, scale, df=10, random_state=None) -> np.ndarray:
    """
    Draws :math:`n` samples from a multivariate Student-t distribution.

    Parameters
    ----------
    n: int
        The number of samples to draw from the distribution.

    loc: ndarray of shape (d,)
        The position of the distribution to sample from.

    scale: ndarray of shape (d,d)
        Positive semi-definite scale matrix.

    df: int, default=10
        Degrees of freedom of the distribution. Controls the spread of the samples.

    random_state: int, RandomState instance or None, default=None
        Determines random number generation for dataset creation. Pass an int for reproducible output across
        multiple runs.

    Returns
    -------
    X: ndarray of shape (n,d)
        The samples drawn from the Student-t distribution.
    """
    loc = check_array(loc, ensure_2d=False, input_name="Location")
    scale = check_array(scale, ensure_2d=True, input_name="Scale")

    d = len(loc)
    if scale.shape[0] != d or scale.shape[1] != d:
        raise ValueError("Please provide a mean and scale with consistent shapes w.r.t. the mean")

    generator = check_random_state(random_state)

    # Start the sampling process by generating from a 0-mean multivariate distribution
    nx = generator.multivariate_normal(np.zeros(d), scale, size=n)
    u = generator.chisquare(df, n).reshape((-1, 1))
    X = np.sqrt(df / u) * nx + loc.reshape((1, -1))

    return X




[docs]
@constraint_params(
    {
        "n": [Interval(Integral, 4, None, closed="left")],
        "alpha": [Interval(Real, 0, None, closed="neither")],
        "df": [Interval(Real, 0, None, closed="neither")],
        "random_state": ["random_state"]
    }
)
def gstm(n=500, alpha=2, df=1, random_state=None):
    """
    Reproduces the Gaussian-Student Mixture dataset from the GEMINI article.

    Parameters
    ----------
    n: int, default=500
        The number of samples to draw from the dataset.

    alpha: float, default=2:
        This parameter controls how close the means of the Gaussian distribution and the location of the Student-t
        distribution are.

    df: float, default=1
        The degrees of freedom for the Student-t distribution.

    random_state: int, RandomState instance or None, default=None
        Determines random number generation for dataset creation. Pass an int for reproducible output across
        multiple runs.

    Returns
    -------
    X: ndarray of shape (n,2)
        The samples of the dataset in an array of shape n_samples x n_features
    y: ndarray of shape (n,)
        The component of the GMM from which each sample was drawn.

    References
    ----------
    GEMINI - Ohl, L., Mattei, P. A., Bouveyron, C., Harchaoui, W., Leclercq, M., Droit, A., & Precioso, F.
        (2022, October). Generalised Mutual Information for Discriminative Clustering. In Advances in Neural
        Information Processing Systems.
    """
    generator = check_random_state(random_state)

    # Build the location and scale of each distribution
    locations = np.array([[1, 1], [1, -1], [-1, 1], [-1, -1]]) * alpha
    covariance = np.eye(2)

    # For the 3 Gaussian distribution, we draw the samples using a GMM with proportions 1/3 on 3/4 of the samples
    n_gaussian = 3 * n // 4
    X_gaussian, y_gaussian = draw_gmm(n_gaussian, locations[:-1], [covariance] * 3, np.ones(3) / 3, generator)

    # Then we sample the student-t distribution
    n_student = n - n_gaussian
    X_student = multivariate_student_t(n_student, locations[-1], covariance, df, generator)

    X = np.vstack([X_gaussian, X_student])
    y = np.concatenate([y_gaussian, np.ones(n_student) * 3])

    # Apply one final random permutation to shuffle the data
    order = generator.permutation(n)

    return X[order], y[order]




[docs]
@constraint_params(
    {
        "n": [Interval(Integral, 1, None, closed="left")],
        "p": [Interval(Integral, 1, None, closed="left")],
        "mu": [Interval(Real, 0, None, closed="neither")],
        "random_state": ["random_state"]
    }
)
def celeux_one(n=300, p=20, mu=1.7, random_state=None) -> Tuple[np.ndarray, np.ndarray]:
    r"""
    Draws :math:`n` samples from a Gaussian mixture with 3 isotropic components of respective means 1, 0 and 1
    over 5 dimensions scaled by :math:`\mu`. The data is concatenated with :math:`p` additional noisy excessive random
    variables that are independent of the true clusters. This dataset is taken by Celeux et al., section 3.1.

    Parameters
    ----------
    n: int, default=300
        The number of samples to draw from the gaussian mixture models.
    p: int, default=20
        The number of excessive noisy variables to concatenate to the dataset.
    mu: float, default=1.7
        Controls how the means of the components are close to each other by scaling.
    random_state: int, RandomState instance or None, default=None
        Determines random number generation for dataset creation. Pass an int for reproducible output across
        multiple runs.

    Returns
    -------
    X: ndarray of shape (n, 5+p)
        The samples of the dataset in an array of shape n_samples x n_features
    y: ndarray of shape (n,)
        The component of the GMM from which each sample was drawn.

    References
    ----------
    Dataset - Celeux, G., Martin-Magniette, M. L., Maugis-Rabusseau, C., & Raftery, A. E. (2014). Comparing model
        selection and regularization approaches to variable selection in model-based clustering.
        Journal de la Societe francaise de statistique, 155(2), 57-71.
    """
    generator = check_random_state(random_state)

    # Draw the first five variables according to a balance gaussian mixture
    mu1 = np.ones(5) * mu
    mu2 = -mu1
    mu3 = np.zeros(5)
    cov = np.eye(5)

    good_variables, y = draw_gmm(n, [mu1, mu2, mu3], [cov, cov, cov], np.ones(3) / 3, generator)

    # Create noisy independent variables
    noise = generator.normal(size=(n, p))

    return np.concatenate([good_variables, noise], axis=1), y




[docs]
@constraint_params(
    {
        "n": [Interval(Integral, 1, None, closed="left")],
        "random_state": ["random_state"]
    }
)
def celeux_two(n=2000, random_state=None) -> Tuple[np.ndarray, np.ndarray]:
    """
    Draws samples from a mixture of 4 Gaussian distributions in 2d with additional variables
    linearly dependent of the informative variables and non-informative noisy variables. This dataset is
    taken from Celeux et al., section 3.2.

    Parameters
    ----------
    n: int, default=2000
        The number of samples to draw.
    random_state: int, RandomState instance or None, default=None
        Determines random number generation for dataset creation. Pass an int for reproducible output across
        multiple runs.

    Returns
    -------
    X: ndarray of shape (n, 14)
        The samples of the dataset in an array of shape n_samples x n_features
    y: ndarray of shape (n,)
        The component of the GMM from which each sample was drawn.

    References
    ----------
    Dataset - Celeux, G., Martin-Magniette, M. L., Maugis-Rabusseau, C., & Raftery, A. E. (2014). Comparing model
        selection and regularization approaches to variable selection in model-based clustering.
        Journal de la Societe francaise de statistique, 155(2), 57-71.
    """

    generator = check_random_state(random_state)

    # Start by generating the true informative variables
    mu1 = np.array([0, 0])
    mu2 = np.array([4, 0])
    mu3 = np.array([0, 2])
    mu4 = np.array([4, 2])
    cov = np.eye(2)
    pis = np.ones(4) / 4
    good_variables, y = draw_gmm(n, [mu1, mu2, mu3, mu4], [cov, cov, cov, cov], pis, generator)

    # Apply affine transformations to produce correlated variables up to some noise
    b = np.array([[0.5, 1], [2, 0], [0, 3], [-1, 2], [2, -4], [0.5, 0], [4, 0.5], [3, 0], [2, 1]]).T
    rot_pi_3 = np.array([[0.5, -np.sqrt(3) / 2], [np.sqrt(3) / 2, 0.5]])
    rot_pi_6 = np.array([[np.sqrt(3) / 2, -0.5], [0.5, np.sqrt(3) / 2]])
    cov_noise = [np.eye(3), 0.5 * np.eye(2)]
    cov_noise += [rot_pi_3.T @ np.diag(np.array([1, 3])) @ rot_pi_3]
    cov_noise += [rot_pi_6.T @ np.diag(np.array([2, 6])) @ rot_pi_6]
    cov_noise = block_diag(*cov_noise)
    noise = generator.multivariate_normal(np.zeros(9), cov_noise, size=(n,))
    X3_11 = np.array([0, 0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8]) + good_variables @ b + noise

    # Add noisy indepedent variables
    X12_14 = generator.multivariate_normal(np.array([3.2, 3.6, 4]), np.eye(3), size=(n,))

    # Complete the dataset by joining everything
    bad_variables = np.concatenate([X3_11, X12_14], axis=1)
    return np.concatenate([good_variables, bad_variables], axis=1), y