Source code for gemclus.data.synthetic_data

from numbers import Integral, Real
from typing import Tuple

import numpy as np
from scipy.linalg import block_diag
from sklearn.utils import check_random_state, check_array
from sklearn.utils._param_validation import Interval
from .._constraints import constraint_params

[docs] @constraint_params( { "n": [Interval(Integral, 1, None, closed="left")], "loc": ["array-like"], "scale": ["array-like"], "pvals": ["array-like"], "random_state": ["random_state"] } ) def draw_gmm(n, loc, scale, pvals, random_state=None) -> Tuple[np.ndarray, np.ndarray]: """ Returns :math:`n` samples drawn from a mixture of Gaussian distributions. The number of components is determined by the number of elements in the lists of the parameters. Parameters ---------- n: int The number of samples to draw from the GMM. loc: list of K ndarray of shape (d,) A list containing the means of all components of the Gaussian mixture distributions. scale: list of K ndarray of shape (d,d) A list containing the covariances of all components of the Gaussian mixture distribution. pvals: ndarray of shape (K,) The proportions of each component of the Gaussian mixture. random_state: int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple runs. Returns ------- X: ndarray of shape (n, d) The array containing the samples drawn from the mixture model. d is the y: ndarray of shape (n,) The component from which each sample originates. """ loc = check_array(loc, ensure_2d=True, ensure_min_samples=2, input_name="Means") scale = check_array(scale, allow_nd=True, ensure_min_samples=2, input_name="Covariances") pvals = check_array(pvals, ensure_2d=False, ensure_min_samples=2, input_name="Proportions") # Check that the parameters have satisfying properties K, d = loc.shape if K != scale.shape[0]: raise ValueError("The means and the covariances do not contain the same number of components") if d!=1: if d != scale.shape[1] or d != scale.shape[2]: raise ValueError("The covariances should be square matrices") if K != pvals.shape[0]: raise ValueError("The proportions and the means do not contain the same number of components") if np.any(pvals <= 0): raise ValueError("Proportions or components should be strictly positive.") if np.sum(pvals) != 1: raise ValueError("Proportions of components do not add up to one.") generator = check_random_state(random_state) # Draw samples from each distribution X = [] # Draw the true cluster from which to draw y = generator.choice(K, p=pvals, size=(n,)) if d == 1: for k in range(K): if scale[k] <= 0: raise ValueError(f"The {k}-th variance is negative.") for k in range(len(loc)): X += [generator.normal(loc[k], scale[k], size=(n,))] else: for k in range(K): if np.any(np.linalg.eigvals(scale[k]) < 0): raise ValueError(f"The {k}-th covariance is not positive semi-definite") if np.all(scale[k] == 0): raise ValueError(f"The {k}-th covariance matrix contains only zeroes") for k in range(len(loc)): X += [generator.multivariate_normal(loc[k], scale[k], size=(n,))] X = [X[k][i].reshape((1, -1)) for i, k in enumerate(y)] return np.concatenate(X, axis=0), y
[docs] @constraint_params( { "n": [Interval(Integral, 1, None, closed="left")], "loc": ["array-like"], "scale": ["array-like"], "df": [Interval(Real, 0, None, closed="neither")], "random_state": ["random_state"] } ) def multivariate_student_t(n, loc, scale, df=10, random_state=None) -> np.ndarray: """ Draws :math:`n` samples from a multivariate Student-t distribution. Parameters ---------- n: int The number of samples to draw from the distribution. loc: ndarray of shape (d,) The position of the distribution to sample from. scale: ndarray of shape (d,d) Positive semi-definite scale matrix. df: int, default=10 Degrees of freedom of the distribution. Controls the spread of the samples. random_state: int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple runs. Returns ------- X: ndarray of shape (n,d) The samples drawn from the Student-t distribution. """ loc = check_array(loc, ensure_2d=False, input_name="Location") scale = check_array(scale, ensure_2d=True, input_name="Scale") d = len(loc) if scale.shape[0] != d or scale.shape[1] != d: raise ValueError("Please provide a mean and scale with consistent shapes w.r.t. the mean") generator = check_random_state(random_state) # Start the sampling process by generating from a 0-mean multivariate distribution nx = generator.multivariate_normal(np.zeros(d), scale, size=n) u = generator.chisquare(df, n).reshape((-1, 1)) X = np.sqrt(df / u) * nx + loc.reshape((1, -1)) return X
[docs] @constraint_params( { "n": [Interval(Integral, 4, None, closed="left")], "alpha": [Interval(Real, 0, None, closed="neither")], "df": [Interval(Real, 0, None, closed="neither")], "random_state": ["random_state"] } ) def gstm(n=500, alpha=2, df=1, random_state=None): """ Reproduces the Gaussian-Student Mixture dataset from the GEMINI article. Parameters ---------- n: int, default=500 The number of samples to draw from the dataset. alpha: float, default=2: This parameter controls how close the means of the Gaussian distribution and the location of the Student-t distribution are. df: float, default=1 The degrees of freedom for the Student-t distribution. random_state: int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple runs. Returns ------- X: ndarray of shape (n,2) The samples of the dataset in an array of shape n_samples x n_features y: ndarray of shape (n,) The component of the GMM from which each sample was drawn. References ---------- GEMINI - Ohl, L., Mattei, P. A., Bouveyron, C., Harchaoui, W., Leclercq, M., Droit, A., & Precioso, F. (2022, October). Generalised Mutual Information for Discriminative Clustering. In Advances in Neural Information Processing Systems. """ generator = check_random_state(random_state) # Build the location and scale of each distribution locations = np.array([[1, 1], [1, -1], [-1, 1], [-1, -1]]) * alpha covariance = np.eye(2) # For the 3 Gaussian distribution, we draw the samples using a GMM with proportions 1/3 on 3/4 of the samples n_gaussian = 3 * n // 4 X_gaussian, y_gaussian = draw_gmm(n_gaussian, locations[:-1], [covariance] * 3, np.ones(3) / 3, generator) # Then we sample the student-t distribution n_student = n - n_gaussian X_student = multivariate_student_t(n_student, locations[-1], covariance, df, generator) X = np.vstack([X_gaussian, X_student]) y = np.concatenate([y_gaussian, np.ones(n_student) * 3]) # Apply one final random permutation to shuffle the data order = generator.permutation(n) return X[order], y[order]
[docs] @constraint_params( { "n": [Interval(Integral, 1, None, closed="left")], "p": [Interval(Integral, 1, None, closed="left")], "mu": [Interval(Real, 0, None, closed="neither")], "random_state": ["random_state"] } ) def celeux_one(n=300, p=20, mu=1.7, random_state=None) -> Tuple[np.ndarray, np.ndarray]: """ Draws :math:`n` samples from a Gaussian mixture with 3 isotropic components of respective means 1, 0 and 1 over 5 dimensions scaled by :math:`\mu`. The data is concatenated with :math:`p` additional noisy excessive random variables that are independent of the true clusters. This dataset is taken by Celeux et al., section 3.1. Parameters ---------- n: int, default=300 The number of samples to draw from the gaussian mixture models. p: int, default=20 The number of excessive noisy variables to concatenate to the dataset. mu: float, default=1.7 Controls how the means of the components are close to each other by scaling. random_state: int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple runs. Returns ------- X: ndarray of shape (n, 5+p) The samples of the dataset in an array of shape n_samples x n_features y: ndarray of shape (n,) The component of the GMM from which each sample was drawn. References ---------- Dataset - Celeux, G., Martin-Magniette, M. L., Maugis-Rabusseau, C., & Raftery, A. E. (2014). Comparing model selection and regularization approaches to variable selection in model-based clustering. Journal de la Societe francaise de statistique, 155(2), 57-71. """ generator = check_random_state(random_state) # Draw the first five variables according to a balance gaussian mixture mu1 = np.ones(5) * mu mu2 = -mu1 mu3 = np.zeros(5) cov = np.eye(5) good_variables, y = draw_gmm(n, [mu1, mu2, mu3], [cov, cov, cov], np.ones(3) / 3, generator) # Create noisy independent variables noise = generator.normal(size=(n, p)) return np.concatenate([good_variables, noise], axis=1), y
[docs] @constraint_params( { "n": [Interval(Integral, 1, None, closed="left")], "random_state": ["random_state"] } ) def celeux_two(n=2000, random_state=None) -> Tuple[np.ndarray, np.ndarray]: """ Draws samples from a mixture of 4 Gaussian distributions in 2d with additional variables linearly dependent of the informative variables and non-informative noisy variables. This dataset is taken from Celeux et al., section 3.2. Parameters ---------- n: int, default=2000 The number of samples to draw. random_state: int, RandomState instance or None, default=None Determines random number generation for dataset creation. Pass an int for reproducible output across multiple runs. Returns ------- X: ndarray of shape (n, 14) The samples of the dataset in an array of shape n_samples x n_features y: ndarray of shape (n,) The component of the GMM from which each sample was drawn. References ---------- Dataset - Celeux, G., Martin-Magniette, M. L., Maugis-Rabusseau, C., & Raftery, A. E. (2014). Comparing model selection and regularization approaches to variable selection in model-based clustering. Journal de la Societe francaise de statistique, 155(2), 57-71. """ generator = check_random_state(random_state) # Start by generating the true informative variables mu1 = np.array([0, 0]) mu2 = np.array([4, 0]) mu3 = np.array([0, 2]) mu4 = np.array([4, 2]) cov = np.eye(2) pis = np.ones(4) / 4 good_variables, y = draw_gmm(n, [mu1, mu2, mu3, mu4], [cov, cov, cov, cov], pis, generator) # Apply affine transformations to produce correlated variables up to some noise b = np.array([[0.5, 1], [2, 0], [0, 3], [-1, 2], [2, -4], [0.5, 0], [4, 0.5], [3, 0], [2, 1]]).T rot_pi_3 = np.array([[0.5, -np.sqrt(3) / 2], [np.sqrt(3) / 2, 0.5]]) rot_pi_6 = np.array([[np.sqrt(3) / 2, -0.5], [0.5, np.sqrt(3) / 2]]) cov_noise = [np.eye(3), 0.5 * np.eye(2)] cov_noise += [rot_pi_3.T @ np.diag(np.array([1, 3])) @ rot_pi_3] cov_noise += [rot_pi_6.T @ np.diag(np.array([2, 6])) @ rot_pi_6] cov_noise = block_diag(*cov_noise) noise = generator.multivariate_normal(np.zeros(9), cov_noise, size=(n,)) X3_11 = np.array([0, 0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8]) + good_variables @ b + noise # Add noisy indepedent variables X12_14 = generator.multivariate_normal(np.array([3.2, 3.6, 4]), np.eye(3), size=(n,)) # Complete the dataset by joining everything bad_variables = np.concatenate([X3_11, X12_14], axis=1) return np.concatenate([good_variables, bad_variables], axis=1), y