Source code for gemclus.mlp._mlp_geminis

from numbers import Integral

import numpy as np
from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, PAIRWISE_DISTANCE_FUNCTIONS
from sklearn.neural_network._stochastic_optimizers import AdamOptimizer, SGDOptimizer
from sklearn.utils._param_validation import Interval, StrOptions
from sklearn.utils.extmath import softmax

from .._base_gemini import DiscriminativeModel
from ..gemini import MMDGEMINI, WassersteinGEMINI


[docs] class MLPModel(DiscriminativeModel): """ Implementation of a two-layer neural network as a clustering distribution :math:`p(y|x)`. Any GEMINI can be used to train this model. Parameters ---------- n_clusters : int, default=3 The maximum number of clusters to form as well as the number of output neurons in the neural network. gemini: str, GEMINI instance or None, default="mmd_ova" GEMINI objective used to train this discriminative model. Can be "mmd_ova", "mmd_ovo", "wasserstein_ova", "wasserstein_ovo", "mi" or other GEMINI available in `gemclus.gemini.AVAILABLE_GEMINI`. Default GEMINIs involve the Euclidean metric or linear kernel. To incorporate custom metrics, a GEMINI can also be passed as an instance. If set to None, the GEMINI will be MMD OvA with linear kernel. max_iter: int, default=1000 Maximum number of epochs to perform gradient descent in a single run. learning_rate: float, default=1e-3 Initial learning rate used. It controls the step-size in updating the weights. n_hidden_dim: int, default=20 The number of neurons in the hidden layer of the neural network. solver: {'sgd','adam'}, default='adam' The solver for weight optimisation. - 'sgd' refers to stochastic gradient descent. - 'adam' refers to a stochastic gradient-based optimiser proposed by Kingma, Diederik and Jimmy Ba. batch_size: int, default=None The size of batches during gradient descent training. If set to None, the whole data will be considered. verbose: bool, default=False Whether to print progress messages to stdout random_state: int, RandomState instance, default=None Determines random number generation for weights and bias initialisation. Pass an int for reproducible results across multiple function calls. Attributes ---------- W1_: ndarray, shape (n_features, n_hidden_dim) The linear weights of the first layer b1_: ndarray of shape (1, n_hidden_dim) The biases of the first layer W2_: ndarray of shape (n_hidden_dim, n_clusters) The linear weights of the hidden layer b2_: ndarray of shape (1, n_clusters) The biases of the hidden layer optimiser_: `AdamOptimizer` or `SGDOptimizer` The optimisation algorithm used for training depending on the chosen solver parameter. labels_: ndarray of shape (n_samples) The labels that were assigned to the samples passed to the :meth:`fit` method. n_iter_: int The number of iterations that the model took for converging. H_: ndarray of shape (n_samples, n_hidden_dim) The hidden representation of the samples after fitting.OvA References ---------- GEMINI - Generalised Mutual Information for Discriminative Clustering Louis Ohl, Pierre-Alexandre Mattei, Charles Bouveyron, Warith Harchaoui, Mickaël Leclercq, Arnaud Droit, Frederic Precioso See Also -------- MLPMMD: two-layer neural network trained for clustering with the MMD GEMINI MLPWasserstein: two-layer neural network trained for clustering with the Wasserstein GEMINI Examples -------- >>> from sklearn.datasets import load_iris >>> from gemclus.mlp import MLPModel >>> X,y=load_iris(return_X_y=True) >>> clf = MLPModel(gemini="mi",random_state=0).fit(X) >>> clf.predict(X[:2,:]) array([0, 0]) >>> clf.predict_proba(X[:2,:]).shape (2, 3) >>> clf.score(X) 0.6325592616 """ _parameter_constraints: dict = { **DiscriminativeModel._parameter_constraints, "n_hidden_dim": [Interval(Integral, 1, None, closed="left")], }
[docs] def __init__(self, n_clusters=3, gemini="mmd_ova", max_iter=1000, learning_rate=1e-3, solver="adam", n_hidden_dim=20, batch_size=None, verbose=False, random_state=None): super().__init__( n_clusters=n_clusters, gemini=gemini, max_iter=max_iter, learning_rate=learning_rate, solver=solver, batch_size=batch_size, verbose=verbose, random_state=random_state ) self.n_hidden_dim = n_hidden_dim
def _init_params(self, random_state, X=None): in_threshold = np.sqrt(1 / self.n_features_in_) hidden_threshold = np.sqrt(1 / self.n_hidden_dim) self.W1_ = random_state.uniform(-in_threshold, in_threshold, size=(self.n_features_in_, self.n_hidden_dim)) self.b1_ = random_state.uniform(-in_threshold, in_threshold, size=(1, self.n_hidden_dim)) self.W2_ = random_state.uniform(-hidden_threshold, hidden_threshold, size=(self.n_hidden_dim, self.n_clusters)) self.b2_ = random_state.uniform(-hidden_threshold, hidden_threshold, size=(1, self.n_clusters)) def _compute_grads(self, X, y_pred, gradient): tau_hat_grad = y_pred * (gradient - (y_pred * gradient).sum(1, keepdims=True)) # Shape NxK W2_grad = self.H_.T @ tau_hat_grad # Shape b2_grad = tau_hat_grad.sum(0, keepdims=True) backprop_grad = tau_hat_grad @ W2_grad.T backprop_grad *= self.H_ > 0 W1_grad = X.T @ backprop_grad b1_grad = backprop_grad.sum(0, keepdims=True) # Negative sign to force the optimiser to maximise instead of minimise gradients = [-W1_grad, -W2_grad, -b1_grad, -b2_grad] return gradients def _get_weights(self): return [self.W1_, self.W2_, self.b1_, self.b2_] def _infer(self, X, retain=True): H = np.maximum((X @ self.W1_ + self.b1_), 0) if retain: self.H_ = H return softmax(H @ self.W2_ + self.b2_)
[docs] class MLPMMD(MLPModel): """ Implementation of the maximisation of the MMD GEMINI using a two-layer neural network as a clustering distribution :math:`p(y|x)`. Parameters ---------- n_clusters : int, default=3 The maximum number of clusters to form as well as the number of output neurons in the neural network. max_iter: int, default=1000 Maximum number of epochs to perform gradient descent in a single run. learning_rate: float, default=1e-3 Initial learning rate used. It controls the step-size in updating the weights. n_hidden_dim: int, default=20 The number of neurons in the hidden layer of the neural network. kernel: {'additive_chi2', 'chi2', 'cosine','linear','poly','polynomial','rbf','laplacian','sigmoid', 'precomputed'}, default='linear' The kernel to use in combination with the MMD objective. It corresponds to one value of `KERNEL_PARAMS`. Currently, all kernel parameters are the default ones. If the kernel is set to 'precomputed', then a custom kernel matrix must be passed to the argument `y` of `fit`, `fit_predict` and/or `score`. ovo: bool, default=False Whether to run the model using the MMD OvA (False) or the MMD OvO (True). solver: {'sgd','adam'}, default='adam' The solver for weight optimisation. - 'sgd' refers to stochastic gradient descent. - 'adam' refers to a stochastic gradient-based optimiser proposed by Kingma, Diederik and Jimmy Ba. batch_size: int, default=None The size of batches during gradient descent training. If set to None, the whole data will be considered. verbose: bool, default=False Whether to print progress messages to stdout random_state: int, RandomState instance, default=None Determines random number generation for weights and bias initialisation. Pass an int for reproducible results across multiple function calls. kernel_params: dict, default=None A dictionary of keyword arguments to pass to the chosen kernel function. Attributes ---------- W1_: ndarray, shape (n_features, n_hidden_dim) The linear weights of the first layer b1_: ndarray of shape (1, n_hidden_dim) The biases of the first layer W2_: ndarray of shape (n_hidden_dim, n_clusters) The linear weights of the hidden layer b2_: ndarray of shape (1, n_clusters) The biases of the hidden layer optimiser_: `AdamOptimizer` or `SGDOptimizer` The optimisation algorithm used for training depending on the chosen solver parameter. labels_: ndarray of shape (n_samples) The labels that were assigned to the samples passed to the :meth:`fit` method. n_iter_: int The number of iterations that the model took for converging. H_: ndarray of shape (n_samples, n_hidden_dim) The hidden representation of the samples after fitting.OvA References ---------- GEMINI - Generalised Mutual Information for Discriminative Clustering Louis Ohl, Pierre-Alexandre Mattei, Charles Bouveyron, Warith Harchaoui, Mickaël Leclercq, Arnaud Droit, Frederic Precioso See Also -------- MLPModel: two-layer neural network trained for clustering with any GEMINI MLPWasserstein: two-layer neural network trained for clustering with the Wasserstein GEMINI Examples -------- >>> from sklearn.datasets import load_iris >>> from gemclus.mlp import MLPMMD >>> X,y=load_iris(return_X_y=True) >>> clf = MLPMMD(random_state=0).fit(X) >>> clf.predict(X[:2,:]) array([2, 2]) >>> clf.predict_proba(X[:2,:]).shape (2, 3) >>> clf.score(X) 1.7592155845 """ _parameter_constraints: dict = { **MLPModel._parameter_constraints, "kernel": [StrOptions(set(list(PAIRWISE_KERNEL_FUNCTIONS) + ["precomputed"])), callable], "kernel_params": [dict, None], "ovo": [bool] }
[docs] def __init__(self, n_clusters=3, max_iter=1000, learning_rate=1e-3, n_hidden_dim=20, kernel="linear", ovo=False, solver="adam", batch_size=None, verbose=False, random_state=None, kernel_params=None): super().__init__( n_clusters=n_clusters, gemini=None, max_iter=max_iter, learning_rate=learning_rate, n_hidden_dim=n_hidden_dim, solver=solver, batch_size=batch_size, verbose=verbose, random_state=random_state ) self.ovo = ovo self.kernel = kernel self.kernel_params = kernel_params
[docs] def get_gemini(self): return MMDGEMINI(ovo=self.ovo, kernel=self.kernel, kernel_params=self.kernel_params)
[docs] class MLPWasserstein(MLPModel): """ Implementation of the maximisation of the Wasserstein GEMINI using a two-layer neural network as a clustering distribution :math:`p(y|x)`. Parameters ---------- n_clusters : int, default=3 The maximum number of clusters to form as well as the number of output neurons in the neural network. max_iter: int, default=1000 Maximum number of epochs to perform gradient descent in a single run. learning_rate: float, default=1e-3 Initial learning rate used. It controls the step-size in updating the weights. n_hidden_dim: int, default=20 The number of neurons in the hidden layer of the neural network. metric: {'cosine', 'euclidean', 'l2','l1','manhattan','cityblock', 'precomputed'}, default='euclidean' The metric to use in combination with the Wasserstein objective. It corresponds to one value of `PAIRED_DISTANCES`. Currently, all metric parameters are the default ones. If the metric is set to 'precomputed', then a custom distance matrix must be passed to the argument `y` of `fit`, `fit_predict` and/or `score`. ovo: bool, default=False Whether to run the model using the Wasserstein OvA (False) or the Wasserstein OvO (True). solver: {'sgd','adam'}, default='adam' The solver for weight optimisation. - 'sgd' refers to stochastic gradient descent. - 'adam' refers to a stochastic gradient-based optimiser proposed by Kingma, Diederik and Jimmy Ba. batch_size: int, default=None The size of batches during gradient descent training. If set to None, the whole data will be considered. verbose: bool, default=False Whether to print progress messages to stdout random_state: int, RandomState instance, default=None Determines random number generation for weights and bias initialisation. Pass an int for reproducible results across multiple function calls. metric_params: dict, default=None A dictionary of keyword arguments to pass to the chosen metric function. Attributes ---------- W1_: ndarray, shape (n_features, n_hidden_dim) The linear weights of the first layer b1_: ndarray of shape (1, n_hidden_dim) The biases of the first layer W2_: ndarray of shape (n_hidden_dim, n_clusters) The linear weights of the hidden layer b2_: ndarray of shape (1, n_clusters) The biases of the hidden layer optimiser_: `AdamOptimizer` or `SGDOptimizer` The optimisation algorithm used for training depending on the chosen solver parameter. labels_: ndarray of shape (n_samples) The labels that were assigned to the samples passed to the :meth:`fit` method. n_iter_: int The number of iterations that the model took for converging. H_: ndarray of shape (n_samples, n_hidden_dim) The hidden representation of the samples after fitting. References ---------- GEMINI - Generalised Mutual Information for Discriminative Clustering Louis Ohl, Pierre-Alexandre Mattei, Charles Bouveyron, Warith Harchaoui, Mickaël Leclercq, Arnaud Droit, Frederic Precioso See Also -------- MLPModel: two-layer neural network trained for clustering with any GEMINI MLPMMD: two-layer neural network trained for clustering with the MMD GEMINI Examples -------- >>> from sklearn.datasets import load_iris >>> from gemclus.mlp import MLPWasserstein >>> X,y=load_iris(return_X_y=True) >>> clf = MLPWasserstein(random_state=0).fit(X) >>> clf.predict(X[:2,:]) array([2, 2]) >>> clf.predict_proba(X[:2,:]).shape (2, 3) >>> clf.score(X) 1.7636482655 """ _parameter_constraints: dict = { **MLPModel._parameter_constraints, "metric": [StrOptions(set(list(PAIRWISE_DISTANCE_FUNCTIONS) + ["precomputed"])), callable], "metric_params": [dict, None], "ovo": [bool], }
[docs] def __init__(self, n_clusters=3, max_iter=1000, learning_rate=1e-3, n_hidden_dim=20, metric="euclidean", ovo=False, solver="adam", batch_size=None, verbose=False, random_state=None, metric_params=None): super().__init__( n_clusters=n_clusters, gemini=None, max_iter=max_iter, learning_rate=learning_rate, n_hidden_dim=n_hidden_dim, solver=solver, batch_size=batch_size, verbose=verbose, random_state=random_state ) self.ovo = ovo self.metric = metric self.metric_params = metric_params
[docs] def get_gemini(self): return WassersteinGEMINI(ovo=self.ovo, metric=self.metric, metric_params=self.metric_params)