Source code for gemclus.gemini._fdivergences

from abc import ABC
from numbers import Real

import numpy as np
from sklearn.utils._param_validation import Interval

from .._constraints import constraint_params
from ._base_loss import _GEMINI


class _FDivergence(_GEMINI, ABC):
    # This helper intermediate class simply defines the compute_affinity for all f-divergences
    def compute_affinity(self, X, y=None):
        """
        Unused for f-divergences.

        Returns
        -------
        None
        """
        return None


[docs] class KLGEMINI(_FDivergence): r""" Implements the one-vs-all and one-vs-one KL GEMINI. The one-vs-all version compares the KL divergence between a cluster distribution and the data distribution. It is the classical mutual information. .. math:: \mathcal{I} = \mathbb{E}_{y \sim p(y)}[\text{KL}(p(x|y)\|p(x))] The one-vs-one version compares the KL divergence between two cluster distributions. .. math:: \mathcal{I} = \mathbb{E}_{y_a,y_b \sim p(y)}[\text{KL}(p(x|y_a)\|p(x|y_b))] Parameters ---------- ovo: bool, default=False Whether to use the one-vs-all objective (False) or the one-vs-one objective (True). epsilon: float, default=1e-12 The precision for clipping the prediction values in order to avoid numerical instabilities. """
[docs] @constraint_params( { "ovo": [bool], "epsilon": [Interval(Real, 0, 1, closed="neither")] } ) def __init__(self, ovo=False, epsilon=1e-12): super().__init__(epsilon) self.ovo = ovo
[docs] def evaluate(self, y_pred, affinity, return_grad=False): # Use a clip mask for numerical stability in gradients clip_mask = (y_pred > self.epsilon) & (y_pred < 1 - self.epsilon) p_y_x = np.clip(y_pred, self.epsilon, 1 - self.epsilon) p_y = p_y_x.mean(0) log_p_y_x = np.log(p_y_x) log_p_y = np.log(p_y) cluster_entropy = np.sum(p_y * log_p_y) prediction_entropy = np.sum(np.mean(p_y_x * log_p_y_x, axis=0)) if self.ovo: mutual_information = prediction_entropy - np.sum(p_y * np.mean(log_p_y_x, axis=0)) else: mutual_information = prediction_entropy - cluster_entropy if return_grad: if self.ovo: gradient_mi = (log_p_y_x + 1) / log_p_y_x.shape[0] - (p_y / p_y_x + np.mean(log_p_y_x, axis=0)) / \ log_p_y_x.shape[0] else: gradient_mi = log_p_y_x / log_p_y_x.shape[0] - log_p_y / log_p_y_x.shape[0] return mutual_information, gradient_mi * clip_mask else: return mutual_information
[docs] class MI(KLGEMINI): r""" Implements the classical mutual information between cluster conditional probabilities and the complete data probabilities: .. math:: \mathcal{I} = \mathbb{E}_{y \sim p(y)}[\text{KL}(p(x|y)\|p(x))] This class is a simplified shortcut for KLGEMINI(ovo=False). Parameters ---------- epsilon: float, default=1e-12 The precision for clipping the prediction values in order to avoid numerical instabilities. """
[docs] def __init__(self, epsilon=1e-12): super().__init__(ovo=False, epsilon=epsilon)
[docs] class TVGEMINI(_FDivergence): r""" Implements the one-vs-all and one-vs-one Total Variation distance GEMINI. The one-vs-all version compares the total variation distance between a cluster distribution and the data distribution. .. math:: \mathcal{I} = \mathbb{E}_{y \sim p(y)}[\text{TV}(p(x|y)\|p(x))] The one-vs-one version compares the TV distance between two cluster distributions. .. math:: \mathcal{I} = \mathbb{E}_{y_a,y_b \sim p(y)}[\text{TV}(p(x|y_a)\|p(x|y_b))] Parameters ---------- ovo: bool, default=False Whether to use the one-vs-all objective (False) or the one-vs-one objective (True). epsilon: float, default=1e-12 The precision for clipping the prediction values in order to avoid numerical instabilities. """
[docs] @constraint_params( { "ovo": [bool], "epsilon": [Interval(Real, 0, 1, closed="neither")] } ) def __init__(self, ovo=False, epsilon=1e-12): super().__init__(epsilon) self.ovo = ovo
[docs] def evaluate(self, y_pred, affinity, return_grad=False): # Use a clip mask for numerical stability in gradients clip_mask = (y_pred > self.epsilon) & (y_pred < 1 - self.epsilon) p_y_x = np.clip(y_pred, self.epsilon, 1 - self.epsilon) p_y = p_y_x.mean(0) if self.ovo: # Extend to 3d tensors to compute p(y=k|x_i) * p(y=k^\prime) # Shape: NxKx1 extended_p_y = np.expand_dims(np.repeat(np.expand_dims(p_y, axis=0), y_pred.shape[0], axis=0), axis=-1) # Nx1xK extended_p_y_x = np.expand_dims(p_y_x, axis=1) # NxKxK cross_product = extended_p_y @ extended_p_y_x difference = cross_product - np.transpose(cross_product, axes=[0, 2, 1]) else: difference = p_y_x - p_y sign_mask = np.sign(difference) # Doing the mean on the data axis, helps to handle the sum of differences with either 2d or 3d axes pseudo_estimates = np.mean(np.abs(difference), axis=0) tv_gemini = 0.5 * np.sum(pseudo_estimates) if return_grad: if self.ovo: base_grad = sign_mask / y_pred.shape[0] cross_prod_grad = base_grad - np.transpose(base_grad, axes=[0, 2, 1]) # NxKxK extended_p_y_x_grad = np.transpose(extended_p_y, axes=[0, 2, 1]) @ cross_prod_grad # Nx1xK,NxKxK => Nx1xK extended_p_y_grad = cross_prod_grad @ np.transpose(extended_p_y_x, axes=[0, 2, 1]) # NxKxK, NxKx1 => NxKx1 gradients = np.squeeze(extended_p_y_x_grad) + np.squeeze(extended_p_y_grad).mean(0) else: gradients = (sign_mask - np.mean(sign_mask, axis=0)) / y_pred.shape[0] return tv_gemini, 0.5 * gradients * clip_mask else: return tv_gemini
[docs] class HellingerGEMINI(_FDivergence): r""" Implements the one-vs-all and one-vs-one Squared Hellinger distance GEMINI. The one-vs-all version compares the squared Hellinger distance between a cluster distribution and the data distribution. .. math:: \mathcal{I} = \mathbb{E}_{y \sim p(y)}[\text{H}^2(p(x|y)\|p(x))] The one-vs-one version compares the squared Hellinger distance between two cluster distributions. .. math:: \mathcal{I} = \mathbb{E}_{y_a,y_b \sim p(y)}[\text{H}^2(p(x|y_a)\|p(x|y_b))] Parameters ---------- ovo: bool, default=False Whether to use the one-vs-all objective (False) or the one-vs-one objective (True). epsilon: float, default=1e-12 The precision for clipping the prediction values in order to avoid numerical instabilities. """
[docs] @constraint_params( { "ovo": [bool], "epsilon": [Interval(Real, 0, 1, closed="neither")] } ) def __init__(self, ovo=False, epsilon=1e-12): super().__init__(epsilon) self.ovo = ovo
[docs] def evaluate(self, y_pred, affinity, return_grad=False): # Use a clip mask for numerical stability in gradients clip_mask = (y_pred > self.epsilon) & (y_pred < 1 - self.epsilon) p_y_x = np.clip(y_pred, self.epsilon, 1 - self.epsilon) p_y = p_y_x.mean(0) cluster_wise_estimates = np.sqrt(p_y_x * p_y) estimates = np.sum(cluster_wise_estimates, axis=1) if self.ovo: estimates = np.square(estimates) hellinger_gemini = 1 - np.mean(estimates, axis=0) if return_grad: if self.ovo: sqrt_estimates = np.sqrt(estimates.reshape((-1, 1))) gradients = - (p_y / cluster_wise_estimates * sqrt_estimates + np.mean( p_y_x / cluster_wise_estimates * sqrt_estimates, axis=0)) else: gradients = -0.5 * (p_y / cluster_wise_estimates + np.mean(p_y_x / cluster_wise_estimates, axis=0)) gradients /= y_pred.shape[0] return hellinger_gemini, gradients * clip_mask else: return hellinger_gemini
[docs] class ChiSquareGEMINI(_FDivergence): r""" Implements the one-vs-all and one-vs-one Chi Squared divergence GEMINI. The one-vs-all version compares the chi square divergence between a cluster distribution and the data distribution. .. math:: \mathcal{I} = \mathbb{E}_{y \sim p(y)}[D_{\chi^2}(p(x|y)\|p(x))] The one-vs-one version compares the chi square divergence between two cluster distributions. .. math:: \mathcal{I} = \mathbb{E}_{y_a,y_b \sim p(y)}[D_{\chi^2}(p(x|y_a)\|p(x|y_b))] Parameters ---------- ovo: bool, default=False Whether to use the one-vs-all objective (False) or the one-vs-one objective (True). epsilon: float, default=1e-12 The precision for clipping the prediction values in order to avoid numerical instabilities. References ---------- Sugiyama, M., Yamada, M., Kimura, M., & Hachiya, H. (2011). On information-maximization clustering: Tuning parameter selection and analytic solution. In Proceedings of the 28th International Conference on Machine Learning (ICML-11) (pp. 65-72). """
[docs] @constraint_params( { "ovo": [bool], "epsilon": [Interval(Real, 0, 1, closed="neither")] } ) def __init__(self, ovo=False, epsilon=1e-12): super().__init__(epsilon) self.ovo = ovo
[docs] def evaluate(self, y_pred, affinity, return_grad=False): # Use a clip mask for numerical stability in gradients clip_mask = (y_pred > self.epsilon) & (y_pred < 1 - self.epsilon) p_y_x = np.clip(y_pred, self.epsilon, 1 - self.epsilon) p_y = p_y_x.mean(0) cluster_wise_estimates = p_y_x / p_y if self.ovo: alpha = np.sum(p_y_x*cluster_wise_estimates, axis=1, keepdims=True) beta = np.sum(p_y/cluster_wise_estimates, axis=1, keepdims=True) chi2_gemini = np.mean(alpha*beta) else: chi2_gemini = np.sum(p_y_x*cluster_wise_estimates, axis=1).mean() if return_grad: if self.ovo: single_beta = beta * cluster_wise_estimates double_beta = single_beta * cluster_wise_estimates single_alpha = alpha / cluster_wise_estimates double_alpha = single_alpha / cluster_wise_estimates gradients = 2*single_beta-double_alpha + np.mean(2*single_alpha-double_beta, axis=0) else: gradients = (2*cluster_wise_estimates-np.square(cluster_wise_estimates).mean(0)) gradients /= y_pred.shape[0] return 0.5*chi2_gemini, 0.5*gradients * clip_mask else: return 0.5*chi2_gemini