import os
import sklearn
from sklearn.utils.validation import check_is_fitted
from sklearn.base import clone
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils._joblib import Parallel
from sklearn.utils._joblib import delayed
from sklearn.metrics.pairwise import pairwise_distances
from utils.metrics import interpolate_nan, idx_bins, compute_bins
import gpflow
from gpflow.utilities import set_trainable
from gpflow.mean_functions import MeanFunction
#from gp_class import BoundedPositiveBijector, Log
import tensorflow as tf
import tensorflow_probability as tfp

from utils.gp_pac import Softmax_PAC, SVGP_PAC

import numpy as np
import scipy

import torch
from torch import nn

import warnings
from tqdm.auto import tqdm

import h5py


class LogMeanFunction(MeanFunction):
    def __call__(self, X):
        X = tf.cast(X, dtype=tf.float64) 
        return tf.math.log(X + 1e-6)

class BoundedPositiveBijector(tfp.bijectors.Bijector):
    def __init__(self, lower, upper, validate_args=False, name="bounded_positive"):
        super().__init__(forward_min_event_ndims=0, validate_args=validate_args, name=name)
        self.lower = tf.cast(lower, tf.float64)
        self.upper = tf.cast(upper, tf.float64)

    def _forward(self, x):
        # scaling to [0, 1]
        scaled_x = tf.sigmoid(x)
        # fix [lower, upper]
        return self.lower + (self.upper - self.lower) * scaled_x

    def _inverse(self, y):
        # y-->[0, 1]
        scaled_y = (y - self.lower) / (self.upper - self.lower)
        # Inverse of sigmoid --> logit.
        return tf.math.log(scaled_y / (1 - scaled_y))

    def _forward_log_det_jacobian(self, x):
        return tf.constant(0., dtype=x.dtype)


class CalibrationMethod(sklearn.base.BaseEstimator):
    """
    A generic class for probability calibration

    A calibration method takes a set of posterior class probabilities and transform them into calibrated posterior
    probabilities. Calibrated in this sense means that the empirical frequency of a correct class prediction matches its
    predicted posterior probability.
    """

    def __init__(self):
        super().__init__()

    def fit(self, X, y):
        """
        Fit the calibration method based on the given uncalibrated class probabilities X and ground truth labels y.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            Training data, i.e. predicted probabilities of the base classifier on the calibration set.
        y : array-like, shape (n_samples,)
            Target classes.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        raise NotImplementedError("Subclass must implement this method.")

    def predict_proba(self, X):
        """
        Compute calibrated posterior probabilities for a given array of posterior probabilities from an arbitrary
        classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            The uncalibrated posterior probabilities.

        Returns
        -------
        P : array, shape (n_samples, n_classes)
            The predicted probabilities.
        """
        raise NotImplementedError("Subclass must implement this method.")

    def predict(self, X):
        """
        Predict the class of new samples after scaling. Predictions are identical to the ones from the uncalibrated
        classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            The uncalibrated posterior probabilities.

        Returns
        -------
        C : array, shape (n_samples,)
            The predicted classes.
        """
        return np.argmax(self.predict_proba(X), axis=1)

class TemperatureScaling(CalibrationMethod):
    """
    Probability calibration using temperature scaling

    Temperature scaling [1]_ is a one parameter multi-class scaling method. Output confidence scores are calibrated,
    meaning they match empirical frequencies of the associated class prediction. Temperature scaling does not change the
    class predictions of the underlying model.

    Parameters
    ----------
    T_init : float
        Initial temperature parameter used for scaling. This parameter is optimized in order to calibrate output
        probabilities.
    verbose : bool
        Print information on optimization procedure.

    References
    ----------
    .. [1] On calibration of modern neural networks, C. Guo, G. Pleiss, Y. Sun, K. Weinberger, ICML 2017
    """

    def __init__(self, T_init=1, verbose=False):
        super().__init__()
        if T_init <= 0:
            raise ValueError("Temperature not greater than 0.")
        self.T_init = T_init
        self.verbose = verbose

    def fit(self, X, y):
        """
        Fit the calibration method based on the given uncalibrated class probabilities or logits X and ground truth
        labels y.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            Training data, i.e. predicted probabilities or logits of the base classifier on the calibration set.
        y : array-like, shape (n_samples,)
            Target classes.

        Returns
        -------
        self : object
            Returns an instance of self.
        """

        # Define objective function (NLL / cross entropy)
        def objective(T):
            # Calibrate with given T
            P = scipy.special.softmax(X / T, axis=1)

            # Compute negative log-likelihood
            P_y = P[np.array(np.arange(0, X.shape[0])), y]
            tiny = np.finfo(np.float64).tiny  # to avoid division by 0 warning
            NLL = - np.sum(np.log(P_y + tiny))
            return NLL

        # Derivative of the objective with respect to the temperature T
        def gradient(T):
            # Exponential terms
            E = np.exp(X / T)

            # Gradient
            dT_i = (np.sum(E * (X - X[np.array(np.arange(0, X.shape[0])), y].reshape(-1, 1)), axis=1)) \
                   / np.sum(E, axis=1)
            grad = - dT_i.sum() / T ** 2
            return grad

        # Optimize
        self.T = scipy.optimize.fmin_bfgs(f=objective, x0=self.T_init,
                                          fprime=gradient, gtol=1e-06, disp=self.verbose)[0]

        # Check for T > 0
        if self.T <= 0:
            raise ValueError("Temperature not greater than 0.")

        return self

    def predict_proba(self, X):
        """
        Compute calibrated posterior probabilities for a given array of posterior probabilities from an arbitrary
        classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            The uncalibrated posterior probabilities.

        Returns
        -------
        P : array, shape (n_samples, n_classes)
            The predicted probabilities.
        """
        # Check is fitted
        check_is_fitted(self, "T")

        # Transform with scaled softmax
        return scipy.special.softmax(X / self.T, axis=1)
    

class HistogramBinning(CalibrationMethod):
    """
    Probability calibration using histogram binning

    Histogram binning [1]_ is a nonparametric approach to probability calibration. Classifier scores are binned into a given
    number of bins either based on fixed width or frequency. Classifier scores are then computed based on the empirical
    frequency of class 1 in each bin.

    Parameters
    ----------
        mode : str, default='equal_width'
            Binning mode used. One of ['equal_width', 'equal_freq'].
        n_bins : int, default=20
            Number of bins to bin classifier scores into.
        input_range : list, shape (2,), default=[0, 1]
            Range of the classifier scores.

    .. [1] Zadrozny, B. & Elkan, C. Obtaining calibrated probability estimates from decision trees and naive Bayesian
           classifiers in Proceedings of the 18th International Conference on Machine Learning (ICML, 2001), 609–616.
    """

    def __init__(self, mode='equal_freq', n_bins=20, input_range=[0, 1]):
        super().__init__()
        if mode in ['equal_width', 'equal_freq']:
            self.mode = mode
        else:
            raise ValueError("Mode not recognized. Choose on of 'equal_width', or 'equal_freq'.")
        self.n_bins = n_bins
        self.input_range = input_range

    def fit(self, X, y, n_jobs=None):
        """
        Fit the calibration method based on the given uncalibrated class probabilities X and ground truth labels y.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            Training data, i.e. predicted probabilities of the base classifier on the calibration set.
        y : array-like, shape (n_samples,)
            Target classes.
        n_jobs : int or None, optional (default=None)
            The number of jobs to use for the computation.
            ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
            ``-1`` means using all processors.

        Returns
        -------
        self : object
            Returns an instance of self.
        """
        if X.ndim == 1:
            raise ValueError("Calibration training data must have shape (n_samples, n_classes).")
        elif np.shape(X)[1] == 2:
            return self._fit_binary(X, y)
        elif np.shape(X)[1] > 2:
            self.onevsrest_calibrator_ = OneVsRestCalibrator(calibrator=clone(self), n_jobs=n_jobs)
            self.onevsrest_calibrator_.fit(X, y)
        return self

    def _fit_binary(self, X, y):
        if self.mode == 'equal_width':
            # Compute probability of class 1 in each equal width bin
            binned_stat = scipy.stats.binned_statistic(x=X[:, 1], values=np.equal(1, y), statistic='mean',
                                                       bins=self.n_bins, range=self.input_range)
            self.prob_class_1 = interpolate_nan(binned_stat.statistic)
            self.binning = binned_stat.bin_edges
        elif self.mode == 'equal_freq':
            # Find binning based on equal frequency
            self.binning = np.quantile(X[:, 1],
                                       q=np.linspace(self.input_range[0], self.input_range[1], self.n_bins + 1))

            # Compute probability of class 1 in equal frequency bins
            digitized = np.digitize(X[:, 1], bins=self.binning)
            digitized[digitized == len(self.binning)] = len(self.binning) - 1  # include rightmost edge in partition
            self.prob_class_1 = interpolate_nan(np.array([y[digitized == i].mean() for i in range(1, len(self.binning))]))

        return self

    def predict_proba(self, X):
        """
        Compute calibrated posterior probabilities for a given array of posterior probabilities from an arbitrary
        classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_classes)
            The uncalibrated posterior probabilities.

        Returns
        -------
        P : array, shape (n_samples, n_classes)
            The predicted probabilities.
        """
        if X.ndim == 1:
            raise ValueError("Calibration data must have shape (n_samples, n_classes).")
        elif np.shape(X)[1] == 2:
            check_is_fitted(self, ["binning", "prob_class_1"])
            # Find bin of predictions
            digitized = np.digitize(X[:, 1], bins=self.binning)
            digitized[digitized == len(self.binning)] = len(self.binning) - 1  # include rightmost edge in partition
            # Transform to empirical frequency of class 1 in each bin
            p1 = np.array([self.prob_class_1[j] for j in (digitized - 1)])
            # If empirical frequency is NaN, do not change prediction
            p1 = np.where(np.isfinite(p1), p1, X[:, 1])
            assert np.all(np.isfinite(p1)), "Predictions are not all finite."

            return np.column_stack([1 - p1, p1])
        elif np.shape(X)[1] > 2:
            check_is_fitted(self, "onevsrest_calibrator_")
            return self.onevsrest_calibrator_.predict_proba(X)

class NadarayaWatson(CalibrationMethod):
    def __init__(self, kernel='dirichlet'):
        super().__init__()
        self.kernel = kernel
    
    def fit(self, X):
        if self.kernel=='dirichlet':
            bandwidth = self.get_bandwidth(X)
        else:
            bandwidth = 0.1 ## That of Gaussian kernel is optimized as the median of pairwise distance
        self.log_kern = self.get_kernel(X, bandwidth)
        self.kern = torch.exp(self.log_kern).to(torch.float32)

    def predict_proba(self, y):
        y_onehot = nn.functional.one_hot(y, num_classes=len(y.unique())).to(torch.float32)
        kern_y = torch.matmul(self.kern, y_onehot)
        den = torch.sum(self.kern, dim=1)
        
        # to avoid division by 0
        den = torch.clamp(den, min=1e-10)
        
        ratio = kern_y / den.unsqueeze(-1)

        return ratio

    def get_kernel(self, f, bandwidth):
        if self.kernel == 'dirichlet':
            if f.shape[1] == 1:
                log_kern = self.beta_kernel(f, f, bandwidth).squeeze()
            else:
                log_kern = self.dirichlet_kernel(f, bandwidth).squeeze()
        elif self.kernel == 'gaussian':
            log_kern = self.gaussian_kernel(f, bandwidth).squeeze()
        
        # Trick: -inf on the diagonal
        return log_kern + torch.diag(torch.finfo(torch.float).min * torch.ones(len(f)))

    def gaussian_kernel(self, z, bandwidth=0.1, median=True):
        Dxx = pairwise_distances(z.numpy(), metric='sqeuclidean')
        if median:
            bandwidth = np.median(Dxx)
        log_gauss_pdf = -Dxx / 2 / bandwidth
        
        return torch.tensor(log_gauss_pdf).float()

    def beta_kernel(self, z, zi, bandwidth=0.1):
        p = zi / bandwidth + 1
        q = (1-zi) / bandwidth + 1
        z = z.unsqueeze(-2)
        
        log_beta = torch.lgamma(p) + torch.lgamma(q) - torch.lgamma(p + q)
        log_num = (p-1) * torch.log(z) + (q-1) * torch.log(1-z)
        log_beta_pdf = log_num - log_beta

        return log_beta_pdf

    def dirichlet_kernel(self, z, bandwidth=0.1):
        alphas = z / bandwidth + 1
        
        log_beta = (torch.sum((torch.lgamma(alphas)), dim=1) - torch.lgamma(torch.sum(alphas, dim=1)))
        log_num = torch.matmul(torch.log(z), (alphas-1).T)
        log_dir_pdf = log_num - log_beta

        return log_dir_pdf

    def get_bandwidth(self, f):
        """
        Select a bandwidth for the kernel based on maximizing the leave-one-out likelihood (LOO MLE).
        
        :param f: The vector containing the probability scores, shape [num_samples, num_classes]
        :param device: The device type: 'cpu' or 'cuda'
        :return: The bandwidth of the kernel
        """
        bandwidths = torch.cat((torch.logspace(start=-5, end=-1, steps=15), torch.linspace(0.2, 1, steps=5)))
        max_b = -1
        max_l = 0
        n = len(f)
        for b in bandwidths:
            log_kern = self.get_kernel(f, b)
            log_fhat = torch.logsumexp(log_kern, 1) - np.log(n-1)
            l = torch.sum(log_fhat)
            if l > max_l:
                max_l = l
                max_b = b

        return max_b
        
class GPCalibration(CalibrationMethod):
    """
    This is the modified implementation of probability calibration using a latent Gaussian process.

    Gaussian process calibration [1]_ is a non-parametric approach to calibrate posterior probabilities from an arbitrary
    classifier based on a hold-out data set. Inference is performed using a sparse variational Gaussian process
    (SVGP) [2]_ implemented in `gpflow` [3]_.
    However, tensorflow is no longer support 1.x version. 
    So, we implement the algorithm of [1]_ via tensorflow==2.x and gpflow==2.x.
    
    Parameters
    ----------
    n_classes : int
        Number of classes in calibration data.
    logits : bool, default=False
        Are the inputs for calibration logits (e.g. from a neural network)?
    mean_function : GPflow object
        Mean function of the latent GP.
    kernel : GPflow object
        Kernel function of the latent GP.
    likelihood : GPflow object
        Likelihood giving a prior on the class prediction.
    n_inducing_points : int, default=100
        Number of inducing points for the variational approximation.
    maxiter : int, default=1000
        Maximum number of iterations for the likelihood optimization procedure.
    n_monte_carlo : int, default=100
        Number of Monte Carlo samples for the inference procedure.
    max_samples_monte_carlo : int, default=10**7
        Maximum number of Monte Carlo samples to draw in one batch when predicting. Setting this value too large can
        cause memory issues.
    inf_mean_approx : bool, default=False
        If True, when inferring calibrated probabilities, only the mean of the latent Gaussian process is taken into
        account, not its covariance.
    session : tf.Session, default=None
        `tensorflow` session to use.
    random_state : int, default=0
        Random seed for reproducibility. Needed for Monte-Carlo sampling routine.
    verbose : bool
        Print information on optimization routine.

    References
    ----------
    .. [1] Wenger, J., Kjellström H. & Triebel, R. Non-Parametric Calibration for Classification in
           Proceedings of AISTATS (2020)
    .. [2] Hensman, J., Matthews, A. G. d. G. & Ghahramani, Z. Scalable Variational Gaussian Process Classification in
           Proceedings of AISTATS (2015)
    .. [3] Matthews, A. G. d. G., van der Wilk, M., et al. GPflow: A Gaussian process library using TensorFlow. Journal
           of Machine Learning Research 18, 1–6 (Apr. 2017)
    """

    def __init__(self, 
                 n_classes,
                 logits=False,
                 mean_function=None,
                 kernel=None,
                 likelihood=None,
                 model_type='SVGP',
                 pac = False,
                 likelihood_type=None,
                 loss_type=None, 
                 num_inducing=10, 
                 maxiter=1000,
                 n_monte_carlo=100,
                 max_samples_monte_carlo=10 ** 7,
                 inf_mean_approx=False,
                 random_state=1,
                 verbose=False
                 ):
        
        super().__init__()
        
        self.num_classes = n_classes
        self.verbose = verbose
        self.model_type = model_type
        self.likelihood_type = likelihood_type
        self.pac = pac
        self.loss_type = loss_type
        self.num_inducing = num_inducing
        self.n_monte_carlo = n_monte_carlo
        self.max_samples_monte_carlo = max_samples_monte_carlo

        self.model = None
        self.maxiter = maxiter
        self.inf_mean_approx = inf_mean_approx
        self.random_state = random_state
        np.random.seed(self.random_state)

        # Set likelihood
        if likelihood is None:
            if self.pac:
                self.likelihood = Softmax_PAC(self.num_classes, self.likelihood_type)
            else:
                self.likelihood = gpflow.likelihoods.Softmax(self.num_classes)
        else:
            self.likelihood = likelihood

        # Set mean function
        if mean_function is None:
            if logits:
                self.mean_function = gpflow.mean_functions.Identity()
            else:
                self.mean_function = LogMeanFunction()
        else:
            self.mean_function = mean_function
        
        # Set kernel
        if kernel is None:
            k_white = gpflow.kernels.White(variance=0.01)
            if logits:
                kernel_lengthscale = 10
                self.kernel = gpflow.kernels.RBF(lengthscales=kernel_lengthscale, variance=1)
            else:
                kernel_lengthscale = 0.5
                k_rbf = gpflow.kernels.RBF(lengthscales=kernel_lengthscale, variance=1)
                
                # Place constraints [a,b] on kernel parameters
                transform_lengthscale = BoundedPositiveBijector(.001, 10)
                transform_variance = BoundedPositiveBijector(0.01, 5)
                
                k_rbf.lengthscales = gpflow.Parameter(kernel_lengthscale, transform=transform_lengthscale)
                k_rbf.variance = gpflow.Parameter(1, transform=transform_variance)
                self.kernel = k_rbf + k_white
        else:
            self.kernel = kernel

    def fit(self, X, y):
        
        # Check for correct dimensions
        if X.ndim == 1 or np.shape(X)[1] != self.num_classes:
            raise ValueError("Calibration data must have shape (n_samples, n_classes).")

        # Fit GP in TF session
        self._fit_multiclass(X, y)
        
        return self
    
    def _fit_multiclass(self, X, y):
        
        y = y.reshape(-1, 1)
        if self.model_type == 'SVGP':
            inducing_variable = scipy.cluster.vq.kmeans(obs=X,
                                    k_or_guess=min(X.shape[0] * X.shape[1], self.num_inducing, ))[0]
            
            if self.pac:
                self.model = SVGP_PAC(kernel=self.kernel, likelihood=self.likelihood, inducing_variable=inducing_variable, 
                                            mean_function=self.mean_function, num_latent_gps=self.num_classes, whiten=True, q_diag=True, type=self.loss_type)
            else:
                self.model = gpflow.models.SVGP(kernel=self.kernel, likelihood=self.likelihood, inducing_variable=inducing_variable, 
                                            mean_function=self.mean_function, num_latent_gps=self.num_classes, whiten=True, q_diag=True)
            
            # Optimization
            optimizer = gpflow.optimizers.Scipy()
            #set_trainable(self.model.inducing_variable, False)  # Fix inducing point
            data = (X, y)
            training_loss = self.model.training_loss_closure(data, compile=False)
            optimizer.minimize(training_loss, variables=self.model.trainable_variables, options=dict(maxiter=self.maxiter))
        else:
            raise ValueError(f"Unexpected recalibratin method: {self.model_type}.")
        
        return self
    
    def predict_proba(self, X, mean_approx=False):
        
        check_is_fitted(self, "model")
        
        if mean_approx or self.inf_mean_approx:
            # Evaluate latent GP
            f, _ = self.model.predict_f(X)
            latent = f.numpy().reshape(np.shape(X))

            # Return softargmax of fitted GP at input
            return scipy.special.softmax(latent, axis=1)
        else:
            # Seed for Monte_Carlo
            tf.random.set_seed(self.random_state)
            if X.ndim == 1 or np.shape(X)[1] != self.num_classes:
                raise ValueError("Calibration data must have shape (n_samples, n_classes).")
            
            else:
                # Predict in batches to keep memory usage in Monte-Carlo sampling low
                n_data = np.shape(X)[0]
                samples_monte_carlo = self.num_classes * self.n_monte_carlo * n_data
                if samples_monte_carlo >= self.max_samples_monte_carlo:
                    n_pred_batches = np.divmod(samples_monte_carlo, self.max_samples_monte_carlo)[0]
                else:
                    n_pred_batches = 1

                p_pred_list = []
                for i in range(n_pred_batches):
                    if self.verbose:
                        print("Predicting batch {}/{}.".format(i + 1, n_pred_batches))
                    ind_range = np.arange(start=self.max_samples_monte_carlo * i,
                                              stop=np.minimum(self.max_samples_monte_carlo * (i + 1), n_data))
                    p_pred_list.append(tf.exp(self.predict_full_density(X[ind_range, :])).numpy())
                    #mean, var = self.model.predict_y(X[ind_range, :])
                    #p_pred_list.append(mean.numpy())

                return np.concatenate(p_pred_list, axis=0)
    
    def predict_full_density(self, X):
        
        mu, var = self.model.predict_f(X)
        N = tf.shape(mu)[0]
        epsilon = tf.random.normal((self.n_monte_carlo, N, self.num_classes), dtype=tf.float64)
        f_star = mu[None, :, :] + tf.sqrt(var[None, :, :]) * epsilon  # S x N x K
        p_y_f_star = tf.nn.softmax(f_star, axis=2)
        return tf.math.log(tf.reduce_mean(p_y_f_star, axis=0))
    
    def kl_divergence(self):
        return self.model.prior_kl().numpy()


class OneVsRestCalibrator(sklearn.base.BaseEstimator):
    """One-vs-the-rest (OvR) multiclass strategy
    Also known as one-vs-all, this strategy consists in fitting one calibrator
    per class. The probabilities to be calibrated of the other classes are summed.
    For each calibrator, the class is fitted against all the other classes.

    Parameters
    ----------
    calibrator : CalibrationMethod object
        A CalibrationMethod object implementing `fit` and `predict_proba`.
    n_jobs : int or None, optional (default=None)
        The number of jobs to use for the computation.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors.
        for more details.

    Attributes
    ----------
    calibrators_ : list of `n_classes` estimators
        Estimators used for predictions.
    classes_ : array, shape = [`n_classes`]
        Class labels.
    label_binarizer_ : LabelBinarizer object
        Object used to transform multiclass labels to binary labels and
        vice-versa.
    """

    def __init__(self, calibrator, n_jobs=None):
        self.calibrator = calibrator
        self.n_jobs = n_jobs

    def fit(self, X, y):
        """Fit underlying estimators.
        Parameters
        ----------
        X : (sparse) array-like, shape = [n_samples, n_features]
            Calibration data.
        y : (sparse) array-like, shape = [n_samples, ]
            Multi-class labels.
        Returns
        -------
        self
        """
        # A sparse LabelBinarizer, with sparse_output=True, has been shown to
        # outperform or match a dense label binarizer in all cases and has also
        # resulted in less or equal memory consumption in the fit_ovr function
        # overall.
        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer_.fit_transform(y)
        Y = Y.tocsc()
        self.classes_ = self.label_binarizer_.classes_
        columns = (col.toarray().ravel() for col in Y.T)
        # In cases where individual estimators are very fast to train setting
        # n_jobs > 1 in can results in slower performance due to the overhead
        # of spawning threads.  See joblib issue #112.
        self.calibrators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(OneVsRestCalibrator._fit_binary)(self.calibrator, X, column, classes=[
                "not %s" % self.label_binarizer_.classes_[i], self.label_binarizer_.classes_[i]]) for i, column in
            enumerate(columns))
        return self

    def predict_proba(self, X):
        """
        Probability estimates.

        The returned estimates for all classes are ordered by label of classes.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        T : (sparse) array-like, shape = [n_samples, n_classes]
            Returns the probability of the sample for each class in the model,
            where classes are ordered as they are in `self.classes_`.
        """
        check_is_fitted(self, ["classes_", "calibrators_"])

        # Y[i, j] gives the probability that sample i has the label j.
        Y = np.array([c.predict_proba(
            np.column_stack([np.sum(np.delete(X, obj=i, axis=1), axis=1), X[:, self.classes_[i]]]))[:, 1] for i, c in
                      enumerate(self.calibrators_)]).T

        if len(self.calibrators_) == 1:
            # Only one estimator, but we still want to return probabilities for two classes.
            Y = np.concatenate(((1 - Y), Y), axis=1)

        # Pad with zeros for classes not in training data
        if np.shape(Y)[1] != np.shape(X)[1]:
            p_pred = np.zeros(np.shape(X))
            p_pred[:, self.classes_] = Y
            Y = p_pred

        # Normalize probabilities to 1.
        Y = sklearn.preprocessing.normalize(Y, norm='l1', axis=1, copy=True, return_norm=False)
        return np.clip(Y, a_min=0, a_max=1)

    @property
    def n_classes_(self):
        return len(self.classes_)

    @property
    def _first_calibrator(self):
        return self.calibrators_[0]

    @staticmethod
    def _fit_binary(calibrator, X, y, classes=None):
        """
        Fit a single binary calibrator.

        Parameters
        ----------
        calibrator
        X
        y
        classes

        Returns
        -------

        """
        # Sum probabilities of combined classes in calibration training data X
        cl = classes[1]
        X = np.column_stack([np.sum(np.delete(X, cl, axis=1), axis=1), X[:, cl]])

        # Check whether only one label is present in training data
        unique_y = np.unique(y)
        if len(unique_y) == 1:
            if classes is not None:
                if y[0] == -1:
                    c = 0
                else:
                    c = y[0]
                warnings.warn("Label %s is present in all training examples." %
                              str(classes[c]))
            calibrator = _ConstantCalibrator().fit(X, unique_y)
        else:
            calibrator = clone(calibrator)
            calibrator.fit(X, y)
        return calibrator

class _ConstantCalibrator(CalibrationMethod):

    def fit(self, X, y):
        self.y_ = y
        return self

    def predict(self, X):
        check_is_fitted(self, 'y_')

        return np.repeat(self.y_, X.shape[0])

    def predict_proba(self, X):
        check_is_fitted(self, 'y_')

        return np.repeat([np.hstack([1 - self.y_, self.y_])], X.shape[0], axis=0)