# Copyright (C) 2009  Arno Onken
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from numpy import *
from scipy.optimize import newton
from scipy.stats import kendalltau
from scipy.stats import chi2

def chi2franktest(x,y,alpha=0.05,mef=1):
    ''' Test for linear dependence structure between x and y, assuming that x and
        y are non-negative integer vectors. See Onken, Gruenewaelder, and
        Obermayer 2009, Advances in Neural Information Processing Systems 22.
        Arguments:
         x         - Array of integer random values
         y         - Array of integer random values of the same size as x
         alpha     - Significance level (default alpha = 0.05)
         mef       - Minimum expected frequency (default mef = 1)
        Returns:
         h         - 1 indicates rejection of the linear dependence hypothesis at
                     the specified significance level; 0 otherwise
         thresh    - chi-square threshold value
         test_stat - Test statistic
    '''
    x.shape = (x.size,1)
    y.shape = (y.size,1)
    # Generate contingency table
    cont = zeros((x.max()+1,y.max()+1),dtype='float64')
    for i in range(0,x.size):
        cont[x[i],y[i]] += 1
    marginfit = discempfit(concatenate((x,y),axis=1))
    tau = kendalltau(x,y)[0]
    theta = frankparam(tau)
    # Expected counts
    (x2, y2) = mgrid[0:(cont.shape[0]),0:(cont.shape[1])]
    x2.shape = (x2.size,1)
    y2.shape = (y2.size,1)
    econt = dmfrankpdf(concatenate((x2,y2),axis=1),marginfit,theta).reshape(cont.shape[0],cont.shape[1]) * x.size
    # Apply ordered expected-frequencies procedure (see Loukas and Kemp 1986, The Statistician)
    ordering = econt.ravel().argsort()[::-1]
    econt_array = econt.ravel()[ordering]
    cont_array = cont.ravel()[ordering]
    # Group according to minimum expected frequency (MEF)
    econt_group = zeros(econt_array.shape,dtype='float64')
    cont_group = zeros(econt_array.shape,dtype='float64')
    ig = 0
    for i in range(0,econt_array.size):
        econt_group[ig] = econt_group[ig] + econt_array[i]
        cont_group[ig] = cont_group[ig] + cont_array[i]
        if econt_group[ig] >= mef:
            ig += 1
    econt_group = econt_group[0:ig]
    cont_group = cont_group[0:ig]
    # Compute test statistic for data
    test_stat = (((cont_group - econt_group)**2) / econt_group).sum()
    # Degrees of freedom; -1 for copula parameter
    df = (cont.shape[0]-1) * (cont.shape[1]-1) - 1
    thresh = chi2.ppf(alpha,df)
    h = test_stat < thresh
    return (h,thresh,test_stat)

def discempfit(x):
    ''' Probability mass values of an empirical distribution over integer values
        based on a sample.
        Arguments:
         x - n-by-d matrix of random samples
        Returns:
         l - d-by-y matrix of pmf values; l(d, 1) is for the zero
             values; sum (l(k, :)) == 1 sould hold for any k in
             {1, ..., d}.
    '''
    x = x.transpose()
    # Dimension
    d = x.shape[0]
    m = x.max()
    # PMF matrix
    l = zeros((d,m+1),dtype='float64')
    for i in range(0,m+1):
        # Count number of samples equal to i
        t = x==i;
        l[:,i] = t.sum(axis=1)
    # Normalize sums
    l /= l.sum(axis=1).reshape(l.shape[0],1)
    return l

def discempcdf(x,l):
    ''' Cumulative distribution function of an empirical distribution over
        integer values.
        Arguments:
         x - n-by-d matrix of random samples
         l - d-by-y matrix of PMF values; l(d, 1) is for the zero
             values; sum (l(k, :)) == 1 sould hold for any k in
             {1, ..., d}.
        Returns:
         p - n-by-d vector of cumulative probabilties
    '''
    # Bring x into the right format
    x = floor(x)
    x.clip(min=0.0,max=l.size)
    x = x.astype(int)
    # Cumulative sum for cumulative probabilities
    l = l.cumsum(axis=1)
    p = zeros(x.shape,dtype='float64')
    for j in range(0,x.shape[1]):
        p[:,j] = l[j,x[:,j]]
    return p

def frankcdf(x,theta):
    """ Cumulative distribution function of the Frank copula.
         Arguments:
          x     - random sample
          theta - Scalar parameter of the Frank copula
         Returns:
          c     - cumulative probabilty
    """
    if len(x.shape) == 1:
        x.shape = (1,x.size)
    if theta == 0:
        # Independence
        c = multiply.reduce(x,axis=1)
    else:
        # CDF of the Frank copula family
        c = -log(1 + (multiply.reduce(expm1(-theta * x),axis=1)) / (expm1(-theta))) / theta;
    return c.reshape(c.size,1)

def dmfrankpdf(X,l,theta):
    ''' Calculates the probability mass function for samples <X> of a Frank
        copula family based distribution with discrete margins. This function has
        exponential complexity in the number of elements.
        Arguments:
         X         - Discrete sample
         l         - marginal fit
         theta     - Copula parameter
        Returns:
         p         - Probabilitiy of X
    '''
    X = X.transpose()
    n = X.shape[0]
    trials = X.shape[1]
    # All binary combinations in a (2**n)-by-n matrix
    bcomb = floor(dot(arange(0,2**n,dtype='float64').reshape(2**n,1),2 ** (arange(1-n,1,dtype='float64')).reshape(1,n))%2)
    p = zeros((trials,1),dtype='float64')
    for itrials in range(0,trials):
        x = X[:,itrials]
        x.shape = (1,n)
        # Apply the inclusion-exclusion principle
        # - but only to the subset of elements != 0
        nz = x.nonzero()[1]
        nnz = len(nz)
        if nnz > 0:
            # Compute cdf of all sub-parts
            x_sub = zeros((2**nnz,n),dtype='float64')
            x_sub[:,nz] = bcomb.take(range(0,2**nnz),axis=0).take(range(n-nnz,n),axis=1)
            # csample = margincdf(tile(x,(2**nnz,1)) - x_sub,l)
            csample = discempcdf(x-x_sub,l)
            subpart = frankcdf(csample,theta)
            # Signs of the terms
            signs = -ones((2**nnz,1),dtype='float64')
            signs[((x_sub[:,nz]).sum(axis=1) % 2) == 0] = 1
            p[itrials,0] = (signs * subpart).sum()
        else:
            csample = discempcdf(x,l)
            p[itrials,0] = frankcdf(csample,theta)
    return p

def frankparam(tau):
    ''' Estimates the parameter theta of the Frank copula family for which
        Kendall's tau of the Frank copula is equal to the function argument tau.
     
        Arguments:
         tau   - Kendall's tau of the Frank copula

        Returns:
         theta - Parameter of the Frank copula family
    '''
    tau_nn = abs(tau)
    if tau_nn < sqrt(finfo(float).eps):
        theta = 0
    elif tau_nn >= 1:
        theta = plus_inf
    else:
        fkt_diff = lambda t: frank_kendall_tau(t) - tau_nn
        theta = newton(fkt_diff, 1)
    if (tau < 0):
        theta = -theta
    return theta

def frank_kendall_tau(theta):
    ''' Calculates Kendall's tau for the Frank copula with paramter theta using
        the Debye function
    '''
    if theta < sqrt(finfo(float).eps):
        return 0
    else:
        return 1 - 4 / theta * (1 - debye_1(theta))

def debye_1(x):
    ''' Debye function D_n for n = 1
        Adopted from the GNU Scientific Library (GSL) version 1.9
        D_1(x) := 1/x Integrate[t/(e^t - 1), {t,0,x}]
    '''
    log_dbl_epsilon = -3.6043653389117154e+01
    sqrt_dbl_epsilon = 1.4901161193847656e-08
    val_infinity = 1.64493406684822644
    xcut = 7.0839641853226408e+02
    if x < 2.0*sqrt_dbl_epsilon:
        return 1.0 - 0.25*x + x*x/36.0
    if x <= 4.0:
        t = x*x/8.0 - 1.0
        c = cheb_eval_e(t)
        return c - 0.25 * x
    if x < -(log (2) + log_dbl_epsilon):
        nexp = floor(xcut/x)
        ex  = exp(-x)
        s = 0.0
        xk  = nexp * x
        rk  = nexp
        i = nexp
        while i>=1:
            s = s * ex
            s = s + (1.0 + 1.0/xk)/rk
            rk = rk - 1.0
            xk = xk - x
            i = i - 1
        return val_infinity/x - s*ex
    if x < xcut:
        return (val_infinity - exp(-x)*(x+1.0)) / x
    return val_infinity/x

def cheb_eval_e(x):
    ''' Chebychev approximation
    '''
    adeb1_data = array([ 2.4006597190381410194,
                         0.1937213042189360089,
                        -0.62329124554895770e-02,
                         0.3511174770206480e-03,
                        -0.228222466701231e-04,
                         0.15805467875030e-05,
                        -0.1135378197072e-06,
                         0.83583361188e-08,
                        -0.6264424787e-09,
                         0.476033489e-10,
                        -0.36574154e-11,
                         0.2835431e-12,
                        -0.221473e-13,
                         0.17409e-14,
                        -0.1376e-15,
                         0.109e-16,
                        -0.9e-18])
    order = 16
    a = -1.0
    b = 1.0
    d  = 0
    dd = 0
    y  = (2*x - a - b) / (b - a)
    y2 = 2 * y
    j = order
    while j >= 1:
      temp = d
      d = y2*d - dd + adeb1_data[j]
      dd = temp
      j = j - 1
    d = y*d - dd + 0.5 * adeb1_data[0]
    return d

