##
# Description: This file implements the horseshoe bayesian model for feature selection 
#              
# Author: Daniel Hernández Lobato
#
# Year: 2012
#
#

##
# Function: horseShoeEPinternal
#
# Author: Daniel Hernández-Lobato
#
# Description: Trains a horse shoe bayesian model on the data 
#
# Arguments: 
#		X -> a matrix containing the attributes of the training data (one row per sample)
#		Y -> a vector with the targets
#		sigma2 -> Variance of the noise (in log scale)
#		Zu -> Pseudo-vectors of the correlation matrix for the u's.
#		Zv -> Pseudo-vectors of the correlation matrix for the v's. (typically both are the same)
#		cu -> Constant that multiplies the correlation matrix of the u's (in log scale)
#		cv -> Constant that multiplies the correlation matrix of the v's (in log scale)
#		initializeApproximation -> approximation used to initialize EP
#
# Returns:	Posterior Approximation
#

horseShoeEPinternal <- function(X, Y, sigma2 = log(.1),  
	Zu = matrix(0, nrow(X), ncol(X)), Zv = matrix(0, nrow(X), ncol(X)), cu = 0, cv = 0, 
	initialApprox = NULL, damping = 0.5) {

	# We initialize the posterior approximation

	a <- initializeApproximation(X, Y, sigma2, Zu, Zv, cu, cv)

	if (! is.null(initialApprox)) {
		a$f1Hat <- initialApprox$f1Hat
	}

	iters <- 0
	convergence <- FALSE

	while (iters < 1000 && !convergence) {

		aOld <- a

		aNew <- processPriorTermsFastCImplementationSameCode(a, damping)

		dampingAux <- damping

		while (! verifyPositveDefinitenessInner(aNew)) {
			dampingAux <- dampingAux * 0.5
#			cat("InnerLoop: Reducing damping for positive-definiteness!", "dampingAux:", dampingAux, "\n")
			aNew <- processPriorTermsFastCImplementationSameCode(a, dampingAux)
		}

		a <- aNew
	
		# We look for convergence in the EP algorithm

		convergence <- checkConvergence(aOld, a)

		iters <- iters + 1
		damping <- damping * 0.99

#		cat("Damping:", damping, "\n")
	}

	# We evaluate the models evidence

	logZ <- computeEvidence(a)

	# We compute an updated predictie distribution

	ret <- computeTitledDistribution(a) 

	gradInfo <- computeGradientEvidence(a)

	# We return the model evidence, the posterior approximation (means and variances)
	# and the gradient information for computing the gradients of the log model evidence

	list(a = a, logZ = logZ, mW = ret$mW, gradInfo = gradInfo, vV = ret$vV, vU = ret$vU, vW = ret$vW)
}

##
# Function: initializeApproximation
#
# Author: Daniel Hernández-Lobato
#
# Description: initializes the posterior approximation
#
# Arguments: 
#		X -> a list containing the attributes of the training data (one row per sample)
#		Y -> a list with the targets
#		sigma2 -> Variance of the noise (in log scale)
#		Zu -> Pseudo-vectors of the covariance matrix for the u's.
#		Zv -> Pseudo-vectors of the covariance matrix for the v's.
#		cu -> Constant that multiplies the covariance matrix of the u's (in log scale)
#		cv -> Constant that multiplies the covariance matrix of the v's (in log scale)
#
# Returns: 	The posterior approximation
#

initializeApproximation <- function(X, Y, sigma2, Zu, Zv, cu, cv) {

        # We set some useful constants. This is the number of tasks

	n <- nrow(X) 
	d <- ncol(X)

	# We define the paramters of the terms corresponding to the prior (Natural Parameters)
	# We initialize the variance of the prior to something flat

	eta1w <- eta2w <- eta2u <- eta2v <- rep(1e-3, d)
	eta1w <- eta1w * 0
	etas <- rep(0, d)

	f1Hat <- list(eta1w = eta1w, eta2w = eta2w, eta2u = eta2u, eta2v = eta2v, etas = etas)

	# This is some constant added to the diagonal of the covariance matrix to guarantee
	# positive definiteness

	diagC <- rep(exp(-6), d) 

	# We compute the l2 norm of each vector + the constant

	varsCu <- apply(Zu, 2, function(x) sum(x^2)) + diagC
	varsCv <- apply(Zv, 2, function(x) sum(x^2)) + diagC
	kappaU <- sqrt(varsCu)^-1 * sqrt(exp(cu))
	kappaV <- sqrt(varsCv)^-1 * sqrt(exp(cv))

	# We initialize the level of noise

	sigma2 <- exp(sigma2)

	list(X = X, Y = Y, sigma2 = sigma2, f1Hat = f1Hat, n = n, d = d, 
		cu = exp(cu), cv = exp(cv), diagC = diagC, kappaU = kappaU, kappaV = kappaV, 
		varsCu = varsCu, varsCv = varsCv, Zu = Zu, Zv = Zv)
}

##
# Function which rotates a matrix 180 degreees.
#

rot180 <- function(M) {
	matrix(rev(as.double(M)), nrow(M), ncol(M))
}

##
# Function which computes the cholesky decomposition of the inverse
# of a particular matrix.
#
# @param	M	m x m positive definite matrix.
#
# @return	L	m x m upper triangular matrix such that
#			M^-1 = L %*% t(L)
#

cholInverse <- function(M) {
	rot180(forwardsolve(t(chol(rot180(M))), diag(nrow(M))))
}

##
# Function: computeTitledDistribution
#
# Author: Daniel Hernández-Lobato
#
# Description: Computes the marginals of an updated posterior distribution
#
# Arguments: 
#		a : Posteiror approximation
#
# Returns: 	The updated posterior approximation
#

computeTitledDistribution <- function(a) {

	# We first compute the marginals of the weights and then, the marginals of the latent variables u and v

	lambda <- a$f1Hat$eta2w^-1
	lambdaX <- matrix(lambda, a$n, a$d, byrow = TRUE) * a$X
	eta <- (t(a$X) %*% a$Y / a$sigma2 + a$f1Hat$eta1w)
	
	inverseMatrix <- solve((diag(a$n) * a$sigma2 + a$X %*% t(lambdaX)))

	vW <- lambda - colSums((inverseMatrix %*% lambdaX) * lambdaX)
	mW <- lambda * eta - t(lambdaX) %*% inverseMatrix %*% (lambdaX %*% eta)

	# For u and v we only compute variances because the means are always 0
	# Each prior is ADA + AZ^T ZA, where A contains the normalization constant
	# and the proportionality constant
	
	DnewU <- a$diagC * a$kappaU^2 / (1 + (a$diagC * a$kappaU^2) * a$f1Hat$eta2u)
	PnewU <- matrix(1 / (1 + (a$diagC * a$kappaU^2) * a$f1Hat$eta2u) * a$kappaU, nrow(a$Zu), ncol(a$Zu), byrow = TRUE) * a$Zu

#	tryCatch(
#	RnewU <- cholInverse(diag(nrow(a$Zu)) + a$Zu %*% 
#		(matrix(1 / (a$f1Hat$eta2u^-1 * a$kappaU^-2  + a$diagC), ncol(a$Zu), nrow(a$Zu)) * t(a$Zu)))
#	, error = function(x) browser())

#	RnewUPnewU <- RnewU %*% PnewU
#	vU <- DnewU + colSums(RnewUPnewU^2)

	MnewU <- solve(diag(nrow(a$Zu)) + a$Zu %*% (matrix(1 / (a$f1Hat$eta2u^-1 * a$kappaU^-2  + a$diagC), ncol(a$Zu), nrow(a$Zu)) * t(a$Zu)))
	vU <- DnewU + colSums((MnewU %*% PnewU) * PnewU)

	DnewV <- a$diagC * a$kappaV^2 / (1 + (a$diagC * a$kappaV^2) * a$f1Hat$eta2v)
	PnewV <- matrix(1 / (1 + (a$diagC * a$kappaV^2) * a$f1Hat$eta2v) * a$kappaV, nrow(a$Zv), ncol(a$Zv), byrow = TRUE) * a$Zv

#	tryCatch(
#	RnewV <- cholInverse(diag(nrow(a$Zv)) + a$Zv %*% 
#		(matrix(1 / (a$f1Hat$eta2v^-1 * a$kappaV^-2  + a$diagC), ncol(a$Zv), nrow(a$Zv)) * t(a$Zv)))
#	, error  = function(x) browser())

#	RnewUPnewV <- RnewV %*% PnewV
#	vV <- DnewV + colSums(RnewUPnewV^2)

	MnewV <- solve(diag(nrow(a$Zv)) + a$Zv %*% (matrix(1 / (a$f1Hat$eta2v^-1 * a$kappaV^-2  + a$diagC), ncol(a$Zv), nrow(a$Zv)) * t(a$Zv)))
	vV <- DnewV + colSums((MnewV %*% PnewV) * PnewV)

	list(mW = mW, vW = vW, vU = vU, vV = vV, DnewU = DnewU, PnewU = PnewU, DnewV = DnewV, PnewV = PnewV, MnewU = MnewU, MnewV = MnewV)
}

##
# Function: processPriorTerms
#
# Author: Daniel Hernández-Lobato
#
# Description: process the terms of the prior
#
# Arguments: 
#		a -> Posterior approximation
#		damping -> Damping factor to improve convergence. 
#
# Returns: 	The posterior approximation
#

processPriorTerms <- function(a, damping) {

	# First we compute an updated posterior distribution

	ret <- computeTitledDistribution(a)

	# We get an old posterior distribution

	vWOld <- (ret$vW^-1 - a$f1Hat$eta2w)^-1
	mWOld <- vWOld * (ret$vW^-1 * ret$mW - a$f1Hat$eta1w)

	vUOld <- (ret$vU^-1 - a$f1Hat$eta2u)^-1
	vVOld <- (ret$vV^-1 - a$f1Hat$eta2v)^-1

	# We do the computations for each different component

	for (d in 1 : a$d) {

		if (vWOld[ d ] < 0 || vVOld[ d ] < 0 || vUOld[ d ] < 0) {
			next
		}

		# We compute the moments of the different latent variables using quadrature
		# The first moments u and v are zero, thus, they need not be computed

		tryCatch(
		Z <- integrate(function(ratio) {
			dnorm(mWOld[ d ], 0, sqrt(vWOld[ d ] + ratio^2)) * 1 / (pi * (vUOld[ d ] / vVOld[ d ] + ratio^2 )) 
			}, 0, +Inf, rel.tol = 1e-10, stop.on.error = FALSE)$value * 2 * sqrt(vUOld[ d ] / vVOld[ d ])
		, error = function(x) browser())

		if (is.na(Z) || ! is.finite(Z) || Z == 0) {
			next
		}

		tryCatch(
		dlogZdvUOld <-  integrate(function(ratio) {
			dnorm(mWOld[ d ], 0, sqrt(vWOld[ d ] + ratio^2)) * - 1 / (pi * (vUOld[ d ] / vVOld[ d ] + ratio^2 ))^2 * pi / vVOld[ d ]
			}, 0, +Inf, rel.tol = 1e-10, stop.on.error = FALSE)$value * 2 * sqrt(vUOld[ d ] / vVOld[ d ]) * Z^-1 + 0.5 / vUOld[ d ]
		, error = function(x) browser())

		dlogZdvUOldTimesvUOld <- vUOld[ d ] * dlogZdvUOld
		dlogZdvVOldTimesvVOld <- - dlogZdvUOldTimesvUOld

		tryCatch(
		Ew2 <- integrate(function(ratio) {
			dnorm(mWOld[ d ], 0, sqrt(vWOld[ d ] + ratio^2)) * 1 / (pi * (vUOld[ d ] / vVOld[ d ] + ratio^2 )) *
			(((ratio^-2 + vWOld[ d ]^-1)^-1 * mWOld[ d ] / vWOld[ d ])^2 + (ratio^-2 + vWOld[ d ]^-1)^-1)
		}, 0, +Inf, rel.tol = 1e-10, stop.on.error = FALSE)$value / Z * 2 * sqrt(vUOld[ d ] / vVOld[ d ])
		, error = function(x) browser())

		tryCatch(
		Ew <- integrate(function(ratio) {
			dnorm(mWOld[ d ], 0, sqrt(vWOld[ d ] + ratio^2)) * 1 / (pi * (vUOld[ d ] / vVOld[ d ] + ratio^2 )) *
			(ratio^-2 + vWOld[ d ]^-1)^-1  * mWOld[ d ] * vWOld[ d ]^-1
		}, 0, Inf, rel.tol = 1e-10, stop.on.error = FALSE)$value / Z * 2  * sqrt(vUOld[ d ] / vVOld[ d ])
		, error = function(x) browser())

		# This comes from DHL PhD Thesis (A.41)

#		Eu2 <- vUOld[ d ] * (1 + vUOld[ d ] * 2 * dlogZdvUOld)
#		Ev2 <- vVOld[ d ] * (1 + vVOld[ d ] * 2 * dlogZdvVOld)

		Eu2 <- vUOld[ d ] * (1 + 2 * dlogZdvUOldTimesvUOld)
		Ev2 <- vVOld[ d ] * (1 + 2 * dlogZdvVOldTimesvVOld)

		# We compute the updates

		eta2wNew <- (Ew2 - Ew^2)^-1 - vWOld[ d ]^-1
		eta1wNew <- ((Ew2 - Ew^2)^-1 * Ew - vWOld[ d ]^-1 * mWOld[ d ])

		eta2uNew <- Eu2^-1 - vUOld[ d ]^-1
		eta2vNew <- Ev2^-1 - vVOld[ d ]^-1

#		eta2uNew <- min(eta2uNew, 1e3)
#		eta2vNew <- min(eta2vNew, 1e3)

		# We do the actual update

		a$f1Hat$eta2w[ d ] <- damping * eta2wNew + (1 - damping) * a$f1Hat$eta2w[ d ]
		a$f1Hat$eta1w[ d ] <- damping * eta1wNew + (1 - damping) * a$f1Hat$eta1w[ d ]
		a$f1Hat$eta2u[ d ] <- damping * eta2uNew + (1 - damping) * a$f1Hat$eta2u[ d ]
		a$f1Hat$eta2v[ d ] <- damping * eta2vNew + (1 - damping) * a$f1Hat$eta2v[ d ]

		# We compute the normalization constant

		value <- log(Z)
		value <- value - (0.5 * log(ret$vW[ d ]) - 0.5 * log(vWOld[ d ]) - 0.5 * 
			(mWOld[ d ]^2 / vWOld[ d ] + a$f1Hat$eta1w[ d ]^2 / a$f1Hat$eta2w[ d ] - ret$mW[ d ]^2 / ret$vW[ d ]))
		value <- value - (0.5 * log(ret$vU[ d ]) - 0.5 * log(vUOld[ d ]))
		value <- value - (0.5 * log(ret$vV[ d ]) - 0.5 * log(vVOld[ d ]))
	
		a$f1Hat$etas[ d ] <- value
	}

	a
}

##
# Function: processPriorTermsFastCImplementation
#
# Author: Daniel Hernández-Lobato
#
# Description: process the terms of the prior
#
# Arguments: 
#		a -> Posterior approximation
#		damping -> Damping factor to improve convergence. 
#
# Returns: 	The posterior approximation
#

processPriorTermsFastCImplementation <- function(a, damping) {

	# First we compute an updated posterior distribution

	ret <- computeTitledDistribution(a)

	# We get an old posterior distribution

	vWOld <- (ret$vW^-1 - a$f1Hat$eta2w)^-1
	mWOld <- vWOld * (ret$vW^-1 * ret$mW - a$f1Hat$eta1w)

	vUOld <- (ret$vU^-1 - a$f1Hat$eta2u)^-1
	vVOld <- (ret$vV^-1 - a$f1Hat$eta2v)^-1

	# We build the auxiliary list which used to pass the arguments
	# to compute the normalization constant and the derivatives

	zeroes <- rep(1.0, a$d)

	m <- list(mWOld = mWOld, vWOld = vWOld, vUOld = vUOld, vVOld =vVOld, Z = zeroes, 
		dZdmWOld = zeroes, dZdvWOld = zeroes, dZdvUOld = zeroes, dZdvVOld = zeroes, 
		computed = as.integer(rep(0, a$d)), d = as.integer(a$d))

#	m <- .Call("computeNormalizationConstantAndMoments", m, dup = F)
#	m <- .Call("computeNormalizationConstantAndMomentsGSL", m, dup = F)
	m <- .Call("computeNormalizationConstantAndMomentsGSLnoTransform", m, dup = F)

	Z <- m$Z
	dZdmWOld <- m$dZdmWOld
	dZdvWOld <- m$dZdvWOld
	dZdvUOld <- m$dZdvUOld
	dZdvVOld <- m$dZdvVOld
	computed <- m$computed

#	Z <- rep(0, a$d)
#	dZdmWOld <- rep(0, a$d)
#	dZdvWOld <- rep(0, a$d)
#	dZdvUOld <- rep(0, a$d)
#	dZdvVOld <- rep(0, a$d)
#	computed <- as.integer(rep(0, a$d))

#	.C("computeNormalizationConstantAndMomentsLowLevel", mWOld, vWOld, vUOld, vVOld, Z, dZdmWOld,
#		dZdvWOld, dZdvUOld, dZdvVOld, computed, as.integer(a$d), DUP = FALSE)

	# We do the computations for each different component

	Ew <- mWOld + vWOld * dZdmWOld / Z
	Ew2 <- vWOld - vWOld * ((dZdmWOld / Z)^2 - 2 * dZdvWOld / Z) * vWOld + Ew^2
	Eu2 <- vUOld - vUOld * (0 - 2 * dZdvUOld / Z) * vUOld
	Ev2 <- vVOld - vVOld * (0 - 2 * dZdvVOld / Z) * vVOld

	# We compute the updates

	eta2wNew <- (Ew2 - Ew^2)^-1 - vWOld^-1
	eta1wNew <- ((Ew2 - Ew^2)^-1 * Ew - vWOld^-1 * mWOld)

	eta2uNew <- Eu2^-1 - vUOld^-1
	eta2vNew <- Ev2^-1 - vVOld^-1

	# We do the actual update

	a$f1Hat$eta2w[ computed != 0 ] <- (damping * eta2wNew + (1 - damping) * a$f1Hat$eta2w)[ computed != 0 ]
	a$f1Hat$eta1w[ computed != 0 ] <- (damping * eta1wNew + (1 - damping) * a$f1Hat$eta1w)[ computed != 0 ]
	a$f1Hat$eta2u[ computed != 0 ] <- (damping * eta2uNew + (1 - damping) * a$f1Hat$eta2u)[ computed != 0 ]
	a$f1Hat$eta2v[ computed != 0 ] <- (damping * eta2vNew + (1 - damping) * a$f1Hat$eta2v)[ computed != 0 ]

	# We compute the normalization constant

	vWOld[ vWOld <= 0 ] <- 1
	vUOld[ vUOld <= 0 ] <- 1
	vVOld[ vVOld <= 0 ] <- 1

	value <- log(Z)
	value <- value - (0.5 * log(ret$vW) - 0.5 * log(vWOld) - 0.5 * (mWOld^2 / vWOld + a$f1Hat$eta1w^2 / a$f1Hat$eta2w - ret$mW^2 / ret$vW))
	value <- value - (0.5 * log(ret$vU) - 0.5 * log(vUOld))
	value <- value - (0.5 * log(ret$vV) - 0.5 * log(vVOld))

	a$f1Hat$etas[ computed != 0 ] <- value[ computed != 0 ]

	a
}


##
# Function: processPriorTermsFast
#
# Author: Daniel Hernández-Lobato
#
# Description: process the terms of the prior
#
# Arguments: 
#		a -> Posterior approximation
#		damping -> Damping factor to improve convergence. 
#
# Returns: 	The posterior approximation
#

processPriorTermsFast <- function(a, damping) {

	# First we compute an updated posterior distribution

	ret <- computeTitledDistribution(a)

	# We get an old posterior distribution

	vWOld <- (ret$vW^-1 - a$f1Hat$eta2w)^-1
	mWOld <- vWOld * (ret$vW^-1 * ret$mW - a$f1Hat$eta1w)

	vUOld <- (ret$vU^-1 - a$f1Hat$eta2u)^-1
	vVOld <- (ret$vV^-1 - a$f1Hat$eta2v)^-1

	# We do the computations for each different component

	for (d in 1 : a$d) {

		if (vWOld[ d ] < 0 || vVOld[ d ] < 0 || vUOld[ d ] < 0) {
			next
		}

		retInt <- comptueFastDerivatives(mWOld[ d ], vWOld[ d ], vUOld[ d ], vVOld[ d ])

		Z <- retInt$Z

		if (is.na(Z) || ! is.finite(Z) || Z == 0) {
			next
		}

		Ew <- mWOld[ d ] + vWOld[ d ] * retInt$dZdmWOld / Z
		Ew2 <- vWOld[ d ] - vWOld[ d ] * ((retInt$dZdmWOld / Z)^2 - 2 * retInt$dZdvWOld / Z) * vWOld[ d ] + Ew^2
		Eu2 <- vUOld[ d ] - vUOld[ d ] * (0 - 2 * retInt$dZdvUOld / Z) * vUOld[ d ]
		Ev2 <- vVOld[ d ] - vVOld[ d ] * (0 - 2 * retInt$dZdvVOld / Z) * vVOld[ d ]

		# We compute the updates

		eta2wNew <- (Ew2 - Ew^2)^-1 - vWOld[ d ]^-1
		eta1wNew <- ((Ew2 - Ew^2)^-1 * Ew - vWOld[ d ]^-1 * mWOld[ d ])

		eta2uNew <- Eu2^-1 - vUOld[ d ]^-1
		eta2vNew <- Ev2^-1 - vVOld[ d ]^-1

		# We do the actual update

		a$f1Hat$eta2w[ d ] <- damping * eta2wNew + (1 - damping) * a$f1Hat$eta2w[ d ]
		a$f1Hat$eta1w[ d ] <- damping * eta1wNew + (1 - damping) * a$f1Hat$eta1w[ d ]
		a$f1Hat$eta2u[ d ] <- damping * eta2uNew + (1 - damping) * a$f1Hat$eta2u[ d ]
		a$f1Hat$eta2v[ d ] <- damping * eta2vNew + (1 - damping) * a$f1Hat$eta2v[ d ]

		# We compute the normalization constant

		value <- log(Z)
		value <- value - (0.5 * log(ret$vW[ d ]) - 0.5 * log(vWOld[ d ]) - 0.5 * 
			(mWOld[ d ]^2 / vWOld[ d ] + a$f1Hat$eta1w[ d ]^2 / a$f1Hat$eta2w[ d ] - ret$mW[ d ]^2 / ret$vW[ d ]))
		value <- value - (0.5 * log(ret$vU[ d ]) - 0.5 * log(vUOld[ d ]))
		value <- value - (0.5 * log(ret$vV[ d ]) - 0.5 * log(vVOld[ d ]))
	
		a$f1Hat$etas[ d ] <- value
	}

	a
}

##
# Function: processPriorTermsFastCImplementationSameCode
#
# Author: Daniel Hernández-Lobato
#
# Description: process the terms of the prior
#
# Arguments: 
#		a -> Posterior approximation
#		damping -> Damping factor to improve convergence. 
#
# Returns: 	The posterior approximation
#

processPriorTermsFastCImplementationSameCode <- function(a, damping) {

	# First we compute an updated posterior distribution

	ret <- computeTitledDistribution(a)

	# We get an old posterior distribution

	vWOld <- (ret$vW^-1 - a$f1Hat$eta2w)^-1
	mWOld <- vWOld * (ret$vW^-1 * ret$mW - a$f1Hat$eta1w)

	vUOld <- (ret$vU^-1 - a$f1Hat$eta2u)^-1
	vVOld <- (ret$vV^-1 - a$f1Hat$eta2v)^-1

	# We build the auxiliary list which used to pass the arguments
	# to compute the normalization constant and the derivatives

	zeroes <- rep(1.0, a$d)

	m <- list(mWOld = mWOld, vWOld = vWOld, vUOld = vUOld, vVOld =vVOld, Z = zeroes, 
		Ew = zeroes, Ew2 = zeroes, dlogZdvUOld = zeroes, 
		computed = as.integer(rep(0, a$d)), d = as.integer(a$d))

	m <- .Call("computeNormalizationConstantAndMomentsGSLnoTransformSameRCodeInternalCall", m, dup = F)

	Z <- m$Z
	Ew <- m$Ew
	Ew2 <- m$Ew2
	dlogZdvUOld <- m$dlogZdvUOld
	computed <- m$computed

	computed[ is.na(Z) | ! is.finite(Z) | Z == 0 ]  <- 0

	# We do the computations for each different component

	dlogZdvUOldTimesvUOld <- vUOld * dlogZdvUOld
	dlogZdvVOldTimesvVOld <- - dlogZdvUOldTimesvUOld

	Eu2 <- vUOld * (1 + 2 * dlogZdvUOldTimesvUOld)
	Ev2 <- vVOld * (1 + 2 * dlogZdvVOldTimesvVOld)

	# We compute the updates

	eta2wNew <- (Ew2 - Ew^2)^-1 - vWOld^-1
	eta1wNew <- ((Ew2 - Ew^2)^-1 * Ew - vWOld^-1 * mWOld)

	eta2uNew <- Eu2^-1 - vUOld^-1
	eta2vNew <- Ev2^-1 - vVOld^-1

	computed[ is.na(eta2uNew) | is.na(eta2vNew) | is.na(eta2wNew) | is.na(eta1wNew) ] <- 0
	computed[ is.nan(eta2uNew) | is.nan(eta2vNew) | is.nan(eta2wNew) | is.nan(eta1wNew) ] <- 0
	computed[ ! is.finite(eta2uNew) | ! is.finite(eta2vNew) | ! is.finite(eta2wNew) | ! is.finite(eta1wNew) ] <- 0

	# We do the actual update

	a$f1Hat$eta2w[ computed != 0 ] <- (damping * eta2wNew + (1 - damping) * a$f1Hat$eta2w)[ computed != 0 ]
	a$f1Hat$eta1w[ computed != 0 ] <- (damping * eta1wNew + (1 - damping) * a$f1Hat$eta1w)[ computed != 0 ]
	a$f1Hat$eta2u[ computed != 0 ] <- (damping * eta2uNew + (1 - damping) * a$f1Hat$eta2u)[ computed != 0 ]
	a$f1Hat$eta2v[ computed != 0 ] <- (damping * eta2vNew + (1 - damping) * a$f1Hat$eta2v)[ computed != 0 ]

	# We compute the normalization constant

	vWOld[ vWOld <= 0 ] <- 1
	vUOld[ vUOld <= 0 ] <- 1
	vVOld[ vVOld <= 0 ] <- 1

	value <- log(Z)
	value <- value - (0.5 * log(ret$vW) - 0.5 * log(vWOld) - 0.5 * (mWOld^2 / vWOld + a$f1Hat$eta1w^2 / a$f1Hat$eta2w - ret$mW^2 / ret$vW))
	value <- value - (0.5 * log(ret$vU) - 0.5 * log(vUOld))
	value <- value - (0.5 * log(ret$vV) - 0.5 * log(vVOld))

	a$f1Hat$etas[ computed != 0 ] <- value[ computed != 0 ]

	if (any(is.nan(a$f1Hat$eta2v))) browser()

	a
}


##
# Function: comptueFastDerivatives
#
# Author: Daniel Hernández-Lobato
#
# Description: computes the normalization constant Z using Newton Cotes formulas
#
# Arguments: 
#		Parameters of the old approximation
#		mWOld
#		vWOld
#		vUOld
#		vVOld
#
# Returns: 	Value of the normalization constant
#

comptueFastDerivatives <- function(mWOld, vWOld, vUOld, vVOld) {
	
	integrateFast <- function(f, minx, maxx) {

	        x <- seq(minx, maxx, length.out = 401)
	        h <- x[ 2 ] - x[ 1 ]

	        # We evaluate the function to be integrated
	
	        fvalues1 <- f(x)
	        fvalues2 <- t.default(matrix(fvalues1[ 1 : (length(fvalues1) - 1) ], ncol = length(x) / 4, nrow = 4))
	        fvalues3 <- cbind(fvalues2, c(fvalues2[ 2 : nrow(fvalues2) , 1 ], fvalues1[ length(fvalues1) ]))
	        aux <- fvalues3 %*% c(7, 32, 12, 32, 7) * 2 * h / 45
	        x2 <- seq(x[ 5 ], x[ length(x) ], 4 * h)
	        sum(aux)
	}

	f <- function(ratio) {
		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) 
	}

	f_interval <- function(x) f(tan(pi * (x - 0.5))) * (1 + tan(pi * (x - 0.5))^2) * pi
	
	Z <- integrateFast(f_interval, 0, 1) 
#	tryCatch(
#	Z <- integrate(f_interval, 0, 1, stop.on.error = FALSE, rel.tol = 1e-10)$value
#	, error = function(x) browser())

	f <- function(ratio) {
		- mWOld / (vWOld + vUOld / vVOld * ratio^2) * 
		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) 
	}

	f_interval <- function(x) f(tan(pi * (x - 0.5))) * (1 + tan(pi * (x - 0.5))^2) * pi

	dZdmWOld <- integrateFast(f_interval, 0, 1) 
#	tryCatch(
#	dZdmWOld <- integrate(f_interval, 0, 1, stop.on.error = FALSE, rel.tol = 1e-10)$value 
#	, error = function(x) browser())

	f <- function(ratio) {
		- 0.5 * (2 * pi * (vWOld + vUOld / vVOld * ratio^2))^(-1) * 2 * pi *
		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) +
		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) *
		0.5 * mWOld^2 / (vWOld + vUOld / vVOld * ratio^2)^2
	}

	f_interval <- function(x) f(tan(pi * (x - 0.5))) * (1 + tan(pi * (x - 0.5))^2) * pi

	dZdvWOld <- integrateFast(f_interval, 0, 1)
#	tryCatch(
#	dZdvWOld <- integrate(f_interval, 0, 1, stop.on.error = FALSE, rel.tol = 1e-10)$value
#	, error = function(x) browser())
	
	f <- function(ratio) {
		- 0.5 * (2 * pi * (vWOld + vUOld / vVOld * ratio^2))^(-1) * 2 * pi * ratio^2 / vVOld * 
		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) +
		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) *
		0.5 * mWOld^2 / (vWOld + vUOld / vVOld * ratio^2)^2 * ratio^2 / vVOld
	}

	f_interval <- function(x) f(tan(pi * (x - 0.5))) * (1 + tan(pi * (x - 0.5))^2) * pi

	dZdvUOld <- integrateFast(f_interval, 0, 1)
#	tryCatch(
#	dZdvUOld <- integrate(f_interval, 0, 1, stop.on.error = FALSE, rel.tol = 1e-10)$value
#	, error = function(x) browser())

#	f <- function(ratio) {
#		0.5 * (2 * pi * (vWOld + vUOld / vVOld * ratio^2))^(-1) * 2 * pi * ratio^2 * vUOld / vVOld^2 * 
#		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) +
#		dnorm(mWOld, 0, sqrt(vWOld + vUOld / vVOld * ratio^2)) * 1 / (pi * (1 + ratio^2 )) *
#		- 0.5 * mWOld^2 / (vWOld + vUOld / vVOld * ratio^2)^2 * ratio^2 * vUOld / vVOld^2
#	}

#	f_interval <- function(x) f(tan(pi * (x - 0.5))) * (1 + tan(pi * (x - 0.5))^2) * pi
	
#	dZdvVOld <- integrateFast(f_interval, 0, 1)

	dZdvVOld <- - dZdvUOld * vUOld / vVOld

	list(Z = Z, dZdmWOld = dZdmWOld, dZdvWOld = dZdvWOld, dZdvUOld = dZdvUOld, dZdvVOld = dZdvVOld)
}


##
# Function: checkConvergence
#
# Author: Daniel Hernández-Lobato
#
# Description: looks for convergence in the EP algorithm
#
# Arguments: 
#		aOld -> posterior approximation
#		aNew -> posterior approximation
#
# Returns: 	A boolean indicating convergence
#

checkConvergence <- function(aOld, aNew) {

	# We evaluate the maximum change within the posterior approximation

	maxChange <- 0

	maxChange <- max(maxChange, abs(aNew$f1Hat$eta2w - aOld$f1Hat$eta2w) / abs(aOld$f1Hat$eta2w))
	maxChange <- max(maxChange, abs(aNew$f1Hat$eta1w - aOld$f1Hat$eta1w) / abs(aOld$f1Hat$eta1w))
	maxChange <- max(maxChange, abs(aNew$f1Hat$eta2u - aOld$f1Hat$eta2u) / abs(aOld$f1Hat$eta2u))
	maxChange <- max(maxChange, abs(aNew$f1Hat$eta2v - aOld$f1Hat$eta2v) / abs(aOld$f1Hat$eta2v))

#	retNew <- computeTitledDistribution(aNew)
#	retOld <- computeTitledDistribution(aOld)

#	maxChange <- 0

#	for (i in 1 : aOld$nTask) {
#		maxChange <- max(maxChange, abs(aNew$f1Hat$eta2w[[ i ]] - aOld$f1Hat$eta2w[[ i ]]) / abs(aOld$f1Hat$eta2w[[ i ]]))
#		maxChange <- max(maxChange, abs(aNew$f1Hat$eta1w[[ i ]] - aOld$f1Hat$eta1w[[ i ]]) / abs(aOld$f1Hat$eta1w[[ i ]]))
#		maxChange <- max(maxChange, abs(aNew$f1Hat$eta2u[[ i ]] - aOld$f1Hat$eta2u[[ i ]]) / abs(aOld$f1Hat$eta2u[[ i ]]))
#		maxChange <- max(maxChange, abs(aNew$f1Hat$eta2v[[ i ]] - aOld$f1Hat$eta2v[[ i ]]) / abs(aOld$f1Hat$eta2v[[ i ]]))
#	}
	
#	maxChange <- max(maxChange, abs(retNew$mW - retOld$mW))
#	maxChange <- max(maxChange, abs(retNew$vW - retOld$vW))
#	maxChange <- max(maxChange, abs(retNew$vV - retOld$vV))
#	maxChange <- max(maxChange, abs(retNew$vU - retOld$vU))


#	cat("EP: max change", maxChange, "\n")

#	if (maxChange < 1e-4)
	if (maxChange < 1e-2)
		TRUE
	else
		FALSE
}

##
# Function: computeEvidence
#
# Author: Daniel Hernández-Lobato
#
# Description: evaluates the models evidence provided by the EP algorithm
#
# Arguments: 
#		a -> Posterior approximation
#
# Returns: 	Log evidence of the model
#

computeEvidence <- function(a) {

	# XXX: This is computed as in the paper about group feature selection
	# We use systematically (A.45) from DHL Thesis.

	# We compute the normalization constant of each approximate factor

	# Each approximate factor is:
	# exp(-0.5 * (w - a$f1Hat$eta1 / a$f1Hat$eta2w)^2 * a$f1Hat$eta2w) * 
	# exp(-0.5 * u^2 * a$f1Hat$eta2u) * exp(-0.5 * u^2 * a$f1Hat$eta2v)

	# First we compute an updated posterior distribution

	ret <- computeTitledDistribution(a)

	# We add the contributions from the constants that multiply each approximate factor

	logZ <- sum(a$f1Hat$etas)

	# We add the normalization factor of the likelihood

	logZ <- logZ - a$n / 2 * log(2 * pi * a$sigma2) - 1 / (2 * a$sigma2) * sum(a$Y^2)

	# We add the normalization terms that appear as a consequence of integrating the approximate factors of w

	logZ <- logZ + a$d / 2 * log(2 * pi) 

	upsilon <- 1 / a$sigma2 * t(a$X) %*% a$Y + a$f1Hat$eta1w

	lambda <- a$f1Hat$eta2w^-1
	lambdaX <- matrix(lambda, a$n, a$d, byrow = TRUE) * a$X
	
	inverseMatrix <- solve(diag(a$n) * a$sigma2 + a$X %*% t(lambdaX))

	logZ <- logZ - 0.5 * sum(a$f1Hat$eta1w^2 / a$f1Hat$eta2w)

	logZ <- logZ + 0.5 * sum(sum(upsilon^2 * lambda) - sum(t(lambdaX %*% upsilon) %*% inverseMatrix %*% (lambdaX %*% upsilon)))

	logZ <- logZ - 0.5 * determinant(diag(a$n) + 1 / a$sigma2 * a$X %*% (t(a$X) * matrix(a$f1Hat$eta2w^-1, a$d, a$n)))$modulus[[ 1 ]]
	logZ <- logZ - 0.5 * sum(log(abs(a$f1Hat$eta2w)))

	# We add the normalization term that appears as a consequence of integrating the approximate factors of u

	logZ <- logZ + 0.5 * sum(log(ret$DnewU)) 
#	logZ <- logZ + 0.5 * determinant(diag(nrow(a$Zu)) + (ret$RnewU %*% ret$PnewU) %*% (t(ret$RnewU %*% ret$PnewU) * 
#		matrix(ret$DnewU^-1, ncol(a$Zu), nrow(a$Zu))))$modulus[[ 1 ]]
	logZ <- logZ + 0.5 * determinant(diag(nrow(a$Zu)) + (ret$MnewU %*% ret$PnewU) %*% (t(ret$PnewU) * 
		matrix(ret$DnewU^-1, ncol(a$Zu), nrow(a$Zu))))$modulus[[ 1 ]]

	logZ <- logZ - 0.5 * sum(log(a$diagC))
	logZ <- logZ - 0.5 * determinant(diag(nrow(a$Zu)) + (a$Zu) %*% (t(a$Zu) * matrix(a$diagC^-1, ncol(a$Zu), nrow(a$Zu))))$modulus[[ 1 ]]
	logZ <- logZ - sum(log(a$kappaU))

	# We add the normalization term that appears as a consequence of integrating the approximate factors of v
	
	logZ <- logZ + 0.5 * sum(log(ret$DnewV)) 
#	logZ <- logZ + 0.5 * determinant(diag(nrow(a$Zv)) + (ret$RnewV %*% ret$PnewV) %*% (t(ret$RnewV %*% ret$PnewV) * 
#		matrix(ret$DnewV^-1, ncol(a$Zv), nrow(a$Zv))))$modulus[[ 1 ]]
	logZ <- logZ + 0.5 * determinant(diag(nrow(a$Zv)) + (ret$MnewV %*% ret$PnewV) %*% (t(ret$PnewV) * 
		matrix(ret$DnewV^-1, ncol(a$Zv), nrow(a$Zv))))$modulus[[ 1 ]]

	logZ <- logZ - 0.5 * sum(log(a$diagC)) 
	logZ <- logZ - 0.5 * determinant(diag(nrow(a$Zv)) + (a$Zv) %*% (t(a$Zv) * matrix(a$diagC^-1, ncol(a$Zv), nrow(a$Zv))))$modulus[[ 1 ]]
	logZ <- logZ - sum(log(a$kappaV))

	logZ
}

##
# Function: dratio
#
# Description: Computes the pdf of the ratio between two gaussian distributions. 
#
# Arguments:  
#	z - Where to evaluate the density
#	meanU - Mean of the first gaussian
#	meanV - Mean of the second gaussian
#	varU - Var of the second gaussian
#	varV - Var of the second gaussian
#

dratio <- function(z, meanU, meanV, varU, varV) {
	
	a <- function(z) { sqrt(1 / varU * z^2 + 1 / varV) }
	b <- function(z) { meanU / varU * z + meanV / varV }
	log_c <- function(z) { 0.5 * b(z)^2 / a(z)^2 - 0.5 * (meanU^2 / varU + meanV^2 / varV) }

	sign(2 * pnorm(b(z) / a(z)) - 1) * sign(b(z)) * 
		exp(log(abs(b(z))) + log_c(z) - 3 * log(a(z)) - 0.5 * log(2 * pi) - 0.5 * log(varU) - 0.5 * log(varV) + 
		log(abs(2 * pnorm(b(z) / a(z)) - 1))) + exp(- 2 * log(a(z)) - log(pi) - 0.5 * log(varU) - 0.5 * log(varV) -
		0.5 * (meanU^2 / varU + meanV^2 / varV))
}

##
# Function: evaluateGradientEvidence
#
# Author: Jose Miguel Hernández-Lobato
#
# Description: evaluates the gradient of the model evidence provided by the EP algorithm
#
# Arguments: 
#		a -> Posterior approximation
#
# Returns: 	A list with the required gradients.
#

computeGradientEvidence <- function(a) {

	# First for the u's

	PRt <- t(a$Zu * matrix(a$kappaU, nrow(a$Zu), ncol(a$Zu), byrow = TRUE))
	D <- a$diagC * a$kappaU^2
	tauTilde <- a$f1Hat$eta2u
	m <- nrow(a$Zu)
	n <- ncol(a$Zu)

	A <- (1 + tauTilde * D)^-1 * tauTilde
#	B <- cholInverse(diag(m) + t(PRt) %*% (matrix(A, n, m) * PRt))
#	C <- matrix(A, n, m) * (PRt %*% t(B))

#	PRtR <- PRt
#	M1 <- (t(PRtR) %*% C) %*% t(C) - t(PRtR) * matrix(A, m, n, byrow = T)
#	v1 <- 1 / sqrt(2) * (as.double(C^2 %*% rep(1, m)) - A)
#	v2 <- rep(1 / sqrt(2), n)

	M <- solve(diag(m) + t(PRt) %*% (matrix(A, n, m) * PRt))
	APRt <- PRt * matrix(A, n, m)
	DC <- APRt %*% M
	rowSumsCtC <- rowSums(DC * APRt)

	PRtR <- PRt
	M1 <- (t(PRtR) %*% DC) %*% t(APRt) - t(PRtR) * matrix(A, m, n, byrow = T)
	v1 <- 1 / sqrt(2) * (rowSumsCtC - A)
	v2 <- rep(1 / sqrt(2), n)

	dLogZdcu <- sum(v1 * D * v2) + 0.5 * sum(colSums(M1 * t(PRt)))
	dLogZdZu <- matrix(v1 * v2 * - a$diagC / a$varsCu^2 * a$cu, nrow(a$Zu), ncol(a$Zu), byrow = TRUE) * 2 * a$Zu + 
		M1 * matrix(a$kappaU, nrow(a$Zu), ncol(a$Zu), byrow = TRUE) + 
		matrix(colSums(M1 * a$Zu), nrow(a$Zu), ncol(a$Zu), byrow = TRUE) * 
		(a$Zu * matrix(- 1 / (a$varsCu)^(3 / 2) * sqrt(a$cu), nrow(a$Zu), ncol(a$Zu), byrow = TRUE))

	# Now for the v's

	PRt <- t(a$Zv * matrix(a$kappaV, nrow(a$Zv), ncol(a$Zv), byrow = TRUE))
	D <- a$diagC * a$kappaV^2
	tauTilde <- a$f1Hat$eta2v
	m <- nrow(a$Zv)
	n <- ncol(a$Zv)

	A <- (1 + tauTilde * D)^-1 * tauTilde
#	B <- cholInverse(diag(m) + t(PRt) %*% (matrix(A, n, m) * PRt))
#	C <- matrix(A, n, m) * (PRt %*% t(B))

#	PRtR <- PRt
#	M1 <- (t(PRtR) %*% C) %*% t(C) - t(PRtR) * matrix(A, m, n, byrow = T)
#	v1 <- 1 / sqrt(2) * (as.double(C^2 %*% rep(1, m)) - A)
#	v2 <- rep(1 / sqrt(2), n)

	M <- solve(diag(m) + t(PRt) %*% (matrix(A, n, m) * PRt))
	APRt <- PRt * matrix(A, n, m)
	DC <- APRt %*% M
	rowSumsCtC <- rowSums(DC * APRt)

	PRtR <- PRt
	M1 <- (t(PRtR) %*% DC) %*% t(APRt) - t(PRtR) * matrix(A, m, n, byrow = T)
	v1 <- 1 / sqrt(2) * (rowSumsCtC - A)
	v2 <- rep(1 / sqrt(2), n)


	dLogZdcv <- sum(v1 * D * v2) + 0.5 * sum(colSums(M1 * t(PRt)))
	dLogZdZv <- matrix(v1 * v2 * - a$diagC / a$varsCv^2 * a$cv, nrow(a$Zv), ncol(a$Zv), byrow = TRUE) * 2 * a$Zv + 
		M1 * matrix(a$kappaV, nrow(a$Zv), ncol(a$Zv), byrow = TRUE) + 
		matrix(colSums(M1 * a$Zv), nrow(a$Zv), ncol(a$Zv), byrow = TRUE) * 
		(a$Zv * matrix(- 1 / (a$varsCv)^(3 / 2) * sqrt(a$cv), nrow(a$Zv), ncol(a$Zv), byrow = TRUE))

	# We compute the gradient with respect to the noise

	dLogZdsigma2 <- - a$n / 2 + 0.5 * 1 / a$sigma2 * sum(a$Y^2)

	lambda <- a$f1Hat$eta2w^-1
	lambdaX <- matrix(lambda, a$n, a$d, byrow = TRUE) * a$X
	invM <- solve(diag(a$n) * a$sigma2 + a$X %*% t(lambdaX))
	invMlambdaX <- invM %*% lambdaX
	upsilon <- 1 / a$sigma2 * t(a$X) %*% a$Y + a$f1Hat$eta1w
	dupsilondsigma2 <- - 1 / a$sigma2 * t(a$X) %*% a$Y

	dLogZdsigma2 <- dLogZdsigma2 + 0.5 * sum(rowSums((invMlambdaX %*% t(a$X))^2)) + 
		0.5 * sum(a$f1Hat$eta2w * a$sigma2 * colSums(invMlambdaX^2))

	dLogZdsigma2 <- dLogZdsigma2 + sum(sum(upsilon * dupsilondsigma2 * lambda) - 
			sum(t(lambdaX %*% upsilon) %*% invM %*% (lambdaX %*% dupsilondsigma2)))

	dLogZdsigma2 <- dLogZdsigma2 + 0.5 * a$sigma2 * sum((invMlambdaX %*% upsilon)^2)

	# We compute the gradient with respect to the X's

	dLogZdX <- a$X * 0

	list(dLogZdcu = dLogZdcu, dLogZdcv = dLogZdcv, dLogZdZu = dLogZdZu, 
		dLogZdZv = dLogZdZv, dLogZdsigma2 = dLogZdsigma2, dLogZdX = dLogZdX)
}

##
# Functions which verify if the approximation for the prior
# does lead to valid posterior covariance matrices.
#

verifyPositveDefinitenessInner <- function(a) {

#	M1 <- rot180(diag(nrow(a$Zu)) + a$Zu %*% (matrix(1 / (a$f1Hat$eta2u^-1 * a$kappaU^-2 + a$diagC), 
#		ncol(a$Zu), nrow(a$Zu)) * t(a$Zu)))
#	eigenValues1 <- eigen(M1)$values
#	eigenValues1[ abs(eigenValues1) < 1e-6 ] <- 0

#	M2 <- rot180(diag(nrow(a$Zv)) + a$Zv %*% (matrix(1 / (a$f1Hat$eta2v^-1 * a$kappaV^-2 + a$diagC), 
#		ncol(a$Zv), nrow(a$Zv)) * t(a$Zv)))
#	eigenValues2 <- eigen(M2)$values
#	eigenValues2[ abs(eigenValues2) < 1e-6 ] <- 0

#	if (any(eigenValues1 <= 0) || any(eigenValues2 <= 0))
#		F
#	else {

		ret <- computeTitledDistribution(a)

		if (all(ret$vV > 0) && all(ret$vU > 0) && all(ret$vW > 0))
			T
		else
			F
#	}
}

verifyPositveDefinitenessOuter <- function(aOld, sigma2New, ZNewU, ZNewV, cuNew, cvNew, Xnew) {

	# We initialize the posterior approximation

	a <- initializeApproximation(Xnew, aOld$Y, sigma2New, ZNewU, ZNewV, cuNew, cvNew)
	a$f1Hat <- aOld$f1Hat

#	M1 <- rot180(diag(nrow(a$Zu)) + a$Zu %*% (matrix(1 / (a$f1Hat$eta2u^-1 * a$kappaU^-2 + a$diagC), ncol(a$Zu), nrow(a$Zu)) * t(a$Zu)))
#	eigenValues1 <- eigen(M1)$values
#	eigenValues1[ abs(eigenValues1) < 1e-6 ] <- 0

#	M2 <- rot180(diag(nrow(a$Zv)) + a$Zv %*% (matrix(1 / (a$f1Hat$eta2v^-1 * a$kappaV^-2 + a$diagC), ncol(a$Zv), nrow(a$Zv)) * t(a$Zv)))
#	eigenValues2 <- eigen(M2)$values
#	eigenValues2[ abs(eigenValues2) < 1e-6 ] <- 0

#	if (any(eigenValues1 <= 0) || any(eigenValues2 <= 0))
#		F
#	else {
		ret <- computeTitledDistribution(a)

		if (all(ret$vV > 0) && all(ret$vU > 0) && all(ret$vW > 0))
			T
		else {
			F
		}
#	}
}

##
# Function: horseShoeEP
#
# Author: José Miguel Hernández-Lobato
#
# Description: Trains a horse shoe bayesian model on the data 
#
# Arguments: 
#		X -> a matrix containing the attributes of the training data (one row per sample)
#		Y -> a vector with the targets
#		sigma2 -> Variance of the noise (in log scale)
#		Z -> Pseudo-vectors of the covariance matrix 
#		cu -> Constant that multiplies the covariance matrix of the u's (in log scale)
#		cv -> Constant that multiplies the covariance matrix of the v's (in log scale)
#		learnDictionary -> Indicates whether we should learn the Xs too.
#
#
# Returns:	Posterior Approximation
#

horseShoeEP <- function(X, Y, sigma2 = log(.1), Z = matrix(0.1 * rnorm(ncol(X) * nrow(X)), nrow(X), ncol(X)), cu = -2, cv = 2,
	learnDictionary = FALSE) {

	ret <- horseShoeEPinternal(X, Y, sigma2, Z, Z, cu, cv)

	eps <- 0.01
	convergence <- F
	i <- 1

	while (! convergence && i < 1000 && eps > 1e-10) {

		cuNew <- cu +  eps * ret$gradInfo$dLogZdcu 
		cvNew <- cv +  eps * ret$gradInfo$dLogZdcv 
		ZNew <- Z + eps * (ret$gradInfo$dLogZdZu + ret$gradInfo$dLogZdZv)
		sigma2New <- sigma2 + eps * ret$gradInfo$dLogZdsigma2
		Xnew <- X + eps * ret$gradInfo$dLogZdX

		while (! verifyPositveDefinitenessOuter(ret$a, sigma2New, ZNew, ZNew, cuNew, cvNew)) {

			eps <- eps * 0.5

			cat("OuterLoop: Reducing eps to guarantee positive-definiteness!", "Eps:", eps, "\n")

			cuNew <- cu + eps * ret$gradInfo$dLogZdcu 
			cvNew <- cv + eps * ret$gradInfo$dLogZdcv 
			ZNew <- Z +  eps * (ret$gradInfo$dLogZdZu + ret$gradInfo$dLogZdZv)
			sigma2New <- sigma2 + eps * ret$gradInfo$dLogZdsigma2
			Xnew <- X + eps * ret$gradInfo$dLogZdX
		}

		cu <- cuNew
		cv <- cvNew
		Z <- ZNew
		sigma2 <- sigma2New

		if (learnDictionary == TRUE)
			X <- Xnew

		retNew <- horseShoeEPinternal(X, Y, sigma2, Z, Z, cu, cv, ret$a)

		cat("Iter:", i, "Eps:", eps, "cu:", cu, "cv:", cv, "Sigma2:", exp(sigma2), 
			"Evidence:", ret$logZ, "Change:", retNew$logZ - ret$logZ, "\n")

		if (abs(retNew$logZ - ret$logZ)< 1e-4)
			convergence <- T

		if (retNew$logZ > ret$logZ)
			eps <- eps * 1.2
		else
			eps <- eps * 0.5

		ret <- retNew

		i <- i + 1
	}

	ret
}


##
# Function: horseShoeEPMultiTask
#
# Author: Daniel Hernández-Lobato
#
# Description: Trains a horse shoe bayesian model on multiple tasks 
#
# Arguments: 
#		X -> a list with the matrices containing the attributes of the training data (one row per sample)
#		Y -> a list with the vectors with the targets
#		sigma2 -> Variance of the noise (in log scale)
#		Z -> Pseudo-vectors of the covariance matrix 
#		cu -> Constant that multiplies the covariance matrix of the u's (in log scale)
#		cv -> Constant that multiplies the covariance matrix of the v's (in log scale)
#		learnDictionary -> Indicates whether we should learn the Xs too.
#		relationYtoX -> a list which indicates for each task which Xs are actually used
#		sharedNoiseParam -> indicates if the noise is shared among tasks or if its task specific
#		sharedSparsityLevel -> indicates if the sparsity parameter is shared among tasks or if its task specific
#		multiCore -> Shall we use multiple cores for the computation?
#		flagsToLearn -> flags which indicate what to learn (sparsity, noise, Z, X)?
#
# Returns:	Posterior Approximation
#

horseShoeEPMultiTask <- function(X = NULL, Y, sigma2 = log(1), Z = matrix(0.1 * rnorm(ncol(X[[ 1 ]]) * 
	nrow(X[[ 1 ]])), nrow(X[[ 1 ]]), ncol(X[[ 1 ]])), cu = rep(-2, length(X)), cv = rep(2, length(X)), 
	learnDictionary = FALSE, Xinitial = NULL, relationYtoX = NULL, sharedNoiseParam = FALSE, 
	sharedSparsityLevel = FALSE, multiCore = FALSE, flagsToLearn = c(1, 1, 1, 1)) {

	learnSparsity <- flagsToLearn[ 1 ]
	learnNoise <- flagsToLearn[ 2 ]
	learnZ <- flagsToLearn[ 3 ]
	learnX <- flagsToLearn[ 4 ]

	if (multiCore == TRUE) {
		library(doMC)
		registerDoMC(cores = 4)
		print(getDoParWorkers())
	}

	Z <- Z * 1
	nTasks <- length(Y)

	retNewTask <- cuNew <- cvNew <- sigma2New <- retTask <- sigma2Task <- cuTask <- cvTask <- list()

	for (i in 1 : nTasks) {
		sigma2Task[[ i ]] <- sigma2
		cuTask[[ i ]] <- cu[ i ]
		cvTask[[ i ]] <- cv[ i ]
	}

	if (multiCore == TRUE) {

		retTask <- foreach(i = 1 : nTasks) %dopar%  {

			print(i)

			# This is to save memory in case of dictionary learning

			if (learnDictionary == TRUE) {

				model <- horseShoeEPinternal(Xinitial[ relationYtoX[[ i ]], ], 
					Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])

				list(f1Hat = model$a$f1Hat, logZ = model$logZ)

			} else  {

				model <- horseShoeEPinternal(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
				list(f1Hat = model$a$f1Hat, logZ = model$logZ)
			}
			
		}
	} else {
		for (i in 1 : nTasks) {
			print(i)

			# This is to save memory in case of dictionary learning

			if (learnDictionary == TRUE) {

				model <- horseShoeEPinternal(Xinitial[ relationYtoX[[ i ]], ], 
					Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])

				retTask[[ i ]] <- list(f1Hat = model$a$f1Hat, logZ = model$logZ)

			} else  {

				model <- horseShoeEPinternal(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
				retTask[[ i ]] <- list(f1Hat = model$a$f1Hat, logZ = model$logZ)
			}
		}
	}

	eps <- 0.01
	convergence <- F
	n <- 1

	while (! convergence && n < 1e3 && eps > 1e-10) {

		ZNew <- Z
		XinitialNew <- Xinitial

		for (i in 1 : nTasks)  {

			if (learnDictionary == TRUE)
				a <- initializeApproximation(Xinitial[ relationYtoX[[ i ]], ], 
						Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
			else
				a <- initializeApproximation(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
				
			a$f1Hat <- retTask[[ i ]]$f1Hat
			gradInfo <- computeGradientEvidence(a)
		
			if (sharedSparsityLevel == TRUE) {
				cuNew[[ 1 ]] <- cuTask[[ 1 ]] + learnSparsity * eps * gradInfo$dLogZdcu 
				cvNew[[ 1 ]] <- cvTask[[ 1 ]] + learnSparsity * eps * gradInfo$dLogZdcv 
			} else {
				cuNew[[ i ]] <- cuTask[[ i ]] + learnSparsity * eps * gradInfo$dLogZdcu 
				cvNew[[ i ]] <- cvTask[[ i ]] + learnSparsity * eps * gradInfo$dLogZdcv 
			}

			if (sharedNoiseParam == TRUE)
				sigma2New[[ 1 ]] <- sigma2Task[[ 1 ]] + learnNoise * eps * gradInfo$dLogZdsigma2 
			else 
				sigma2New[[ i ]] <- sigma2Task[[ i ]] + learnNoise * eps * gradInfo$dLogZdsigma2 

			ZNew <- ZNew + learnZ * eps * (gradInfo$dLogZdZu + gradInfo$dLogZdZv) 

			if (learnDictionary == TRUE) 
				XinitialNew[ relationYtoX[[ i ]], ] <- XinitialNew[ relationYtoX[[ i ]], ] + learnX * eps * gradInfo$dLogZdX 
		}

		if (sharedSparsityLevel == TRUE) {
			for (i in 1 : nTasks) {
				cuNew[[ i ]] <- cuNew[[ 1 ]]
				cvNew[[ i ]] <- cvNew[[ 1 ]]
			}
		}

		if (sharedNoiseParam == TRUE) {
			for (i in 1 : nTasks) 
				sigma2New[[ i ]] <- sigma2New[[ 1 ]] 
		} 

		# We check that all the tasks have positive defininte matrices

		while (TRUE) {

			flag <- TRUE

			for (i in 1 : nTasks) {

				if (learnDictionary == TRUE)
					a <- initializeApproximation(XinitialNew[ relationYtoX[[ i ]], ], 
						Y[[ i ]], sigma2New[[ i ]], ZNew, ZNew, cuNew[[ i ]], cvNew[[ i ]])
				else
					a <- initializeApproximation(X[[ i ]], Y[[ i ]], sigma2New[[ i ]], 
						ZNew, ZNew, cuNew[[ i ]], cvNew[[ i ]])
				
				a$f1Hat <- retTask[[ i ]]$f1Hat

				if (learnDictionary == TRUE)
					flag <- flag & verifyPositveDefinitenessOuter(a, sigma2New[[ i ]], ZNew, ZNew, 
						cuNew[[ i ]], cvNew[[ i ]], XinitialNew[ relationYtoX[[ i ]], ])
				else
					flag <- flag & verifyPositveDefinitenessOuter(a, sigma2New[[ i ]], ZNew, ZNew, 
						cuNew[[ i ]], cvNew[[ i ]], X[[ i ]])
			}

			# If there all matrices are positive defininte we break	

			if (flag)
				break

			eps <- eps * 0.5

			cat("OuterLoop: Reducing eps to guarantee positive-definiteness!", "Eps:", eps, "\n")

			ZNew <- Z
			sigma2New <- sigma2Task
			cuNew <- cuTask
			cvNew <- cvTask
			XinitialNew <- Xinitial

			for (i in 1 : nTasks)  {

				if (learnDictionary == TRUE)
					a <- initializeApproximation(Xinitial[ relationYtoX[[ i ]], ], 
							Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
				else
					a <- initializeApproximation(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
					
				a$f1Hat <- retTask[[ i ]]$f1Hat
				gradInfo <- computeGradientEvidence(a)
			
				if (sharedSparsityLevel == TRUE) {
					cuNew[[ 1 ]] <- cuTask[[ 1 ]] + learnSparsity * eps * gradInfo$dLogZdcu 
					cvNew[[ 1 ]] <- cvTask[[ 1 ]] + learnSparsity * eps * gradInfo$dLogZdcv 
				} else {
					cuNew[[ i ]] <- cuTask[[ i ]] + learnSparsity * eps * gradInfo$dLogZdcu 
					cvNew[[ i ]] <- cvTask[[ i ]] + learnSparsity * eps * gradInfo$dLogZdcv 
				}
	
				if (sharedNoiseParam == TRUE)
					sigma2New[[ 1 ]] <- sigma2Task[[ 1 ]] + learnNoise * eps * gradInfo$dLogZdsigma2 
				else 
					sigma2New[[ i ]] <- sigma2Task[[ i ]] + learnNoise * eps * gradInfo$dLogZdsigma2 
	
				ZNew <- ZNew + learnZ * eps * (gradInfo$dLogZdZu + gradInfo$dLogZdZv) 
	
				if (learnDictionary == TRUE) 
					XinitialNew[ relationYtoX[[ i ]], ] <- XinitialNew[ relationYtoX[[ i ]], ] + learnX * eps * gradInfo$dLogZdX 
			}
	
			if (sharedSparsityLevel == TRUE) {
				for (i in 1 : nTasks) {
					cuNew[[ i ]] <- cuNew[[ 1 ]]
					cvNew[[ i ]] <- cvNew[[ 1 ]]
				}
			}
	
			if (sharedNoiseParam == TRUE) {
				for (i in 1 : nTasks) 
					sigma2New[[ i ]] <- sigma2New[[ 1 ]] 
			} 

		} # while

		max_change_param <- 0

		for (i in 1 : nTasks)
			max_change_param <- max(max_change_param, c(abs(cuTask[[ i ]] - cuNew[[ i ]]), 
				abs(cvTask[[ i ]] - cvNew[[ i ]]), abs(sigma2Task[[ i ]] - sigma2New[[ i ]]), abs(Z - ZNew)))

		if (learnDictionary == TRUE)
			max_change_param <- max(max_change_param, abs(c(Xinitial[[ i ]] - XinitialNew[[ i ]])))

		cuTask <- cuNew
		cvTask <- cvNew
		Z <- ZNew
		sigma2Task <- sigma2New
		Xinitial <- XinitialNew

		# We retrain

		if (multiCore == TRUE) {
	
			retNewTask <- foreach(i = 1 : nTasks) %dopar%  {
	
				print(i)

				a <- list()
				a$f1Hat <- retTask[[ i ]]$f1Hat
	
				# This is to save memory in case of dictionary learning
	
				if (learnDictionary == TRUE) {
	
					model <- horseShoeEPinternal(Xinitial[ relationYtoX[[ i ]], ], 
						Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]], a)

					if (any(computeTitledDistribution(model$a)$vU > 100) || is.nan(model$logZ)) {
						cat("Retraining task:", i, "\n")
						model <- horseShoeEPinternal(Xinitial[ relationYtoX[[ i ]], ], Y[[ i ]], sigma2Task[[ i ]], Z, Z, 
							cuTask[[ i ]], cvTask[[ i ]], damping = 0.1)
					}

					list(f1Hat = model$a$f1Hat, logZ = model$logZ)
	
				} else  {
	
					model <- horseShoeEPinternal(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]], a)

					if (any(computeTitledDistribution(model$a)$vU > 100) || is.nan(model$logZ)) {
						cat("Retraining task:", i, "\n")
						model <- horseShoeEPinternal(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, 
							cuTask[[ i ]], cvTask[[ i ]], damping = 0.1)
					}

					list(f1Hat = model$a$f1Hat, logZ = model$logZ)
				}
				
			}
		} else {
			for (i in 1 : nTasks) {

				print(i)

				a <- list()
				a$f1Hat <- retTask[[ i ]]$f1Hat
	
				# This is to save memory in case of dictionary learning
	
				if (learnDictionary == TRUE) {
	
					model <- horseShoeEPinternal(Xinitial[ relationYtoX[[ i ]], ], 
						Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]], a)
	
					if (any(computeTitledDistribution(model$a)$vU > 100) || is.nan(model$logZ)) {
						cat("Retraining task:", i, "\n")
						model <- horseShoeEPinternal(Xinitial[ relationYtoX[[ i ]], ], Y[[ i ]], sigma2Task[[ i ]], Z, Z, 
							cuTask[[ i ]], cvTask[[ i ]], damping = 0.1)
					}

					retNewTask[[ i ]] <- list(f1Hat = model$a$f1Hat, logZ = model$logZ)
	
				} else  {
	
					model <- horseShoeEPinternal(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]], a)

					if (any(computeTitledDistribution(model$a)$vU > 100) || is.nan(model$logZ)) {
						cat("Retraining task:", i, "\n")
						model <- horseShoeEPinternal(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, 
							cuTask[[ i ]], cvTask[[ i ]], damping = 0.1)
					}

					retNewTask[[ i ]] <- list(f1Hat = model$a$f1Hat, logZ = model$logZ)
				}
			}
		}
	
		sum_logZ <- sum_logZNew <- 0

		for (i in 1 : nTasks)  {
			cat("Iter:", n, "Eps:", eps, "cu:", cuTask[[ i ]], "cv:", cvTask[[ i ]], "Sigma2:", exp(sigma2Task[[ i ]]), 
				"Evidence:", retTask[[ i ]]$logZ, "Change:", retNewTask[[ i ]]$logZ - retTask[[ i ]]$logZ, "\n")
			sum_logZ <- sum_logZ + retTask[[ i ]]$logZ
			sum_logZNew <- sum_logZNew + retNewTask[[ i ]]$logZ
		}

		cat("Iter:", n, "Eps:", eps, "MaxChangeParam:", max_change_param, 
			"Sum Evidence:", sum_logZ, "Change:", sum_logZNew - sum_logZ, "\n")

		if (abs(sum_logZNew - sum_logZ) < 1e-4 || max_change_param < 1e-5)
			convergence <- T

		if (sum_logZNew  > sum_logZ)
			eps <- eps * 1.2
		else
			eps <- eps * 0.5

		for (i in 1 : nTasks)  
			retTask[[ i ]] <- retNewTask[[ i ]]

		n <- n + 1

		result <- list(approx = retTask, Z = Z, cuTask = cuTask, cvTask = cvTask, Xinitial = Xinitial, sigma2Task = sigma2Task)

#		save(result, file = "partial_result_Approx.dat")
	}

	# We compute the means of each tasks

	if (learnDictionary == TRUE)
		d <- ncol(Xinitial)
	else
		d <- ncol(X[[ 1 ]])

	means <- matrix(0, d, nTasks)
	vars <- matrix(0, d, nTasks)

	for (i in 1 : nTasks)  {

		if (learnDictionary == TRUE)
			a <- initializeApproximation(Xinitial[ relationYtoX[[ i ]], ], 
				Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])
		else
			a <- initializeApproximation(X[[ i ]], Y[[ i ]], sigma2Task[[ i ]], Z, Z, cuTask[[ i ]], cvTask[[ i ]])

		a$f1Hat <- retTask[[ i ]]$f1Hat
		ret <- computeTitledDistribution(a)
		means[ , i ] <- ret$mW
		vars[ , i ] <- ret$vW
	}

	list(approx = retTask, Z = Z, cuTask = cuTask, cvTask = cvTask, Xinitial = Xinitial, sigma2Task = sigma2Task, means = means)
}
