#FUNCTIONS FOR THE ITERATIVE GD APPROXIMATION ALGORITHMS. Note that we also call the iterative GD algorithm "full GD" (it's a full order approximation of a numerical GD).)

#NOTE: all the presented algorithms for the full GD optimization, in 1D or in multiple dimensions, don't present any convergence or unstability problems. In any case we leave the 
#possibility for a stabilization procedure through Coordinate Descent, as explained below (see comments above "Full_GD_control_and_estimation_backward_optimization_coordinate_descend").

#(1) ONE-DIMENSIONAL CASE ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#Here we directly present the algorithm for the most general case, that is allowing for multiplicative noise.

#(1.0): Propagation of the moments 

#The function propagates over time the moments needed to implement the iterative optimization of control and estimation using the full GD algorithm.
#The needed moments are <x_t*x_t>, <z_t*z_t> and <x_t*z_t> (they are the components, in the same order, of the vector w in this function), where x_t stands
#for the state at time t and z for the state estimate.
function propagation_of_moments(Σ_x_0, Σ_z_0, μ_x_0, μ_z_0, T, a, b, m, n, H, k_vec, l_vec, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)

    #NOTE that these variables: "σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise" are all STANDARD DEVIATIONS
    #while, these other variables: "Σ_x_0, Σ_z_0" are VARIANCES 

    #definition of cost vector

    moments_vec = zeros(3,T)

    #define the covariance vector, the updating matrix and the additive vector G for the discrete evolution of the system
    T_matrix = zeros(3,3)
    w = zeros(3,1)
    G = zeros(3,1)

    w = [μ_x_0^2 + Σ_x_0, μ_z_0^2 + Σ_z_0, μ_x_0*μ_z_0] #assuming that at the initial time step, the state and its estimate are independent
    moments_vec[:,1] = w[:]

    T_matrix[1,:] = [a^2, (b^2*l_vec[1]^2+σ_ϵ_control_dep_noise^2*l_vec[1]^2), 2*a*b*l_vec[1]]
    T_matrix[2,:] = [k_vec[1]^2*(H^2+σ_ρ_mult_sensory_noise^2), (m+n*l_vec[1]-k_vec[1]*H)^2, 2*(m+n*l_vec[1]-k_vec[1]*H)*k_vec[1]*H]
    T_matrix[3,:] = [a*k_vec[1]*H, b*l_vec[1]*(m+n*l_vec[1]-k_vec[1]*H), a*(m+n*l_vec[1]-k_vec[1]*H)+b*l_vec[1]*k_vec[1]*H]
    
    G = [σ_ξ^2, k_vec[1]^2*σ_ω_add_sensory_noise^2+σ_η_internal_noise^2, 0]

    for i in 2:T-1
        
        w = T_matrix*w .+ G    #dynamics
        
        moments_vec[:,i] = w[:]

        #update the vector G for the time evolution
        G[2] = k_vec[i]^2*σ_ω_add_sensory_noise^2+σ_η_internal_noise^2 
        
        #update the matrix T for the time evolution
        T_matrix[1,:] = [a^2, (b^2*l_vec[i]^2+σ_ϵ_control_dep_noise^2*l_vec[i]^2), 2*a*b*l_vec[i]]
        T_matrix[2,:] = [k_vec[i]^2*(H^2+σ_ρ_mult_sensory_noise^2), (m+n*l_vec[i]-k_vec[i]*H)^2, 2*(m+n*l_vec[i]-k_vec[i]*H)*k_vec[i]*H]
        T_matrix[3,:] = [a*k_vec[i]*H, b*l_vec[i]*(m+n*l_vec[i]-k_vec[i]*H), a*(m+n*l_vec[i]-k_vec[i]*H)+b*l_vec[i]*k_vec[i]*H]

    end

    w = T_matrix*w .+ G         #last evolution
    moments_vec[:,end] = w[:]

    return moments_vec

end

#(1.1): CONTROL OPTIMIZATION 

#The function gives as outputs the matrices A and B useful to compute the optimal controller at time t (the elements of the matrices that have to be considered 
#are therefore only the ones from t to T-2) [see math].
function A_B_propagation(t, moments_matrix_t, l_vec, k_vec, T, a, b, H, σ_ϵ_control_dep_noise, σ_ρ_mult_sensory_noise)

    A_matrix = zeros(3, T-2)
    B_matrix = zeros(3, T-2)
    F_matrix = zeros(3, T-1)
    G_matrix = zeros(3, T-1)
    H_matrix = zeros(3, T-1)

    #propagation of A and B 
    A_matrix[1,t] = 2*moments_matrix_t[2]*(b^2+σ_ϵ_control_dep_noise^2)
    A_matrix[2,t] = 2*b^2*moments_matrix_t[2]
    A_matrix[3,t] = 2*b^2*moments_matrix_t[2]
    
    B_matrix[1,t] = 2*a*b*moments_matrix_t[3]
    B_matrix[2,t] = 2*a*b*moments_matrix_t[2] - 2*b*k_vec[t]*H*(moments_matrix_t[2] - moments_matrix_t[3])
    B_matrix[3,t] = a*b*(moments_matrix_t[2] + moments_matrix_t[3]) - b*k_vec[t]*H*(moments_matrix_t[2] - moments_matrix_t[3])
    
    for i in t+1:T-2

        #Definition of F,G and H
        F_matrix[1, i] = a^2
        F_matrix[2, i] = (b^2+σ_ϵ_control_dep_noise^2)*l_vec[i]^2
        F_matrix[3, i] = 2*a*b*l_vec[i]

        G_matrix[1, i] = k_vec[i]^2*(H^2+σ_ρ_mult_sensory_noise^2)
        G_matrix[2, i] = (a+b*l_vec[i])^2 + k_vec[i]^2*H^2 - 2*a*k_vec[i]*H - 2*b*l_vec[i]*k_vec[i]*H
        G_matrix[3, i] = 2*b*l_vec[i]*k_vec[i]*H + 2*a*k_vec[i]*H - 2*k_vec[i]^2*H^2
        
        H_matrix[1, i] = a*k_vec[i]*H
        H_matrix[2, i] = a*b*l_vec[i] + b^2*l_vec[i]^2 - b*l_vec[i]*k_vec[i]*H
        H_matrix[3, i] = a^2 + a*b*l_vec[i] - a*k_vec[i]*H + b*l_vec[i]*k_vec[i]*H

        #propagation of A and B 
        A_matrix[1,i] = dot(F_matrix[:, i],A_matrix[:,i-1])
        A_matrix[2,i] = dot(G_matrix[:, i],A_matrix[:,i-1])
        A_matrix[3,i] = dot(H_matrix[:, i],A_matrix[:,i-1])
        
        B_matrix[1,i] = dot(F_matrix[:, i],B_matrix[:,i-1])
        B_matrix[2,i] = dot(G_matrix[:, i],B_matrix[:,i-1])
        B_matrix[3,i] = dot(H_matrix[:, i],B_matrix[:,i-1])

    end

    i = T-1
    F_matrix[1, i] = a^2
    F_matrix[2, i] = (b^2+σ_ϵ_control_dep_noise^2)*l_vec[i]^2
    F_matrix[3, i] = 2*a*b*l_vec[i]

    G_matrix[1, i] = k_vec[i]^2*(H^2+σ_ρ_mult_sensory_noise^2)
    G_matrix[2, i] = (a+b*l_vec[i])^2 + k_vec[i]^2*H^2 - 2*a*k_vec[i]*H - 2*b*l_vec[i]*k_vec[i]*H
    G_matrix[3, i] = 2*b*l_vec[i]*k_vec[i]*H + 2*a*k_vec[i]*H - 2*k_vec[i]^2*H^2
    
    H_matrix[1, i] = a*k_vec[i]*H
    H_matrix[2, i] = a*b*l_vec[i] + b^2*l_vec[i]^2 - b*l_vec[i]*k_vec[i]*H
    H_matrix[3, i] = a^2 + a*b*l_vec[i] - a*k_vec[i]*H + b*l_vec[i]*k_vec[i]*H
    
    return A_matrix, B_matrix, F_matrix, G_matrix

end

#The function computes the optimal control gain at time t.
function l_t_optimal_full_GD(n_order, t, A_matrix, B_matrix, x_hat_square_t, F_matrix, G_matrix, l_vec, q_vec, r_vec)
    
    sum_for_numerator = 0
    sum_for_denominator = 0

    for i in 2:n_order
        sum_for_numerator += (q_vec[t+i]*dot(F_matrix[:,t+i-1], B_matrix[:,t+i-2]) + r_vec[t+i]*l_vec[t+i]^2*dot(G_matrix[:,t+i-1], B_matrix[:,t+i-2]))
        sum_for_denominator += (q_vec[t+i]*dot(F_matrix[:,t+i-1], A_matrix[:,t+i-2]) + r_vec[t+i]*l_vec[t+i]^2*dot(G_matrix[:,t+i-1], A_matrix[:,t+i-2]))
    end

    numerator_l_t = q_vec[t+1]*B_matrix[1,t]+r_vec[t+1]*l_vec[t+1]^2*B_matrix[2,t] + sum_for_numerator
    denominator_l_t = 2*r_vec[t+1]*x_hat_square_t + q_vec[t+1]*A_matrix[1,t] + r_vec[t+1]*l_vec[t+1]^2*A_matrix[2,t] + sum_for_denominator
    l_t_optimal  = - numerator_l_t/denominator_l_t

    return l_t_optimal

end

#The function implements the backward iterative optimization where the optimal controller, at fixed filter, is computed at every time step. The function uses the functions 
#"A_B_propagation" and "l_t_optimal_full_GD". n_order_vec is a vector, whose length has to be T-2, that contains the order of approximation per each control gain optimization,
#at each time step t. Note that at time t, the maximum order is n = T-t and that the last control gain (at t = T-1) are already set by using a first order 
#approximation.
function control_optimization_full_GD_backward(n_order_vec, moments_matrix_all_t, k_vec, T, q, q_T, r, a, b, H, σ_ϵ_control_dep_noise, σ_ρ_mult_sensory_noise)

    q_vec = q .* ones(T)
    q_vec[T] = q_T

    r_vec = r .* ones(T)
    r_vec[T] = 0

    l_vec_optimal = zeros(T)

    #t=T-1
    l_vec_optimal[T-1] = -a*b*q_T/(q_T*(b^2+σ_ϵ_control_dep_noise^2)+r)*moments_matrix_all_t[3,T-1]/moments_matrix_all_t[2,T-1]

    for j in 2:T-1

        t = T-j

        #propagation of A and B
        A_matrix_t, B_matrix_t, F_matrix_t, G_matrix_t = A_B_propagation(t, moments_matrix_all_t[:,t], l_vec_optimal, k_vec, T, a, b, H, σ_ϵ_control_dep_noise, σ_ρ_mult_sensory_noise)
        
        #compute the optimal l_t
        x_hat_square_t = moments_matrix_all_t[2,t]
        #n_order = T-t
        n_order = Int(n_order_vec[t])
        l_vec_optimal[t] = l_t_optimal_full_GD(n_order, t, A_matrix_t, B_matrix_t, x_hat_square_t, F_matrix_t, G_matrix_t, l_vec_optimal, q_vec, r_vec)

    end

    return l_vec_optimal[1:T-1]

end

#(1.2): ESTIMATION OPTIMIZATION 

#(1.2.1): ESTIMATION OPTIMIZATION through full GD algorithm 

#The function propagates the coefficients for the filter gains optimization. 
function A_B_propagation_for_k_t(t, moments_matrix_t, l_vec, k_vec, T, a, b, H, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_ϵ_control_dep_noise)

    A_matrix = zeros(3, T-2)
    B_matrix = zeros(3, T-2)
    F_matrix = zeros(3, T-1)
    G_matrix = zeros(3, T-1)
    H_matrix = zeros(3, T-1)

    #propagation of A and B 
    A_matrix[1,t] = 0
    A_matrix[2,t] = 2*H^2*(moments_matrix_t[1] + moments_matrix_t[2] -2*moments_matrix_t[3])+ 2*σ_ω_add_sensory_noise^2 + 2*σ_ρ_mult_sensory_noise^2*moments_matrix_t[1]
    A_matrix[3,t] = 0
    
    B_matrix[1,t] = 0
    B_matrix[2,t] = -2*H*(a+b*l_vec[t])*(moments_matrix_t[2] - moments_matrix_t[3])
    B_matrix[3,t] = a*H*(moments_matrix_t[1] - moments_matrix_t[3]) - b*l_vec[t]*H*(moments_matrix_t[2] - moments_matrix_t[3])
    
    for i in t+1:T-2

        #Definition of F,G and H
        F_matrix[1, i] = a^2
        F_matrix[2, i] = (b^2+σ_ϵ_control_dep_noise^2)*l_vec[i]^2
        F_matrix[3, i] = 2*a*b*l_vec[i]

        G_matrix[1, i] = k_vec[i]^2*(H^2+σ_ρ_mult_sensory_noise^2)
        G_matrix[2, i] = (a+b*l_vec[i])^2 + k_vec[i]^2*H^2 - 2*a*k_vec[i]*H - 2*b*l_vec[i]*k_vec[i]*H
        G_matrix[3, i] = 2*b*l_vec[i]*k_vec[i]*H + 2*a*k_vec[i]*H - 2*k_vec[i]^2*H^2
        
        H_matrix[1, i] = a*k_vec[i]*H
        H_matrix[2, i] = a*b*l_vec[i] + b^2*l_vec[i]^2 - b*l_vec[i]*k_vec[i]*H
        H_matrix[3, i] = a^2 + a*b*l_vec[i] - a*k_vec[i]*H + b*l_vec[i]*k_vec[i]*H

        #propagation of A and B 
        A_matrix[1,i] = dot(F_matrix[:, i],A_matrix[:,i-1])
        A_matrix[2,i] = dot(G_matrix[:, i],A_matrix[:,i-1])
        A_matrix[3,i] = dot(H_matrix[:, i],A_matrix[:,i-1])
        
        B_matrix[1,i] = dot(F_matrix[:, i],B_matrix[:,i-1])
        B_matrix[2,i] = dot(G_matrix[:, i],B_matrix[:,i-1])
        B_matrix[3,i] = dot(H_matrix[:, i],B_matrix[:,i-1])

    end

    i = T-1
    F_matrix[1, i] = a^2
    F_matrix[2, i] = (b^2+σ_ϵ_control_dep_noise^2)*l_vec[i]^2
    F_matrix[3, i] = 2*a*b*l_vec[i]

    G_matrix[1, i] = k_vec[i]^2*(H^2+σ_ρ_mult_sensory_noise^2)
    G_matrix[2, i] = (a+b*l_vec[i])^2 + k_vec[i]^2*H^2 - 2*a*k_vec[i]*H - 2*b*l_vec[i]*k_vec[i]*H
    G_matrix[3, i] = 2*b*l_vec[i]*k_vec[i]*H + 2*a*k_vec[i]*H - 2*k_vec[i]^2*H^2
    
    H_matrix[1, i] = a*k_vec[i]*H
    H_matrix[2, i] = a*b*l_vec[i] + b^2*l_vec[i]^2 - b*l_vec[i]*k_vec[i]*H
    H_matrix[3, i] = a^2 + a*b*l_vec[i] - a*k_vec[i]*H + b*l_vec[i]*k_vec[i]*H
    
    return A_matrix, B_matrix, F_matrix, G_matrix

end

#The function computes the optimal filter gain at time t.
function k_t_optimal_full_GD(n_order, t, A_matrix, B_matrix, F_matrix, G_matrix, l_vec, q_vec, r_vec)
    
    sum_for_numerator = 0
    sum_for_denominator = 0

    for i in 2:n_order
        sum_for_numerator += (q_vec[t+i]*dot(F_matrix[:,t+i-1], B_matrix[:,t+i-2]) + r_vec[t+i]*l_vec[t+i]^2*dot(G_matrix[:,t+i-1], B_matrix[:,t+i-2]))
        sum_for_denominator += (q_vec[t+i]*dot(F_matrix[:,t+i-1], A_matrix[:,t+i-2]) + r_vec[t+i]*l_vec[t+i]^2*dot(G_matrix[:,t+i-1], A_matrix[:,t+i-2]))
    end

    numerator_k_t = r_vec[t+1]*l_vec[t+1]^2*B_matrix[2,t] + sum_for_numerator
    denominator_k_t = r_vec[t+1]*l_vec[t+1]^2*A_matrix[2,t] + sum_for_denominator
    k_t_optimal  = - numerator_k_t/denominator_k_t

    return k_t_optimal

end

#The function implements the backward iterative optimization where the optimal filter, at fixed filter, is computed at every time step.
function estimation_optimization_full_GD_backward(n_order_vec, moments_matrix_all_t, l_vec, T, q, q_T, r, a, b, H, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_ϵ_control_dep_noise)

    l_vec_tmp = zeros(T)
    l_vec_tmp[1:T-1] .= l_vec[:]
    
    q_vec = q .* ones(T)
    q_vec[T] = q_T

    r_vec = r .* ones(T)
    r_vec[T] = 0

    k_vec_optimal = zeros(T-1)
    
    α_tmp = (2*H^2+2*σ_ρ_mult_sensory_noise^2)*moments_matrix_all_t[1,T-2] + 2*H^2*moments_matrix_all_t[2,T-2] - 4*H^2*moments_matrix_all_t[3,T-2] + 2*σ_ω_add_sensory_noise^2
    β_tmp = -2*H*(a+b*l_vec[T-2])*(moments_matrix_all_t[2,T-2]- moments_matrix_all_t[3,T-2])
    num_k = r_vec[T-1]*l_vec[T-1]^2*β_tmp + q_vec[T]*((b^2+σ_ϵ_control_dep_noise^2)*l_vec[T-1]^2*β_tmp + 2*a*b*l_vec[T-1]*H*(a*(moments_matrix_all_t[1,T-2]-moments_matrix_all_t[3,T-2])-b*l_vec[T-2]*(moments_matrix_all_t[2,T-2]-moments_matrix_all_t[3,T-2])))
    den_k = r_vec[T-1]*l_vec[T-1]^2*α_tmp + q_vec[T]*(b^2+σ_ϵ_control_dep_noise^2)*l_vec[T-1]^2*α_tmp
    k_vec_optimal[T-2] = -num_k/den_k

    for j in 3:T-2

        t = T-j

        #propagation of A and B
        A_matrix_t, B_matrix_t, F_matrix_t, G_matrix_t = A_B_propagation_for_k_t(t, moments_matrix_all_t[:,t], l_vec_tmp, k_vec_optimal, T, a, b, H, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_ϵ_control_dep_noise)

        #compute the optimal k_t
        n_order = Int(n_order_vec[t])
        k_vec_optimal[t] = k_t_optimal_full_GD(n_order, t, A_matrix_t, B_matrix_t, F_matrix_t, G_matrix_t, l_vec_tmp, q_vec, r_vec)

    end

    return k_vec_optimal

end

#(1.2.2): ESTIMATION OPTIMIZATION through orthogonality princicple. 
#Here we optimize the filter gains by imposing the orthogonality principle, that is by solving wrt K_t the following iterative equation:
# <z_t^2> = <x_t*z_t>, where z_t stands for the state estimate and x_t the state at time t. Note that we're in 1D now, and we're using the fact that for t=1 (initial condition)
#we have <z_1^2> = <x_1*z_1> (this holds if the initial internal state it's known without uncertainty) [see math].
#Note that this optimization turns out to be optimal when no internal noise is present: therefore, the optimal filter can be computed without minizing the cost function, but 
#simply imposing the orthogonality princicple (which is indeed a necessary condition for an optimal estimator). Note that Todorov's algorithm acutally satisfies such a condition,
#even if it's derived by a cost minimization. When internal noise is present, this princicple cannot hold anymore (see math) and the optimal solution stops to ensure it. 
#Therefore, when σ_η > 0 we have to opt for the full GD optimization for K_t (point (1.2.1)) [see math].

#The function finds the optimal filter, at fixed controller, by imposing the orthogonality ptinciple, that is by solving <x_hat_t^2> = <x_t*x_hat_t> for each k_t. This turns out to
#be the optimal solution for zero internal noise.
function estimation_optimization_orthogonality_principle(moments_vec, T, a, H, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise)
    
    k_vec_optimal = zeros(T-1)

    for i in 2:T-1

        Γ_t = moments_vec[1,T-i]-moments_vec[3,T-i]
        D_3 = H^2*Γ_t + σ_ω_add_sensory_noise^2 + σ_ρ_mult_sensory_noise^2*moments_vec[1,T-i]

        k_vec_optimal[T-i] = a*H*Γ_t/D_3

    end

    return k_vec_optimal

end

#(1.3): ESTIMATION-CONTROL OPTIMIZATION 

#(1.3.1): Control with full GD and Estimation with orthogonality princicple

#The function implements the full GD algorithm for control and estimation iteratively. This algorithm turns out to be optimal in all cases, matching Todorov's solution for 
#σ_η = 0 and overperforming them (in terms of cost minimization) when σ_η > 0. To stabilize the algorithm, if needed, we implement a coordinate descent procedure, in which
#k_t is optimized at fixed l_t and viceverse. Each optimization step in one coordinate is composed by several iterations, to avoid possible unstabilities. 
function Iterative_GD_control_and_estimation_backward_optimization_coordinate_descend(n_order_vec, iterations_full_optimization, iterations_control_optimization, iterations_estimation_optimization, l_vec_start, k_vec_start, Σ_x_0, Σ_z_0, μ_z_0, T, q, q_T, r, a, b, H, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)

    l_vec_optimal = zeros(T-1)
    k_vec_optimal = zeros(T-1)

    k_vec_optimal[:] .= k_vec_start[:]
    l_vec_optimal[:] .= l_vec_start[:]

    for i in 1:iterations_full_optimization
        
        #control optimization 
        for j in 1:iterations_control_optimization  
            moments_vec = propagation_of_moments(Σ_x_0, Σ_z_0, μ_x_0, μ_z_0, T, a, b, m, n, H, k_vec_optimal, l_vec_optimal, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)
            l_vec_optimal[:] = control_optimization_full_GD_backward(n_order_vec, moments_vec, k_vec_optimal, T, q, q_T, r, a, b, H, σ_ϵ_control_dep_noise, σ_ρ_mult_sensory_noise)
        end

        #estimation optimization 
        for j in 1:iterations_estimation_optimization  
            moments_vec = propagation_of_moments(Σ_x_0, Σ_z_0, μ_x_0, μ_z_0, T, a, b, m, n, H, k_vec_optimal, l_vec_optimal, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)
            k_vec_optimal[:] = estimation_optimization_full_GD_backward(n_order_vec, moments_vec, l_vec_optimal, T, q, q_T, r, a, b, H, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_ϵ_control_dep_noise)
        end

    end

    return l_vec_optimal, k_vec_optimal

end

#(1.3.2): Control with full GD and Estimation with orthogonality princicple

#The function implements the overall optimization by iteratively using "control_optimization_full_GD_backward" and "estimation_optimization_orthogonality_principle".
#Here we get rid of the smoothing on the controller and we simply implement a coordinate descent solution (the coordinates would be control and estimation gains
#resepctively l_t and k_t). 
#This function finds the optimal solution for σ_η = 0. For σ_η > 0, it still overperforms Todorov's algorithm, without reaching the optimality of the Numerical (and analytical) GD.
#Indeed, as commented before, for σ_η > 0, the orthogonality principle is not optimal anymore (and cannot be exactly implemented [see math]). 
function Full_GD_control_backward_optimization_with_orthogonality_principle_filter_optimization_coordinate_descend(n_order_vec, iterations_full_optimization, iterations_control_optimization, iterations_estimation_optimization, l_vec_start, k_vec_start, Σ_x_0, Σ_z_0, μ_z_0, T, q, q_T, r, a, b, H, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)

    l_vec_optimal = zeros(T-1)
    k_vec_optimal = zeros(T-1)

    k_vec_optimal[:] .= k_vec_start[:]
    l_vec_optimal[:] .= l_vec_start[:]

    for i in 1:iterations_full_optimization
        
        #control optimization 
        for j in 1:iterations_control_optimization  
            moments_vec = propagation_of_moments(Σ_x_0, Σ_z_0, μ_x_0, μ_z_0, T, a, b, m, n, H, k_vec_optimal, l_vec_optimal, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)
            l_vec_optimal[:] = control_optimization_full_GD_backward(n_order_vec, moments_vec, k_vec_optimal, T, q, q_T, r, a, b, H, σ_ϵ_control_dep_noise, σ_ρ_mult_sensory_noise)
        end

        #estimation optimization 
        for j in 1:iterations_estimation_optimization  
            moments_vec = propagation_of_moments(Σ_x_0, Σ_z_0, μ_x_0, μ_z_0, T, a, b, m, n, H, k_vec_optimal, l_vec_optimal, σ_ξ, σ_ϵ_control_dep_noise, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise, σ_η_internal_noise)
            k_vec_optimal[:] = estimation_optimization_orthogonality_principle(moments_vec, T, a, H, σ_ω_add_sensory_noise, σ_ρ_mult_sensory_noise)
        end

    end

    return l_vec_optimal, k_vec_optimal

end

#(2) MULTI-DIMENSIONAL CASE ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#(2.1): Additive noise - the following functions implement the full GD algorithm in the absence of multiplicative noise [see the math].

#(2.1.1): CONTROL OPTIMIZATION 

#The function propagates the raw moments in time (it's also suited for multiplicative noise). S_0 is a block matrix containing the inital condition for the propagation and it's 
#defined this way: S_0 = [<x_1*transpose(x_1)>, <x_1*transpose(z_1)>; <z_1*transpose(x_1)>, <z_1*transpose(z_1)>]. By z we mean the state estimate.
function raw_moments_propagation_for_full_GD(S_0, K_matrix, L_matrix, N_steps, A, B, C, D, H, Ω_ξ, Ω_η, Ω_ω, dimension_of_state)

    xx = zeros(dimension_of_state, dimension_of_state, N_steps)
    xz = zeros(dimension_of_state, dimension_of_state, N_steps)
    zx = zeros(dimension_of_state, dimension_of_state, N_steps)
    zz = zeros(dimension_of_state, dimension_of_state, N_steps)

    xx_old = zeros(dimension_of_state, dimension_of_state, N_steps)
    xz_old = zeros(dimension_of_state, dimension_of_state, N_steps)
    zx_old = zeros(dimension_of_state, dimension_of_state, N_steps)
    zz_old = zeros(dimension_of_state, dimension_of_state, N_steps)

    xx[:,:,1] .= S_0[1,1]
    xz[:,:,1] .= S_0[1,2]
    zx[:,:,1] .= S_0[2,1]
    zz[:,:,1] .= S_0[2,2]

    xx_old[:,:,1] .= xx[:,:,1]
    xz_old[:,:,1] .= xz[:,:,1]
    zx_old[:,:,1] .= zx[:,:,1]
    zz_old[:,:,1] .= zz[:,:,1]

    for t in 1:N_steps-1
        
        xx[:,:,t+1] .= A*xx_old[:,:,t]*transpose(A) .+ A*xz_old[:,:,t]*transpose(B*L_matrix[:,:,t]) .+ B*L_matrix[:,:,t]*zx_old[:,:,t]*transpose(A) .+ B*L_matrix[:,:,t]*zz_old[:,:,t]*transpose(B*L_matrix[:,:,t]) .+ C*L_matrix[:,:,t]*zz_old[:,:,t]*transpose(C*L_matrix[:,:,t]) .+ Ω_ξ
        zx[:,:,t+1] .= K_matrix[:,:,t]*H*xx_old[:,:,t]*transpose(A) .+ K_matrix[:,:,t]*H*xz_old[:,:,t]*transpose(B*L_matrix[:,:,t]) .+ (A .+ B*L_matrix[:,:,t] .- K_matrix[:,:,t]*H)*zx_old[:,:,t]*transpose(A) .+ (A .+ B*L_matrix[:,:,t] .- K_matrix[:,:,t]*H)*zz_old[:,:,t]*transpose(B*L_matrix[:,:,t])
        xz[:,:,t+1] .= transpose(zx[:,:,t+1])
        zz[:,:,t+1] .= K_matrix[:,:,t]*H*xx_old[:,:,t]*transpose(K_matrix[:,:,t]*H) .+ K_matrix[:,:,t]*D*xx_old[:,:,t]*transpose(K_matrix[:,:,t]*D) .+ K_matrix[:,:,t]*H*xz_old[:,:,t]*transpose(A .+ B*L_matrix[:,:,t] .- K_matrix[:,:,t]*H) .+ (A .+ B*L_matrix[:,:,t] .- K_matrix[:,:,t]*H)*zx_old[:,:,t]*transpose(K_matrix[:,:,t]*H) .+ (A .+ B*L_matrix[:,:,t] .- K_matrix[:,:,t]*H)*zz_old[:,:,t]*transpose(A .+ B*L_matrix[:,:,t] .- K_matrix[:,:,t]*H) .+ K_matrix[:,:,t]*Ω_ω*transpose(K_matrix[:,:,t]) .+ Ω_η
        
        xx_old[:,:,t+1] .= xx[:,:,t+1]
        xz_old[:,:,t+1] .= xz[:,:,t+1]
        zx_old[:,:,t+1] .= zx[:,:,t+1]
        zz_old[:,:,t+1] .= zz[:,:,t+1]

    end

    return xx,xz,zx,zz

end

#The function propagats the coefficients for the derivative of the cost function wrt L_t for the case without multiplicative noise [see math]. 
#Note that mu_vector is a vector of mxm matrices, where m is the dimension of the state, while alpha_matrix is simply a mxm matrix.
function alpha_matrix_and_mu_vectors_propagation(t, T, K_matrix, L_matrix, A, B, H, dimension_of_control, dimension_of_state)

    alpha_matrix = zeros(dimension_of_state, dimension_of_control, T-1)
    alpha_matrix[:,:,t] .= B[:,:]

    for i in 1:T-t-1
        alpha_matrix[:,:,t+i] .= (A[:,:] .+ B[:,:]*L_matrix[:,:,t+i]) * alpha_matrix[:,:,t+i-1]
    end

    #t = t+1 --> update using L_t = 0
    mu_vector_1_0 = zeros(2, dimension_of_state, dimension_of_state, T)
    mu_vector_0_1 = zeros(2, dimension_of_state, dimension_of_state, T)
   
    mu_vector_1_0_tmp = [zeros(dimension_of_state, dimension_of_state) for _ in 1:2]
    mu_vector_0_1_tmp = [zeros(dimension_of_state, dimension_of_state) for _ in 1:2]
    mu_vector_1_0_tmp[1] .= Diagonal(ones(dimension_of_state))
    mu_vector_0_1_tmp[2] .= Diagonal(ones(dimension_of_state))
    
    M_matrix = [ones(dimension_of_state, dimension_of_state) for _ in 1:2, _ in 1:2]
    
    M_matrix[1,1] .= A[:,:]
    M_matrix[1,2] .= zeros(dimension_of_state, dimension_of_state)
    M_matrix[2,1] .= K_matrix[:,:,t] * H[:,:]
    M_matrix[2,2] .= (A[:,:] .- K_matrix[:,:,t] * H[:,:])
    
    #in such a way we implement a matrix-vector multiplication, where M_matrix is a 4x4 block matrix, composed by mxm matrices, while mu_vector is a 2x1 vector of mxm matrices,
    #where m is the dimension of the state.
    for i in 1:2
        for j in 1:2
            mu_vector_1_0[i,:,:, t+1] .+= M_matrix[i, j] * mu_vector_1_0_tmp[j]
            mu_vector_0_1[i,:,:, t+1] .+= M_matrix[i, j] * mu_vector_0_1_tmp[j]
        end
    end
    
    mu_vector_1_0_tmp[1] .= mu_vector_1_0[1,:,:, t+1]
    mu_vector_1_0_tmp[2] .= mu_vector_1_0[2,:,:, t+1]
    mu_vector_0_1_tmp[1] .= mu_vector_0_1[1,:,:, t+1]
    mu_vector_0_1_tmp[2] .= mu_vector_0_1[2,:,:, t+1]

    #until t=T
    for k in 2:T-t 

        M_matrix[1,1] .= A[:,:]
        M_matrix[1,2] .= B*L_matrix[:,:,t+k-1]
        M_matrix[2,1] .= K_matrix[:,:,t+k-1] * H[:,:]
        M_matrix[2,2] .= (A[:,:] .+ B*L_matrix[:,:,t+k-1] .- K_matrix[:,:,t+k-1] * H[:,:])    

        for i in 1:2
            for j in 1:2
                mu_vector_1_0[i,:,:, t+k] .+= M_matrix[i, j] * mu_vector_1_0_tmp[j]
                mu_vector_0_1[i,:,:, t+k] .+= M_matrix[i, j] * mu_vector_0_1_tmp[j]
            end
        end
        
        mu_vector_1_0_tmp[1] .= mu_vector_1_0[1,:,:, t+k]
        mu_vector_1_0_tmp[2] .= mu_vector_1_0[2,:,:, t+k]
        mu_vector_0_1_tmp[1] .= mu_vector_0_1[1,:,:, t+k]
        mu_vector_0_1_tmp[2] .= mu_vector_0_1[2,:,:, t+k]

    end

    return alpha_matrix, mu_vector_1_0, mu_vector_0_1

end

#The function computes the optimal L_t given the raw moments (moments_matrix_x_z_t --> <x*transpose(z)>, moments_matrix_z_z_t --> <z*transpose(z)>, where x is the state and z the 
#state estimate) at time t, L_{t+1,...,T-1}, K_{t+1,...,T-1} (this dependence is hidden in the coefficients alpha_matrix and mu_vector) [see math].
function L_t_optimal_full_GD_multidimensional_case(n_order, t, alpha_matrix, mu_vector_1_0, mu_vector_0_1, moments_matrix_x_z_t, moments_matrix_z_z_t, L_matrix, Q_matrix, R_matrix, dimension_of_control, dimension_of_state)

    #compute the matrix J_nt --> pxp matrix, where p is the dimension of the control signal u(t)
    J_nt = zeros(dimension_of_control,dimension_of_control)
    J_nt[:,:] .+= 2 .* R_matrix[:,:,t]

    for i in 1:n_order
        J_nt[:,:] .+= 2 .* transpose(alpha_matrix[:,:,t+i-1]) *(Q_matrix[:,:,t+i] .+ transpose(L_matrix[:,:,t+i])*R_matrix[:,:,t+i]*L_matrix[:,:,t+i])* alpha_matrix[:,:,t+i-1]
    end

    #compute the matrix S_nt --> pxm matrix, where p is the dimension of the control signal u(t) and m is the dimensnion of state vector x_t (and therefore of the state estimate)
    S_nt = zeros(dimension_of_control,dimension_of_state)
    for i in 1:n_order
        S_nt[:,:] .+= 2 .* transpose(alpha_matrix[:,:,t+i-1]) *(Q_matrix[:,:,t+i] * mu_vector_1_0[1,:,:,t+i] .+ transpose(L_matrix[:,:,t+i])*R_matrix[:,:,t+i]*L_matrix[:,:,t+i] * mu_vector_1_0[2,:,:,t+i]) 
    end

    #compute the matrix P_nt --> pxm matrix, where p is the dimension of the control signal u(t) and m is the dimensnion of state vector x_t (and therefore of the state estimate)
    P_nt = zeros(dimension_of_control,dimension_of_state)
    for i in 1:n_order
        P_nt[:,:] .+= 2 .* transpose(alpha_matrix[:,:,t+i-1]) *(Q_matrix[:,:,t+i] * mu_vector_0_1[1,:,:,t+i] .+ transpose(L_matrix[:,:,t+i])*R_matrix[:,:,t+i]*L_matrix[:,:,t+i] * mu_vector_0_1[2,:,:,t+i]) 
    end

    L_t_optimal = .- inv(J_nt) * (S_nt * moments_matrix_x_z_t * pinv(moments_matrix_z_z_t) .+ P_nt)

    return L_t_optimal

end

#The function implements the backward control optimization, using the previous functions, "alpha_matrix_and_mu_vectors_propagation" and "L_t_optimal_full_GD_multidimensional_case",
#once the raw moments have been propagated using "raw_moments_propagation_for_full_GD" and given as an input. Note that in the function "L_t_optimal_full_GD_multidimensional_case"
#the oiptimal L_t is computed by inverting <z*transpose(z)>: such a matrix is invertible if det(<z*transpose(z)>) != 0. However, due to the initial conditions, if we don't set
#any intial uncertainty on z_1 (as it's done in Todorov's paper), we would have det(<z*transpose(z)>) = 0, and a quite small determinant for the time steps to come. This results in 
#a degeneracy of solutions, at least for the first time steps. The algorithm can still be run, as we use the pseudoinverse instead of the inverse: note that pinv() always exists, and,
#when a matrix is invertyible, corresponds to the actual inverse matrix. This degeneracy makes the solution be different from Todorov's one (when no internal noise is considered).
#To match the same control gains, it suffices to use Todorov's solutions for the first time steps (commented lines in this function). However, this procedure doesn't seem to be 
#necessary, as performance is already mathced without the need for further adjustments, even if the extracted optimal control gains don't look alike for an initial transient. 
#The degeneracy is due to the fact that z_1 will have only a few components different from zero: therefore, only a few components of L_t will be relevant, leaving the othres 
#free to assume different values, without affecting the performance. 
#This is a simple implementation of the control optimization:

# K_matrix = zeros(dimension_of_state, observation_dimension,N_steps)
# K_matrix[:,:,1:N_steps-1] .= K_TOD[:,:,:]
# L_matrix_optimal_full_GD = zeros(dimension_of_control, dimension_of_state, N_steps-1)
# L_matrix_optimal_full_GD[:,:,:] .= L_TOD[:,:,:] #L_matrix_optimal_full_GD start 

# for i in 1:iterations_control_optimization_full_GD

#     _,moments_matrix_x_z,_,moments_matrix_z_z = raw_moments_propagation_for_full_GD(S_0, K_matrix, L_matrix_optimal_full_GD, N_steps, A, B, C, D, H, Ω_ξ, Ω_η, Ω_ω, dimension_of_state)
#     L_matrix_optimal_full_GD[:,:,:] .= control_optimization_full_GD_multidimensional_case_backward(n_order_vec, K_matrix, moments_matrix_x_z, moments_matrix_z_z, Q_matrix, R_matrix, N_steps, A, B, H, dimension_of_control, dimension_of_state)
#     #L_matrix_optimal_full_GD[:,:,1] .= L_TOD[:,:,1]

# end
function control_optimization_full_GD_multidimensional_case_backward(n_order_vec, K_matrix, moments_matrix_x_z, moments_matrix_z_z, Q_matrix, R_matrix, T, A, B, H, dimension_of_control, dimension_of_state)

    L_matrix_optimal = zeros(dimension_of_control, dimension_of_state, T)
    moments_matrix_x_z_t = zeros(dimension_of_state, dimension_of_state)
    moments_matrix_z_z_t = zeros(dimension_of_state, dimension_of_state)

    for j in 1:T-1
    #for j in 1:T-2
        t = T-j

        #propagation of alpha matrix and mu vector
        alpha_matrix, mu_vector_1_0, mu_vector_0_1 = alpha_matrix_and_mu_vectors_propagation(t, T, K_matrix, L_matrix_optimal, A, B, H, dimension_of_control, dimension_of_state)
        
        #compute the optimal L_t
        n_order = Int(n_order_vec[t])
        moments_matrix_x_z_t[:,:] .= moments_matrix_x_z[:,:,t]
        moments_matrix_z_z_t[:,:] .= moments_matrix_z_z[:,:,t]
        L_matrix_optimal[:,:,t] .= L_t_optimal_full_GD_multidimensional_case(n_order, t, alpha_matrix, mu_vector_1_0, mu_vector_0_1, moments_matrix_x_z_t, moments_matrix_z_z_t, L_matrix_optimal, Q_matrix, R_matrix, dimension_of_control, dimension_of_state)

    end

    #return L_matrix_optimal[:,:,2:T-1]
    return L_matrix_optimal[:,:,1:T-1]

end
