# Codes of functions used in simulations
# The function for nonuniform Poisson sampling
function sample_poi(y, pi_P, n)
    idx = (1:length(y))[rand.() .<= y .+ (1 .- y) .* n .* pi_P]
    return idx
end


# The function of case-control sampling for pilot estimators
function cc(y::Vector{Bool}, Np::Int64)
    N = length(y); N1 = sum(y); N0 = N - N1
    p0 = Np / 2N0
    pd = Np / 2N1 - p0
    idx = (1:N)[rand.() .<= pd .* y .+ p0]
    π = y[idx] .* pd .+ p0
    return idx, π, N1, N0
end


# The function to compute A-OS optimal subsampling probabilities
function optA(X, y, G, N, N0, beta_plt, alpha_plt, idx_plt, G_plt, π_plt)
    π = Vector{Float64}(undef, N)
    η = Vector{Float64}(undef, N)
    p = Vector{Float64}(undef, N)
    m = Vector{Float64}(undef, N)
    mul!(η, X, beta_plt)
    p .= 1 ./ (1 .+ exp.(-alpha_plt .- η))
    p_plt = p[idx_plt]
    gs = similar(G_plt)
    gs .= G_plt .* p_plt .* (1.0 .- p_plt) ./ π_plt
    H = G_plt'gs ./ N
    U, s, Vt = svd(H)
    A = Vt * (1 ./ s .* U')
    Gd = similar(G)
    mul!(Gd, G, A)
    dsm = sum(Gd .^ 2, dims=2)
    m .= p .* sqrt.((1 .- p) .* dsm)
    estM = sum(m[.!y])
    π .= m ./ estM
    return π
end


# The function to compute P-OS optimal subsampling probabilities
function optIV(X, y, G, N, N0, beta_plt, alpha_plt, idx_plt, G_plt, π_plt)
    π = Vector{Float64}(undef, N)
    η = Vector{Float64}(undef, N)
    p = Vector{Float64}(undef, N)
    m = Vector{Float64}(undef, N)
    mul!(η, X, beta_plt)
    p .= 1 ./ (1 .+ exp.(-alpha_plt .- η))
    p_plt = p[idx_plt]
    gs = similar(G_plt)
    gs .= G_plt .* p_plt .* (1.0 .- p_plt) ./ π_plt
    H = G_plt'gs ./ N
    gs .= gs .* p_plt .* (1.0 .- p_plt)
    S = G_plt'gs ./ N
    S = sqrt(Hermitian(S))
    U, s, Vt = svd(H)
    A = Vt * (1 ./ s .* U') * S
    Gd = similar(G)
    mul!(Gd, G, A)
    dsm = sum(Gd .^ 2, dims=2)
    m .= p .* sqrt.((1 .- p) .* dsm)
    estM = sum(m[.!y])
    π .= m ./ estM
    return π
end


# The function to compute L-OS optimal subsampling probabilities
function optL(X, y, G, N, N0, beta_plt, alpha_plt, idx_plt, G_plt, π_plt)
    π = Vector{Float64}(undef, N)
    η = Vector{Float64}(undef, N)
    p = Vector{Float64}(undef, N)
    m = Vector{Float64}(undef, N)
    mul!(η, X, beta_plt)
    p .= 1 ./ (1 .+ exp.(-alpha_plt .- η))
    Gd = G
    dsm = sum(Gd .^ 2, dims=2)
    m .= p .* sqrt.((1 .- p) .* dsm)
    estM = sum(m[.!y])
    π .= m ./ estM
    return π
end



# The function to do pilot estimation.
# X: matrix of covariates
# y: vector of response
# Np: Expected size of pilot sample
# nlambda: parameter to control the number of lasso tuning parameters (lambda) to be selected
# eps: parameter to control the range of lasso tuning parameter
#      lambda_min = eps * lambda_max
# alpha: penalty = alpha*L1 + (1-alpha)*L2. When alpha=1, the penalty is lasso.
# standardize: option that determines if we standardize the covariates
function PilotEst2(X, y, Np; nlambda = 100, eps = 1e-4, criterion,
                   alpha = 1, standardize=true)
    (N, d) = size(X)
    if standardize == true

        idx_plt, π_plt, N1, N0 = cc(y, Np)
        X_plt = X[idx_plt, :]
        y_plt = y[idx_plt]

        colmax = maximum(X_plt,dims=1)
        colstd = vec(colmax .!= 1 .&& colmax .!= 0)
        Xstd = ones(d)
        Xstd[colstd] = vec(std(X_plt[:,colstd], dims=1))
        Xmean = zeros(d)
        Xmean[colstd] = vec(mean(X_plt[:,colstd],dims=1))
        X_plt = (X_plt .- Xmean') ./ Xstd'
        cv_plt = fit(LassoPath, X_plt, y_plt, Binomial(); standardize=false, irls_maxiter=100, λminratio=eps,
                     α=alpha)

        beta_plt = Array(Lasso.coef(cv_plt; select=MinAIC())[2:end])
        beta_plt = beta_plt ./ Xstd
        alpha_plt = Lasso.coef(cv_plt; select=MinAIC())[1]
        alpha_plt = alpha_plt - log(N0/N1) - sum(Xmean .* beta_plt)
        fscr_idx = findall(!=(0), beta_plt)
        X_plt = X[idx_plt, fscr_idx]
        meanXest = vec(sum(X_plt ./ π_plt,dims=1)) / N
        varXest = vec(sum((X_plt .- meanXest') .^ 2 ./ π_plt,dims=1)) / N
        stdXest = sqrt.(varXest)
    else
        idx_plt, π_plt, N1, N0 = cc(y, Np)
        X_plt = X[idx_plt, :]
        y_plt = y[idx_plt]
        cv_plt = fit(LassoPath, X_plt, y_plt, Binomial(); standardize=false, irls_maxiter=100,λminratio=eps,
                     α=alpha)
    
        beta_plt = Array(Lasso.coef(cv_plt; select=MinAIC())[2:end])
        alpha_plt = Lasso.coef(cv_plt; select=MinAIC())[1]
        alpha_plt = alpha_plt - log(N0/N1)
        fscr_idx = findall(!=(0), beta_plt)
        X_plt = X[idx_plt, fscr_idx]
        meanXest = vec(sum(X_plt ./ π_plt,dims=1)) / N
        varXest = vec(sum((X_plt .- meanXest') .^ 2 ./ π_plt,dims=1)) / N
        stdXest = sqrt.(varXest)
    end
    return (beta_plt = beta_plt, alpha_plt = alpha_plt, fscr_idx = fscr_idx,
            cv_plt = cv_plt,
            idx_plt = idx_plt, π_plt = π_plt, X_plt = X_plt, y_plt = y_plt,
            stdXest = stdXest, meanXest = meanXest,
            N = N,  N1 = N1, N0 = N0, N_plt = length(idx_plt))
end


# The function to compute optimal subsampling probabilities
# X: matrix of covariates
# y: vector of response
# rho : expected sampling rate
# lthr: lower threshold for sampling probabilities to make algorithm more stable
# criterion: method of compute optimal probabilities.
# standardize: option that determines if we standardize the covariates
function PiOpt(X, y, plt, rho, lthr, criterion; standardize=false)
    beta_plt = plt.beta_plt[plt.fscr_idx]
    alpha_plt = plt.alpha_plt
    N = plt.N
    N0 = plt.N0
    idx_plt = plt.idx_plt
    X_plt = plt.X_plt
    y_plt = plt.y_plt
    π_plt = plt.π_plt
    if standardize == true
        G = [ones(N) X ./ plt.stdXest']
    else
        G = [ones(N) X]
    end
    G_plt = G[idx_plt, :]
    if criterion == "A-opt"
        π = optA(X, y, G, N, N0, beta_plt, alpha_plt, idx_plt, G_plt, π_plt)
        piOS = min.(max.(rho .* N .* π, lthr), 1)
        return piOS
    elseif criterion == "P-opt"
        π = optIV(X, y, G, N, N0, beta_plt, alpha_plt, idx_plt, G_plt, π_plt)
        piOS = min.(max.(rho .* N .* π, lthr), 1)
        return piOS
    elseif criterion == "L-opt"
        π = optL(X, y, G, N, N0, beta_plt, alpha_plt, idx_plt, G_plt, π_plt)
        piOS = min.(max.(rho .* N .* π, lthr), 1)
        return piOS
    end
end


# Main function to do two-step subsampling estimation
# X: matrix of covariates
# y: vector of response
# gamma: parameter of adaptive lasso penalty (1/|β|^γ)
# rho: expected sampling ratio
# lthr: a lower threshold of optimal sampling probabilities for algorithm stablity
# nlambda: number of tuning parameter λ of adaptive lasso to be selected
# eps: value to control range of λ. We use lambda_min = eps*lambda_max
# lambda_max: max value of λ. When choose "auto", the max value is determined by data
#             User-specified value can also be given
# fld: number of fold of cross validation.
# method: method of model selection. Options: likelihood, BIC, AIC, and AUC
# criterion: method to compute optimal subsampling probabilities, Options: A-opt, L-opt, P-opt
# standardize: Option that whether to standardize the data
function SubsampleEst2(X, y, plt, gamma, rho, lthr;
                       nlambda = 100, eps = 0.001, lambda_max = "auto", fld = 5,
                       method = "likelihood", criterion = "A-opt",
                       standardize=true)
    (N,p) = size(X)
    idx1 = y .== 1
    beta_plt = copy(plt.beta_plt)
    alpha_plt = plt.alpha_plt
    fscr_idx = plt.fscr_idx      #first-stage screening
    Xslct = @view X[:, fscr_idx]
    if standardize == true
        Pi = PiOpt(Xslct, y, plt, rho, lthr, criterion, standardize=standardize)
        splPi = copy(Pi)
        splPi[idx1] .= 1
        subidx = rand(N) .<= splPi
        ysub = y[subidx]
        Xsub = X[subidx, fscr_idx]
        if lambda_max == "auto"
            lambda_max = maximum(abs.(vec(sum(ysub .* Xsub, dims=1)))) / length(ysub)
        else
            lambda_max = lambda_max / length(ysub)
        end
        Xsstd = vec(std(Xsub,dims=1))
        Xsub = Xsub ./ Xsstd'
        adp_weight = 1 ./ (abs.(beta_plt[fscr_idx] .* Xsstd) .^ gamma)

        cv = lassocv(Xsub, ysub, Pi=Pi[subidx], penalty_factor = adp_weight,
                     lambda_max = lambda_max,
                     nlambda = nlambda, eps=eps, fld = fld,
                     method = method)
        if cv.message == "Not converge"
            return (adpbetas = NaN, alpha = NaN, path = NaN, fscr_idx = fscr_idx,
                    auc = NaN, flsrt = NaN,
                    sscr_idx = NaN, lambda = NaN, subidx = NaN, Pi = Pi, splPi = splPi,
                    message = cv.message)
        end
        adpbetas = zeros(p)
        adpbetas[fscr_idx] .= cv.coef[2:end] ./ Xsstd
        sscr_idx = findall(!=(0), adpbetas)
        prb = 1 .- 1 ./ (1 .+ exp.(cv.coef[1] .+ X * adpbetas))
        clsf = prb .> 0.5
        flsrt = mean(abs.(y .- clsf))
        auc = AUC(y, prb, 2)
    else
        adp_weight = 1 ./ (abs.(beta_plt[fscr_idx]) .^ gamma)
        theta_plt = [alpha_plt; beta_plt[fscr_idx]]
        Pi = PiOpt(Xslct, y, plt, rho, lthr, criterion)
        splPi = copy(Pi)
        splPi[idx1] .= 1
        subidx = rand(N) .<= splPi
        ysub = y[subidx]
        Xsub = X[subidx, fscr_idx]
        if lambda_max == "auto"
            lambda_max = maximum(abs.(vec(sum(ysub .* Xsub, dims=1)))) / length(ysub)
        else
            lambda_max = lambda_max
        end
        cv = lassocv(Xsub, ysub, Pi=Pi[subidx], penalty_factor = adp_weight,
                     lambda_max = lambda_max,
                     nlambda = nlambda, eps=eps, fld = fld,
                     method = method)
        if cv.message == "Not converge"
            return (adpbetas = NaN, alpha = NaN, path = NaN, fscr_idx = fscr_idx,
                    auc = NaN, flsrt = NaN,
                    sscr_idx = NaN, lambda = NaN, subidx = NaN, Pi = Pi, splPi = splPi,
                    message = cv.message)
        end
        adpbetas = zeros(p)
        adpbetas[fscr_idx] .= cv.coef[2:end]
        sscr_idx = findall(!=(0), adpbetas)
        prb = 1 .- 1 ./ (1 .+ exp.(cv.coef[1] .+ X * adpbetas))
        clsf = prb .> 0.5
        flsrt = mean(abs.(y .- clsf))
        auc = AUC(y, prb, 2)
    end
    return (adpbetas = adpbetas, alpha = cv.coef[1], path = cv.path, fscr_idx = fscr_idx,
            auc = auc, flsrt = flsrt,
            sscr_idx = sscr_idx, lambda = cv.lambda,
            subidx = subidx, Pi = Pi, splPi = splPi,
            message = cv.message)
end


# A function to compute AUC values
function AUC(y, P, method)
    if method == 1
        (N,nlambda) = size(P)
        N1 = sum(y)
        N0 = N - N1
        auc = zeros(nlambda)
        for i in 1:nlambda
            perm = sortperm(P[:, i])
            s = 0
            for rank in 1:N
                if y[perm[rank]] == 1
                    s += rank / (N1 * N0)
                end
            end
        auc[i] = s - (N1 + 1) / (2N0)
        end
        return auc
    elseif method == 2
        N = length(P)
        N1 = sum(y)
        N0 = N - N1
        perm = sortperm(P)
        s = 0
        for rank in 1:N
            if y[perm[rank]] == 1
                s += rank / (N1 * N0)
            end
        end
        auc = s - (N1 + 1) / (2N0)
        return auc
    end
end


# Main function to fit the adaptive lasso and apply cross-validation to selection tuning parameter λ
function lassocv(X, y;
                 Pi = ones(length(y)),
                 weight = ones(length(y)),
                 penalty_factor = ones(size(X,2)),
                 nlambda=100, lambda_max = 2000, eps=0.01,
                 fld = 5, method="likelihood")
    (N,p) = size(X)
    X1 = Matrix(X)
    m = floor(Int64, N / fld)
    loss = zeros(nlambda, fld)
    message = "Successful convergence"
    g = Vector{Float64}(undef, N)
    lambda = lambda_max .* eps .^ [0:1/(nlambda-1):1;]
    l = -log.(Pi)
    Xtr = Matrix{Float64}(undef, N-m, p)
    Xtst = Matrix{Float64}(undef, m, p)
    ytr = Vector{Float64}(undef, N-m)
    ytst = Vector{Float64}(undef, m)
    ltr = Vector{Float64}(undef, N-m)
    wtr = Vector{Float64}(undef, N-m)
    ptst = Vector{Float64}(undef, m)
    for i in 1:fld
        tst_idx = [i*m-m+1:i*m;]
        tr_idx = setdiff([1:N;], tst_idx)
        Xtr = X1[tr_idx, :]
        ytr = y[tr_idx]
        ltr = l[tr_idx]
        wtr = weight[tr_idx]
        est = try 
            fit(LassoPath, Xtr, ytr, Binomial();
                offset=ltr, wts=wtr,
                standardize=false, stopearly=false,
                penalty_factor=penalty_factor,
                λ=lambda)
        catch e
            message = "Not converge"
            return (coef = NaN, path = NaN, best = NaN, meanloss = NaN,
                    message = message)
        end
        betas = est.coefs
        alpha0 = est.b0
        num_var = vec(sum(betas .!= 0, dims=1) .+ 1)

        Xtst = X1[tst_idx, :]
        ytst = y[tst_idx]
        ptst = Pi[tst_idx]
        g = alpha0' .+ X1[tst_idx, :]*betas
        n = length(tr_idx)
        if method == "likelihood"
            loss[:, i] = -sum.(eachcol(y[tst_idx] .* g .- log.(1 .+ exp.(g) ./ ptst))) ./ n
        elseif method == "auc"
            loss[:, i] = -AUC(y[tst_idx], 1 .- 1 ./ (1 .+ exp.(g)), 1)
        elseif method == "bic"
            loss[:, i] = -2 .* sum.(eachcol(y[tst_idx] .* g .- log.(1 .+ exp.(g) ./ ptst))) .+ log(n) .* num_var
        elseif method == "aic"
            loss[:, i] = -2 .* sum.(eachcol(y[tst_idx] .* g .- log.(1 .+ exp.(g) ./ ptst))) .+ 2 .* num_var
        else
            @warn("Cross Validation method not found")
            break
        end
    end
    mnloss = mean.(eachrow(loss))
    (meanloss, best) = findmin(mnloss)
    cv = try
        fit(LassoPath, X1, y, Binomial();
             offset = l, wts = weight,
             standardize=false, stopearly=false,
             penalty_factor=penalty_factor,
             λ=[lambda[best]])
    catch e
        println(e)
        message = "Not converge"
        return (coef = NaN, path = NaN, best = NaN, meanloss = NaN,
                message = message)
    end
    coef = [cv.b0[1]; (Matrix(cv.coefs)[:,1])]
    return (coef = coef, path = cv.coefs,lambda = lambda[best],
            best = best, meanloss = meanloss,
            message = message)
end


# Main function to do uniform sampling estimation
# X: matrix of covariates
# y: vector of response
# gamma: parameter of adaptive lasso penalty (1/|β|^γ)
# rho: expected sampling ratio
# lthr: a lower threshold of optimal sampling probabilities for algorithm stablity
# nlambda: number of tuning parameter λ of adaptive lasso to be selected
# eps: value to control range of λ. We use lambda_min = eps*lambda_max
# lambda_max: max value of λ. When choose "auto", the max value is determined by data
#             User-specified value can also be given
# fld: number of fold of cross validation.
# method: method of model selection. Options: likelihood, BIC, AIC, and AUC
# criterion: method to compute optimal subsampling probabilities, Options: A-opt, L-opt, P-opt
# standardize: Option that whether to standardize the data
function UniEst(X, y, plt, gamma, rho, lthr;
                nlambda = 100, eps = 0.001, fld = 5,
                lambda_max = "auto",
                method = "likelihood", standardize=false)
    (N,p) = size(X)
    idx0 = findall(==(0), y)
    beta_plt = plt.beta_plt
    alpha_plt = plt.alpha_plt
    fscr_idx = findall(!=(0), beta_plt)      #first-stage screening
    theta_plt = [alpha_plt - log(rho); beta_plt[fscr_idx]]
    pi_uni = ones(N)
    pi_uni[idx0] .= rho
    subidx = rand(N) .<= pi_uni
    ysub = y[subidx]
    Xsub = X[subidx, fscr_idx]
    if standardize == true
        ysub = y[subidx]
        Xsub = X[subidx, fscr_idx]
        if lambda_max == "auto"
            lambda_max = maximum(abs.(vec(sum(ysub .* Xsub, dims=1)))) / length(ysub)
        else
            lambda_max = lambda_max / length(ysub)
        end
        Xsstd = vec(std(Xsub,dims=1))
        Xsub = Xsub ./ Xsstd'
        adp_weight = 1 ./ (abs.(beta_plt[fscr_idx] .* Xsstd) .^ gamma)
        cv = lassocv(Xsub, ysub, penalty_factor = adp_weight,
                     lambda_max = lambda_max,
                     nlambda = nlambda, eps=eps, fld = fld,
                     method = method)
        if cv.message == "Not converge"
            return (adpbetas = NaN, alpha = NaN, path = NaN, fscr_idx = fscr_idx, 
                    auc = NaN, flsrt = NaN,
                    sscr_idx = NaN, lambda = NaN, subidx = NaN, #Pi = Pi, splPi = splPi,
                    message = cv.message)
        end 
        adpbetas = zeros(p)
        adpbetas[fscr_idx] .= cv.coef[2:end]
        sscr_idx = findall(!=(0), adpbetas)
        alpha = cv.coef[1] .+ log(rho)
        prb = 1 .- 1 ./ (1 .+ exp.(alpha .+ X * adpbetas))
        clsf = prb .> 0.5
        flsrt = mean(abs.(y .- clsf))
        auc = AUC(y, prb, 2)
    else
        ysub = y[subidx]
        Xsub = X[subidx, fscr_idx]
        if lambda_max == "auto"
            lambda_max = maximum(abs.(vec(sum(ysub .* Xsub, dims=1)))) / length(ysub)
        else
            lambda_max = lambda_max / length(ysub)
        end
        adp_weight = 1 ./ abs.(beta_plt[fscr_idx]) .^ gamma
        cv = lassocv(Xsub, ysub, penalty_factor = adp_weight,
                     lambda_max = lambda_max,
                     nlambda = nlambda, eps=eps, fld = fld,
                     method = method)
        if cv.message == "Not converge"
            return (adpbetas = NaN, alpha = NaN, path = NaN, fscr_idx = fscr_idx, 
                    auc = NaN, flsrt = NaN,
                    sscr_idx = NaN, lambda = NaN, subidx = NaN, #Pi = Pi, splPi = splPi,
                    message = cv.message)
        end 
        adpbetas = zeros(p)
        adpbetas[fscr_idx] .= cv.coef[2:end]
        sscr_idx = findall(!=(0), adpbetas)
        alpha = cv.coef[1] .+ log(rho)
        prb = 1 .- 1 ./ (1 .+ exp.(alpha .+ X * adpbetas))
        clsf = prb .> 0.5
        flsrt = mean(abs.(y .- clsf))
        auc = AUC(y, prb, 2)
    end
    return (adpbetas = adpbetas, alpha = alpha, path = cv.path, fscr_idx = fscr_idx, 
            auc = auc, flsrt = flsrt,
            sscr_idx = sscr_idx, lambda = cv.lambda,
            subidx = subidx,
            message = cv.message)
end


# A function to compute full data estimators and apply cross validation to select λ
function lassoBIC(X, y;
                  penalty_factor = ones(size(X,2)),
                  nlambda=100, lambda_max = 2000, eps=0.01,
                  fld = 5, method="likelihood")
    (N,p) = size(X)
    X1 = Matrix(X)
    m = floor(Int64, N / fld)
    loss = zeros(nlambda, fld)
    message = "Successful convergence"
    g = Vector{Float64}(undef, N)
    lambda = lambda_max .* eps .^ [0:1/(nlambda-1):1;]
    for i in 1:fld
        tst_idx = [i*m-m+1:i*m;]
        tr_idx = setdiff([1:N;], tst_idx)
        est = fit(LassoPath, X1[tr_idx,:], y[tr_idx], Binomial();
                  standardize=false, stopearly=false,
                  penalty_factor=penalty_factor,
                  λ=lambda)
        betas = est.coefs
        alpha0 = est.b0
        num_var = vec(sum(betas .!= 0, dims=1) .+ 1)

        g = alpha0' .+ X1[tst_idx, :]*betas
        n = length(tr_idx)
        if method == "likelihood"
            loss[:, i] = -sum.(eachcol(y[tst_idx] .* g .- log.(1 .+ exp.(g)))) ./ n
        elseif method == "auc"
            loss[:, i] = -AUC(y[tst_idx], 1 .- 1 ./ (1 .+ exp.(g)), 1)
        elseif method == "bic"
            loss[:, i] = -2 .* sum.(eachcol(y[tst_idx] .* g .- log.(1 .+ exp.(g)))) .+ log(n) .* num_var
        elseif method == "aic"
            loss[:, i] = -2 .* sum.(eachcol(y[tst_idx] .* g .- log.(1 .+ exp.(g)))) .+ 2 .* num_var
        else
            @warn("Cross Validation method not found")
            break
        end
    end
    mnloss = mean.(eachrow(loss))
    (meanloss, best) = findmin(mnloss)
    cv = fit(LassoPath, X1, y, Binomial();
             standardize=false, stopearly=false,
             penalty_factor=penalty_factor,
             λ=[lambda_max, lambda[best]])
    coef = [cv.b0[2]; (Matrix(cv.coefs)[:,2])]
    return (coef = coef, path = cv.coefs,
            best = best, meanloss = meanloss,
            message = message)
end


