import numpy as np
import pandas as pd
import itertools
import time
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
##Suppress Warning##
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
def data_split(X_full, Y_full, train_size):
'''
Sample n=200 data from the original dataset to be used as training data
X_full: Full X data matrix (for communities/meps/blog)
Y_full: Full Y data matrix (for communities/meps/blog)
Return: A list of data matrices of train/test(or validation) pairs
'''
n_tot = len(Y_full)
idx = np.random.choice(n_tot, train_size, replace=False)
not_idx = [i not in idx for i in range(n_tot)]
X_train = X_full[idx, :]
X_test = X_full[not_idx, :]
Y_train = Y_full[idx, ]
Y_test = Y_full[not_idx, ]
return([X_train, X_test, Y_train, Y_test])
def generate_bootstrap_samples(n, m, B):
'''
Return: B-by-m matrix, where row b gives the indices for b-th bootstrap sample
'''
samples_idx = np.zeros((B, m),dtype=int)
for b in range(B):
sample_idx = np.random.choice(n, m)
samples_idx[b, :] = sample_idx
return(samples_idx)
def fit_bootstrap_models(X_train, Y_train, X_predict, fit_muh_fun, n, m, B):
'''
Train B bootstrap estimators and calculate predictions on X_predict
Return: list of matrices [M,P]
samples_idx = B-by-m matrix, row b = indices of b-th bootstrap sample
predictions = B-by-n1 matrix, row b = predictions from b-th bootstrap sample
(n1=len(X_predict))
'''
samples_idx = generate_bootstrap_samples(n, m, B)
n1 = len(X_predict)
# P holds the predictions from individual bootstrap estimators
predictions = np.zeros((B, n1), dtype=float)
for b in range(B):
predictions[b] = fit_muh_fun(X_train[samples_idx[b], :],\
Y_train[samples_idx[b], ], X_predict)
return([samples_idx, predictions])
# Define Baseline Regressor to be used later
def ridge2(X,Y,X1,ridge_mult=0.001):
# named ridge2 to distinguish it from the ridge regressor in sklearn.
lam = ridge_mult * np.linalg.svd(X,compute_uv=False).max()**2
betahat = np.linalg.solve(\
np.c_[np.ones(X.shape[0]),X].T.dot(np.c_[np.ones(X.shape[0]),X]) \
+ lam * np.diag(np.r_[0,np.ones(X.shape[1])]),
np.c_[np.ones(X.shape[0]),X].T.dot(Y))
return betahat[0] + X1.dot(betahat[1:])
def neural_net(X,Y,X1):
nnet = MLPRegressor(solver='lbfgs',activation='logistic').fit(X,Y)
return nnet.predict(X1)
def random_forest_nB(X,Y,X1,ntree=20):
# when bootstrap=False, it means each tree is trained on all rows of X and only
# subsamples its columns (which are features).
rf = RandomForestRegressor(n_estimators=ntree,criterion='mae',bootstrap=False).fit(X,Y)
return rf.predict(X1)
Include mean, median, and trimmed mean aggregation
def compute_PIs_jacknife_plus_after_bootstrap(X,Y,X1,alpha,fit_muh_fun,B,m,aggre):
'''
Using mean aggregation
'''
n=len(X)
n1 = len(X1)
[boot_samples_idx,boot_predictions] = \
fit_bootstrap_models(X, Y, np.vstack((X,X1)), fit_muh_fun, n, m, B)
in_boot_sample = np.zeros((B,n),dtype=bool)
for b in range(len(in_boot_sample)):
in_boot_sample[b,boot_samples_idx[b]] = True
resids_LOO = np.zeros(n)
muh_LOO_vals_testpoint = np.zeros((n,n1))
for i in range(n):
b_keep = np.argwhere(~(in_boot_sample[:,i])).reshape(-1)
if(len(b_keep)>0):
if aggre=='mean':
resids_LOO[i] = np.abs(Y[i] - boot_predictions[b_keep,i].mean())
muh_LOO_vals_testpoint[i] = boot_predictions[b_keep,n:].mean(0)
elif aggre=='median':
resids_LOO[i] = np.abs(Y[i] - np.median(boot_predictions[b_keep,i]))
muh_LOO_vals_testpoint[i] = np.median(boot_predictions[b_keep,n:],axis=0)
elif aggre=='tmean':
resids_LOO[i] = np.abs(Y[i] - stats.trim_mean(boot_predictions[b_keep,i],0.25))
muh_LOO_vals_testpoint[i] = stats.trim_mean(boot_predictions[b_keep,n:],0.25,axis=0)
else: # if aggregating an empty set of models, predict zero everywhere
resids_LOO[i] = np.abs(Y[i])
muh_LOO_vals_testpoint[i] = np.zeros(n1)
ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
###############################
# construct prediction intervals
###############################
return pd.DataFrame(\
np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
columns = ['lower','upper'])
Include mean, median, and trimmed mean aggregation
def compute_PIs_jacknife_plus_naive(X,Y,X1,alpha,fit_muh_fun,B,m,aggre):
'''
Using mean aggregation
'''
n=len(X)
n1 = len(X1)
resids_LOO = np.zeros(n)
muh_LOO_vals_testpoint = np.zeros((n,n1))
for i in range(n):
boot_predictions = fit_bootstrap_models(np.delete(X,i,0), Y, np.vstack((X,X1)), fit_muh_fun, n-1, m, B)[1]
if aggre=='mean':
resids_LOO[i] = np.abs(Y[i] - np.mean(boot_predictions[:,i]))
muh_LOO_vals_testpoint[i] = np.mean(boot_predictions[:,n:],axis=0)
elif aggre=='median':
resids_LOO[i] = np.abs(Y[i] - np.median(boot_predictions[:,i]))
muh_LOO_vals_testpoint[i] = np.median(boot_predictions[:,n:],axis=0)
elif aggre=='tmean':
resids_LOO[i] = np.abs(Y[i] - stats.trim_mean(boot_predictions[:,i],0.25))
muh_LOO_vals_testpoint[i] = stats.trim_mean(boot_predictions[:,n:],0.25,axis=0)
ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
###############################
# construct prediction intervals
###############################
return pd.DataFrame(\
np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
columns = ['lower','upper'])
def compute_PIs_jacknife_plus(X,Y,X1,alpha,fit_muh_fun):
n = len(X)
n1 = len(X1)
muh_vals = fit_muh_fun(X,Y,np.r_[X,X1])
resids_naive = np.abs(Y-muh_vals[:n])
muh_vals_testpoint = muh_vals[n:]
resids_LOO = np.zeros(n)
muh_LOO_vals_testpoint = np.zeros((n,n1))
for i in range(n):
muh_vals_LOO = fit_muh_fun(np.delete(X,i,0),np.delete(Y,i),\
np.r_[X[i].reshape((1,-1)),X1])
resids_LOO[i] = np.abs(Y[i] - muh_vals_LOO[0])
muh_LOO_vals_testpoint[i] = muh_vals_LOO[1:]
ind_q = (np.ceil((1-alpha)*(n+1))).astype(int)
###############################
# construct prediction intervals
###############################
return pd.DataFrame(\
np.c_[np.sort(muh_LOO_vals_testpoint.T - resids_LOO,axis=1).T[-ind_q], \
np.sort(muh_LOO_vals_testpoint.T + resids_LOO,axis=1).T[ind_q-1]],\
columns = ['lower','upper'])
# UCI Communities and Crime Data Set
# http://archive.ics.uci.edu/ml/datasets/communities+and+crime
communities_data = np.loadtxt('communities_data.csv',delimiter=',',dtype=str)
# remove categorical predictors
communities_data = np.delete(communities_data, np.arange(5), 1)
# remove predictors with missing values
communities_data = np.delete(communities_data, np.argwhere(
(communities_data == '?').sum(0) > 0).reshape(-1), 1)
communities_data = communities_data.astype(float)
X_communities = communities_data[:, :-1]
Y_communities = communities_data[:, -1]
# UCI BlogFeedback data set
# https://archive.ics.uci.edu/ml/datasets/BlogFeedback
blog_data = np.loadtxt('blogData_train.csv',delimiter=',')
X_blog = blog_data[:, :-1]
Y_blog = np.log(1+blog_data[:, -1])
# MEPS data set
# data downloaded (in .ssp format) from:
# https://meps.ahrq.gov/mepsweb/data_files/pufs/h192ssp.zip
# then run get_meps_data.ipynb script
# to perform feature selection & data cleaning, & store to .txt file
meps_data = np.loadtxt('meps_data.txt')
X_meps = meps_data[:, :-1]
Y_meps = meps_data[:, -1]
# Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 40 # training size n
m_vals = np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
B_ = 20 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
# Run Mean
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print('running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Start clock
start_time=time.time()
# Run jackknife+
PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
# Run Naive jackknife+ and jackknife+aB at number sampled = 200 across all m values
for m in m_vals:
# Naive J+
start_time=time.time()
PIs = compute_PIs_jacknife_plus_naive(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,'mean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
coverage,width,time.time()-start_time]
# J+aB
start_time=time.time()
B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,'mean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('new-jack+aB_realdata_results-mean.csv',index=False)
# Run Median
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print('running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Start clock
start_time=time.time()
# Run jackknife+
PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
# Run Naive jackknife+ and jackknife+aB at number sampled = 200 across all m values
for m in m_vals:
# Naive J+
start_time=time.time()
PIs = compute_PIs_jacknife_plus_naive(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,'median')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
coverage,width,time.time()-start_time]
# J+aB
start_time=time.time()
B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,'median')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('new-jack+aB_realdata_results-median.csv',index=False)
# Run Trimmed Mean
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print('running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Start clock
start_time=time.time()
# Run jackknife+
PIs = compute_PIs_jacknife_plus(X_train,Y_train,X_test,alpha,fit_func)
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+',coverage,width,time.time()-start_time]
# Run Naive jackknife+ and jackknife+aB at number sampled = 200 across all m values
for m in m_vals:
# Naive J+
start_time=time.time()
PIs = compute_PIs_jacknife_plus_naive(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,'tmean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+-naive (m='+str(m)+')',\
coverage,width,time.time()-start_time]
# J+aB
start_time=time.time()
B = int(np.random.binomial(int(B_/((1-1./(1+train_size))**m*(1-1./train_size)**m)),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,'tmean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('new-jack+aB_realdata_results-tmean.csv',index=False)
Compare J+aB with Random $B$ and with Fixed $B'$, making sure $E(B)=B'$
# Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 200 # training size n
m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
B_ = 50 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
# Run Mean, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print('running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Run jackknife+aB, random B & non-random B
for m in m_vals:
# J+aB, random B
start_time=time.time()
B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,'mean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('random_fixed-jack+aB_realdata_results-mean.csv',index=False)
# J+aB, fixed B
start_time=time.time()
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,'mean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('random_fixed-jack+aB_realdata_results-mean.csv',index=False)
# Run Median, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print('running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Run jackknife+aB, random B & non-random B
for m in m_vals:
# J+aB, random B
start_time=time.time()
B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,'median')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('random_fixed-jack+aB_realdata_results-median.csv',index=False)
# J+aB, fixed B
start_time=time.time()
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,'median')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('random_fixed-jack+aB_realdata_results-median.csv',index=False)
# Run Trimmed Mean, Random vs. Fixed J+aB
fit_funcs = [ridge2,neural_net,random_forest_nB]
Data_name = ['meps','communities','blog']
results = pd.DataFrame(columns = ['itrial','dataset','muh_fun','method','coverage','width','time'])
for data_name in Data_name:
X = eval('X_'+data_name)
Y = eval('Y_'+data_name)
for ifunc in range(len(fit_funcs)):
fit_func = fit_funcs[ifunc]
print('running '+fit_func.__name__+' on '+data_name)
for itrial in range(tot_trial):
np.random.seed(98765+itrial)
[X_train, X_test, Y_train, Y_test] = data_split(X, Y, train_size)
# Run jackknife+aB, random B & non-random B
for m in m_vals:
# J+aB, random B
start_time=time.time()
B = int(np.random.binomial(int(B_/(1-1./(1+train_size))**m),(1-1./(1+train_size))**m,size=1))
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B,m,'tmean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'random-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('random_fixed-jack+aB_realdata_results-tmean.csv',index=False)
# J+aB, fixed B
start_time=time.time()
PIs = compute_PIs_jacknife_plus_after_bootstrap(\
X_train,Y_train,X_test,alpha,fit_func,B_,m,'tmean')
coverage = ((PIs['lower'] <= Y_test)&(PIs['upper'] >= Y_test)).mean()
width = (PIs['upper'] - PIs['lower']).mean()
results.loc[len(results)]=\
[itrial,data_name,fit_func.__name__,'fixed-jack+aB (m='+str(m)+')',\
coverage,width,time.time()-start_time]
results.to_csv('random_fixed-jack+aB_realdata_results-tmean.csv',index=False)
Results are average coverage and width plots for meps dataset using random forest that subsamples features only.
results=pd.read_csv('new-jack+aB_realdata_results-mean.csv')
plt.rcParams.update({'font.size': 14})
fig, (ax1, ax2) = plt.subplots(ncols=2)
fig.set_size_inches([36,8])
m_vals=np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
Data=eval('X_meps')
d=Data.shape[1]
# Left Panel on Coverage
# Fast J+aB
coverage_AB_mean=[]
coverage_AB_SE=[]
for m in m_vals:
m=int(m)
coverage_AB_mean.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['coverage'].mean())
coverage_AB_SE.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_AB_mean=np.array(coverage_AB_mean)
coverage_AB_SE=np.array(coverage_AB_SE)
ax1.plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB')
ax1.fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
# Naive J+aB
coverage_naive_mean=[]
coverage_naive_SE=[]
for m in m_vals:
m=int(m)
coverage_naive_mean.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['coverage'].mean())
coverage_naive_SE.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_naive_mean=np.array(coverage_naive_mean)
coverage_naive_SE=np.array(coverage_naive_SE)
ax1.plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o',label='j+ ensemble')
ax1.fill_between(m_vals/train_size,coverage_naive_mean-coverage_naive_SE,coverage_naive_mean+coverage_naive_SE,alpha=0.35)
# J+
coverage_mean=[]
coverage_SE=[]
for m in m_vals:
m=int(m)
coverage_mean.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+') &\
(results['muh_fun']=='random_forest_nB')]['coverage'].mean())
coverage_SE.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+') &\
(results['muh_fun']=='random_forest_nB')]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_mean=np.array(coverage_mean)
coverage_SE=np.array(coverage_SE)
ax1.plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o',label='j+ non-ensemble')
ax1.fill_between(m_vals/train_size,coverage_mean-coverage_SE,coverage_mean+coverage_SE,alpha=0.35)
ax1.axhline(0.9,linestyle='-.',color='black')
ax1.set_ylim(0.8,1)
ax1.legend(loc='upper right')
# Right Panel on Width
# J+aB
width_AB_mean=[]
width_AB_SE=[]
for m in m_vals:
m=int(m)
width_AB_mean.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['width'].mean())
width_AB_SE.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['width'].std()\
/np.sqrt(tot_trial))
width_AB_mean=np.array(width_AB_mean)
width_AB_SE=np.array(width_AB_SE)
ax2.plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
ax2.fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
# Naive J+aB
width_naive_mean=[]
width_naive_SE=[]
for m in m_vals:
m=int(m)
width_naive_mean.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['width'].mean())
width_naive_SE.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']=='random_forest_nB')]['width'].std()\
/np.sqrt(tot_trial))
width_naive_mean=np.array(width_naive_mean)
width_naive_SE=np.array(width_naive_SE)
ax2.plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o')
ax2.fill_between(m_vals/train_size,width_naive_mean-width_naive_SE,width_naive_mean+width_naive_SE,alpha=0.35)
# J+
width_mean=[]
width_SE=[]
for m in m_vals:
m=int(m)
width_mean.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+') &\
(results['muh_fun']=='random_forest_nB')]['width'].mean())
width_SE.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+') &\
(results['muh_fun']=='random_forest_nB')]['width'].std()\
/np.sqrt(tot_trial))
width_mean=np.array(width_mean)
width_SE=np.array(width_SE)
ax2.plot(m_vals/train_size,width_mean,linestyle='-',marker='o')
ax2.fill_between(m_vals/train_size,width_mean-width_SE,width_mean+width_SE,alpha=0.35)
plt.tight_layout()
fig.savefig('Fig1_meps_random_forest_mean.pdf',dpi=300)
fig.show()
# For J+aB
aggre=['mean','median','tmean']
tot_trial=5
m=int(0.6*train_size)
Data=eval('X_meps')
regressor = ['ridge2','random_forest_nB','neural_net']
for agg in aggre:
print('Average Coverage under '+agg)
results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
for regr in regressor:
cov_mean=results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(int(m))+')') &\
(results['muh_fun']==regr)]['coverage'].mean()
cov_std=results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(int(m))+')') &\
(results['muh_fun']==regr)]['coverage'].std()/np.sqrt(tot_trial)
width_mean=results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(int(m))+')') &\
(results['muh_fun']==regr)]['width'].mean()
width_std=results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(int(m))+')') &\
(results['muh_fun']==regr)]['width'].std()/np.sqrt(tot_trial)
print('J+AB with '+regr+' on '+data_name+' has mean coverage '+str(cov_mean))
print('J+AB with '+regr+' on '+data_name+' has std coverage '+str(cov_std))
# J+aB vs. J+ vs. Naive J+aB, with Neural Network on Meps
# All three aggregation methods considered
# By row
aggre=['mean','median','tmean']
regressor = ['ridge2','random_forest_nB','neural_net']
m=int(0.6*train_size)
Data=eval('X_meps')
d=Data.shape[1]
for regr in regressor:
for agg in aggre:
print('Computing time under '+regr+'&'+agg)
results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
# J+aB
time_across_m_J_plusaB=[]
time_across_m_J_plusaB.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['time'].mean())
print('For J+aB')
print('')
print(time_across_m_J_plusaB[0])
# J+
time_across_m_J_plus=[]
time_across_m_J_plus.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+') &\
(results['muh_fun']==regr)]['time'].mean())
print('For J+ Non-Ensemble')
print('')
print(time_across_m_J_plus[0])
# Naive J+aB
time_across_m_NaiveJ_plusaB=[]
for m in m_vals:
m=int(m)
time_across_m_NaiveJ_plusaB.append(results[(results['dataset']=='meps') &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']==regr)]['time'].mean())
print('For J+ Ensemble')
print('')
print(time_across_m_NaiveJ_plusaB[0])
# For 2. Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 40 # training size n
m_vals = np.round(train_size*np.linspace(0.2,1,num=5)).astype(int)
B_ = 20 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
# For 2. Coverage
aggre=['mean','median','tmean']
Data_name = ['meps']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
plt.rcParams.update({'font.size': 14})
fig, ax1 = plt.subplots(3,3,figsize=(12,12))
Data=eval('X_'+data_name)
d=Data.shape[1]
j=0
for agg in aggre:
results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
i=0
for regr in regressor:
# J+AB, coverage
coverage_AB_mean=[]
coverage_AB_SE=[]
for m in m_vals:
coverage_AB_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].mean())
coverage_AB_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_AB_mean=np.array(coverage_AB_mean)
coverage_AB_SE=np.array(coverage_AB_SE)
if regr=='ridge2': # For legend
if agg=='tmean':
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB')
else:
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
else:
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
ax1[i,j].fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
# Naive J+AB, coverage
coverage_naive_mean=[]
coverage_naive_SE=[]
for m in m_vals:
coverage_naive_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].mean())
coverage_naive_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_naive_mean=np.array(coverage_naive_mean)
coverage_naive_SE=np.array(coverage_naive_SE)
if regr=='ridge2': # For legend
if agg=='tmean':
ax1[i,j].plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o',label='j+ ensemble')
else:
ax1[i,j].plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o')
else:
ax1[i,j].plot(m_vals/train_size,coverage_naive_mean,linestyle='-',marker='o')
ax1[i,j].fill_between(m_vals/train_size,coverage_naive_mean-coverage_naive_SE,coverage_naive_mean+coverage_naive_SE,alpha=0.35)
# J+, coverage
coverage_mean=[]
coverage_SE=[]
for m in m_vals:
coverage_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+') &\
(results['muh_fun']==regr)]['coverage'].mean())
coverage_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+') &\
(results['muh_fun']==regr)]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_mean=np.array(coverage_mean)
coverage_SE=np.array(coverage_SE)
if regr=='ridge2': # For legend
if agg=='tmean':
ax1[i,j].plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o',label='j+ non-ensemble')
ax1[i,j].legend(loc='upper right',fontsize='small')
else:
ax1[i,j].plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o')
else:
ax1[i,j].plot(m_vals/train_size,coverage_mean,linestyle='-',marker='o')
ax1[i,j].fill_between(m_vals/train_size,coverage_mean-coverage_SE,coverage_mean+coverage_SE,alpha=0.35)
# Other Features
ax1[i,j].axhline(0.9,linestyle='-.',color='black')
ax1[i,j].set_ylim(0.8,1)
if agg=='mean': # For axis label
if regr=='neural_net':
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
pass
else:
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax1[i,j].xaxis.set_visible(False)
else:
if regr=='neural_net':
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax1[i,j].yaxis.set_visible(False)
else:
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax1[i,j].xaxis.set_visible(False)
ax1[i,j].yaxis.set_visible(False)
if i==0 and j==0: # Set Title and Label
ax1[i,j].set_title('Mean',fontsize=14)
ax1[i,j].set_ylabel('Ridge',fontsize=14)
if i==1 and j==0:
ax1[i,j].set_ylabel('RF',fontsize=14)
if i==2 and j==0:
ax1[i,j].set_ylabel('NN',fontsize=14)
if i==0 and j==1:
ax1[i,j].set_title('Median',fontsize=14)
if i==0 and j==2:
ax1[i,j].set_title('Trimmed Mean',fontsize=14)
fig.tight_layout()
i+=1
j+=1
fig.savefig('Supp2_Meps_Cov.pdf',dpi=300)
fig.show()
# For 2. Width
aggre=['mean','median','tmean']
Data_name = ['meps']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
plt.rcParams.update({'font.size': 14})
fig, ax2 = plt.subplots(3,3,figsize=(12,12))
Data=eval('X_'+data_name)
d=Data.shape[1]
j=0
for agg in aggre:
results=pd.read_csv('new-jack+aB_realdata_results-'+agg+'.csv')
i=0
for regr in regressor:
# Fast J+AB, width
width_AB_mean=[]
width_AB_SE=[]
for m in m_vals:
m=int(m)
width_AB_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].mean())
width_AB_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].std()\
/np.sqrt(tot_trial))
width_AB_mean=np.array(width_AB_mean)
width_AB_SE=np.array(width_AB_SE)
if regr=='ridge2': # For legend
if agg == 'tmean':
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o',label='j+aB')
else:
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
else:
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
ax2[i,j].fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
# Naive J+AB, width
width_naive_mean=[]
width_naive_SE=[]
for m in m_vals:
m=int(m)
width_naive_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].mean())
width_naive_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+-naive (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].std()\
/np.sqrt(tot_trial))
width_naive_mean=np.array(width_naive_mean)
width_naive_SE=np.array(width_naive_SE)
if regr=='ridge2': # For legend
if agg == 'tmean':
ax2[i,j].plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o',label='j+ ensemble')
else:
ax2[i,j].plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o')
else:
ax2[i,j].plot(m_vals/train_size,width_naive_mean,linestyle='-',marker='o')
ax2[i,j].fill_between(m_vals/train_size,width_naive_mean-width_naive_SE,width_naive_mean+width_naive_SE,alpha=0.35)
# J+,width
width_mean=[]
width_SE=[]
for m in m_vals:
m=int(m)
width_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+') &\
(results['muh_fun']==regr)]['width'].mean())
width_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='jack+') &\
(results['muh_fun']==regr)]['width'].std()\
/np.sqrt(tot_trial))
width_mean=np.array(width_mean)
width_SE=np.array(width_SE)
if regr=='ridge2': # For legend
if agg == 'tmean':
ax2[i,j].plot(m_vals/train_size,width_mean,linestyle='-',marker='o',label='j+ non-ensemble')
ax2[i,j].legend(loc='upper right',fontsize='small')
else:
ax2[i,j].plot(m_vals/train_size,width_mean,linestyle='-',marker='o')
else:
ax2[i,j].plot(m_vals/train_size,width_mean,linestyle='-',marker='o')
ax2[i,j].fill_between(m_vals/train_size,width_mean-width_SE,width_mean+width_SE,alpha=0.35)
# Maintain appropriate spacing
if regr=='neural_net':
ax2[i,j].set_ylim(np.min(0.5*(width_mean+width_AB_mean)-2*(width_SE+width_AB_SE)),
np.max(0.5*(width_mean+width_AB_mean)+2*(width_SE+width_AB_SE)))
ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
else:
if regr=='ridge2':
ax2[i,j].set_ylim(np.min(0.5*(width_mean+width_AB_mean)-2*(width_SE+width_AB_SE)),
np.max(0.5*(width_mean+width_AB_mean)+2*(width_SE+width_AB_SE)))
ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax2[i,j].xaxis.set_visible(False)
# Other Features
if i==0 and j==0: # Set Title and Label
ax2[i,j].set_title('Mean',fontsize=14)
ax2[i,j].set_ylabel('Ridge',fontsize=14)
if i==1 and j==0:
ax2[i,j].set_ylabel('RF',fontsize=14)
if i==2 and j==0:
ax2[i,j].set_ylabel('NN',fontsize=14)
if i==0 and j==1:
ax2[i,j].set_title('Median',fontsize=14)
if i==0 and j==2:
ax2[i,j].set_title('Trimmed Mean',fontsize=14)
fig.tight_layout()
i+=1
j+=1
fig.savefig('Supp2_Meps_Width.pdf',dpi=300)
fig.show()
# For 1. Initialize Parameters
alpha= 0.1
tot_trial = 10
train_size = 200 # training size n
m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
B_ = 50 # number of bootstrap samples in J+aB is drawn as B ~ Binomial(int(B_/(1-1/(n+1))^m),(1-1/(n+1))^m)
# For 1. Coverage
aggre=['mean','median','tmean']
Data_name = ['meps','communities','blog']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
plt.rcParams.update({'font.size': 14})
fig, ax1 = plt.subplots(3,3,figsize=(12,12))
m_vals = np.round(train_size*np.linspace(0.1,1,num=10)).astype(int)
Data=eval('X_'+data_name)
d=Data.shape[1]
j=0
for agg in aggre:
results=pd.read_csv('random_fixed-jack+aB_realdata_results-'+agg+'.csv')
i=0
for regr in regressor:
# Random J+AB, coverage
coverage_AB_mean=[]
coverage_AB_SE=[]
for m in m_vals:
coverage_AB_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='random-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].mean())
coverage_AB_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='random-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_AB_mean=np.array(coverage_AB_mean)
coverage_AB_SE=np.array(coverage_AB_SE)
if regr=='ridge2': # For legend
if agg=='tmean':
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB random B')
else:
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
else:
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
ax1[i,j].fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
# Fixed J+AB, coverage
coverage_AB_mean=[]
coverage_AB_SE=[]
for m in m_vals:
coverage_AB_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='fixed-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].mean())
coverage_AB_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='fixed-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['coverage'].std()\
/np.sqrt(tot_trial))
coverage_AB_mean=np.array(coverage_AB_mean)
coverage_AB_SE=np.array(coverage_AB_SE)
if regr=='ridge2': # For legend
if agg=='tmean':
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o',label='j+aB fixed B')
ax1[i,j].legend(loc='upper right',fontsize='small')
else:
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
else:
ax1[i,j].plot(m_vals/train_size,coverage_AB_mean,linestyle='-',marker='o')
ax1[i,j].fill_between(m_vals/train_size,coverage_AB_mean-coverage_AB_SE,coverage_AB_mean+coverage_AB_SE,alpha=0.35)
# Other Features
ax1[i,j].axhline(0.9,linestyle='-.',color='black')
ax1[i,j].set_ylim(0.8,1)
if agg=='mean': # For axis label
if regr=='neural_net':
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
pass
else:
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax1[i,j].xaxis.set_visible(False)
else:
if regr=='neural_net':
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax1[i,j].yaxis.set_visible(False)
else:
ax1[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax1[i,j].xaxis.set_visible(False)
ax1[i,j].yaxis.set_visible(False)
if i==0 and j==0: # Set Title and Label
ax1[i,j].set_title('Mean',fontsize=14)
ax1[i,j].set_ylabel('Ridge',fontsize=14)
if i==1 and j==0:
ax1[i,j].set_ylabel('RF',fontsize=14)
if i==2 and j==0:
ax1[i,j].set_ylabel('NN',fontsize=14)
if i==0 and j==1:
ax1[i,j].set_title('Median',fontsize=14)
if i==0 and j==2:
ax1[i,j].set_title('Trimmed Mean',fontsize=14)
fig.tight_layout()
i+=1
j+=1
fig.savefig('Supp1_'+data_name+'_Cov.pdf',dpi=300)
fig.show()
# For 1. Width
aggre=['mean','median','tmean']
Data_name = ['meps','communities','blog']
regressor = ['ridge2','random_forest_nB','neural_net']
for data_name in Data_name:
plt.rcParams.update({'font.size': 14})
fig, ax2 = plt.subplots(3,3,figsize=(12,12))
Data=eval('X_'+data_name)
d=Data.shape[1]
j=0
for agg in aggre:
results=pd.read_csv('random_fixed-jack+aB_realdata_results-'+agg+'.csv')
i=0
for regr in regressor:
# Random J+AB, width
width_AB_mean=[]
width_AB_SE=[]
for m in m_vals:
m=int(m)
width_AB_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='random-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].mean())
width_AB_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='random-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].std()\
/np.sqrt(tot_trial))
width_AB_mean=np.array(width_AB_mean)
width_AB_SE=np.array(width_AB_SE)
if regr=='ridge2': # For legend
if agg == 'tmean':
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o',label='j+aB random B')
else:
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
else:
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
ax2[i,j].fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
# Fixed J+AB, width
width_AB_mean=[]
width_AB_SE=[]
for m in m_vals:
m=int(m)
width_AB_mean.append(results[(results['dataset']==data_name) &\
(results['method']=='fixed-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].mean())
width_AB_SE.append(results[(results['dataset']==data_name) &\
(results['method']=='fixed-jack+aB (m='+str(m)+')') &\
(results['muh_fun']==regr)]['width'].std()\
/np.sqrt(tot_trial))
width_AB_mean=np.array(width_AB_mean)
width_AB_SE=np.array(width_AB_SE)
if regr=='ridge2': # For legend
if agg == 'tmean':
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o',label='j+aB fixed B')
ax2[i,j].legend(loc='upper right',fontsize='small')
else:
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
else:
ax2[i,j].plot(m_vals/train_size,width_AB_mean,linestyle='-',marker='o')
ax2[i,j].fill_between(m_vals/train_size,width_AB_mean-width_AB_SE,width_AB_mean+width_AB_SE,alpha=0.35)
# Maintain appropriate spacing
if regr=='neural_net':
ax2[i,j].set_ylim(np.min(width_AB_mean-3*width_AB_SE),
np.max(width_AB_mean+3*width_AB_SE))
ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
else:
ax2[i,j].set_ylim(np.min(width_AB_mean-3*width_AB_SE),
np.max(width_AB_mean+3*width_AB_SE))
ax2[i,j].xaxis.set_ticks(np.arange(0.2, 1.01, 0.2))
ax2[i,j].xaxis.set_visible(False)
# Other Features
if i==0 and j==0: # Set Title and Label
ax2[i,j].set_title('Mean',fontsize=14)
ax2[i,j].set_ylabel('Ridge',fontsize=14)
if i==1 and j==0:
ax2[i,j].set_ylabel('RF',fontsize=14)
if i==2 and j==0:
ax2[i,j].set_ylabel('NN',fontsize=14)
if i==0 and j==1:
ax2[i,j].set_title('Median',fontsize=14)
if i==0 and j==2:
ax2[i,j].set_title('Trimmed Mean',fontsize=14)
fig.tight_layout()
i+=1
j+=1
fig.savefig('Supp1_'+data_name+'_Width.pdf',dpi=300)
fig.show()