%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% batch_generate_sem:
%
% Generates a set of synthetic structural equation models as used in the 
% accompanying paper:
%
% * R. Silva "Thinning measurement models and questionnaire design". 
%   Advances in Neural Information Processing Systems, 2011.
%
% Description: we generate 80 synthetic models and the corresponding
% datasets as follows. The models have 10 latent variables and 40
% observations. Coefficients linking indicators to latent variables were 
% set to zero with probability 0.8, and sampled from a standard Gaussian 
% otherwise. If some latent variable ends up with no child, or an indicator 
% ends up with no parent, we uniformly choose one child/parent to be 
% linked to it.
%
% 40 of the models have as latent covariance matrix a matrix sampled from
% an inverse Wishart distribution with 10 degrees of freedom and a 10I
% shape matrix, "I" being a 10 x 10 identity matrix. We call these cases 
% the "weak coupling cases", since sampled covariance matrices tend to be 
% closer to the identity matrix. The remaining 40 cases are sampled using
% as the shape matrix the matrix 10wish_cov, where "wish_cov" is a 
% correlation matrix with the off-diagonal values set to 0.5. These are the 
% "strong coupling" cases.
%
% Within a given level of coupling, we also set the measurement error
% variance according to a pre-specified level of "signal-to-noise" ratio.
% This is given by the reliability index (see paper). For a batch of 40
% models with a given coupling strength, 20 of them have their
% signal-to-noise ratio parameters sampled uniformly from the interval 
% [0.2 0.4] (the "low signal" case). The other 20 have their ratios sampled
% from the interval [0.4 0.7] ("high signal"). 
%
% Given each model, we sample 1000 points which can then be used to 
% calculate measures of success for each approach.
%
% The final sampled model parameters and datasets are stored in two 
% directories, each case as an independent "*.mat" file. The directories 
% are split according to the corresponding levels of signal-to-noise ratio.
%
% Created by: Ricardo Silva, London, 21/04/2011
% University College London
%
% Current version: 24/09/2011

% Seed setting (for reproducibility) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

load('nips_seed.mat')
rng(sd)

% Free parameters %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

DATA_NAME = 'data'; % Name of data directory
SYNTH_NAME = strcat(DATA_NAME, '\synth'); % Synthetic data subdirectory

num_y = 40;     % Number of observed variables
num_x = 10;     % Number of latent variables
N     = 1000;   % Number of data points

num_trials = 20; % Number of trials of each type

prob_edge = 0.2; % Probability of a X -> Y edge

for model_type = 1:4
 
  % Choose setup %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  
  % Decide here range in which the total variance of an underlying variable
  % should correspond to the 'signal': the variance given by latent
  % variables
  
  if model_type <= 2 
    s2n_ratio = [0.2 0.4]; % "Low-signal" regime 
    SIGNAL_NAME = strcat(SYNTH_NAME, '\low_signal');
  else
    s2n_ratio = [0.4 0.7];  % "High-signal" regime
    SIGNAL_NAME = strcat(SYNTH_NAME, '\high_signal');
  end
  
  if ~exist(DATA_NAME, 'dir')
    mkdir(DATA_NAME)
  end
  if ~exist(SYNTH_NAME, 'dir')
    mkdir(SYNTH_NAME)
  end
  if ~exist(SIGNAL_NAME, 'dir')
    mkdir(SIGNAL_NAME)
  end
  
  % Decide here if coupling is strong or weak
  
  FILE_NAME = strcat(SIGNAL_NAME, '\SYNTH_', int2str(num_y), '_', ...
                     int2str(num_x));
  if model_type == 1 || model_type == 3
    rho = 0;   % "Weak" coupling
    FILE_NAME = strcat(FILE_NAME, '_WC_');
  else
    rho = 0.5; % "Strong" coupling
    FILE_NAME = strcat(FILE_NAME, '_SC_');
  end

  % Generate model and data %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

  for t = 1:num_trials
          
    FILE_NAME_TRIAL = strcat(FILE_NAME, int2str(t), '.mat');
    fprintf('Saving file %s...\n', FILE_NAME_TRIAL);
    
    % Generate graph, guaranteeing some minimal connections
    
    graph = rand(num_y, num_x) < prob_edge;
    zero_par = find(sum(graph, 2) == 0)';
    for y = zero_par
      % Every observable with at least one parent    
      new_p = randsample(1:num_x, 1);
      graph(y, new_p) = 1; 
    end
    zero_child = find(sum(graph, 1) == 0);
    for x = zero_child
      % Every latent with at least one child  
      graph(ones(num_y, 1), x) = 1; 
    end

    % Generate measurement model parameters
    
    num_p = sum(graph, 2);
    L = randn(num_y, num_x + 1) .* [graph ones(num_y, 1)];

    % Generate latent covariance matrix
    
    wish_cov = ones(num_x) * rho; for x = 1:num_x, wish_cov(x, x) = 1; end
    chol_wish_cov = chol(wish_cov);
    preS = chol_wish_cov' * randn(num_x, num_x + 1); 
    S = corr(preS * preS');
    invS = inv(S);

    % Adjust coefficients to a reasonable signal-to-noise (S2N) ratio
    
    current_v = diag(L(:, 1:num_x) * S * L(:, 1:num_x)');
    s2n = zeros(num_y, 1);
    for y = 1:num_y
      s2n(y) = rand() * (s2n_ratio(2) - s2n_ratio(1)) + s2n_ratio(1); % S2N ratio 
      v_target = s2n(y) / (1 - s2n(y));
      f = sqrt(v_target / current_v(y));
      L(y, 1:num_x) = L(y, 1:num_x) * f;
    end
    
    impl_corr = corr(L(:, 1:num_x) * S * L(:, 1:num_x)' + eye(num_y));

    disp('Total signal ratios:');
    s2n_i = diag(L(:, 1:num_x) * S * L(:, 1:num_x)');
    disp(s2n_i ./ (s2n_i + 1));

    % Generate data: latent variables ("X") + underlying latent variables
    % ("Y_star") + observed binary variables ("Y")

    X = (chol(S)' * randn(num_x, N))';
    Y_star = randn(N, num_y) + [X ones(N, 1)] * L';
    Y = 2 * (Y_star > 0) - 1;

    % Independendent "test" data for sanity checks (not really used in our
    % benchmark)
    
    X_test = (chol(S)' * randn(num_x, N))';
    Y_star_test = randn(N, num_y) + [X_test ones(N, 1)] * L';
    Y_test = 2 * (Y_star_test > 0) - 1;

    % Save information
    
    save(FILE_NAME_TRIAL, 'L', 'S', 's2n', 's2n_i', 'X', 'Y_star', 'Y', ...
         'X_test', 'Y_star_test', 'Y_test')
    
  end
  
end
