/*---------------------------------------------------
 * file:    main.c
 * purpose: Run GraphLDA from scratch
 * author:  ahollowa@uci.edu
 * date:    1/11/09
 *-------------------------------------------------*/

#include "mylib.h"     // Contains the C std. lib. headers and global #define statements
#include "sampler.h"   // Sampling routines
#include "graph.h"     // Graph data structure
#include "node.h"      // Node data structure
#include "myio.h"      // I/O routines
#include "alloc.h"     // Memory allocation routines
#include "checksum.h"  // Checksums for count matrices
#include "sparse.h"    // Miscellaneous routines for sparse matrix formats
#include "init.h"      // Initialization routines for Gibb's sampler


/*------------------------------------------
 * global variables
 *------------------------------------------ */
static int W;     // Number of unique words in vocabulary
static int D;     // Number of docs
static int L;     // Maximum depth of graph
static int K;     // Maximum no. of nodes per level
static int ntot;  // Number of words in corpus


/*==========================================
 * main
 *========================================== */
int main(int argc, char* argv[]){
 
  int iter, seed, i, j, k, p_estimate, init_capacity, wid, did, lev, node, W2, num_initial_nodes, level;
  int *d, *w, *docconcept, *path_lengths, *perm_lengths, *levels, *sum_levels, *Nd, *min_depth, *factorial; 
  int **dl, ***dwl, **cp, **perm, **paths;
  double alpha, beta_prior, eta_prior, a, b;
  double **ll, *pi_d, *beta, *eta;
  char *did_file, *wid_file, *docpath_file, *level_file, *perm_file, *pi_file, *filename, *path_file;
  char *dir, *outfile, *cpfile, *docconcept_file;
  FILE *fp;
  clock_t begin, end;
  Graph *graph;
  gsl_rng *rgen = gsl_rng_alloc(gsl_rng_taus);
  
  
  /******************************************************
   *					COMMAND LINE
   *******************************************************/
  if( argc == 14 ){
  	  alpha = atof(argv[1]);        // Hyperparameter for stick-breaking beta distribution
	  beta_prior  = atof(argv[2]);  // Hyper-Hyperparameter for stick-breaking beta distribution
	  eta_prior   = atof(argv[3]);  // Hyper-Hyperparameter for topic Dirichlet prior
	  a     = atof(argv[4]);        // Hyperparameter for level distribution (either beta/geometric or gamma/poisson)
	  b     = atof(argv[5]);        // Hyperparameter for level distribution (either beta/geometric or gamma/poisson)
	  L     = atoi(argv[6]);        // Maximum number of nodes in a path (i.e. a node can have depth 0, 1, 2 ... L-1)
	  K     = atoi(argv[7]);
	  iter            = atoi(argv[8]);
	  seed	          = atoi(argv[9]);
	  did_file        = argv[10];  
	  wid_file        = argv[11];  
	  docconcept_file = argv[12];
	  dir             = argv[13];
  }
  else{
	fprintf(stderr, "usage: %s alpha beta_prior eta_prior a b L K iter seed did wid docconcept output_dir\n", argv[0]);
    exit(-1);
  }
    
  assert(alpha>0);
  assert(beta_prior>0);
  assert(eta_prior>0);
  assert(a>0);
  assert(b>0);
  assert(L>0);
  assert(K>0);
  assert(iter>0);
  seedMT(seed);
    
	
 /******************************************************
   *			READ IN COMMAND LINE ARGUMENTS
   *******************************************************/

  //Read in docword matrix
  ntot = countlines(did_file);
  assert(ntot>0);
  d = ivec(ntot);
  w = ivec(ntot);
  read_ivec(ntot, d, did_file);
  read_ivec(ntot, w, wid_file);

  W = -1;
  D = -1;
  for(i=0; i<ntot; i++){
	if(d[i] > D){ D = d[i]; }
	if(w[i] > W){ W = w[i]; }
  }
  W++;
  D++;
  assert(W>0);
  assert(D>0);
  
  //Read in docconcept matrix
  docconcept = read_count_ivec(D,docconcept_file);
  

  /******************************************************
  *				CREATE GRAPH STRUCTURE
  ******************************************************/
  p_estimate = 1;
  graph = allocate_graph();

  //Initialize equivalence classes
  graph->num_equiv = ivec(L);
  graph->limit_equiv = ivec(L);
  graph->equivalence = imat(K, L);
  for(i=0; i < L; i++){  
	graph->num_equiv[i] = 1;
	graph->limit_equiv[i] = K;
	graph->equivalence[0][i] = NEW_NODE;
	p_estimate *= 2;
  }
  graph->equiv_capacity = K;
  graph->max_depth = L;
  
  factorial = ivec(L);
  factorial[0] = 1;
  for(i=1; i < L; i++){
	factorial[i] = factorial[i-1]*i;
  }
    
  //Initialize nodes array
  graph->capacity = K;
  graph->nodes = nodevec(graph->capacity);
  graph->next_avail_id = 0;
  for(i=0; i < graph->capacity; i++){   
	graph->nodes[i].id = NOT_IN_USE;
  }

  //Add a single root node. If you want to start with an existing graph structure,
  //then you need to add each node to its respective level using an add_new_node()
  //command and then call update_next_avail_id() at the end.
  add_new_node(graph, 0, 0, L, W, D);
  update_next_avail_id(graph);



 /******************************************************
  *			INITIALIZE COUNT MATRICES
  *******************************************************/
  levels = ivec(ntot);     // Word token level assignments
  paths  = imat(D,L);      // Document path assignments
  dl     = imat(D,L);      // Total no. words in doc d at level l
  dwl    = i3d( D, W, L);  // No. times in doc. d that word w occurs at level l
  Nd     = ivec(D);        // Total no. words in a document -- don't really need this
  pi_d   = dvec(D);        // Truncated Geometric level distribution parameter
  sum_levels   = ivec(D);  // Sum of word levels in a document
  min_depth    = ivec(D);  // The maximum level of the words in the doc -- all paths must be at least this depth
  path_lengths = ivec(D);  // Length of document paths
  for(i=0; i<ntot; i++) Nd[d[i]]++;
  
  
  /******************************************************
  *		INITIALIZATION AND PRINTING OF INITIAL STATE
  *******************************************************/
  randomassignment(L, D, W, ntot, w, d, paths, path_lengths, levels, sum_levels, dwl, dl, min_depth, pi_d, graph, docconcept);
  checksum_etot_k_agg(graph);
  checksum_dwl(dwl, dl, D, W, L);

  outfile = create_filename(dir,"init_path_assigns.txt");
  print_paths(D, path_lengths, paths ,outfile);
  free(outfile);
  
  outfile = create_filename(dir, "original_graph_structure.txt");
  print_graph(graph, outfile);
  free(outfile);

  //Print program parameters
  printf("D  = %d\n", D);
  printf("W  = %d\n", W);
  printf("L  = %d\n", L);
  printf("K  = %d\n", K);
  printf("Graph capacity = %d\n", graph->capacity);
  printf("p_estimate     = %d\n", p_estimate);
  printf("ntot  = %d\n", ntot);
  printf("seed  = %d\n", seed);
  printf("iter  = %d\n", iter);
  printf("alpha = %f\n", alpha);
  printf("a     = %f\n", a);
  printf("b     = %f\n", b);
  printf("beta_prior  = %f\n", beta_prior);
  printf("eta_prior   = %f\n", eta_prior);

  
 /******************************************************
  *				GIBBS/MH SAMPLING
  *******************************************************/
  ll   = dmat(iter + 1,4);
  beta = dvec(iter + 1);
  eta  = dvec(iter + 1);
  beta[0] = beta_prior;
  eta[0]  = eta_prior;
  ll[0]   = log_likelihood(graph, paths, path_lengths, levels, d, w, Nd, sum_levels, pi_d, ntot, D, L, W, alpha, beta[0], eta[0], a, b, docconcept);
  printf("\tLog Likelihood = %e\n",  ll[0][0]+ll[0][1]+ll[0][2]+ll[0][3]);


  for(i=0; i< iter; i++){
	printf("\n\n Iteration %d:\n", i);

	//Sample the paths
	begin = clock();
	sample_nonparam_z(L, W, D, p_estimate, alpha, beta[i], eta[i], ntot, d, w, paths, path_lengths, dwl, dl, min_depth, graph, TRUE, docconcept);
	end = clock();
	checksum_etot_k_agg(graph);
	printf("\tGibbs Sampling of Z: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC)); 

	//Sample the levels
	begin = clock();
	sample_level_assignments_geometric( L, W, D, ntot, eta[i], pi_d, paths, d, w, dl, dwl, path_lengths, levels, sum_levels, Nd, min_depth, docconcept, graph);
	checksum_dwl(dwl, dl, D, W, L);
	end = clock();
	printf("\tGibbs Sampling of L: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC)); 

	//Sample the level parameters
	begin = clock();
	sample_nonparam_pi(D, L, ntot, a, b, d, pi_d, path_lengths, levels, sum_levels, Nd, dl, docconcept, graph, rgen);
	end = clock();
	printf("\tGibbs Sampling of Pi: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC));

	//Compute log likelihood of new configuration
	ll[i+1]   = log_likelihood(graph, paths, path_lengths, levels, d, w, Nd, sum_levels, pi_d, ntot, D, L, W, alpha, beta[i], eta[i], a, b, docconcept);
	printf("\tLog Likelihood = %e\n", ll[i+1][0]+ll[i+1][1]+ll[i+1][2]+ll[i+1][3]);

	//Sample new hyperparameters
	beta[i+1] = sample_beta( beta_prior, beta[i], alpha, graph, rgen);
	eta[i+1]  = sample_eta(eta_prior, eta[i], W, graph, rgen);

	//Print out state of Gibbs sampler
	if(i % 50 == 0  && i > 0){
		print_state(i, ntot, D, W, L, dir, paths, path_lengths, pi_d, levels, graph);
	}

	printf("\tGraph next avail. id: %d\n", graph->next_avail_id);
  }


  /******************************************************
  *				PRINT STATE OF CHAIN
  *******************************************************/

  outfile = create_filename(dir, "beta.txt");
  write_dvec(iter + 1, beta, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "eta.txt");
  write_dvec(iter + 1, eta, outfile);
  free(outfile);
  
  outfile = create_filename(dir,"ll.txt");
  fp = fopen(outfile,"w");
  for(i=0; i < (iter+1); i++){
	fprintf(fp,"%f %f %f %f\n", ll[i][0], ll[i][1], ll[i][2], ll[i][3]);
  }
  fclose(fp);
    
  outfile = create_filename(dir, "pi.txt");
  write_dvec(D, pi_d, outfile);
  free(outfile); 
  
  outfile = create_filename(dir, "path.txt");
  write_variable_line_i(outfile, paths, path_lengths, D);
  free(outfile);
    
  outfile = create_filename(dir,"levels.txt");
  write_count_ivec(ntot, levels, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "cp.txt");
  print_cp(graph, W, L, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "perm.txt");
  print_perm(graph, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "final_graph_structure.txt");
  print_graph(graph, outfile);
  free(outfile);

  return 0;
}
