
/*---------------------------------------------------
 * file:    main_resampler.c
 * purpose: Run nonparametric supervised lda on graph category structure.
 * author:  ahollowa@uci.edu
 * date:    1/11/09
 *-------------------------------------------------*/

#include "mylib.h"     // Contains the C std. lib. headers and global #define statements
#include "sampler.h"   // Sampling routines
#include "graph.h"     // Graph data structure
#include "node.h"      // Node data structure
#include "myio.h"      // I/O routines
#include "alloc.h"     // Memory allocation routines
#include "checksum.h"  // Checksums for count matrices
#include "sparse.h"    // Miscellaneous routines for sparse matrix formats
#include "init.h"      // Initialization routines for Gibb's sampler


/*------------------------------------------
 * global variables
 *------------------------------------------ */
static int W;     // Number of unique words in vocabulary
static int D;     // Number of docs
static int L;     // Maximum depth of graph
static int K;
static int ntot;  // Number of words in corpus


/*==========================================
 * main
 *========================================== */
int main(int argc, char* argv[]){

  int allow_new_nodes = TRUE, flag; // Boolean variables  
  int num_train_docs, iter, seed, i, j, k, p_estimate, init_capacity, wid, did, lev, node, W2, C, num_initial_nodes;
  int *d, *w, *docconcept, *path_lengths, *perm_lengths, *levels, *sum_levels, *Nd, *min_depth; 
  int **dl, ***dwl, **cp, **perm, **paths;
  double alpha, beta_prior, eta_prior, a, b;
  double **ll, *pi_d, *beta, *eta;
  char *did_file, *wid_file, *perm_file, *cp_file, *path_file;
  char *dir, *filename, *outfile;
  FILE *fp;
  clock_t begin, end;
  Graph *graph;
  gsl_rng *rgen = gsl_rng_alloc(gsl_rng_taus);
  
  
  /******************************************************
   *					COMMAND LINE
   *******************************************************/
  if( argc == 16 ){
  	  alpha = atof(argv[1]);        // Hyperparameter for stick-breaking beta distribution
	  beta_prior  = atof(argv[2]);  // Hyper-Hyperparameter for stick-breaking beta distribution
	  eta_prior   = atof(argv[3]);  // Hyper-Hyperparameter for topic Dirichlet prior
	  a           = atof(argv[4]);  // Hyperparameter for level Geometric prior
	  b           = atof(argv[5]);  // Hyperparameter for level Geometric prior
	  L           = atoi(argv[6]);  // Maximum number of nodes in a path (i.e. a node can have depth 0, 1, 2 ... L-1)
	  K           = atoi(argv[7]);
	  iter        = atoi(argv[8]);
	  seed	      = atoi(argv[9]);
	  did_file    = argv[10];  
	  wid_file    = argv[11];  
	  perm_file   = argv[12];
	  cp_file     = argv[13];
	  path_file   = argv[14];
	  dir         = argv[15];
  }
  else{
	fprintf(stderr, "usage: %s alpha beta_prior eta_prior a b L K iter seed did wid perm cp path output_dir\n", argv[0]);
    exit(-1);
  }
    
  assert(alpha>0);
  assert(beta_prior>0);
  assert(eta_prior>0);
  assert(a>0);
  assert(b>0);
  assert(L>0);
  assert(K>0);
  assert(iter>0);
  seedMT(seed);
    
	
 /******************************************************
 *				READ IN DOCWORD MATRIX
 *******************************************************/
  ntot = countlines(did_file);
  assert(ntot>0);
  d = ivec(ntot);
  w = ivec(ntot);
  read_ivec(ntot, d, did_file);
  read_ivec(ntot, w, wid_file);

  D = -1;
  W = -1;
  for(i=0; i<ntot; i++){
	if(d[i] > D){ D = d[i]; }
	if(w[i] > W){ W = w[i]; }
  }
  D++;
  W++;
  assert(D>0);
  assert(W>0);
  
  /******************************************************
  *				CREATE GRAPH STRUCTURE
  ******************************************************/
  p_estimate = 1;
  init_capacity = 5;
  graph = allocate_graph();

  //Initialize equivalence classes
  graph->num_equiv = ivec(L);
  graph->limit_equiv = ivec(L);
  graph->equivalence = imat(init_capacity, L);
  for(i=0; i < L; i++){  
	graph->num_equiv[i] = 1;
	graph->limit_equiv[i] = K;
	graph->equivalence[0][i] = NEW_NODE;
	p_estimate *= 2;
  }  
  graph->equiv_capacity = init_capacity;
  graph->max_depth = L;

  //Initialize nodes array
  graph->capacity = init_capacity;
  graph->nodes = nodevec(graph->capacity);
  for(i=0; i < graph->capacity; i++){   
	graph->nodes[i].id = NOT_IN_USE;
  }

  
  add_new_node(graph, 0, 0, L, W, D);
  add_new_node(graph, 1, 1, L, W, D);
  add_new_node(graph, 2, 1, L, W, D);
  add_new_node(graph, 3, 2, L, W, D);
  add_new_node(graph, 4, 1, L, W, D);
  add_new_node(graph, 5, 1, L, W, D);
  add_new_node(graph, 6, 1, L, W, D);
  add_new_node(graph, 7, 1, L, W, D);
  add_new_node(graph, 8, 1, L, W, D);
  add_new_node(graph, 9, 2, L, W, D);
  add_new_node(graph, 10, 2, L, W, D);
  add_new_node(graph, 11, 2, L, W, D);
  add_new_node(graph, 12, 2, L, W, D);
  add_new_node(graph, 13, 2, L, W, D);
  add_new_node(graph, 14, 2, L, W, D);
  update_next_avail_id(graph);

  graph->nodes[0].keep_cp_fixed = TRUE;
  graph->nodes[1].keep_cp_fixed = TRUE;
  graph->nodes[2].keep_cp_fixed = TRUE;
  graph->nodes[3].keep_cp_fixed = TRUE;
  graph->nodes[4].keep_cp_fixed = TRUE;
  graph->nodes[5].keep_cp_fixed = TRUE;
  graph->nodes[6].keep_cp_fixed = TRUE;
  graph->nodes[7].keep_cp_fixed = TRUE;
  graph->nodes[8].keep_cp_fixed = TRUE;
  graph->nodes[9].keep_cp_fixed = TRUE;
  graph->nodes[10].keep_cp_fixed = TRUE;
  graph->nodes[11].keep_cp_fixed = TRUE;
  graph->nodes[12].keep_cp_fixed = TRUE;
  graph->nodes[13].keep_cp_fixed = TRUE;
  graph->nodes[14].keep_cp_fixed = TRUE;

 /******************************************************
  *		READ IN REMAINING COMMAND LINE ARGUMENTS
  *******************************************************/

  cp = read_sparse(cp_file, &W2, &C);
  for(i=0; i < C; i++){
	if( graph->nodes[i].id == NOT_IN_USE){ continue; }
	for(j=0; j < W2; j++){
		graph->nodes[i].cp[j] = cp[j][i];
		graph->nodes[i].ztot += cp[j][i];
	}
  }
  
  num_initial_nodes = countlines(perm_file);
  perm_lengths = ivec(num_initial_nodes);
  perm = read_variable_line_i(perm_file, num_initial_nodes, -1, perm_lengths);
  for(i=0; i < num_initial_nodes; i++){
	if( perm_lengths[i] == 1 ){ continue; }
	if( perm_lengths[i] == 0 ){ continue; }
	assert(graph->nodes[i].num_feasible == perm_lengths[i]);
	for(j=0; j < perm_lengths[i]; j++){
	  graph->nodes[i].feasible[j] = perm[i][j];
	  graph->nodes[i].perm[perm[i][j]] = j;
	}
  }
  free(perm_lengths);
  free(perm[0]);
  free(perm);
 
  num_train_docs = countlines(path_file);
  path_lengths = ivec(num_train_docs);
  paths = read_variable_line_i(path_file, num_train_docs, -1, path_lengths);  // Paths
  for(i=0; i<num_train_docs; i++){
	increment_cnts_sb(graph, paths[i], i, path_lengths[i]-1);
  }
  free(paths[0]);
  free(paths);
  free(path_lengths);
  free(cp);
    
  
  
 /******************************************************
  *			INITIALIZE COUNT MATRICES
  *******************************************************/  
  levels = ivec(ntot);     // Word token level assignments
  paths  = imat(D,L);      // Document path assignments
  dl     = imat(D,L);      // Total no. words in doc d at level l
  dwl    = i3d( D, W, L);  // No. times in doc. d that word w occurs at level l
  Nd     = ivec(D);        // Total no. words in a document -- don't really need this
  pi_d   = dvec(D);        // Truncated Geometric level distribution parameter
  sum_levels   = ivec(D);  // Sum of word levels in a document
  min_depth    = ivec(D);  // The maximum level of the words in the doc -- all paths must be at least this depth
  path_lengths = ivec(D);  // Length of document paths
  docconcept = ivec(D);  
  for(i=0; i < D; i++){ docconcept[i] = FREE_PATH;}
  for(i=0; i<ntot; i++) Nd[d[i]]++;
  
  
  /******************************************************
  *		INITIALIZATION AND PRINTING OF INITIAL STATE
  *******************************************************/
  randomassignment(L, D, W, ntot, w, d, paths, path_lengths, levels, sum_levels, dwl, dl, min_depth, pi_d, graph, docconcept);
  checksum_etot_k_agg(graph);
  checksum_dwl(dwl, dl, D, W, L);

  outfile = create_filename(dir,"init_path_assigns.txt");
  print_paths(D, path_lengths, paths ,outfile);
  free(outfile);
  
  outfile = create_filename(dir, "original_graph_structure.txt");
  print_graph(graph, outfile);
  free(outfile);

  //Print program parameters
  printf("D     = %d\n", D);
  printf("W     = %d\n", W);
  printf("L     = %d\n", L);
  printf("K     = %d\n", K);
  printf("ntot  = %d\n", ntot);
  printf("seed  = %d\n", seed);
  printf("iter  = %d\n", iter);
  printf("alpha = %f\n", alpha);
  printf("a     = %f\n", a);
  printf("b     = %f\n", b);
  printf("beta_prior     = %f\n", beta_prior);
  printf("eta_prior      = %f\n", eta_prior);
  printf("p_estimate     = %d\n", p_estimate);
  printf("Graph capacity = %d\n", graph->capacity);

  
 /******************************************************
  *				GIBBS/MH SAMPLING
  *******************************************************/
  ll   = dmat(iter + 1, 4);
  beta = dvec(iter + 1);
  eta  = dvec(iter + 1);
  beta[0] = 0.0001;
  eta[0]  = 0.00001;
  ll[0]   = log_likelihood(graph, paths, path_lengths, levels, d, w, Nd, sum_levels, pi_d, ntot, D, L, W, alpha, beta[0], eta[0], a, b, docconcept);
  printf("\tLog Likelihood = %e %e %e %e sum=%e\n", ll[0][0], ll[0][1], ll[0][2], ll[0][3], ll[0][0]+ll[0][1]+ll[0][2]+ll[0][3]);


  for(i=0; i< iter; i++){
	printf("\n\n Iteration %d:\n", i);
	
	begin = clock();
	sample_nonparam_z(L, W, D, p_estimate, alpha, beta[i], eta[i], ntot, d, w, paths, path_lengths, dwl, dl, min_depth, graph, allow_new_nodes, docconcept);
	end = clock();
	checksum_etot_k_agg(graph);
	printf("\tGibbs Sampling of Z: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC)); 

	begin = clock();
	sample_level_assignments_geometric( L, W, D, ntot, eta[i], pi_d, paths, d, w, dl, dwl, path_lengths, levels, sum_levels, Nd, min_depth, docconcept, graph);
	checksum_dwl(dwl, dl, D, W, L);
	end = clock();
	printf("\tGibbs Sampling of L: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC)); 

	begin = clock();
	sample_nonparam_pi(D, L, ntot, a, b, d, pi_d, path_lengths, levels, sum_levels, Nd, dl, docconcept, graph, rgen);
	end = clock();
	printf("\tGibbs Sampling of Pi: %f seconds\n", ((float)end/CLOCKS_PER_SEC - (float)begin/CLOCKS_PER_SEC));
	
	ll[i+1] = log_likelihood(graph, paths, path_lengths, levels, d, w, Nd, sum_levels, pi_d, ntot, D, L, W, alpha, beta[i], eta[i], a, b, docconcept);
	printf("\tLog Likelihood = %e %e %e %e sum=%e \n", ll[i+1][0], ll[i+1][1], ll[i+1][2], ll[i+1][3], ll[i+1][0]+ll[i+1][1]+ll[i+1][2]+ll[i+1][3]);

	beta[i+1] = sample_beta( beta_prior, beta[i], alpha, graph, rgen);
	eta[i+1]  = sample_eta(eta_prior, eta[i], W, graph, rgen);

	if( i % 50 == 0  && i > 0){
				
		filename = calloc(BUFF_SIZE+1, sizeof(char));
		sprintf(filename, "graph_iter%d.txt",i);
		outfile = create_filename(dir, filename);
		print_graph(graph, outfile);
		free(outfile);
		free(filename);

		filename = calloc(BUFF_SIZE+1, sizeof(char));
		sprintf(filename, "perm_iter%d.txt",i);
		outfile = create_filename(dir, filename);
		print_perm(graph, outfile);
		free(outfile);
		free(filename);	
	
		filename = calloc(BUFF_SIZE+1, sizeof(char));
		sprintf(filename, "etot_iter%d.txt",i);
		outfile = create_filename(dir,filename);
		fp = fopen(outfile, "w"); assert(fp);
		for(j=0; j < graph->capacity; j++){
			if( graph->nodes[j].id==NOT_IN_USE){continue;}
			for(k=0; k < graph->nodes[j].k_capacity; k++){
				fprintf(fp, "%d ", graph->nodes[j].etot_k[k]);
			}
			fprintf(fp,"\n");
		}
		fclose(fp);
		free(outfile);
		free(filename);
	}

	
	printf("\tGraph next avail. id: %d\n", graph->next_avail_id);
  }
  
   
  /******************************************************
  *				PRINT STATE OF CHAIN
  *******************************************************/

  outfile = create_filename(dir, "beta.txt");
  write_dvec(iter + 1, beta, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "eta.txt");
  write_dvec(iter + 1, eta, outfile);
  free(outfile);
  
  outfile = create_filename(dir,"ll.txt");
  fp = fopen(outfile,"w");
  for(i=0; i < (iter+1); i++){
	fprintf(fp,"%f %f %f %f\n", ll[i][0], ll[i][1], ll[i][2], ll[i][3]);
  }
  fclose(fp);
//  write_dvec(iter + 1, ll, outfile);
//  free(outfile);
  
  outfile = create_filename(dir, "pi.txt");
  write_dvec(D, pi_d, outfile);
  free(outfile); 
  
  outfile = create_filename(dir, "path.txt");
  write_variable_line_i(outfile, paths, path_lengths, D);
  free(outfile);
    
  outfile = create_filename(dir,"levels.txt");
  write_count_ivec(ntot, levels, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "cp.txt");
  print_cp(graph, W, L, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "perm.txt");
  print_perm(graph, outfile);
  free(outfile);
  
  outfile = create_filename(dir, "final_graph_structure.txt");
  print_graph(graph, outfile);
  free(outfile);

  return 0;
}
