#!/usr/bin/python

'''
	call using
 	python3.6 main.py -i myInputFile.csv -o myOutputFile.csv -k 3 --tau 3 --threshold 0.5 --minClusterSize 20 --verbose 0

		All variables are as in the paper except:
		- minClusterSize is L in the paper
		- threshold is theta in the paper
		- k is the value for k given to the SVD
		- tau is the value for k given to k-means

	If you specify an input parameters (not files!) multiple times, the
	algorithm will iterate over them and return all right side clusters it has
	ever seen.

	Don't get confused by the slightly different terminology used here because
	this used to be a data mining paper until we turned to bipartite graphs
	(they are much better anyways):
		- patterns correspond to clusters on the right side
		- transactions correspond vertices on the left side
	Generally, it might be helpful to think about the biadjacency matrix of the
	graph when reading the code.

	It could be that for large k compared to n the computation of the SVD
	crashes. This is not our fault but a problem of scikit-learn.
'''


import numpy
import random
import sys, getopt
import os

from time import gmtime, strftime

import SyntheticData

import MajorityVote
import ProjCluster

def main(argv):
	inputfile = ''
	outputfile = ''

	writeClusterOutput = False
	clusterOutputFile = ''

	ks = []
	taus = []
	gammas = []
	thresholds = []
	clustering = "kmeans"
	verbose = 0

	minClusterSize = 20
	setting = 'simple'

	try:
		opts, args = getopt.getopt(argv,"hk:i:o:",["ifile=","ofile=","setting=","tau=","gamma=","threshold=","minClusterSize=","clustering=","verbose=","generate","clusterOutputFile="])
	except getopt.GetoptError:
		printHelp()
		sys.exit(2)
	for opt, arg in opts:
		if opt == '-h':
			printHelp()
			sys.exit()
		elif opt in ("-i", "--ifile"):
			inputfile = arg
		elif opt in ("-o", "--ofile"):
			outputfile = arg
		elif opt == "--clusterOutputFile":
			writeClusterOutput = True
			clusterOutputFile = arg
		elif opt == "--setting": # should be "simple"
			setting = arg
		elif opt == "-k": # the clustering parameter "k" used for the SVD
			k = int(arg)
			ks.append(k)
		elif opt == "--tau": # the clustering parameter "k" used for k-means
			tau = float(arg)
			taus.append(tau)
		elif opt == "--gamma": # not used anymore
			gamma = float(arg)
			gammas.append(gamma)
		elif opt == "--threshold":
			threshold = float(arg)
			thresholds.append(threshold)
		elif opt == "--minClusterSize":
			minClusterSize = int(arg)
		elif opt == "--verbose":
			verbose = int(arg)
		elif opt == "--clustering":
			if arg == "kmeans":
				clustering = arg
			else:
				print('Unknown clustering method given.')
				sys.exit()
		elif opt == "--generate":
			generateData()
			sys.exit()
		else:
			print("unknown option {0}".format(opt))
			printHelp()
			sys.exit()

	if len(ks) == 0:
		ks.append(0)
	if len(taus) == 0:
		taus.append(2.0)
	if len(gammas) == 0:
		gammas.append(0.5)
	if len(thresholds) == 0:
		thresholds.append(0.5)

	B = numpy.loadtxt(inputfile, delimiter=',')
	rows,columns = B.shape

	transactionClusters = []
	for k in ks:
		for tau in taus:
			for gamma in gammas:
				currentTransactionClusters = ProjCluster.ProjCluster(B.T, k, tau, projection=setting, gamma=gamma, clustering=clustering)
				transactionClusters.extend(currentTransactionClusters)

	reconstructedPatterns = []
	for threshold in thresholds:
		currentPatterns = MajorityVote.BinaryMajorityVote(B,transactionClusters,threshold=threshold,minClusterSize=minClusterSize)
		reconstructedPatterns.extend(currentPatterns)

	writePatternMatrix(outputfile, reconstructedPatterns, columns)
	if writeClusterOutput:
		writePatternMatrix(clusterOutputFile, transactionClusters, rows)

	if verbose > 0:
		print('number of reconstructedPatterns: {0}'.format(len(reconstructedPatterns)))
		print('reconstructedPatterns = {0}'.format(reconstructedPatterns))

	if verbose >= 2:
		print('number of transactionClusters: {0}. with lengths: '.format(len(transactionClusters)), end='')
		for cl in transactionClusters:
			print('{0}, '.format(len(cl)), end='')
		print('\b\b ')

	if verbose == 3:
		print('transactionClusters = ',end='')
		for cl in transactionClusters:
			if len(cl) < 20 and verbose != 4:
				continue
			print('\n- {0}, '.format(sorted(cl)),end='')
		print('\b\b ')

def writePatternMatrix(outfile, patterns, columns):
	''' Writes the patterns into a matrix as a csv file. '''
	rows = len(patterns)
	B = numpy.zeros((rows, columns), dtype=numpy.float64)
	i = 0
	for pattern in patterns:
		if len(pattern) == 0:
		   continue

		for item in pattern:
		   B[i,item] = 1
		i += 1
	
	B = B[0:i,:] # rescale B to make it smaller in case we found some empty patterns

	SyntheticData.exportToCSVFile(outfile,B)

def printHelp():
	print('Usage: See comments on top of main.py.')


if __name__ == '__main__':
	main(sys.argv[1:])

