### playing around with node2vec

import argparse
import networkx as nx
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
# from sympy.utilities.iterables import multiset_permutations
import numpy.random as nprand
import numpy as np
import pandas as pd
from pathlib import Path
import os
os.environ['MKL_VERBOSE'] = '0'  # Suppress MKL verbosity
# from scipy import errstate
from datetime import date
from math import pi
import gzip
import re



### Parse arguments in from command line
## something like python 02_n2v_args.py --nodes=100 --K=2 --within=0.5 --between=0.01

parser = argparse.ArgumentParser()
parser.add_argument('--dim', dest='dim', type=int, default=64)
parser.add_argument('--sims', dest='sims', type=int, default=1)
parser.add_argument('--alpha', dest='alpha', type=float, default=0.75)
parser.add_argument('--data', dest='data', type=str, default='email')
parser.add_argument('--notes', dest='notes', type=str, default='')


args = parser.parse_args()


### initially specify p = 0.05 just by default
### to get it working


notes = args.notes
nsims = args.sims
alpha = args.alpha
dim = args.dim
data = args.data

## load in the data and the true community labels here
main_path = os.getcwd()

if data == 'email':
    K = 42
    data_path =  main_path + "/data/email-Eu-core.txt.gz"
    data_path

    f = gzip.open(data_path)
    events = f.read().splitlines()

    edgelist = []

    for edge in events:
        curr_edge = str(edge)
        ids = re.findall(r'\d+', curr_edge)
        ## this will only work for 1 digit numbers...
        i = int(ids[0])
        j = int(ids[1])
        # if(i != j):
            # print("Self")
        edgelist.append( (i, j))
        edgelist.append( (j, i))
        ## need this to be symmetric?

    ## convert to networkx format
    G = nx.from_edgelist(edgelist)   
    G = G.to_undirected()
    largest_cc = max(nx.connected_components(G), key=len)
    N = G.subgraph(largest_cc)
    nodes = N.nodes()

    ## load in the true labels also
    data_path =  main_path + "/data/email-Eu-core-department-labels.txt.gz"
    data_path

    f = gzip.open(data_path)
    clusters = f.read().splitlines()
    truth = []

    for node in clusters:
        curr_node = str(node)
        ids = re.findall(r'\d+', curr_node)
        # print(ids[1]) 
        if float(ids[0]) in N:
            truth.append(int(ids[1]))



if data == 'blogs':
    K = 2
    ### complete this to process the blog data here..
    data_path =  main_path + "/data/polblogs/polblogs.gml"
    G = nx.read_gml(data_path)
    G = nx.Graph(G) ## convert from multigraph to regular
    G = G.to_undirected()
    largest_cc = max(nx.connected_components(G), key=len)
    N = G.subgraph(largest_cc)

    ## need to figure this out here...
    a = N.nodes.data()
    truth = [data["value"] for _, data in a]


all_results = []

today = date.today()
curr_date = today.strftime("%d_%m_%Y")
# file_name = "email_dim{}_alpha{}_{}{}".format(int(args.dim), int(args.alpha*100), curr_date, notes)
file_name = "{}_dim{}_alpha{}_{}".format(data, int(dim), int(alpha*100), curr_date)


for i in range(nsims):
    # graph = sim_dc_sbm(N, K, truth, Prob_matrix, weights)
    node2vec = Node2Vec(N, dimensions=dim, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs

    # Embed nodes
    model = node2vec.fit(window=10, min_count=1, batch_words=4, ns_exponent = alpha)

    emb_df = (
        pd.DataFrame(
            [model.wv.get_vector(str(n)) for n in N.nodes()],
            index = N.nodes
        )
    )

    X = emb_df.values

    ## then do k-means on that
    # K = 42
    kmeans = KMeans(n_clusters=K, n_init = 10).fit(X)


    ## then would want to compare these to the truth
    ari_score = adjusted_rand_score(kmeans.labels_, truth)
    nmi_score = adjusted_mutual_info_score(kmeans.labels_, truth)
    # print(adjusted_mutual_info_score(kmeans.labels_, truth))


    results = {'method':["n2v"],'ARI':[ari_score], 'NMI':[nmi_score],
                'alpha':[alpha], 'sim':[i], 'dim':[dim]}
    all_results.append(results)

    ## repeat for spectral clustering here also
    # adjacency_matrix = nx.to_numpy_array(G)

    ## compute normalized laplacian instead
    adj_matrix = nx.to_numpy_array(N)
    D = np.diag(np.sum(adj_matrix, axis=1))
    D_inv_sqrt = np.diag(1 / np.sqrt(np.diag(D)))
    n = adj_matrix.shape[0]  # Number of nodes
    I = np.eye(n)   # Identity matrix
    L = I - np.dot(np.dot(D_inv_sqrt, adj_matrix), D_inv_sqrt)

    # n = nx.normalized_laplacian_matrix(N).toarray()
    # N = n.todense()
    eigenvalues, eigenvectors = np.linalg.eigh(L)
    v = eigenvectors[:, 1:K+1]
    scaler = StandardScaler()
    v_normalized = scaler.fit_transform(v)
    kmeans = KMeans(n_clusters=K, random_state=0).fit(v_normalized)
    cluster_labels = kmeans.labels_
    ##


    ari_score = adjusted_rand_score(cluster_labels, truth)
    nmi_score = adjusted_mutual_info_score(cluster_labels, truth)
    results = {'method':["SC"],'ARI':[ari_score], 'NMI':[nmi_score], 
               'alpha':[None], 'sim':[i], 'dim':[None]}
    all_results.append(results)



df = pd.DataFrame(data = all_results)

curr_dir = os.getcwd()

save_path = curr_dir + "/subfolder/" + curr_date + '/'


filepath = Path(save_path + file_name + '_out.csv')  

filepath.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(filepath)
