import torch
import torch.utils.benchmark as benchmark

import sys
sys.path.append("../../")

from hollow.models.fno import NeuralOperator2d
from hollow.models.t1 import T1_2d
from hollow.losses.relative_l2 import RelativeL2
from hollow.models.fno import NeuralOperator1d
from hollow.models.t1 import T1_1d
from hollow.models.layers.fdm1d import DFTConv1d, DCTConv1d

from hollow.utils.numerics import dct2d
from hollow.train import train

from omegaconf import DictConfig, OmegaConf, open_dict
import json
from tqdm.auto import tqdm
import os
import numpy as np

# Setup dict and metrics (i know, convoluted but better than nothing)
folder = '.'
file_name = "latent_1d_benchmark_results_layers_highres.json"
model_names = ['vanilla', 'latent', 'latent_dct']

# TODO: modify, enlarge
layer_bench = np.arange(1, 9, 1, dtype=int)[::-1].tolist()  # 9
signal_bench =  np.linspace(64, 1024, 16, dtype=int)[::-1].tolist() 
width = 512 # keep fixed

threads = 32
benchmark_results = {} # use dictionary since it is easier to save

for m in model_names:
    for w in layer_bench: # note: changed name
        try: benchmark_results[m].get(w)
        except: benchmark_results[m] = {}
        for s in signal_bench:
            try: benchmark_results[m][w].get(s)
            except: benchmark_results[m][w] = {}
            benchmark_results[m][w][s] = [] # dummy init


device = torch.device("cuda:2")

# Repeat the experiment n times
repetition_number = 5

for _ in range(repetition_number):

    for nlayers in tqdm(layer_bench, "Width iter"):

        latent_operator = T1_1d(
            modes=12,
            padding=9,
            width=width,
            nlayers=nlayers,
            residual=True,
            keep_high=False,
            perform_inverse=False,
            weight_init=4,
            signal_resolution=1, # dummy res for init
            transform="dft"
        ).to(device)

        latent_dct_operator = T1_1d(
            modes=12,
            padding=9,
            width=width,
            nlayers=nlayers,
            residual=True,
            keep_high=False,
            perform_inverse=False,
            weight_init=4,
            signal_resolution=1, # dummy res for init
            transform="dct"
        ).to(device)


        vanilla_operator = NeuralOperator1d(
            modes=12,
            padding=9,
            width=width,
            nlayers=nlayers,
            residual=True,
            keep_high=False,
            spectral_layer=DFTConv1d,
            weight_init=4,
            signal_resolution=1, # dummy res for init
        ).to(device)

        # Iterate over all signal res for each width
        for signal_res in tqdm(signal_bench, leave=False, desc="Signal res iter"):

            ## Experimental: callgrind stuff 
            ## Commmented stuff are for trying
            # vanilla_operator = CopyIfCallgrind(vanilla_operator)
            # latent_operator = CopyIfCallgrind(latent_operator)
            # latent_dct_operator = CopyIfCallgrind(latent_dct_operator)

            # print(f"Timing with signal length: {signal_res}")
            t_vanilla = benchmark.Timer(
                setup='x = torch.randn(32, signal_res).to(torch.device("cuda:2"))',
                stmt='vanilla_operator(x)',
                globals={'signal_res': signal_res, 'vanilla_operator': vanilla_operator},
                num_threads=threads)

            t_latent = benchmark.Timer(
                setup='x = torch.randn(32, signal_res).to(torch.device("cuda:2"))',
                stmt='latent_operator(x)',
                globals={'signal_res': signal_res, 'latent_operator': latent_operator},
                num_threads=threads)

            t_latent_dct = benchmark.Timer(
                setup='x = torch.randn(32, signal_res).to(torch.device("cuda:2"))',
                stmt='latent_dct_operator(x)',
                globals={'signal_res': signal_res,  'latent_dct_operator': latent_operator},
                num_threads=threads)

            benchmark_results['vanilla'][nlayers][signal_res].extend(t_vanilla.blocked_autorange().times)
            benchmark_results['latent'][nlayers][signal_res].extend(t_latent.blocked_autorange().times)
            benchmark_results['latent_dct'][nlayers][signal_res].extend(t_latent_dct.blocked_autorange().times)

# Save file
with open(os.path.join(folder, file_name), "w") as f:
    json.dump(benchmark_results, f,indent=2)
    f.close()