"""
Reinforcement learning maze example.

Red rectangle:          explorer.
Black rectangles:       hells       [reward = -1].
Yellow bin circle:      paradise    [reward = +1].
All other states:       ground      [reward = 0].

This script is the main part which controls the update method of this example.
The RL is in RL_brain.py.

View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from QLearning import QLearningTable
from Maze_denseR import Maze
import argparse

import argparse

"""
Here are the param for the training
#Configurations

# 4 * 4 maze
#from Maze_denseR import Maze

# 8 * 8 maze
from Maze_large1 import Maze


episode_total_reward = []
Total_Rewards = 0

mean_episode_reward_list = []
task_completed_step = []

total_episode = 10000
total_step = 10

"""


def get_args():
    parser = argparse.ArgumentParser("Reinforcement Learning experiments for multiagent environments")
    # Environment
    parser.add_argument("--maze-name", type=str, default="./Maze_denseR.py", help="name of the scenario script")

    parser.add_argument("--maze-H", type=int, default=6, help="maze height")
    parser.add_argument("--maze-W", type=int, default=6, help="maze width")

    parser.add_argument("--scenario-name", type=str, default="3_agent_maze_6_6_run1", help="name of the scenario script")
    parser.add_argument("--max-episode-len", type=int, default=6, help="maximum episode length, which is the number of steps per episode")
    parser.add_argument("--num-episodes", type=int, default=10000, help="number of total episodes")


    parser.add_argument("--sampling-method", type=str, default="SamplingActionTrajectoriesFor_500_Episodes", help="name of the sampling method for baselines DT, named by how many episodes sampled")
    parser.add_argument("--max-sampling-episode-len", type=int, default=3, help="maximum episode length, which is the number of steps per episode for sampling action trajectories")
    parser.add_argument("--num-sampling-episodes", type=int, default=1000, help="number of total sampling episodes for sampling action trajectories")

    # Core training parameters
    parser.add_argument("--num-clusters", type=int, default=4, help="number of clustering labels, how many different labels could be chosen from")
    parser.add_argument("--max-leaf-nodes", type=int, default=4, help="leaf nodes constraints in the Decision Tree")
    parser.add_argument("--max-depth", type=int, default=2, help="max depth constraints in the Decision Tree")
    parser.add_argument("--random-seed", type=int, default=55, help="random seed for different runs")





    # Outputs
    # Models
    parser.add_argument("--save-DTModel-dir", type=str, default="./outputs/model", help="directory in which trained decision tree models should be saved")
    parser.add_argument("--load-DTModel-dir", type=str, default="./outputs/model", help="directory in which trained decision tree models should be loaded")

    # Data
    parser.add_argument("--save-data-dir", type=str, default="./outputs/data", help="directory in which trained data should be saved")
    parser.add_argument("--load-data-dir", type=str, default="./outputs/data", help="directory in which trained data should be loaded")

    # Figs
    parser.add_argument("--save-fig-dir", type=str, default="./outputs/fig", help="directory in which generated fig should be saved")
    parser.add_argument("--load-fig-dir", type=str, default="./outputs/fig", help="directory in which generated fig should be loaded")




    # Evaluate in step 3
    parser.add_argument("--evaluate-episodes", type=int, default=200, help="number of episodes for evaluating")
    parser.add_argument("--evaluate-episode-len", type=int, default=3, help="length of episodes for evaluating")


    args = parser.parse_args()

    return args

import importlib.util

def make_env(args):
    # Load the scenario from the script specified in args.maze_name
    maze_module_path = args.maze_name  # Assuming args.maze_name contains the module path
    spec = importlib.util.spec_from_file_location("Maze", maze_module_path)
    maze_module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(maze_module)

    # Assuming the maze class is named Maze within the loaded module
    env = maze_module.Maze()

    # Other initialization code can go here if needed

    return env, args


args = get_args()

import random

# Set the random seed for reproducibility
random_seed = args.random_seed  # You can choose any seed value
np.random.seed(random_seed)
random.seed(random_seed)


episode_total_reward = []
Total_Rewards = 0

mean_episode_reward_list = []
task_completed_step = []


total_sampling_episode = args.num_sampling_episodes
total_sampling_step = args.max_sampling_episode_len
scenario_name = args.scenario_name



def run_maze():
    mean_episode_reward = 0


    for episode in range(total_sampling_episode):
        # initial observation
        current_episode_reward = 0
        observation = env.reset()
        print("Start epsiode", episode)


        for s in range(total_sampling_step):
            # fresh env
            env.render()

            # RL choose action based on observation
            action_0 = RL_1.choose_action(str(observation[0]))
            action_1 = RL_2.choose_action(str(observation[1]))
            action_2 = RL_3.choose_action(str(observation[2]))



            action_n = []
            action_n.append(action_0)
            action_n.append(action_1)
            action_n.append(action_2)



            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action_n)
            current_episode_reward += reward
            #current_episode_reward += np.sum(reward)
            print('current episode reward:', current_episode_reward)



            # RL learn from this transition
            RL_1.learn(str(observation[0]), action_0, reward, str(observation_[0]))
            RL_2.learn(str(observation[1]), action_1, reward, str(observation_[1]))
            RL_3.learn(str(observation[2]), action_2, reward, str(observation_[2]))



            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                print("task achieved: YES!!!!!!!!!!!!")
                print("task achieved after ", s, " steps")
                task_completed_step.append(s)
                break
            s += 1
        if episode != 0 and episode % 200 == 0 and RL_1.epsilon <= 0.8:
            RL_1.update_epsilon()
        if episode != 0 and episode % 200 == 0 and RL_2.epsilon <= 0.8:
            RL_2.update_epsilon()
        if episode != 0 and episode % 200 == 0 and RL_3.epsilon <= 0.8:
            RL_3.update_epsilon()
        print('current episode reward when this episode ends:', current_episode_reward)
        episode_total_reward.append(current_episode_reward)
        print('Total reward List after', episode, "episode is:", episode_total_reward)

        Total_Rewards = np.sum(episode_total_reward)
        mean_episode_reward = Total_Rewards / (episode + 1)
        mean_episode_reward_list.append(mean_episode_reward)

        print("Mean Episode Rewards after", episode, "episode is:", mean_episode_reward)
        print("Mean Episode Rewards List:", mean_episode_reward_list)
        print("End Episode :", episode)
        print("\n")



        episode += 1
    print("training process over mean episode rewards:",
          mean_episode_reward)  # average rewards over 100 episodes without noise

    # end of game
    print('game over')

    # Constructing file paths
    #total_reward_file = args.save_data_dir + "/TotalReward_{scenario}_{episode}_{step}.csv".format(
    #    scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)
    mean_reward_file = args.save_data_dir + "/SamplingAction_MeanEpisodeReward_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)
    #task_completed_file = args.save_data_dir + "/TaskCompletedStep_{scenario}_{episode}_{step}.csv".format(
    #    scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)

    # Saving dataframes to CSV
    #pd.DataFrame(episode_total_reward).to_csv(total_reward_file)
    pd.DataFrame(mean_episode_reward_list).to_csv(mean_reward_file)
    #pd.DataFrame(task_completed_step).to_csv(task_completed_file)

    print(RL_1.q_table)
    print(RL_2.q_table)
    print(RL_3.q_table)


    #q1 = pd.DataFrame(RL_1.q_table)
    #q1.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    #q1_file = args.save_data_dir + "/Q1Table_{scenario}_{episode}_{step}.csv".format(
    #    scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)

    #q1.to_csv(q1_file)

    print(RL_1.action_table)
    a1 = pd.DataFrame(RL_1.action_table)
    a1.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    a1_file = args.save_data_dir + "/Action1Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)

    a1.to_csv(a1_file)

    print(RL_2.action_table)
    a2 = pd.DataFrame(RL_2.action_table)
    a2.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    a2_file = args.save_data_dir + "/Action2Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)

    a2.to_csv(a2_file)

    print(RL_3.action_table)
    a3 = pd.DataFrame(RL_2.action_table)
    a3.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    a3_file = args.save_data_dir + "/Action3Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)

    a3.to_csv(a3_file)

    env.destroy()

if __name__ == "__main__":
    args = get_args()
    env = Maze()
    RL_1 = QLearningTable(actions=list(range(env.n_actions)))
    RL_2 = QLearningTable(actions=list(range(env.n_actions)))
    RL_3 = QLearningTable(actions=list(range(env.n_actions)))




    env.after(100, run_maze)
    env.mainloop()

    plt.plot(np.arange(len(mean_episode_reward_list)), mean_episode_reward_list)
    plt.xlabel('Episode')
    plt.ylabel('Mean Episode reward')
    fig_MeanEpisodeReward_file = args.save_fig_dir + "/SamplingAction_MeanEpisodeReward_{scenario}_{episode}_{step}.png".format(
        scenario=scenario_name, episode=total_sampling_episode, step=total_sampling_step)
    plt.savefig(fig_MeanEpisodeReward_file, format='png')
    plt.show()


