"""
Reinforcement learning maze example.

Red rectangle:          explorer.
Black rectangles:       hells       [reward = -1].
Yellow bin circle:      paradise    [reward = +1].
All other states:       ground      [reward = 0].

This script is the main part which controls the update method of this example.
The RL is in RL_brain.py.

View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from QLearning import QLearningTable
from Maze_denseR import Maze
from arguments.args import get_args
from arguments.utils import make_env
import argparse



args = get_args()

import random

# Set the random seed for reproducibility
random_seed = args.random_seed  # You can choose any seed value
np.random.seed(random_seed)
random.seed(random_seed)


episode_total_reward = []
Total_Rewards = 0

mean_episode_reward_list = []
task_completed_step = []


total_episode = args.num_episodes
total_step = args.max_episode_len
scenario_name = args.scenario_name


def run_maze():
    mean_episode_reward = 0


    for episode in range(total_episode):
        # initial observation
        current_episode_reward = 0
        observation = env.reset()
        print("Start epsiode", episode)


        for s in range(total_step):
            # fresh env
            env.render()

            # RL choose action based on observation
            action_0 = RL_1.choose_action(str(observation[0]))
            action_1 = RL_2.choose_action(str(observation[1]))
            action_2 = RL_3.choose_action(str(observation[2]))



            action_n = []
            action_n.append(action_0)
            action_n.append(action_1)
            action_n.append(action_2)




            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action_n)
            current_episode_reward += reward
            #current_episode_reward += np.sum(reward)
            #print('current episode reward:', current_episode_reward)



            # RL learn from this transition
            RL_1.learn(str(observation[0]), action_0, reward, str(observation_[0]))
            RL_2.learn(str(observation[1]), action_1, reward, str(observation_[1]))
            RL_3.learn(str(observation[2]), action_2, reward, str(observation_[2]))



            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                print("task achieved: YES!!!!!!!!!!!!")
                print("task achieved after ", s, " steps")
                task_completed_step.append(s)
                break
            s += 1
        if episode != 0 and episode % 200 == 0 and RL_1.epsilon <= 0.8:
            RL_1.update_epsilon()
        if episode != 0 and episode % 200 == 0 and RL_2.epsilon <= 0.8:
            RL_2.update_epsilon()
        if episode != 0 and episode % 200 == 0 and RL_3.epsilon <= 0.8:
            RL_3.update_epsilon()
        print('current episode reward when this episode ends:', current_episode_reward)
        episode_total_reward.append(current_episode_reward)
        #print('Total reward List after', episode, "episode is:", episode_total_reward)

        Total_Rewards = np.sum(episode_total_reward)
        mean_episode_reward = Total_Rewards / (episode + 1)
        mean_episode_reward_list.append(mean_episode_reward)

        print("Mean Episode Rewards after", episode, "episode is:", mean_episode_reward)
        #print("Mean Episode Rewards List:", mean_episode_reward_list)
        print("End Episode :", episode)
        print("\n")



        episode += 1
    print("training process over mean episode rewards:",
          mean_episode_reward)  # average rewards over 100 episodes without noise

    # end of game
    print('game over')

    # Constructing file paths
    # Constructing file paths
    total_reward_file = args.save_data_dir + "/TotalReward_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)
    mean_reward_file = args.save_data_dir + "/MeanEpisodeReward_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)
    task_completed_file = args.save_data_dir + "/TaskCompletedStep_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    # Saving dataframes to CSV
    pd.DataFrame(episode_total_reward).to_csv(total_reward_file)
    pd.DataFrame(mean_episode_reward_list).to_csv(mean_reward_file)
    pd.DataFrame(task_completed_step).to_csv(task_completed_file)

    print(RL_1.q_table)
    print(RL_2.q_table)
    print(RL_3.q_table)


    q1 = pd.DataFrame(RL_1.q_table)
    q1.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    q1_file = args.save_data_dir + "/Q1Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    q1.to_csv(q1_file)

    q2 = pd.DataFrame(RL_2.q_table)
    q2.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    q2_file = args.save_data_dir + "/Q2Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    q2.to_csv(q2_file)

    q3 = pd.DataFrame(RL_3.q_table)
    q3.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    q3_file = args.save_data_dir + "/Q3Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    q3.to_csv(q3_file)

    print(RL_1.action_table)
    a1 = pd.DataFrame(RL_1.action_table)
    a1.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    a1_file = args.save_data_dir + "/ExpertRL_1_Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    a1.to_csv(a1_file)

    print(RL_2.action_table)
    a2 = pd.DataFrame(RL_2.action_table)
    a2.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    a2_file = args.save_data_dir + "/ExpertRL_2_Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    a2.to_csv(a2_file)

    print(RL_3.action_table)
    a3 = pd.DataFrame(RL_2.action_table)
    a3.columns = ['a_0', 'a_1', 'a_2', 'a_3']

    a3_file = args.save_data_dir + "/ExpertRL_3_Table_{scenario}_{episode}_{step}.csv".format(
        scenario=scenario_name, episode=total_episode, step=total_step)

    a3.to_csv(a3_file)

    env.destroy()

if __name__ == "__main__":
    args = get_args()
    env = Maze()
    RL_1 = QLearningTable(actions=list(range(env.n_actions)))
    RL_2 = QLearningTable(actions=list(range(env.n_actions)))
    RL_3 = QLearningTable(actions=list(range(env.n_actions)))



    env.after(100, run_maze)
    env.mainloop()

    plt.plot(np.arange(len(episode_total_reward)), episode_total_reward)
    plt.xlabel('Episode')
    plt.ylabel('Total reward')
    fig_total_reward_file = args.save_fig_dir + "/EpisodeTotalReward_{scenario}_{episode}_{step}.png".format(
        scenario=scenario_name, episode=total_episode, step=total_step)
    plt.savefig(fig_total_reward_file, format='png')
    plt.show()

    plt.plot(np.arange(len(task_completed_step)), task_completed_step)
    plt.xlabel('Episode')
    plt.ylabel('Task Completed Steps in current episode')
    fig_task_complete_step_file = args.save_fig_dir + "/TaskCompleteSteps_{scenario}_{episode}_{step}.png".format(
        scenario=scenario_name, episode=total_episode, step=total_step)
    plt.savefig(fig_task_complete_step_file, format='png')
    plt.show()

    plt.plot(np.arange(len(mean_episode_reward_list)), mean_episode_reward_list)
    plt.xlabel('Episode')
    plt.ylabel('Mean Episode reward')
    fig_MeanEpisodeReward_file = args.save_fig_dir + "/MeanEpisodeReward_{scenario}_{episode}_{step}.png".format(
        scenario=scenario_name, episode=total_episode, step=total_step)
    plt.savefig(fig_MeanEpisodeReward_file, format='png')
    plt.show()

