import urllib.request as request
import re, orjson, os
import numpy as np

ANSWER_DIR = "/fs/cml-projects/E2H/AMC/problems"

contests = {
    "AIME_I":["AIME1", list(np.arange(2010, 2023))],
    "AIME_II":["AIME2", list(np.arange(2010, 2023))]
}


def crawl_answer_key(url, name):
    page = request.urlopen(url)
    strings = str(page.read(), encoding='utf-8')

    answer_pattern = r"<div id=\"mw-content-text\" lang=\"en\" dir=\"ltr\" class=\"mw-content-ltr\"><div class=\"mw-parser-output\"><p>Return to.*?</li></ol>"
    answer_text = re.findall(answer_pattern, strings, re.DOTALL)[0]
    answer_lines = re.findall(r"<ol>.*?</ol>", answer_text, re.DOTALL)[0].split('\n')
    answer_keys = {}
    for index, answer_line in enumerate(answer_lines):
        answer_str = re.findall(r"<li>.*?</li>", answer_line)[0].replace('<li>', '').replace('</li>', '')
        answer_keys[f'{name}_{index+1}'] = answer_str

    return answer_keys


def crawl_data_process(contest, year):
    url = f"https://artofproblemsolving.com/wiki/index.php/{year}_{contest}_Answer_Key"
    answer_key_dict = crawl_answer_key(url, f"{contests[contest][0]}_{year}")
    return answer_key_dict
    

def main():
    with open(f"{ANSWER_DIR}/AIME_answer.jsonl", "w") as wf:
        for contest, contest_value in contests.items():
            for year in contest_value[1]:
                answer_key_dict = crawl_data_process(contest, year)
                for key in answer_key_dict.keys():
                    problem_dict = {
                        "result_name":key,
                        "answer":answer_key_dict[key]
                    }
                    json_line = orjson.dumps(problem_dict, option=orjson.OPT_NAIVE_UTC | orjson.OPT_SERIALIZE_NUMPY)
                    wf.write(f"{str(json_line, encoding='utf-8')}\n")

    # for contest in contests.keys():
    #     final_key_dict = {}
    #     for year in range(2010, 2023):
    #         answer_key_dict = crawl_data_process(contest, year)
    #         final_key_dict.update(answer_key_dict)
    #     with open(f"answer_key/{contests[contest]}.json", "w") as outfile: 
    #         json.dump(final_key_dict, outfile)


if __name__ == "__main__":
    main()