from datasets import load_dataset, DatasetDict, Dataset
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from bs4 import BeautifulSoup


SAVE_PATH = "/fs/cml-projects/E2H/Code_new/codeforces/contest_problem_html"

problem_dataset = load_dataset("mcding-org/Easy2Hard-Codeforces", 'problem-v0', cache_dir="/fs/cml-projects/E2H/Huggingface_cache")

for example in tqdm(problem_dataset["train"].__iter__()):
    contestID = example["contestId"]
    index = example["index"]
    url = f"https://codeforces.com/contest/{contestID}/problem/{index}"
    try:
        with open(f"{SAVE_PATH}/contestID_{contestID}_index_{index}.html", "r") as f:
            problem_statement = BeautifulSoup(f.read(), features="html.parser").find_all(name="div", class_="problem-statement")[0]
            header = problem_statement.find_all(name="div", class_="header")[0]
    except:
        for n in range(3):
            try:
                response = requests.get(url).text
            except:
                response = ""
            if len(response):
                break
        with open(f"{SAVE_PATH}/contestID_{contestID}_index_{index}.html", "w") as wf:
            if len(response)==0:
                print(f"Fail: {url}")
            wf.write(response)
