from typing import Tuple
from html2axtree import print_tree

data = []
cur = ''

import gzip
f = gzip.open('./en0000-01.warc.gz', 'r')

def semicolon_to_dict(metadata_list):
    metadata_dict = {}
    for element in metadata_list:
        try:
            index = element.index(':')
            key, value = element[:index], element[index+2:] # skip ': '
            metadata_dict[key] = value
        except:
            continue

    return metadata_dict

for line in f:
    cur += line.decode()
    if cur.endswith('\r\n\r\n\r\n'):
        metadata = cur.strip().split('\r\n\r\n')[0]
        body = '\r\n\r\n'.join(cur.strip().split('\r\n\r\n')[1:])
        metadata = semicolon_to_dict(metadata.split('\r\n'))
        data.append((metadata, body))
        cur = ''
        
    if len(data)-1 >= 10:
        break

from webarena.browser_env.processors import ObservationHandler
from playwright.sync_api import sync_playwright

p = sync_playwright().start()


data = data[1:]
ax_tree = []

from tqdm import tqdm
for d in tqdm(data):
    with open('./tmp.html', 'w') as f:
        print(d[1], file=f)

    browser = p.chromium.launch()  
    page = browser.new_page()
    page.goto("file:///Users/b1003/Desktop/fall-23/agent-model/scripts/clueweb/tmp.html") 
    cdp_client = page.context.new_cdp_session(page)

    cdp_client.send(
        "Accessibility.enable"
    )

    obs_handler = ObservationHandler(
        "text",
        "accessibility_tree",
        "",
        False,
        {"width": 1280, "height": 1080},
    )


    obs = obs_handler.get_observation(page, cdp_client)

    ax_tree.append(obs['text'])
    browser.close()

import json
with open('tmp.json', 'w') as f:
    json.dump(ax_tree, f)
