import os.path
import re
import subprocess
from typing import Optional

from datasets import load_dataset, Dataset, DatasetDict, load_from_disk

import secret

# Load the dataset
# dataset = load_dataset("datasets/swe_bench_vanilla")
dataset = load_dataset("princeton-nlp/SWE-bench_oracle")


repos = {}
for split in dataset:
    repos_split = set()
    for i, example in enumerate(dataset[split]):
        repo = example["repo"]
        repos_split.add(repo)
    repos[split] = repos_split
print("Overlap dev and test splits: ", len(repos["dev"] & repos["test"]))
print("Overlap dev and train splits: ", len(repos["dev"] & repos["train"]))
print("Overlap train and test splits: ", len(repos["test"] & repos["train"]))
print(repos["dev"])
print(repos["train"])
print(repos["test"])
