import sys
sys.path.append("..")
from backend.gpt import query_gpt4

import os
import yaml
import json
from pprint import pprint

# load global config
file_path = os.path.dirname(__file__)
project_path = os.path.dirname(file_path)
global_config = yaml.safe_load(open(os.path.join(project_path, "config/global.yaml"), "r"))

file_prefix = "FriendsV3_GPT3.5_Full_"

llm_judge_prompt = """
You are a experienced human labeler for reading comprehension task.
Given a ground truth answer and a model prediction,
you have to judge whether the model prediction is correct.
The question is {}.
The ground truth answer is {}.
The model prediction is {}.

return 1 if the model prediction is correct else 0.
the model prediction may be a little different on the expression, as long as the meaning or key entity is correct, the answer can be regarded as correct.
ONLY RETURN THE NUMBER.
"""

evaluated_set = set()
if os.path.exists(file_prefix + "result"):
    with open(file_prefix + "result", "r") as f:
        for line in f:
            evaluated_set.add(line.strip().split("\t")[0])

fw = open(file_prefix + "result", "a")
for filename in os.listdir("./Friends"):
    if file_prefix in filename and filename.endswith(".json"):
        print(filename)
        data = json.loads(open(os.path.join("Friends",filename), "r").read())
        if data['sample_id'] in evaluated_set:
            continue
        # pprint(data)
        question = data['question_v2']
        ground_truth = data['answer']
        predict = data['predicted_answer']
        query = llm_judge_prompt.format(question, ground_truth, predict)
        judge_result = query_gpt4(query)
        fw.write("\t".join([data['sample_id'], ground_truth, predict, judge_result]) + "\n")