import json


def get_file_list(datafile, outfile):
    dataset = json.load(open(datafile))
    files = []
    for d in dataset:
        uid = d['unique_id']
        path = uid.split('.p0')[0]
        if len(files) == 0 or files[-1] != path:
            files.append(path)

    json.dump(files, open(outfile, 'w'))
    return files


train_files = get_file_list(
    '/scratch/hengyuan/minirts/data/train.json_min10', 'ref_train.train')
valid_files = get_file_list(
    '/scratch/hengyuan/minirts/data/valid.json_min10', 'ref_valid.json')
