import json
import pandas as pd
import librosa
import soundfile as sf
from pathlib import Path
from tqdm import tqdm
path = []
sentence = []
offset = []
duration = []
speaker = []


with open('/data2/libriheavy/libriheavy_cuts_large.jsonl') as f:
    for line in tqdm(f, total=11156939, leave=False):
    # for line in tqdm(f, total=1101040, leave=False):
        line = json.loads(line)
        global_start = line['start']
        for obj in line['supervisions']:
            wav_path = "/data2/libriheavy/download/librilight/" + obj['recording_id'] + ".flac"
            clip_output_path = obj['id'] + ".flac"
            # wav, sr = librosa.load(wav_path, sr=16000, offset=obj['start']+global_start, duration=obj['duration'])
            # Path("/data2/libriheavy/download/librilight_clip/" + clip_output_path).parent.mkdir(parents=True, exist_ok=True)
            # sf.write("/data2/libriheavy/download/librilight_clip/" + clip_output_path, wav, sr)
            path.append(obj['recording_id'] + ".flac")
            sentence.append(obj['custom']['texts'][0])
            offset.append(obj['start']+global_start)
            duration.append(obj['duration'])
            speaker.append(obj['speaker'])

df = pd.DataFrame({'path': path, 'offset':offset, 'duration': duration, 'sentence': sentence, 'speaker': speaker})
df.to_csv('/data2/libriheavy/libriheavy_cuts_large.tsv', index=False, sep='\t')

speaker2line = {}
record = df.to_dict('records')
for i in tqdm(range(len(record))):
    speaker = record[i]['speaker']
    if speaker not in speaker2line:
        speaker2line[speaker] = []
    speaker2line[speaker].append(i)

import json
with open('/data2/libriheavy/libriheavy_large_speaker2line.json', 'w') as f:
    json.dump(speaker2line, f)



# import json
# import pandas as pd
# import librosa
# import soundfile as sf
# from pathlib import Path
# from tqdm import tqdm
# from concurrent.futures import ThreadPoolExecutor

# def process_line(line):
#     line = json.loads(line)
#     global_start = line['start']
#     results = []
#     for obj in line['supervisions']:
#         wav_path = "/data2/libriheavy/download/librilight/" + obj['recording_id'] + ".flac"
#         clip_output_path = obj['id'] + ".wav"
#         wav = librosa.load(wav_path, offset=obj['start']+global_start, duration=obj['duration'], sr=16000)[0]
#         Path("/data2/libriheavy/download/librilight_clip/" + clip_output_path).parent.mkdir(parents=True, exist_ok=True)
#         sf.write("/data2/libriheavy/download/librilight_clip/" + clip_output_path, wav, 16000)
#         results.append((clip_output_path, obj['custom']['texts'][0], obj['duration']))
#     return results

# def main():
#     with open('/data2/libriheavy/libriheavy_cuts_large.jsonl') as f:
#         lines = f.readlines()

#     with ThreadPoolExecutor() as executor:
#         results = list(tqdm(executor.map(process_line, lines), total=len(lines)))

#     # 展平结果并创建DataFrame
#     flattened_results = [item for sublist in results for item in sublist]
#     df = pd.DataFrame(flattened_results, columns=['path', 'sentence', 'duration'])
#     df.to_csv('/data2/libriheavy/libriheavy_cuts_large.tsv', index=False, sep='\t')

# if __name__ == "__main__":
#     main()
            
