# Set which GPU devices to be visible to the process, --num_processes should be adjusted accordingly
# "0,1,2,3" "4,5,6,7"
export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export MASTER_PORT=5010
ACCELERATE_LOG_LEVEL=info
export PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.6,max_split_size_mb:32"  # Set memory allocation configuration
export NCCL_BLOCKING_WAIT=1
export NCCL_ALLREDUCE_TIMEOUT_MS=3600000
# Set the home directory for Hugging Face transformers library cache.
# export HF_HOME="${your_hf_home}"

teacher='llama2'
student='llama2'
model_name='meta-llama/llama-2-7b-hf'
dataset='gsm8k'
data_path="gsm8k"
exp_name=LoT_llama2

prefix="LoT_llama"
if [ ! -d "logs/${exp_name}" ]; then
    mkdir -p logs/${exp_name}
fi
log_filename=logs/${exp_name}/${prefix}_${teacher}_${student}_${dataset}_seed${seed}.log
while true; do
    log_filename=logs/${exp_name}/${prefix}_${teacher}_${student}_${dataset}_seed${seed}.log
    if [ ! -f "${log_filename}" ]; then
        break
    fi
    seed=$((seed + 1))
done
output_dir="ckpt/${exp_name}/${teacher}_${student}_${dataset}_${seed}_outputs"
if [ ! -d "${output_dir}" ]; then
    mkdir -p ${output_dir}
fi

nohup deepspeed llama/src/train_lot.py \
    --model_name_or_path ${model_name} \
    --data_path ${data_path} \
    --output_dir  ${output_dir} \
    --teacher ${teacher} \
    --student "${student}" \
    --alpha 0.01 \
    --T 2 \
    --start_step 80 \
    --num_train_epochs 2 \
    --model_max_length 512 \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --evaluation_strategy "no" \
    --save_strategy "steps" \
    --save_steps 80 \
    --save_total_limit 5 \
    --learning_rate 2e-5 \
    --warmup_steps 5 \
    --logging_steps 5 \
    --lr_scheduler_type "cosine" \
    --report_to "tensorboard" \
    --gradient_checkpointing True \
    --deepspeed "llama/src/configs/deepspeed_config_llama.json" \
    --fp16 False \
    --bf16 True \
> ${log_filename} 2>&1 &
