#!/bin/bash

#SBATCH --job-name=mistral_full
#SBATCH --output=mistral_full.out
#SBATCH --error=mistral_full.err
# SBATCH --job-name=mistral_full_scrp
# SBATCH --output=mistral_full_scrp.out
# SBATCH --error=mistral_full_scrp.err

#SBATCH --partition=compute
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=1
#SBATCH --gres=gpu:A100:3
#SBATCH --time=23:00:00
#SBATCH --mail-type=ALL
#SBATCH --mail-user=bo@andrew.cmu.edu

source ~/.bashrc
conda activate agent

WANDB__SERVICE_WAIT=300 WANDB_PROJECT=llama accelerate launch  --main_process_port 20501 --num_processes 3 --num_machines 1  /data/b_ou/agent-model/LLaMA-Factory/src/train_bash.py \
    --stage sft \
    --model_name_or_path /data/b_ou/ckpts/mistral_flsh_attn/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/7ad5799710574ba1c1d953eba3077af582f3a773/ \
    --cache_path /data/b_ou/ckpts/data_cache/mistral \
    --do_train \
    --do_eval \
    --dataset m2w_text \
    --train_size 16500 \
    --shuffle False \
    --dataset_dir /data/b_ou/agent/data/text/ \
    --template mistral \
    --finetuning_type full \
    --output_dir /data/b_ou/ckpts/output_16k_mistral_full/ \
    --overwrite_output_dir True \
    --overwrite_cache \
    --per_device_train_batch_size 1 \
    --per_device_eval_batch_size 1 \
    --gradient_accumulation_steps 16 \
    --gradient_checkpointing True \
    --lr_scheduler_type cosine \
    --evaluation_strategy "steps" \
    --save_strategy "epoch" \
    --logging_steps "100" \
    --save_total_limit 1 \
    --learning_rate 5e-5 \
    --num_train_epochs 5 \
    --cutoff_len 4096 \
    --plot_loss \
    --bf16 True \
    --fsdp "full_shard auto_wrap" \
    --fsdp_transformer_layer_cls_to_wrap 'MistralDecoderLayer' \
    --report_to 'wandb'

        # --eval_steps 1 \




# WANDB__SERVICE_WAIT=300 WANDB_PROJECT=llama accelerate launch  --main_process_port 20500 --num_processes 3 --num_machines 1  /data/b_ou/agent-model/LLaMA-Factory/src/train_bash.py \
#     --stage sft \
#     --model_name_or_path /data/b_ou/ckpts/mistral_flsh_attn/models--mistralai--Mistral-7B-Instruct-v0.1/snapshots/7ad5799710574ba1c1d953eba3077af582f3a773/ \
#     --cache_path /data/b_ou/ckpts/data_cache/mistral_scrp \
#     --do_train \
#     --do_eval \
#     --dataset m2w_text_scrape \
#     --train_size 7500 \
#     --shuffle False \
#     --dataset_dir /data/b_ou/agent/data/text_scrp/ \
#     --template mistral \
#     --finetuning_type full \
#     --output_dir /data/b_ou/ckpts/output_8k_mistral_full_scrp/ \
#     --overwrite_output_dir True \
#     --overwrite_cache \
#     --per_device_train_batch_size 1 \
#     --per_device_eval_batch_size 1 \
#     --gradient_accumulation_steps 16 \
#     --gradient_checkpointing True \
#     --lr_scheduler_type cosine \
#     --evaluation_strategy "steps" \
#     --save_strategy "epoch" \
#     --logging_steps "100" \
#     --save_total_limit 1 \
#     --cutoff_len 4096 \
#     --learning_rate 5e-5 \
#     --num_train_epochs 5 \
#     --plot_loss \
#     --bf16 True \
#     --fsdp "full_shard auto_wrap" \
#     --fsdp_transformer_layer_cls_to_wrap 'MistralDecoderLayer' \
#     --report_to 'wandb'
