always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 3537108992.0
cola_params: 48311112
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 2048
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 458752
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 196608
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 6
neurons: 0
non_emb_flops: 3511943168.0
non_emb_params: 47655752
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 4
num_ffn_experts: 1
opt_name: AdamW
out_dir: /chkpt/btt_norm_moe_para_e4_k2_ffe1k2_all_but_last_l6-dm2048-de-1-h-1-dh64-ttr1-010237
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: btt_norm_moe_para
timestamp: 2024-05-19_010237
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: btt_norm_moe_para_e4_k2_ffe1k2_all_but_last_l6-dm2048-de-1-h-1-dh64-ttr1-010237
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.1938 | P 3.269e+00 | Lt 1.2154 | Pt 3.340e+00 | H 4.4344 | ETA 11.02h
I 5000 | L 1.1204 | P 3.040e+00 | Lt 1.1352 | Pt 3.085e+00 | H 11.2034 | ETA 10.78h
I 7500 | L 1.0864 | P 2.939e+00 | Lt 1.0993 | Pt 2.976e+00 | H 18.2798 | ETA 10.51h
I 10000 | L 1.0648 | P 2.876e+00 | Lt 1.0858 | Pt 2.937e+00 | H 25.5325 | ETA 10.24h
I 12500 | L 1.0559 | P 2.851e+00 | Lt 1.0702 | Pt 2.892e+00 | H 32.9486 | ETA 9.96h
I 15000 | L 1.0403 | P 2.807e+00 | Lt 1.0579 | Pt 2.857e+00 | H 40.2649 | ETA 9.68h
I 17500 | L 1.0352 | P 2.793e+00 | Lt 1.0498 | Pt 2.834e+00 | H 47.5253 | ETA 9.40h
I 20000 | L 1.0298 | P 2.778e+00 | Lt 1.0449 | Pt 2.820e+00 | H 54.6853 | ETA 9.12h
I 22500 | L 1.0225 | P 2.758e+00 | Lt 1.0374 | Pt 2.799e+00 | H 62.4441 | ETA 8.83h
I 25000 | L 1.0168 | P 2.743e+00 | Lt 1.0348 | Pt 2.792e+00 | H 69.9040 | ETA 8.55h
I 27500 | L 1.0136 | P 2.734e+00 | Lt 1.0274 | Pt 2.772e+00 | H 78.1948 | ETA 8.27h
I 30000 | L 1.0130 | P 2.732e+00 | Lt 1.0228 | Pt 2.759e+00 | H 84.2713 | ETA 7.98h
I 32500 | L 1.0049 | P 2.710e+00 | Lt 1.0214 | Pt 2.755e+00 | H 92.9028 | ETA 7.70h
I 35000 | L 1.0041 | P 2.708e+00 | Lt 1.0228 | Pt 2.759e+00 | H 100.1092 | ETA 7.42h
I 37500 | L 1.0037 | P 2.707e+00 | Lt 1.0185 | Pt 2.747e+00 | H 108.6346 | ETA 7.13h
I 40000 | L 0.9979 | P 2.692e+00 | Lt 1.0170 | Pt 2.743e+00 | H 116.0104 | ETA 6.85h
I 42500 | L 0.9949 | P 2.684e+00 | Lt 1.0114 | Pt 2.728e+00 | H 123.3250 | ETA 6.56h
I 45000 | L 0.9966 | P 2.688e+00 | Lt 1.0084 | Pt 2.720e+00 | H 132.7997 | ETA 6.28h
I 47500 | L 0.9915 | P 2.675e+00 | Lt 1.0085 | Pt 2.720e+00 | H 138.4220 | ETA 5.99h
I 50000 | L 0.9906 | P 2.672e+00 | Lt 1.0073 | Pt 2.717e+00 | H 146.9906 | ETA 5.71h
I 52500 | L 0.9902 | P 2.671e+00 | Lt 1.0037 | Pt 2.707e+00 | H 154.5452 | ETA 5.42h
I 55000 | L 0.9862 | P 2.661e+00 | Lt 1.0020 | Pt 2.703e+00 | H 162.7639 | ETA 5.14h
I 57500 | L 0.9875 | P 2.664e+00 | Lt 1.0008 | Pt 2.700e+00 | H 170.9854 | ETA 4.85h
I 60000 | L 0.9848 | P 2.657e+00 | Lt 0.9983 | Pt 2.693e+00 | H 180.7983 | ETA 4.57h
I 62500 | L 0.9820 | P 2.650e+00 | Lt 0.9978 | Pt 2.691e+00 | H 189.6872 | ETA 4.28h
I 65000 | L 0.9827 | P 2.651e+00 | Lt 0.9948 | Pt 2.683e+00 | H 196.5364 | ETA 4.00h
I 67500 | L 0.9832 | P 2.653e+00 | Lt 0.9929 | Pt 2.678e+00 | H 203.6066 | ETA 3.71h
I 70000 | L 0.9790 | P 2.642e+00 | Lt 0.9952 | Pt 2.684e+00 | H 211.0779 | ETA 3.42h
I 72500 | L 0.9776 | P 2.638e+00 | Lt 0.9912 | Pt 2.674e+00 | H 223.2937 | ETA 3.14h
I 75000 | L 0.9791 | P 2.642e+00 | Lt 0.9919 | Pt 2.676e+00 | H 230.6140 | ETA 2.85h
I 77500 | L 0.9809 | P 2.647e+00 | Lt 0.9875 | Pt 2.664e+00 | H 242.1125 | ETA 2.57h
I 80000 | L 0.9769 | P 2.636e+00 | Lt 0.9889 | Pt 2.668e+00 | H 248.4719 | ETA 2.28h
I 82500 | L 0.9748 | P 2.631e+00 | Lt 0.9880 | Pt 2.665e+00 | H 257.4416 | ETA 2.00h
I 85000 | L 0.9728 | P 2.626e+00 | Lt 0.9817 | Pt 2.649e+00 | H 266.2040 | ETA 1.71h
I 87500 | L 0.9732 | P 2.627e+00 | Lt 0.9831 | Pt 2.653e+00 | H 277.0175 | ETA 1.43h
I 90000 | L 0.9713 | P 2.622e+00 | Lt 0.9831 | Pt 2.652e+00 | H 282.4663 | ETA 1.14h
I 92500 | L 0.9700 | P 2.618e+00 | Lt 0.9852 | Pt 2.658e+00 | H 294.9325 | ETA 0.86h
I 95000 | L 0.9732 | P 2.627e+00 | Lt 0.9821 | Pt 2.650e+00 | H 304.9352 | ETA 0.57h
I 97500 | L 0.9704 | P 2.619e+00 | Lt 0.9851 | Pt 2.658e+00 | H 310.0593 | ETA 0.29h
I 100000 | L 0.9733 | P 2.627e+00 | Lt 0.9844 | Pt 2.656e+00 | H 324.1381 | ETA 0.00h
I 100001 | L 0.9733 | P 2.627e+00 | Lt 0.9844 | Pt 2.656e+00
Finished training!
