always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 6218645504
cola_params: 311675520
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 512
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 114688
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 49152
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 9
neurons: 0
non_emb_flops: 6212354048
non_emb_params: 311511680
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 16
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe16k2_all_but_last_l9-dm512-de-1-h-1-dh64-ttr1-174305
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_174305
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe16k2_all_but_last_l9-dm512-de-1-h-1-dh64-ttr1-174305
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.1632 | P 3.171e+00 | Lt 1.1833 | Pt 3.235e+00 | H 42.7803 | ETA 10.34h
I 5000 | L 1.0867 | P 2.940e+00 | Lt 1.1077 | Pt 3.001e+00 | H 88.8092 | ETA 10.22h
I 7500 | L 1.0520 | P 2.840e+00 | Lt 1.0737 | Pt 2.902e+00 | H 113.8710 | ETA 10.01h
I 10000 | L 1.0360 | P 2.795e+00 | Lt 1.0509 | Pt 2.837e+00 | H 135.8756 | ETA 9.75h
I 12500 | L 1.0205 | P 2.753e+00 | Lt 1.0387 | Pt 2.803e+00 | H 153.1031 | ETA 9.51h
I 15000 | L 1.0115 | P 2.728e+00 | Lt 1.0292 | Pt 2.777e+00 | H 167.6929 | ETA 9.23h
I 17500 | L 1.0033 | P 2.706e+00 | Lt 1.0189 | Pt 2.748e+00 | H 183.6618 | ETA 8.95h
I 20000 | L 0.9929 | P 2.678e+00 | Lt 1.0064 | Pt 2.714e+00 | H 201.4441 | ETA 8.67h
I 22500 | L 0.9884 | P 2.666e+00 | Lt 1.0033 | Pt 2.706e+00 | H 214.4437 | ETA 8.41h
I 25000 | L 0.9854 | P 2.658e+00 | Lt 0.9998 | Pt 2.697e+00 | H 226.8675 | ETA 8.13h
I 27500 | L 0.9792 | P 2.642e+00 | Lt 0.9962 | Pt 2.687e+00 | H 242.6321 | ETA 7.85h
I 30000 | L 0.9751 | P 2.631e+00 | Lt 0.9868 | Pt 2.662e+00 | H 254.6015 | ETA 7.57h
I 32500 | L 0.9747 | P 2.630e+00 | Lt 0.9853 | Pt 2.658e+00 | H 267.6825 | ETA 7.31h
I 35000 | L 0.9699 | P 2.618e+00 | Lt 0.9817 | Pt 2.649e+00 | H 277.9328 | ETA 7.03h
I 37500 | L 0.9670 | P 2.610e+00 | Lt 0.9773 | Pt 2.637e+00 | H 295.1678 | ETA 6.75h
I 40000 | L 0.9626 | P 2.599e+00 | Lt 0.9754 | Pt 2.632e+00 | H 309.1584 | ETA 6.47h
I 42500 | L 0.9607 | P 2.594e+00 | Lt 0.9764 | Pt 2.635e+00 | H 325.1440 | ETA 6.20h
I 45000 | L 0.9584 | P 2.588e+00 | Lt 0.9713 | Pt 2.622e+00 | H 339.5075 | ETA 5.93h
I 47500 | L 0.9603 | P 2.593e+00 | Lt 0.9717 | Pt 2.623e+00 | H 350.7495 | ETA 5.65h
I 50000 | L 0.9535 | P 2.576e+00 | Lt 0.9666 | Pt 2.609e+00 | H 362.0437 | ETA 5.37h
I 52500 | L 0.9497 | P 2.566e+00 | Lt 0.9633 | Pt 2.601e+00 | H 383.4968 | ETA 5.10h
I 55000 | L 0.9531 | P 2.575e+00 | Lt 0.9598 | Pt 2.592e+00 | H 393.3452 | ETA 4.83h
I 57500 | L 0.9497 | P 2.566e+00 | Lt 0.9601 | Pt 2.593e+00 | H 408.6419 | ETA 4.55h
I 60000 | L 0.9465 | P 2.558e+00 | Lt 0.9578 | Pt 2.587e+00 | H 417.8508 | ETA 4.27h
I 62500 | L 0.9436 | P 2.551e+00 | Lt 0.9610 | Pt 2.595e+00 | H 434.8682 | ETA 4.00h
I 65000 | L 0.9427 | P 2.548e+00 | Lt 0.9555 | Pt 2.581e+00 | H 452.9860 | ETA 3.74h
I 67500 | L 0.9482 | P 2.562e+00 | Lt 0.9530 | Pt 2.574e+00 | H 463.7524 | ETA 3.47h
I 70000 | L 0.9429 | P 2.549e+00 | Lt 0.9599 | Pt 2.592e+00 | H 480.0281 | ETA 3.20h
I 72500 | L 0.9410 | P 2.544e+00 | Lt 0.9501 | Pt 2.567e+00 | H 495.3662 | ETA 2.92h
I 75000 | L 0.9403 | P 2.542e+00 | Lt 0.9496 | Pt 2.566e+00 | H 512.1424 | ETA 2.66h
I 77500 | L 0.9366 | P 2.533e+00 | Lt 0.9473 | Pt 2.560e+00 | H 523.1767 | ETA 2.39h
I 80000 | L 0.9403 | P 2.542e+00 | Lt 0.9485 | Pt 2.563e+00 | H 528.6934 | ETA 2.13h
I 82500 | L 0.9378 | P 2.536e+00 | Lt 0.9486 | Pt 2.563e+00 | H 557.8209 | ETA 1.86h
I 85000 | L 0.9361 | P 2.532e+00 | Lt 0.9411 | Pt 2.544e+00 | H 560.7196 | ETA 1.59h
I 87500 | L 0.9342 | P 2.527e+00 | Lt 0.9453 | Pt 2.555e+00 | H 586.7111 | ETA 1.33h
I 90000 | L 0.9362 | P 2.532e+00 | Lt 0.9436 | Pt 2.551e+00 | H 592.0150 | ETA 1.06h
I 92500 | L 0.9342 | P 2.527e+00 | Lt 0.9442 | Pt 2.552e+00 | H 616.7423 | ETA 0.79h
I 95000 | L 0.9292 | P 2.514e+00 | Lt 0.9426 | Pt 2.548e+00 | H 627.7857 | ETA 0.53h
I 97500 | L 0.9318 | P 2.521e+00 | Lt 0.9382 | Pt 2.537e+00 | H 645.6047 | ETA 0.26h
I 100000 | L 0.9268 | P 2.508e+00 | Lt 0.9373 | Pt 2.535e+00 | H 655.6570 | ETA 0.00h
I 100001 | L 0.9268 | P 2.508e+00 | Lt 0.9373 | Pt 2.535e+00
Finished training!
