always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 422199296
cola_params: 19527168
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 128
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 28672
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 12288
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 9
neurons: 0
non_emb_flops: 420626432
non_emb_params: 19486208
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 16
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe16k2_all_but_last_l9-dm128-de-1-h-1-dh64-ttr1-174021
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_174021
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe16k2_all_but_last_l9-dm128-de-1-h-1-dh64-ttr1-174021
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2262 | P 3.376e+00 | Lt 1.2498 | Pt 3.456e+00 | H 70.0858 | ETA 9.82h
I 5000 | L 1.1601 | P 3.162e+00 | Lt 1.1826 | Pt 3.233e+00 | H 187.6145 | ETA 9.52h
I 7500 | L 1.1317 | P 3.074e+00 | Lt 1.1547 | Pt 3.145e+00 | H 266.7681 | ETA 9.28h
I 10000 | L 1.1202 | P 3.039e+00 | Lt 1.1367 | Pt 3.089e+00 | H 337.9524 | ETA 9.01h
I 12500 | L 1.1038 | P 2.990e+00 | Lt 1.1212 | Pt 3.042e+00 | H 391.2703 | ETA 8.77h
I 15000 | L 1.0948 | P 2.963e+00 | Lt 1.1192 | Pt 3.036e+00 | H 445.6078 | ETA 8.54h
I 17500 | L 1.0893 | P 2.947e+00 | Lt 1.1084 | Pt 3.003e+00 | H 495.6542 | ETA 8.29h
I 20000 | L 1.0837 | P 2.931e+00 | Lt 1.1043 | Pt 2.991e+00 | H 535.4566 | ETA 8.12h
I 22500 | L 1.0781 | P 2.915e+00 | Lt 1.0992 | Pt 2.976e+00 | H 579.2473 | ETA 7.84h
I 25000 | L 1.0792 | P 2.918e+00 | Lt 1.0959 | Pt 2.967e+00 | H 628.8515 | ETA 7.60h
I 27500 | L 1.0716 | P 2.896e+00 | Lt 1.0861 | Pt 2.938e+00 | H 665.3822 | ETA 7.34h
I 30000 | L 1.0702 | P 2.892e+00 | Lt 1.0860 | Pt 2.938e+00 | H 706.7880 | ETA 7.07h
I 32500 | L 1.0630 | P 2.871e+00 | Lt 1.0853 | Pt 2.936e+00 | H 745.0711 | ETA 6.81h
I 35000 | L 1.0582 | P 2.858e+00 | Lt 1.0837 | Pt 2.931e+00 | H 787.2953 | ETA 6.55h
I 37500 | L 1.0599 | P 2.862e+00 | Lt 1.0793 | Pt 2.918e+00 | H 837.0805 | ETA 6.29h
I 40000 | L 1.0595 | P 2.861e+00 | Lt 1.0766 | Pt 2.910e+00 | H 870.8851 | ETA 6.03h
I 42500 | L 1.0599 | P 2.862e+00 | Lt 1.0803 | Pt 2.921e+00 | H 908.0772 | ETA 5.78h
I 45000 | L 1.0537 | P 2.845e+00 | Lt 1.0736 | Pt 2.902e+00 | H 955.0597 | ETA 5.53h
I 47500 | L 1.0561 | P 2.852e+00 | Lt 1.0714 | Pt 2.895e+00 | H 1000.0307 | ETA 5.28h
I 50000 | L 1.0515 | P 2.839e+00 | Lt 1.0686 | Pt 2.887e+00 | H 1040.8873 | ETA 5.02h
I 52500 | L 1.0496 | P 2.833e+00 | Lt 1.0687 | Pt 2.888e+00 | H 1079.6890 | ETA 4.77h
I 55000 | L 1.0480 | P 2.829e+00 | Lt 1.0660 | Pt 2.880e+00 | H 1122.7547 | ETA 4.52h
I 57500 | L 1.0479 | P 2.829e+00 | Lt 1.0632 | Pt 2.872e+00 | H 1170.6527 | ETA 4.27h
I 60000 | L 1.0487 | P 2.831e+00 | Lt 1.0670 | Pt 2.883e+00 | H 1211.7634 | ETA 4.02h
I 62500 | L 1.0471 | P 2.826e+00 | Lt 1.0621 | Pt 2.869e+00 | H 1258.7746 | ETA 3.77h
I 65000 | L 1.0463 | P 2.824e+00 | Lt 1.0635 | Pt 2.873e+00 | H 1290.3426 | ETA 3.52h
I 67500 | L 1.0460 | P 2.823e+00 | Lt 1.0601 | Pt 2.863e+00 | H 1338.1514 | ETA 3.27h
I 70000 | L 1.0399 | P 2.806e+00 | Lt 1.0654 | Pt 2.878e+00 | H 1377.5408 | ETA 3.02h
I 72500 | L 1.0451 | P 2.821e+00 | Lt 1.0617 | Pt 2.868e+00 | H 1419.3902 | ETA 2.77h
I 75000 | L 1.0415 | P 2.811e+00 | Lt 1.0569 | Pt 2.854e+00 | H 1468.6600 | ETA 2.51h
I 77500 | L 1.0374 | P 2.799e+00 | Lt 1.0586 | Pt 2.859e+00 | H 1501.5568 | ETA 2.26h
I 80000 | L 1.0361 | P 2.796e+00 | Lt 1.0560 | Pt 2.851e+00 | H 1547.7173 | ETA 2.01h
I 82500 | L 1.0391 | P 2.804e+00 | Lt 1.0566 | Pt 2.853e+00 | H 1595.0036 | ETA 1.76h
I 85000 | L 1.0370 | P 2.798e+00 | Lt 1.0569 | Pt 2.854e+00 | H 1659.0214 | ETA 1.51h
I 87500 | L 1.0404 | P 2.808e+00 | Lt 1.0589 | Pt 2.860e+00 | H 1683.3009 | ETA 1.26h
I 90000 | L 1.0389 | P 2.803e+00 | Lt 1.0526 | Pt 2.842e+00 | H 1735.5015 | ETA 1.00h
I 92500 | L 1.0379 | P 2.801e+00 | Lt 1.0535 | Pt 2.844e+00 | H 1771.2504 | ETA 0.75h
I 95000 | L 1.0323 | P 2.785e+00 | Lt 1.0534 | Pt 2.844e+00 | H 1828.7295 | ETA 0.50h
I 97500 | L 1.0365 | P 2.797e+00 | Lt 1.0536 | Pt 2.845e+00 | H 1857.2010 | ETA 0.25h
I 100000 | L 1.0328 | P 2.787e+00 | Lt 1.0509 | Pt 2.837e+00 | H 1921.0303 | ETA 0.00h
I 100001 | L 1.0328 | P 2.787e+00 | Lt 1.0509 | Pt 2.837e+00
Finished training!
