always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 6912278528
cola_params: 204689280
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 512
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 114688
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_params: 49152
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 3
neurons: 0
non_emb_flops: 6905987072
non_emb_params: 204525440
num_active_experts: 2
num_active_ffn_experts: 8
num_blocks: 4
num_experts: 0
num_ffn_experts: 32
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe32k8_all_but_last_l3-dm512-de-1-h-1-dh64-ttr1-191711
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-14_191711
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe32k8_all_but_last_l3-dm512-de-1-h-1-dh64-ttr1-191711
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.1821 | P 3.231e+00 | Lt 1.2040 | Pt 3.302e+00 | H 36.5741 | ETA 7.22h
I 5000 | L 1.1082 | P 3.003e+00 | Lt 1.1244 | Pt 3.052e+00 | H 71.5626 | ETA 7.26h
I 7500 | L 1.0794 | P 2.918e+00 | Lt 1.0964 | Pt 2.968e+00 | H 90.8678 | ETA 7.14h
I 10000 | L 1.0604 | P 2.864e+00 | Lt 1.0789 | Pt 2.917e+00 | H 102.5716 | ETA 7.02h
I 12500 | L 1.0451 | P 2.821e+00 | Lt 1.0692 | Pt 2.889e+00 | H 117.6233 | ETA 6.84h
I 15000 | L 1.0361 | P 2.796e+00 | Lt 1.0557 | Pt 2.851e+00 | H 134.3209 | ETA 6.65h
I 17500 | L 1.0251 | P 2.765e+00 | Lt 1.0475 | Pt 2.827e+00 | H 141.4268 | ETA 6.45h
I 20000 | L 1.0281 | P 2.774e+00 | Lt 1.0378 | Pt 2.800e+00 | H 151.9816 | ETA 6.25h
I 22500 | L 1.0157 | P 2.740e+00 | Lt 1.0320 | Pt 2.784e+00 | H 165.2947 | ETA 6.01h
I 25000 | L 1.0119 | P 2.729e+00 | Lt 1.0288 | Pt 2.776e+00 | H 179.0984 | ETA 5.82h
I 27500 | L 1.0137 | P 2.734e+00 | Lt 1.0252 | Pt 2.766e+00 | H 192.3237 | ETA 5.62h
I 30000 | L 1.0039 | P 2.708e+00 | Lt 1.0180 | Pt 2.746e+00 | H 201.9049 | ETA 5.40h
I 32500 | L 1.0091 | P 2.722e+00 | Lt 1.0198 | Pt 2.751e+00 | H 216.3264 | ETA 5.20h
I 35000 | L 1.0027 | P 2.705e+00 | Lt 1.0107 | Pt 2.726e+00 | H 229.4703 | ETA 4.98h
I 37500 | L 0.9973 | P 2.690e+00 | Lt 1.0090 | Pt 2.721e+00 | H 242.9337 | ETA 4.79h
I 40000 | L 0.9941 | P 2.682e+00 | Lt 1.0081 | Pt 2.719e+00 | H 254.8246 | ETA 4.59h
I 42500 | L 0.9903 | P 2.671e+00 | Lt 1.0031 | Pt 2.706e+00 | H 268.8926 | ETA 4.40h
I 45000 | L 0.9890 | P 2.668e+00 | Lt 1.0034 | Pt 2.706e+00 | H 281.8796 | ETA 4.20h
I 47500 | L 0.9854 | P 2.658e+00 | Lt 1.0025 | Pt 2.704e+00 | H 291.1327 | ETA 4.01h
I 50000 | L 0.9882 | P 2.666e+00 | Lt 0.9982 | Pt 2.693e+00 | H 305.4490 | ETA 3.82h
I 52500 | L 0.9831 | P 2.653e+00 | Lt 0.9986 | Pt 2.694e+00 | H 321.5882 | ETA 3.62h
I 55000 | L 0.9821 | P 2.650e+00 | Lt 0.9939 | Pt 2.681e+00 | H 338.5668 | ETA 3.43h
I 57500 | L 0.9796 | P 2.643e+00 | Lt 0.9946 | Pt 2.683e+00 | H 356.4603 | ETA 3.24h
I 60000 | L 0.9798 | P 2.644e+00 | Lt 0.9910 | Pt 2.673e+00 | H 368.0170 | ETA 3.04h
I 62500 | L 0.9817 | P 2.649e+00 | Lt 0.9881 | Pt 2.666e+00 | H 383.4428 | ETA 2.85h
I 65000 | L 0.9807 | P 2.646e+00 | Lt 0.9888 | Pt 2.668e+00 | H 399.8010 | ETA 2.65h
I 67500 | L 0.9775 | P 2.638e+00 | Lt 0.9903 | Pt 2.672e+00 | H 414.8594 | ETA 2.46h
I 70000 | L 0.9752 | P 2.632e+00 | Lt 0.9883 | Pt 2.666e+00 | H 421.9756 | ETA 2.27h
I 72500 | L 0.9735 | P 2.627e+00 | Lt 0.9856 | Pt 2.659e+00 | H 434.0841 | ETA 2.08h
I 75000 | L 0.9744 | P 2.630e+00 | Lt 0.9818 | Pt 2.649e+00 | H 448.0759 | ETA 1.90h
I 77500 | L 0.9735 | P 2.627e+00 | Lt 0.9846 | Pt 2.656e+00 | H 459.5046 | ETA 1.70h
I 80000 | L 0.9721 | P 2.624e+00 | Lt 0.9790 | Pt 2.642e+00 | H 471.5695 | ETA 1.51h
I 82500 | L 0.9722 | P 2.624e+00 | Lt 0.9826 | Pt 2.651e+00 | H 500.2914 | ETA 1.33h
I 85000 | L 0.9663 | P 2.609e+00 | Lt 0.9780 | Pt 2.639e+00 | H 500.3959 | ETA 1.13h
I 87500 | L 0.9685 | P 2.614e+00 | Lt 0.9776 | Pt 2.638e+00 | H 519.3560 | ETA 0.95h
I 90000 | L 0.9668 | P 2.610e+00 | Lt 0.9759 | Pt 2.634e+00 | H 540.7894 | ETA 0.76h
I 92500 | L 0.9688 | P 2.615e+00 | Lt 0.9754 | Pt 2.632e+00 | H 552.9046 | ETA 0.57h
I 95000 | L 0.9664 | P 2.609e+00 | Lt 0.9734 | Pt 2.627e+00 | H 565.8562 | ETA 0.38h
I 97500 | L 0.9674 | P 2.611e+00 | Lt 0.9755 | Pt 2.633e+00 | H 578.5656 | ETA 0.19h
I 100000 | L 0.9657 | P 2.607e+00 | Lt 0.9737 | Pt 2.628e+00 | H 588.7656 | ETA 0.00h
I 100001 | L 0.9657 | P 2.607e+00 | Lt 0.9737 | Pt 2.628e+00
Finished training!
