always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 549257216.0
cola_params: 24206664
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 512
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 114688
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 49152
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 6
neurons: 0
non_emb_flops: 542965760.0
non_emb_params: 24042824
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 16
num_ffn_experts: 1
opt_name: AdamW
out_dir: /chkpt/btt_norm_moe_para_e16_k2_ffe1k2_all_but_last_l6-dm512-de-1-h-1-dh64-ttr1-002836
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: btt_norm_moe_para
timestamp: 2024-05-19_002836
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: btt_norm_moe_para_e16_k2_ffe1k2_all_but_last_l6-dm512-de-1-h-1-dh64-ttr1-002836
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2263 | P 3.376e+00 | Lt 1.2467 | Pt 3.445e+00 | H 3.4418 | ETA 6.81h
I 5000 | L 1.1486 | P 3.126e+00 | Lt 1.1683 | Pt 3.187e+00 | H 6.8701 | ETA 6.64h
I 7500 | L 1.1197 | P 3.037e+00 | Lt 1.1399 | Pt 3.099e+00 | H 9.6354 | ETA 6.46h
I 10000 | L 1.1033 | P 2.988e+00 | Lt 1.1167 | Pt 3.028e+00 | H 12.2346 | ETA 6.30h
I 12500 | L 1.0896 | P 2.948e+00 | Lt 1.1046 | Pt 2.992e+00 | H 14.4107 | ETA 6.13h
I 15000 | L 1.0772 | P 2.912e+00 | Lt 1.0972 | Pt 2.970e+00 | H 16.9108 | ETA 5.95h
I 17500 | L 1.0707 | P 2.893e+00 | Lt 1.0881 | Pt 2.944e+00 | H 18.8597 | ETA 5.78h
I 20000 | L 1.0679 | P 2.885e+00 | Lt 1.0818 | Pt 2.925e+00 | H 21.1112 | ETA 5.60h
I 22500 | L 1.0598 | P 2.862e+00 | Lt 1.0786 | Pt 2.916e+00 | H 23.3188 | ETA 5.43h
I 25000 | L 1.0564 | P 2.853e+00 | Lt 1.0744 | Pt 2.904e+00 | H 25.3474 | ETA 5.25h
I 27500 | L 1.0535 | P 2.844e+00 | Lt 1.0705 | Pt 2.893e+00 | H 27.7773 | ETA 5.08h
I 30000 | L 1.0506 | P 2.836e+00 | Lt 1.0687 | Pt 2.887e+00 | H 29.4542 | ETA 4.90h
I 32500 | L 1.0442 | P 2.818e+00 | Lt 1.0622 | Pt 2.869e+00 | H 31.1847 | ETA 4.73h
I 35000 | L 1.0418 | P 2.812e+00 | Lt 1.0580 | Pt 2.857e+00 | H 33.2282 | ETA 4.55h
I 37500 | L 1.0431 | P 2.815e+00 | Lt 1.0572 | Pt 2.855e+00 | H 35.8104 | ETA 4.38h
I 40000 | L 1.0349 | P 2.792e+00 | Lt 1.0491 | Pt 2.832e+00 | H 37.9080 | ETA 4.20h
I 42500 | L 1.0372 | P 2.799e+00 | Lt 1.0510 | Pt 2.837e+00 | H 39.9610 | ETA 4.03h
I 45000 | L 1.0334 | P 2.788e+00 | Lt 1.0482 | Pt 2.829e+00 | H 41.7828 | ETA 3.85h
I 47500 | L 1.0303 | P 2.780e+00 | Lt 1.0458 | Pt 2.823e+00 | H 43.6137 | ETA 3.68h
I 50000 | L 1.0283 | P 2.774e+00 | Lt 1.0465 | Pt 2.825e+00 | H 46.4022 | ETA 3.50h
I 52500 | L 1.0271 | P 2.771e+00 | Lt 1.0450 | Pt 2.821e+00 | H 48.2781 | ETA 3.33h
I 55000 | L 1.0263 | P 2.769e+00 | Lt 1.0402 | Pt 2.807e+00 | H 50.7955 | ETA 3.15h
I 57500 | L 1.0232 | P 2.760e+00 | Lt 1.0412 | Pt 2.810e+00 | H 52.7221 | ETA 2.98h
I 60000 | L 1.0222 | P 2.757e+00 | Lt 1.0349 | Pt 2.792e+00 | H 54.9669 | ETA 2.80h
I 62500 | L 1.0226 | P 2.758e+00 | Lt 1.0383 | Pt 2.802e+00 | H 57.3607 | ETA 2.63h
I 65000 | L 1.0217 | P 2.756e+00 | Lt 1.0373 | Pt 2.799e+00 | H 60.1270 | ETA 2.45h
I 67500 | L 1.0234 | P 2.761e+00 | Lt 1.0396 | Pt 2.805e+00 | H 62.0636 | ETA 2.28h
I 70000 | L 1.0198 | P 2.751e+00 | Lt 1.0367 | Pt 2.797e+00 | H 64.0585 | ETA 2.10h
I 72500 | L 1.0172 | P 2.744e+00 | Lt 1.0346 | Pt 2.792e+00 | H 66.1886 | ETA 1.93h
I 75000 | L 1.0174 | P 2.744e+00 | Lt 1.0334 | Pt 2.788e+00 | H 69.3154 | ETA 1.75h
I 77500 | L 1.0164 | P 2.741e+00 | Lt 1.0331 | Pt 2.787e+00 | H 70.8401 | ETA 1.58h
I 80000 | L 1.0154 | P 2.739e+00 | Lt 1.0296 | Pt 2.778e+00 | H 73.5945 | ETA 1.40h
I 82500 | L 1.0131 | P 2.733e+00 | Lt 1.0299 | Pt 2.779e+00 | H 75.9089 | ETA 1.23h
I 85000 | L 1.0142 | P 2.735e+00 | Lt 1.0313 | Pt 2.782e+00 | H 78.5855 | ETA 1.05h
I 87500 | L 1.0132 | P 2.733e+00 | Lt 1.0302 | Pt 2.779e+00 | H 80.4499 | ETA 0.88h
I 90000 | L 1.0113 | P 2.728e+00 | Lt 1.0296 | Pt 2.778e+00 | H 82.8303 | ETA 0.70h
I 92500 | L 1.0101 | P 2.725e+00 | Lt 1.0267 | Pt 2.770e+00 | H 85.2128 | ETA 0.53h
I 95000 | L 1.0130 | P 2.732e+00 | Lt 1.0272 | Pt 2.771e+00 | H 88.2843 | ETA 0.35h
I 97500 | L 1.0116 | P 2.729e+00 | Lt 1.0254 | Pt 2.766e+00 | H 90.5530 | ETA 0.18h
I 100000 | L 1.0106 | P 2.726e+00 | Lt 1.0281 | Pt 2.773e+00 | H 93.3170 | ETA 0.00h
I 100001 | L 1.0106 | P 2.726e+00 | Lt 1.0281 | Pt 2.773e+00
Finished training!
