always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 1595834368
cola_params: 21330816
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 256
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 57344
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 24576
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 9
neurons: 0
non_emb_flops: 1592688640
non_emb_params: 21248896
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 4
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe4k2_all_but_last_l9-dm256-de-1-h-1-dh64-ttr1-174035
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_174035
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe4k2_all_but_last_l9-dm256-de-1-h-1-dh64-ttr1-174035
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2121 | P 3.329e+00 | Lt 1.2326 | Pt 3.398e+00 | H 42.5750 | ETA 4.46h
I 5000 | L 1.1408 | P 3.102e+00 | Lt 1.1589 | Pt 3.158e+00 | H 124.1895 | ETA 4.33h
I 7500 | L 1.1106 | P 3.010e+00 | Lt 1.1302 | Pt 3.069e+00 | H 184.2148 | ETA 4.20h
I 10000 | L 1.0922 | P 2.956e+00 | Lt 1.1181 | Pt 3.033e+00 | H 237.0366 | ETA 4.09h
I 12500 | L 1.0803 | P 2.921e+00 | Lt 1.1019 | Pt 2.984e+00 | H 285.6829 | ETA 3.96h
I 15000 | L 1.0741 | P 2.903e+00 | Lt 1.0885 | Pt 2.945e+00 | H 337.8850 | ETA 3.84h
I 17500 | L 1.0635 | P 2.873e+00 | Lt 1.0848 | Pt 2.934e+00 | H 380.4831 | ETA 3.72h
I 20000 | L 1.0623 | P 2.869e+00 | Lt 1.0805 | Pt 2.922e+00 | H 428.3861 | ETA 3.60h
I 22500 | L 1.0558 | P 2.851e+00 | Lt 1.0740 | Pt 2.903e+00 | H 475.4741 | ETA 3.49h
I 25000 | L 1.0492 | P 2.832e+00 | Lt 1.0703 | Pt 2.892e+00 | H 533.2970 | ETA 3.37h
I 27500 | L 1.0477 | P 2.828e+00 | Lt 1.0664 | Pt 2.881e+00 | H 582.4035 | ETA 3.25h
I 30000 | L 1.0426 | P 2.814e+00 | Lt 1.0613 | Pt 2.866e+00 | H 631.2936 | ETA 3.13h
I 32500 | L 1.0403 | P 2.807e+00 | Lt 1.0614 | Pt 2.867e+00 | H 687.7899 | ETA 3.02h
I 35000 | L 1.0411 | P 2.810e+00 | Lt 1.0559 | Pt 2.851e+00 | H 731.0022 | ETA 2.90h
I 37500 | L 1.0347 | P 2.792e+00 | Lt 1.0548 | Pt 2.848e+00 | H 796.0902 | ETA 2.78h
I 40000 | L 1.0357 | P 2.795e+00 | Lt 1.0525 | Pt 2.842e+00 | H 844.6001 | ETA 2.67h
I 42500 | L 1.0314 | P 2.783e+00 | Lt 1.0494 | Pt 2.833e+00 | H 909.0500 | ETA 2.55h
I 45000 | L 1.0329 | P 2.787e+00 | Lt 1.0483 | Pt 2.830e+00 | H 956.6830 | ETA 2.44h
I 47500 | L 1.0310 | P 2.782e+00 | Lt 1.0470 | Pt 2.826e+00 | H 1017.2104 | ETA 2.32h
I 50000 | L 1.0280 | P 2.773e+00 | Lt 1.0439 | Pt 2.817e+00 | H 1068.4811 | ETA 2.21h
I 52500 | L 1.0236 | P 2.761e+00 | Lt 1.0442 | Pt 2.818e+00 | H 1119.1544 | ETA 2.10h
I 55000 | L 1.0228 | P 2.759e+00 | Lt 1.0429 | Pt 2.815e+00 | H 1173.8644 | ETA 1.99h
I 57500 | L 1.0232 | P 2.760e+00 | Lt 1.0395 | Pt 2.805e+00 | H 1256.3340 | ETA 1.88h
I 60000 | L 1.0213 | P 2.755e+00 | Lt 1.0353 | Pt 2.793e+00 | H 1294.7342 | ETA 1.77h
I 62500 | L 1.0198 | P 2.751e+00 | Lt 1.0334 | Pt 2.788e+00 | H 1352.2849 | ETA 1.65h
I 65000 | L 1.0167 | P 2.742e+00 | Lt 1.0330 | Pt 2.787e+00 | H 1427.1322 | ETA 1.54h
I 67500 | L 1.0181 | P 2.746e+00 | Lt 1.0343 | Pt 2.791e+00 | H 1474.2001 | ETA 1.43h
I 70000 | L 1.0195 | P 2.750e+00 | Lt 1.0330 | Pt 2.787e+00 | H 1545.0807 | ETA 1.32h
I 72500 | L 1.0180 | P 2.746e+00 | Lt 1.0344 | Pt 2.791e+00 | H 1619.6311 | ETA 1.21h
I 75000 | L 1.0161 | P 2.741e+00 | Lt 1.0352 | Pt 2.793e+00 | H 1662.3246 | ETA 1.10h
I 77500 | L 1.0137 | P 2.734e+00 | Lt 1.0315 | Pt 2.783e+00 | H 1731.4730 | ETA 0.99h
I 80000 | L 1.0124 | P 2.731e+00 | Lt 1.0339 | Pt 2.789e+00 | H 1792.1605 | ETA 0.88h
I 82500 | L 1.0130 | P 2.732e+00 | Lt 1.0291 | Pt 2.776e+00 | H 1864.6896 | ETA 0.77h
I 85000 | L 1.0144 | P 2.736e+00 | Lt 1.0257 | Pt 2.767e+00 | H 1919.5142 | ETA 0.66h
I 87500 | L 1.0138 | P 2.734e+00 | Lt 1.0270 | Pt 2.771e+00 | H 1999.2367 | ETA 0.55h
I 90000 | L 1.0116 | P 2.729e+00 | Lt 1.0296 | Pt 2.778e+00 | H 2046.3993 | ETA 0.44h
I 92500 | L 1.0112 | P 2.727e+00 | Lt 1.0268 | Pt 2.770e+00 | H 2104.0594 | ETA 0.33h
I 95000 | L 1.0092 | P 2.722e+00 | Lt 1.0279 | Pt 2.773e+00 | H 2171.5028 | ETA 0.22h
I 97500 | L 1.0089 | P 2.721e+00 | Lt 1.0253 | Pt 2.766e+00 | H 2236.2305 | ETA 0.11h
I 100000 | L 1.0106 | P 2.726e+00 | Lt 1.0245 | Pt 2.764e+00 | H 2320.5601 | ETA 0.00h
I 100001 | L 1.0106 | P 2.726e+00 | Lt 1.0245 | Pt 2.764e+00
Finished training!
