always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 197558272.0
cola_params: 4128840
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 256
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 57344
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 24576
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 6
neurons: 0
non_emb_flops: 194412544.0
non_emb_params: 4046920
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 8
num_ffn_experts: 1
opt_name: AdamW
out_dir: /chkpt/btt_norm_moe_para_e8_k2_ffe1k2_all_but_last_l6-dm256-de-1-h-1-dh64-ttr1-001532
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: btt_norm_moe_para
timestamp: 2024-05-19_001532
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: btt_norm_moe_para_e8_k2_ffe1k2_all_but_last_l6-dm256-de-1-h-1-dh64-ttr1-001532
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2911 | P 3.601e+00 | Lt 1.3141 | Pt 3.684e+00 | H 3.5538 | ETA 4.65h
I 5000 | L 1.2156 | P 3.341e+00 | Lt 1.2309 | Pt 3.392e+00 | H 7.1686 | ETA 4.55h
I 7500 | L 1.1876 | P 3.249e+00 | Lt 1.2095 | Pt 3.321e+00 | H 9.5196 | ETA 4.42h
I 10000 | L 1.1714 | P 3.197e+00 | Lt 1.1885 | Pt 3.252e+00 | H 11.8518 | ETA 4.30h
I 12500 | L 1.1588 | P 3.158e+00 | Lt 1.1797 | Pt 3.224e+00 | H 13.8206 | ETA 4.17h
I 15000 | L 1.1488 | P 3.127e+00 | Lt 1.1700 | Pt 3.193e+00 | H 15.6730 | ETA 4.06h
I 17500 | L 1.1432 | P 3.109e+00 | Lt 1.1650 | Pt 3.177e+00 | H 17.5278 | ETA 3.94h
I 20000 | L 1.1348 | P 3.083e+00 | Lt 1.1596 | Pt 3.160e+00 | H 19.7065 | ETA 3.82h
I 22500 | L 1.1358 | P 3.086e+00 | Lt 1.1504 | Pt 3.132e+00 | H 21.5280 | ETA 3.69h
I 25000 | L 1.1301 | P 3.069e+00 | Lt 1.1482 | Pt 3.125e+00 | H 23.1660 | ETA 3.57h
I 27500 | L 1.1247 | P 3.053e+00 | Lt 1.1470 | Pt 3.121e+00 | H 25.0293 | ETA 3.44h
I 30000 | L 1.1240 | P 3.050e+00 | Lt 1.1490 | Pt 3.127e+00 | H 26.9773 | ETA 3.32h
I 32500 | L 1.1198 | P 3.038e+00 | Lt 1.1426 | Pt 3.107e+00 | H 28.8559 | ETA 3.20h
I 35000 | L 1.1205 | P 3.040e+00 | Lt 1.1410 | Pt 3.102e+00 | H 30.9971 | ETA 3.08h
I 37500 | L 1.1188 | P 3.035e+00 | Lt 1.1405 | Pt 3.101e+00 | H 32.8061 | ETA 2.96h
I 40000 | L 1.1163 | P 3.027e+00 | Lt 1.1377 | Pt 3.092e+00 | H 34.4939 | ETA 2.84h
I 42500 | L 1.1099 | P 3.008e+00 | Lt 1.1336 | Pt 3.080e+00 | H 36.4251 | ETA 2.72h
I 45000 | L 1.1154 | P 3.025e+00 | Lt 1.1282 | Pt 3.063e+00 | H 38.6449 | ETA 2.60h
I 47500 | L 1.1114 | P 3.013e+00 | Lt 1.1297 | Pt 3.068e+00 | H 40.5943 | ETA 2.48h
I 50000 | L 1.1079 | P 3.002e+00 | Lt 1.1292 | Pt 3.066e+00 | H 42.6656 | ETA 2.36h
I 52500 | L 1.1089 | P 3.005e+00 | Lt 1.1272 | Pt 3.060e+00 | H 44.1179 | ETA 2.24h
I 55000 | L 1.1112 | P 3.012e+00 | Lt 1.1248 | Pt 3.053e+00 | H 46.4551 | ETA 2.13h
I 57500 | L 1.1051 | P 2.994e+00 | Lt 1.1273 | Pt 3.060e+00 | H 47.8617 | ETA 2.01h
I 60000 | L 1.1065 | P 2.998e+00 | Lt 1.1262 | Pt 3.057e+00 | H 50.8857 | ETA 1.89h
I 62500 | L 1.1058 | P 2.996e+00 | Lt 1.1221 | Pt 3.045e+00 | H 52.2719 | ETA 1.77h
I 65000 | L 1.1010 | P 2.982e+00 | Lt 1.1258 | Pt 3.056e+00 | H 54.2892 | ETA 1.65h
I 67500 | L 1.1017 | P 2.984e+00 | Lt 1.1267 | Pt 3.059e+00 | H 55.8268 | ETA 1.53h
I 70000 | L 1.1006 | P 2.980e+00 | Lt 1.1225 | Pt 3.046e+00 | H 58.0015 | ETA 1.42h
I 72500 | L 1.1004 | P 2.980e+00 | Lt 1.1208 | Pt 3.041e+00 | H 60.2910 | ETA 1.30h
I 75000 | L 1.0978 | P 2.972e+00 | Lt 1.1213 | Pt 3.042e+00 | H 62.4876 | ETA 1.18h
I 77500 | L 1.0968 | P 2.969e+00 | Lt 1.1212 | Pt 3.042e+00 | H 63.6714 | ETA 1.06h
I 80000 | L 1.0993 | P 2.977e+00 | Lt 1.1177 | Pt 3.031e+00 | H 66.5114 | ETA 0.94h
I 82500 | L 1.0988 | P 2.975e+00 | Lt 1.1174 | Pt 3.031e+00 | H 67.9613 | ETA 0.83h
I 85000 | L 1.0953 | P 2.965e+00 | Lt 1.1129 | Pt 3.017e+00 | H 71.0082 | ETA 0.71h
I 87500 | L 1.0986 | P 2.975e+00 | Lt 1.1172 | Pt 3.030e+00 | H 73.2384 | ETA 0.59h
I 90000 | L 1.0948 | P 2.963e+00 | Lt 1.1168 | Pt 3.029e+00 | H 74.7215 | ETA 0.47h
I 92500 | L 1.0918 | P 2.954e+00 | Lt 1.1127 | Pt 3.016e+00 | H 76.7868 | ETA 0.35h
I 95000 | L 1.0915 | P 2.954e+00 | Lt 1.1149 | Pt 3.023e+00 | H 79.3541 | ETA 0.24h
I 97500 | L 1.0948 | P 2.963e+00 | Lt 1.1127 | Pt 3.016e+00 | H 80.7935 | ETA 0.12h
I 100000 | L 1.0927 | P 2.957e+00 | Lt 1.1115 | Pt 3.013e+00 | H 82.2941 | ETA 0.00h
I 100001 | L 1.0927 | P 2.957e+00 | Lt 1.1115 | Pt 3.013e+00
Finished training!
