always_save_checkpoint: False
arg: --wandb_project=moe_gpt
attempt: moe_gpt
aux_loss_weight: 0.01
axial: False
backend: nccl
base_d_embd: 768
base_d_head: 64
base_d_model: 768
base_ffn_expansion: 1
base_n_head: -1
batch_size: 64
beta1: 0.9
beta2: 0.95
bias: False
block_size: 128
ckpt_path:
cola_flops: 420429824
cola_params: 5357568
compile: False
config_file: config/train_open_small.py
d_embd: -1
d_head: 64
d_model: 128
data_dir: /storage/data
dataset: open
decay_lr: False
device: cuda
device_type: cuda
do_qk_ln: True
dropout: 0.0
dtype: bfloat16
emb_params: 28672
eval_interval: 2500
eval_iters: 200
eval_only: False
every_n_fwds: 200
expr:
ffn_expansion: 4
grad_clip: 1.0
gradient_accumulation_steps: 8
head_btt_case_proj: (n|d|n|d)
head_btt_case_qkv: (n|d|n|d)
head_params: 12288
init_lr: 0.003
input_lr_mult: 1.0
key: wandb_project
layers: all_but_last
lm_head_rank_frac: -1.0
lm_head_struct:
lm_head_tt_rank: -1
log_interval: 100
lr_decay_iters: 100000
max_iters: 100000
min_lr: 0.00030000000000000003
n_head: -1
n_layer: 9
neurons: 0
non_emb_flops: 418856960
non_emb_params: 5316608
num_active_experts: 2
num_active_ffn_experts: 2
num_blocks: 4
num_experts: 0
num_ffn_experts: 4
opt_name: AdamW
out_dir: /chkpt/dense_e0_k2_ffe4k2_all_but_last_l9-dm128-de-1-h-1-dh64-ttr1-173957
rank_frac: 0.2
spec_penalty_weight: 0.0
split_qkv: True
struct: dense
timestamp: 2024-05-18_173957
total_tokens: 6017506009
tt_dim: 2
tt_rank: 1
use_head_btt: False
val: moe_gpt
vocab_size: 96
wandb_log: True
wandb_project: moe_gpt
wandb_run_name: dense_e0_k2_ffe4k2_all_but_last_l9-dm128-de-1-h-1-dh64-ttr1-173957
warmup_iters: 2000
weight_decay: 0.0
I 2500 | L 1.2740 | P 3.540e+00 | Lt 1.2945 | Pt 3.613e+00 | H 49.4306 | ETA 4.37h
I 5000 | L 1.2075 | P 3.314e+00 | Lt 1.2275 | Pt 3.381e+00 | H 161.5724 | ETA 4.19h
I 7500 | L 1.1846 | P 3.239e+00 | Lt 1.1989 | Pt 3.286e+00 | H 260.3233 | ETA 4.07h
I 10000 | L 1.1636 | P 3.173e+00 | Lt 1.1915 | Pt 3.262e+00 | H 348.7771 | ETA 3.94h
I 12500 | L 1.1556 | P 3.148e+00 | Lt 1.1743 | Pt 3.207e+00 | H 438.5729 | ETA 3.82h
I 15000 | L 1.1515 | P 3.135e+00 | Lt 1.1687 | Pt 3.189e+00 | H 516.5079 | ETA 3.70h
I 17500 | L 1.1438 | P 3.111e+00 | Lt 1.1658 | Pt 3.180e+00 | H 593.4377 | ETA 3.58h
I 20000 | L 1.1370 | P 3.090e+00 | Lt 1.1576 | Pt 3.154e+00 | H 678.8765 | ETA 3.47h
I 22500 | L 1.1330 | P 3.078e+00 | Lt 1.1519 | Pt 3.136e+00 | H 754.8220 | ETA 3.36h
I 25000 | L 1.1321 | P 3.075e+00 | Lt 1.1501 | Pt 3.131e+00 | H 836.4904 | ETA 3.25h
I 27500 | L 1.1283 | P 3.064e+00 | Lt 1.1491 | Pt 3.127e+00 | H 912.8007 | ETA 3.14h
I 30000 | L 1.1240 | P 3.050e+00 | Lt 1.1457 | Pt 3.117e+00 | H 1000.6786 | ETA 3.03h
I 32500 | L 1.1178 | P 3.032e+00 | Lt 1.1410 | Pt 3.102e+00 | H 1075.1583 | ETA 2.92h
I 35000 | L 1.1202 | P 3.039e+00 | Lt 1.1381 | Pt 3.093e+00 | H 1161.0243 | ETA 2.82h
I 37500 | L 1.1155 | P 3.025e+00 | Lt 1.1388 | Pt 3.096e+00 | H 1243.2805 | ETA 2.71h
I 40000 | L 1.1141 | P 3.021e+00 | Lt 1.1381 | Pt 3.094e+00 | H 1348.0794 | ETA 2.60h
I 42500 | L 1.1157 | P 3.025e+00 | Lt 1.1329 | Pt 3.078e+00 | H 1432.9416 | ETA 2.49h
I 45000 | L 1.1113 | P 3.012e+00 | Lt 1.1320 | Pt 3.075e+00 | H 1529.2046 | ETA 2.38h
I 47500 | L 1.1114 | P 3.013e+00 | Lt 1.1296 | Pt 3.068e+00 | H 1603.4762 | ETA 2.27h
I 50000 | L 1.1118 | P 3.014e+00 | Lt 1.1314 | Pt 3.073e+00 | H 1733.3541 | ETA 2.16h
I 52500 | L 1.1138 | P 3.020e+00 | Lt 1.1294 | Pt 3.067e+00 | H 1851.9765 | ETA 2.06h
I 55000 | L 1.1091 | P 3.006e+00 | Lt 1.1245 | Pt 3.052e+00 | H 1927.8285 | ETA 1.95h
I 57500 | L 1.1088 | P 3.005e+00 | Lt 1.1296 | Pt 3.067e+00 | H 2013.0510 | ETA 1.84h
I 60000 | L 1.1055 | P 2.995e+00 | Lt 1.1266 | Pt 3.058e+00 | H 2115.6488 | ETA 1.73h
I 62500 | L 1.1066 | P 2.998e+00 | Lt 1.1283 | Pt 3.063e+00 | H 2230.7538 | ETA 1.63h
I 65000 | L 1.1038 | P 2.990e+00 | Lt 1.1219 | Pt 3.044e+00 | H 2305.8011 | ETA 1.52h
I 67500 | L 1.1022 | P 2.985e+00 | Lt 1.1266 | Pt 3.058e+00 | H 2365.1850 | ETA 1.41h
I 70000 | L 1.1048 | P 2.993e+00 | Lt 1.1231 | Pt 3.048e+00 | H 2512.8240 | ETA 1.30h
I 72500 | L 1.1036 | P 2.989e+00 | Lt 1.1248 | Pt 3.053e+00 | H 2652.6699 | ETA 1.19h
I 75000 | L 1.1039 | P 2.990e+00 | Lt 1.1215 | Pt 3.043e+00 | H 2752.1508 | ETA 1.09h
I 77500 | L 1.1041 | P 2.991e+00 | Lt 1.1229 | Pt 3.047e+00 | H 2862.9134 | ETA 0.98h
I 80000 | L 1.1020 | P 2.985e+00 | Lt 1.1201 | Pt 3.039e+00 | H 2935.8699 | ETA 0.87h
I 82500 | L 1.1025 | P 2.986e+00 | Lt 1.1169 | Pt 3.029e+00 | H 3078.4802 | ETA 0.76h
I 85000 | L 1.0998 | P 2.978e+00 | Lt 1.1151 | Pt 3.023e+00 | H 3159.3542 | ETA 0.65h
I 87500 | L 1.1008 | P 2.981e+00 | Lt 1.1184 | Pt 3.033e+00 | H 3270.6572 | ETA 0.54h
I 90000 | L 1.0956 | P 2.966e+00 | Lt 1.1154 | Pt 3.024e+00 | H 3400.9748 | ETA 0.44h
I 92500 | L 1.0971 | P 2.970e+00 | Lt 1.1170 | Pt 3.029e+00 | H 3484.8758 | ETA 0.33h
I 95000 | L 1.0942 | P 2.962e+00 | Lt 1.1134 | Pt 3.018e+00 | H 3589.9705 | ETA 0.22h
I 97500 | L 1.0967 | P 2.969e+00 | Lt 1.1140 | Pt 3.020e+00 | H 3702.9903 | ETA 0.11h
I 100000 | L 1.1002 | P 2.979e+00 | Lt 1.1136 | Pt 3.019e+00 | H 3831.5308 | ETA 0.00h
I 100001 | L 1.1002 | P 2.979e+00 | Lt 1.1136 | Pt 3.019e+00
Finished training!
