| | 3: W0902 18:31:19.695000 3352467 torch/distributed/run.py:792] |
| | 3: W0902 18:31:19.695000 3352467 torch/distributed/run.py:792] ***************************************** |
| | 3: W0902 18:31:19.695000 3352467 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
| | 3: W0902 18:31:19.695000 3352467 torch/distributed/run.py:792] ***************************************** |
| | 2: W0902 18:31:19.695000 2766555 torch/distributed/run.py:792] |
| | 2: W0902 18:31:19.695000 2766555 torch/distributed/run.py:792] ***************************************** |
| | 2: W0902 18:31:19.695000 2766555 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
| | 2: W0902 18:31:19.695000 2766555 torch/distributed/run.py:792] ***************************************** |
| | 0: W0902 18:31:19.696000 801759 torch/distributed/run.py:792] |
| | 0: W0902 18:31:19.696000 801759 torch/distributed/run.py:792] ***************************************** |
| | 0: W0902 18:31:19.696000 801759 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
| | 0: W0902 18:31:19.696000 801759 torch/distributed/run.py:792] ***************************************** |
| | 1: W0902 18:31:19.696000 3191159 torch/distributed/run.py:792] |
| | 1: W0902 18:31:19.696000 3191159 torch/distributed/run.py:792] ***************************************** |
| | 1: W0902 18:31:19.696000 3191159 torch/distributed/run.py:792] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
| | 1: W0902 18:31:19.696000 3191159 torch/distributed/run.py:792] ***************************************** |
| | 0: [2025-09-02 18:31:52,971] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:801836] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
| | 2: [2025-09-02 18:31:52,971] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:2766631] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
| | 1: [2025-09-02 18:31:52,971] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3191235] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
| | 0: [2025-09-02 18:31:52,972] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:801836] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
| | 2: [2025-09-02 18:31:52,972] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:2766631] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
| | 3: [2025-09-02 18:31:52,972] [INFO] [axolotl.utils.schemas.validation.check_eval_packing:119] [PID:3352543] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`[39m |
| | 1: [2025-09-02 18:31:52,972] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3191235] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
| | 3: [2025-09-02 18:31:52,972] [INFO] [axolotl.utils.schemas.validation.hint_sample_packing_padding:218] [PID:3352543] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing[39m |
| | 0: [2025-09-02 18:31:57,072] [INFO] [axolotl.cli.config.load_cfg:245] [PID:801836] [RANK:0] config: |
| | 0: { |
| | 0: "activation_offloading": false, |
| | 0: "auto_resume_from_checkpoints": true, |
| | 0: "axolotl_config_path": "/lustre/fswork/projects/rech/dgo/udv55np/train/tmp/1756826505612058881.yaml", |
| | 0: "base_model": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-1.5B_ift", |
| | 0: "base_model_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-1.5B_ift", |
| | 0: "batch_size": 16, |
| | 0: "bf16": true, |
| | 0: "capabilities": { |
| | 0: "bf16": true, |
| | 0: "compute_capability": "sm_90", |
| | 0: "fp8": false, |
| | 0: "n_gpu": 16, |
| | 0: "n_node": 1 |
| | 0: }, |
| | 0: "chat_template": "qwen_25", |
| | 0: "context_parallel_size": 1, |
| | 0: "dataloader_num_workers": 16, |
| | 0: "dataloader_pin_memory": true, |
| | 0: "dataloader_prefetch_factor": 256, |
| | 0: "dataset_prepared_path": "/lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/0", |
| | 0: "dataset_processes": 192, |
| | 0: "datasets": [ |
| | 0: { |
| | 0: "chat_template": "tokenizer_default", |
| | 0: "field_messages": "conversations", |
| | 0: "message_property_mappings": { |
| | 0: "content": "content", |
| | 0: "role": "role" |
| | 0: }, |
| | 0: "path": "/lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/no_thinking_text/generator/default-d32b2cae8ea7e541/0.0.0", |
| | 0: "trust_remote_code": false, |
| | 0: "type": "chat_template" |
| | 0: } |
| | 0: ], |
| | 0: "ddp": true, |
| | 0: "deepspeed": { |
| | 0: "bf16": { |
| | 0: "enabled": true |
| | 0: }, |
| | 0: "gradient_accumulation_steps": "auto", |
| | 0: "gradient_clipping": "auto", |
| | 0: "train_batch_size": "auto", |
| | 0: "train_micro_batch_size_per_gpu": "auto", |
| | 0: "wall_clock_breakdown": false, |
| | 0: "zero_optimization": { |
| | 0: "contiguous_gradients": true, |
| | 0: "overlap_comm": true, |
| | 0: "reduce_bucket_size": "auto", |
| | 0: "stage": 3, |
| | 0: "stage3_gather_16bit_weights_on_model_save": true, |
| | 0: "stage3_param_persistence_threshold": "auto", |
| | 0: "stage3_prefetch_bucket_size": "auto", |
| | 0: "sub_group_size": 0 |
| | 0: } |
| | 0: }, |
| | 0: "device": "cuda:0", |
| | 0: "device_map": { |
| | 0: "": 0 |
| | 0: }, |
| | 0: "dion_rank_fraction": 1.0, |
| | 0: "dion_rank_multiple_of": 1, |
| | 0: "env_capabilities": { |
| | 0: "torch_version": "2.6.0" |
| | 0: }, |
| | 0: "eval_batch_size": 1, |
| | 0: "eval_causal_lm_metrics": [ |
| | 0: "sacrebleu", |
| | 0: "comet", |
| | 0: "ter", |
| | 0: "chrf" |
| | 0: ], |
| | 0: "eval_max_new_tokens": 128, |
| | 0: "eval_sample_packing": true, |
| | 0: "eval_table_size": 0, |
| | 0: "evals_per_epoch": 0, |
| | 0: "flash_attention": true, |
| | 0: "fp16": false, |
| | 0: "gradient_accumulation_steps": 1, |
| | 0: "gradient_checkpointing": true, |
| | 0: "gradient_checkpointing_kwargs": { |
| | 0: "use_reentrant": true |
| | 0: }, |
| | 0: "learning_rate": 7e-06, |
| | 0: "lisa_layers_attribute": "model.layers", |
| | 0: "load_best_model_at_end": false, |
| | 0: "load_in_4bit": false, |
| | 0: "load_in_8bit": false, |
| | 0: "local_rank": 0, |
| | 0: "logging_steps": 10, |
| | 0: "lora_dropout": 0.0, |
| | 0: "loraplus_lr_embedding": 1e-06, |
| | 0: "lr_scheduler": "warmup_stable_decay", |
| | 0: "lr_scheduler_kwargs": { |
| | 0: "min_lr_ratio": 0.1, |
| | 0: "num_decay_steps": 300 |
| | 0: }, |
| | 0: "max_prompt_len": 512, |
| | 0: "mean_resizing_embeddings": false, |
| | 0: "micro_batch_size": 1, |
| | 0: "model_config_type": "qwen2", |
| | 0: "num_epochs": 1.0, |
| | 0: "optimizer": "adamw_torch_fused", |
| | 0: "output_dir": "/lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0", |
| | 0: "pad_to_sequence_len": true, |
| | 0: "pretrain_multipack_attn": true, |
| | 0: "pretrain_multipack_buffer_size": 10000, |
| | 0: "profiler_steps_start": 0, |
| | 0: "qlora_sharded_model_loading": false, |
| | 0: "ray_num_workers": 1, |
| | 0: "resources_per_worker": { |
| | 0: "GPU": 1 |
| | 0: }, |
| | 0: "sample_packing": true, |
| | 0: "sample_packing_bin_size": 200, |
| | 0: "sample_packing_group_size": 100000, |
| | 0: "save_only_model": false, |
| | 0: "save_safetensors": true, |
| | 0: "save_steps": 0.2, |
| | 0: "save_total_limit": 20, |
| | 0: "sequence_len": 16384, |
| | 0: "shuffle_before_merging_datasets": false, |
| | 0: "shuffle_merged_datasets": true, |
| | 0: "skip_prepare_dataset": false, |
| | 0: "special_tokens": { |
| | 0: "bos_token": "<|im_start|>", |
| | 0: "eos_token": "<|im_end|>", |
| | 0: "pad_token": "<|endoftext|>" |
| | 0: }, |
| | 0: "strict": false, |
| | 0: "tensor_parallel_size": 1, |
| | 0: "tf32": false, |
| | 0: "tiled_mlp_use_original_mlp": true, |
| | 0: "tokenizer_config": "/lustre/fswork/projects/rech/qwv/udv55np/Qwen/Qwen2.5-1.5B_ift", |
| | 0: "torch_dtype": "torch.bfloat16", |
| | 0: "train_on_inputs": false, |
| | 0: "trl": { |
| | 0: "log_completions": false, |
| | 0: "mask_truncated_completions": false, |
| | 0: "ref_model_mixup_alpha": 0.9, |
| | 0: "ref_model_sync_steps": 64, |
| | 0: "scale_rewards": true, |
| | 0: "sync_ref_model": false, |
| | 0: "use_vllm": false, |
| | 0: "vllm_server_host": "0.0.0.0", |
| | 0: "vllm_server_port": 8000 |
| | 0: }, |
| | 0: "use_ray": false, |
| | 0: "use_tensorboard": true, |
| | 0: "val_set_size": 0.0, |
| | 0: "vllm": { |
| | 0: "device": "auto", |
| | 0: "dtype": "auto", |
| | 0: "gpu_memory_utilization": 0.9, |
| | 0: "host": "0.0.0.0", |
| | 0: "port": 8000 |
| | 0: }, |
| | 0: "warmup_steps": 150, |
| | 0: "weight_decay": 0.0, |
| | 0: "world_size": 16 |
| | 0: }[39m |
| | 0: [2025-09-02 18:31:57,074] [INFO] [axolotl.cli.checks.check_user_token:35] [PID:801836] [RANK:0] Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used.[39m |
| | 3: [2025-09-02 18:31:57,658] [INFO] [axolotl.utils.data.sft._load_raw_datasets:314] [PID:3352543] [RANK:0] Loading raw datasets...[39m |
| | 3: [2025-09-02 18:31:57,926] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:88] [PID:3352543] [RANK:0] Loading dataset: /lustre/fswork/projects/rech/qwv/udv55np/dataset/math/hf/no_thinking_text/generator/default-d32b2cae8ea7e541/0.0.0 with base_type: chat_template and prompt_style: None[39m |
| | 3:
Tokenizing Prompts (num_proc=192): 0%| | 0/321773 [00:00<?, ? examples/s]
Tokenizing Prompts (num_proc=192): 0%| | 1000/321773 [00:07<41:25, 129.06 examples/s]
Tokenizing Prompts (num_proc=192): 1%| | 2000/321773 [00:08<18:32, 287.45 examples/s]
Tokenizing Prompts (num_proc=192): 1%| | 3000/321773 [00:08<10:32, 504.34 examples/s]
Tokenizing Prompts (num_proc=192): 2%|β | 5000/321773 [00:08<04:51, 1084.88 examples/s]
Tokenizing Prompts (num_proc=192): 2%|β | 7000/321773 [00:08<03:01, 1731.38 examples/s]
Tokenizing Prompts (num_proc=192): 2%|β | 8000/321773 [00:09<02:33, 2047.72 examples/s]
Tokenizing Prompts (num_proc=192): 3%|β | 10000/321773 [00:09<01:40, 3116.78 examples/s]
Tokenizing Prompts (num_proc=192): 3%|β | 11000/321773 [00:09<01:37, 3171.82 examples/s]
Tokenizing Prompts (num_proc=192): 4%|β | 14000/321773 [00:09<00:56, 5450.93 examples/s]
Tokenizing Prompts (num_proc=192): 5%|β |
| | 3: | 17000/321773 [00:09<00:39, 7709.86 examples/s]
Tokenizing Prompts (num_proc=192): 6%|β | 19000/321773 [00:10<00:42, 7127.15 examples/s]
Tokenizing Prompts (num_proc=192): 7%|β | 22000/321773 [00:10<00:30, 9898.27 examples/s]
Tokenizing Prompts (num_proc=192): 7%|β | 24000/321773 [00:10<00:26, 11183.95 examples/s]
Tokenizing Prompts (num_proc=192): 8%|β | 26000/321773 [00:10<00:24, 12209.62 examples/s]
Tokenizing Prompts (num_proc=192): 9%|β | 28000/321773 [00:11<00:35, 8209.11 examples/s]
Tokenizing Prompts (num_proc=192): 9%|β | 30000/321773 [00:11<00:42, 6838.15 examples/s]
Tokenizing Prompts (num_proc=192): 11%|β | 34000/321773 [00:11<00:27, 10590.54 examples/s]
Tokenizing Prompts (num_proc=192): 11%|ββ | 37000/321773 [00:11<00:21, 13142.37 examples/s]
Tokenizing Prompts (num_proc=192): 12%|ββ | 39000/321773 [00:11<00:20, 14089.58 examples/s]
Tokenizing Prompts (num_proc=192): 13%|ββ |
| | 3: | 41000/321773 [00:11<00:19, 14656.19 examples/s]
Tokenizing Prompts (num_proc=192): 13%|ββ | 43000/321773 [00:12<00:17, 15518.82 examples/s]
Tokenizing Prompts (num_proc=192): 14%|ββ | 45676/321773 [00:12<00:39, 6929.08 examples/s]
Tokenizing Prompts (num_proc=192): 15%|ββ | 47676/321773 [00:13<00:34, 7888.30 examples/s]
Tokenizing Prompts (num_proc=192): 15%|ββ | 49676/321773 [00:13<00:31, 8763.53 examples/s]
Tokenizing Prompts (num_proc=192): 16%|ββ | 52676/321773 [00:13<00:29, 9083.16 examples/s]
Tokenizing Prompts (num_proc=192): 17%|ββ | 54676/321773 [00:13<00:27, 9796.75 examples/s]
Tokenizing Prompts (num_proc=192): 18%|ββ | 56352/321773 [00:13<00:26, 9896.88 examples/s]
Tokenizing Prompts (num_proc=192): 18%|ββ | 58352/321773 [00:13<00:24, 10627.36 examples/s]
Tokenizing Prompts (num_proc=192): 19%|ββ | 59704/321773 [00:14<00:26, 10006.86 examples/s]
Tokenizing Prompts (num_proc=192): 1 |
| | 3: 9%|ββ | 61704/321773 [00:14<00:30, 8478.52 examples/s]
Tokenizing Prompts (num_proc=192): 20%|ββ | 64380/321773 [00:14<00:24, 10349.69 examples/s]
Tokenizing Prompts (num_proc=192): 21%|ββ | 66732/321773 [00:14<00:22, 11441.90 examples/s]
Tokenizing Prompts (num_proc=192): 21%|βββ | 68408/321773 [00:14<00:22, 11133.35 examples/s]
Tokenizing Prompts (num_proc=192): 22%|βββ | 70084/321773 [00:15<00:22, 11054.40 examples/s]
Tokenizing Prompts (num_proc=192): 22%|βββ | 72084/321773 [00:15<00:21, 11433.73 examples/s]
Tokenizing Prompts (num_proc=192): 23%|βββ | 73760/321773 [00:15<00:28, 8671.94 examples/s]
Tokenizing Prompts (num_proc=192): 23%|βββ | 75112/321773 [00:15<00:35, 6965.05 examples/s]
Tokenizing Prompts (num_proc=192): 24%|βββ | 77788/321773 [00:16<00:26, 9088.82 examples/s]
Tokenizing Prompts (num_proc=192): 25%|βββ | 80464/321773 [00:16<00:22, 10968.34 examples/s]
Tokenizin |
| | 3: g Prompts (num_proc=192): 27%|βββ | 85816/321773 [00:16<00:14, 16672.80 examples/s]
Tokenizing Prompts (num_proc=192): 28%|βββ | 88492/321773 [00:16<00:17, 13055.51 examples/s]
Tokenizing Prompts (num_proc=192): 28%|βββ | 90844/321773 [00:16<00:17, 13538.06 examples/s]
Tokenizing Prompts (num_proc=192): 29%|βββ | 92520/321773 [00:16<00:17, 12750.13 examples/s]
Tokenizing Prompts (num_proc=192): 29%|βββ | 94872/321773 [00:17<00:16, 13495.26 examples/s]
Tokenizing Prompts (num_proc=192): 31%|βββ | 98900/321773 [00:17<00:12, 18153.54 examples/s]
Tokenizing Prompts (num_proc=192): 32%|ββββ | 101900/321773 [00:17<00:10, 20426.88 examples/s]
Tokenizing Prompts (num_proc=192): 32%|ββββ | 104576/321773 [00:17<00:10, 21175.21 examples/s]
Tokenizing Prompts (num_proc=192): 33%|ββββ | 106928/321773 [00:17<00:10, 19557.61 examples/s]
Tokenizing Prompts (num_proc=192): 34%|ββββ | 109280/321 |
| | 3: 773 [00:17<00:11, 18027.02 examples/s]
Tokenizing Prompts (num_proc=192): 35%|ββββ | 111280/321773 [00:18<00:20, 10169.94 examples/s]
Tokenizing Prompts (num_proc=192): 35%|ββββ | 113308/321773 [00:18<00:24, 8659.59 examples/s]
Tokenizing Prompts (num_proc=192): 36%|ββββ | 114660/321773 [00:18<00:28, 7336.72 examples/s]
Tokenizing Prompts (num_proc=192): 36%|ββββ | 116012/321773 [00:19<00:27, 7480.22 examples/s]
Tokenizing Prompts (num_proc=192): 37%|ββββ | 117688/321773 [00:19<00:24, 8295.67 examples/s]
Tokenizing Prompts (num_proc=192): 37%|ββββ | 119716/321773 [00:19<00:21, 9466.16 examples/s]
Tokenizing Prompts (num_proc=192): 38%|ββββ | 121392/321773 [00:19<00:29, 6781.76 examples/s]
Tokenizing Prompts (num_proc=192): 39%|ββββ | 124420/321773 [00:19<00:20, 9798.48 examples/s]
Tokenizing Prompts (num_proc=192): 40%|ββββ | 128096/321773 [00:19<00:13, 13992.17 examples/s]
Tokenizing |
| | 3: Prompts (num_proc=192): 41%|ββββ | 130448/321773 [00:20<00:13, 14306.37 examples/s]
Tokenizing Prompts (num_proc=192): 42%|βββββ | 134124/321773 [00:20<00:14, 13155.96 examples/s]
Tokenizing Prompts (num_proc=192): 42%|βββββ | 135800/321773 [00:20<00:18, 10140.66 examples/s]
Tokenizing Prompts (num_proc=192): 43%|βββββ | 139476/321773 [00:20<00:14, 12956.31 examples/s]
Tokenizing Prompts (num_proc=192): 44%|βββββ | 141152/321773 [00:21<00:14, 12452.86 examples/s]
Tokenizing Prompts (num_proc=192): 45%|βββββ | 143504/321773 [00:21<00:17, 10482.67 examples/s]
Tokenizing Prompts (num_proc=192): 45%|βββββ | 145180/321773 [00:21<00:17, 10380.40 examples/s]
Tokenizing Prompts (num_proc=192): 46%|βββββ | 147180/321773 [00:21<00:15, 11028.80 examples/s]
Tokenizing Prompts (num_proc=192): 46%|βββββ | 148532/321773 [00:21<00:16, 10261.03 examples/s]
Tokenizing Prompts (num_proc=192): 4 |
| | 3: 7%|βββββ | 152560/321773 [00:22<00:11, 14278.03 examples/s]
Tokenizing Prompts (num_proc=192): 48%|βββββ | 154236/321773 [00:22<00:12, 13315.22 examples/s]
Tokenizing Prompts (num_proc=192): 48%|βββββ | 155912/321773 [00:22<00:13, 12684.46 examples/s]
Tokenizing Prompts (num_proc=192): 49%|βββββ | 157588/321773 [00:22<00:13, 12104.25 examples/s]
Tokenizing Prompts (num_proc=192): 49%|βββββ | 159264/321773 [00:22<00:13, 11670.35 examples/s]
Tokenizing Prompts (num_proc=192): 50%|βββββ | 160940/321773 [00:22<00:14, 11283.84 examples/s]
Tokenizing Prompts (num_proc=192): 51%|βββββ | 163616/321773 [00:22<00:12, 12964.34 examples/s]
Tokenizing Prompts (num_proc=192): 52%|ββββββ | 167968/321773 [00:23<00:08, 17147.06 examples/s]
Tokenizing Prompts (num_proc=192): 53%|ββββββ | 171996/321773 [00:23<00:07, 20097.64 examples/s]
Tokenizing Prompts (num_proc=192): 54%|ββββββ |
| | 3: | 174348/321773 [00:23<00:08, 16916.63 examples/s]
Tokenizing Prompts (num_proc=192): 55%|ββββββ | 177024/321773 [00:23<00:09, 15704.11 examples/s]
Tokenizing Prompts (num_proc=192): 56%|ββββββ | 178700/321773 [00:23<00:10, 14114.51 examples/s]
Tokenizing Prompts (num_proc=192): 56%|ββββββ | 180376/321773 [00:24<00:10, 13461.92 examples/s]
Tokenizing Prompts (num_proc=192): 57%|ββββββ | 182052/321773 [00:24<00:12, 10749.16 examples/s]
Tokenizing Prompts (num_proc=192): 57%|ββββββ | 184052/321773 [00:24<00:12, 11135.40 examples/s]
Tokenizing Prompts (num_proc=192): 58%|ββββββ | 186052/321773 [00:24<00:12, 10583.35 examples/s]
Tokenizing Prompts (num_proc=192): 59%|ββββββ | 188404/321773 [00:24<00:11, 11790.53 examples/s]
Tokenizing Prompts (num_proc=192): 59%|ββββββ | 190756/321773 [00:24<00:09, 13102.67 examples/s]
Tokenizing Prompts (num_proc=192): 60%|ββββββ | 1 |
| | 3: 92432/321773 [00:25<00:09, 13422.19 examples/s]
Tokenizing Prompts (num_proc=192): 61%|ββββββ | 195460/321773 [00:25<00:10, 11613.09 examples/s]
Tokenizing Prompts (num_proc=192): 61%|βββββββ | 197812/321773 [00:25<00:09, 13512.01 examples/s]
Tokenizing Prompts (num_proc=192): 62%|βββββββ | 199488/321773 [00:25<00:08, 13593.92 examples/s]
Tokenizing Prompts (num_proc=192): 63%|βββββββ | 201840/321773 [00:25<00:08, 13750.64 examples/s]
Tokenizing Prompts (num_proc=192): 64%|βββββββ | 206544/321773 [00:26<00:07, 14651.83 examples/s]
Tokenizing Prompts (num_proc=192): 65%|βββββββ | 208220/321773 [00:26<00:09, 12434.73 examples/s]
Tokenizing Prompts (num_proc=192): 65%|βββββββ | 209896/321773 [00:26<00:08, 12693.20 examples/s]
Tokenizing Prompts (num_proc=192): 66%|βββββββ | 211572/321773 [00:26<00:08, 12889.40 examples/s]
Tokenizing Prompts (num_proc=192): 67%|ββββββ |
| | 3: β | 214248/321773 [00:26<00:07, 14194.53 examples/s]
Tokenizing Prompts (num_proc=192): 67%|βββββββ | 216276/321773 [00:26<00:07, 15061.91 examples/s]
Tokenizing Prompts (num_proc=192): 68%|βββββββ | 218628/321773 [00:27<00:08, 12262.17 examples/s]
Tokenizing Prompts (num_proc=192): 68%|βββββββ | 220304/321773 [00:27<00:09, 10696.70 examples/s]
Tokenizing Prompts (num_proc=192): 69%|βββββββ | 222304/321773 [00:27<00:08, 11855.66 examples/s]
Tokenizing Prompts (num_proc=192): 71%|βββββββ | 227008/321773 [00:27<00:05, 18143.36 examples/s]
Tokenizing Prompts (num_proc=192): 71%|ββββββββ | 229360/321773 [00:27<00:06, 14199.06 examples/s]
Tokenizing Prompts (num_proc=192): 73%|ββββββββ | 235712/321773 [00:27<00:03, 21816.01 examples/s]
Tokenizing Prompts (num_proc=192): 74%|ββββββββ | 239388/321773 [00:28<00:03, 22456.78 examples/s]
Tokenizing Prompts (num_proc=192): 76%|οΏ½ |
| | 3: οΏ½οΏ½βββββββ | 243064/321773 [00:28<00:03, 23999.96 examples/s]
Tokenizing Prompts (num_proc=192): 76%|ββββββββ | 246092/321773 [00:28<00:03, 21010.93 examples/s]
Tokenizing Prompts (num_proc=192): 77%|ββββββββ | 248444/321773 [00:28<00:04, 14670.92 examples/s]
Tokenizing Prompts (num_proc=192): 78%|ββββββββ | 251120/321773 [00:29<00:06, 10872.04 examples/s]
Tokenizing Prompts (num_proc=192): 79%|ββββββββ | 253120/321773 [00:29<00:06, 9999.79 examples/s]
Tokenizing Prompts (num_proc=192): 80%|ββββββββ | 256472/321773 [00:29<00:05, 12446.17 examples/s]
Tokenizing Prompts (num_proc=192): 80%|ββββββββ | 258148/321773 [00:29<00:06, 9825.03 examples/s]
Tokenizing Prompts (num_proc=192): 81%|βββββββββ | 261500/321773 [00:29<00:04, 12948.51 examples/s]
Tokenizing Prompts (num_proc=192): 82%|βββββββββ | 263528/321773 [00:30<00:04, 13505.24 examples/s]
Tokenizi |
| | 3: ng Prompts (num_proc=192): 83%|βββββββββ | 265556/321773 [00:30<00:04, 12073.28 examples/s]
Tokenizing Prompts (num_proc=192): 83%|βββββββββ | 268232/321773 [00:30<00:03, 14660.58 examples/s]
Tokenizing Prompts (num_proc=192): 84%|βββββββββ | 270908/321773 [00:30<00:03, 13163.71 examples/s]
Tokenizing Prompts (num_proc=192): 85%|βββββββββ | 273936/321773 [00:31<00:04, 11479.23 examples/s]
Tokenizing Prompts (num_proc=192): 87%|βββββββββ | 279964/321773 [00:31<00:02, 18840.31 examples/s]
Tokenizing Prompts (num_proc=192): 88%|βββββββββ | 282964/321773 [00:31<00:01, 20146.33 examples/s]
Tokenizing Prompts (num_proc=192): 89%|βββββββββ | 286316/321773 [00:31<00:01, 19035.77 examples/s]
Tokenizing Prompts (num_proc=192): 90%|βββββββββ | 288668/321773 [00:31<00:02, 16189.10 examples/s]
Tokenizing Prompts (num_proc=192): 90%|βββββββββ | 290696/321 |
| | 3: 773 [00:32<00:02, 11163.90 examples/s]
Tokenizing Prompts (num_proc=192): 91%|βββββββββ | 292724/321773 [00:32<00:02, 11719.21 examples/s]
Tokenizing Prompts (num_proc=192): 92%|ββββββββββ| 294752/321773 [00:32<00:03, 8280.15 examples/s]
Tokenizing Prompts (num_proc=192): 92%|ββββββββββ| 296104/321773 [00:32<00:02, 8643.73 examples/s]
Tokenizing Prompts (num_proc=192): 93%|ββββββββββ| 298132/321773 [00:32<00:02, 9182.47 examples/s]
Tokenizing Prompts (num_proc=192): 93%|ββββββββββ| 299484/321773 [00:33<00:02, 9735.08 examples/s]
Tokenizing Prompts (num_proc=192): 93%|ββββββββββ| 300836/321773 [00:33<00:02, 9589.84 examples/s]
Tokenizing Prompts (num_proc=192): 94%|ββββββββββ| 302188/321773 [00:33<00:02, 8482.04 examples/s]
Tokenizing Prompts (num_proc=192): 94%|ββββββββββ| 303540/321773 [00:33<00:02, 8937.65 examples/s]
Tokenizing Prompts (num_p |
| | 3: roc=192): 95%|ββββββββββ| 304892/321773 [00:33<00:02, 7545.40 examples/s]
Tokenizing Prompts (num_proc=192): 95%|ββββββββββ| 306243/321773 [00:33<00:02, 7636.62 examples/s]
Tokenizing Prompts (num_proc=192): 96%|ββββββββββ| 307593/321773 [00:34<00:01, 7924.16 examples/s]
Tokenizing Prompts (num_proc=192): 96%|ββββββββββ| 309620/321773 [00:34<00:01, 7315.62 examples/s]
Tokenizing Prompts (num_proc=192): 97%|ββββββββββ| 311646/321773 [00:34<00:01, 5983.22 examples/s]
Tokenizing Prompts (num_proc=192): 97%|ββββββββββ| 312997/321773 [00:35<00:01, 5833.55 examples/s]
Tokenizing Prompts (num_proc=192): 98%|ββββββββββ| 315698/321773 [00:35<00:00, 7810.36 examples/s]
Tokenizing Prompts (num_proc=192): 99%|ββββββββββ| 317048/321773 [00:35<00:00, 7275.24 examples/s]
Tokenizing Prompts (num_proc=192): 99%|ββββββββββ| 318398/321773 [00 |
| | 3: :35<00:00, 7960.74 examples/s]
Tokenizing Prompts (num_proc=192): 99%|ββββββββββ| 319748/321773 [00:36<00:00, 6454.31 examples/s]
Tokenizing Prompts (num_proc=192): 100%|ββββββββββ| 321098/321773 [00:36<00:00, 6991.26 examples/s]
Tokenizing Prompts (num_proc=192): 100%|ββββββββββ| 321773/321773 [00:36<00:00, 8790.55 examples/s] |
| | 3:
Dropping Long Sequences (>16384) (num_proc=192): 0%| | 0/321773 [00:00<?, ? examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 0%| | 1000/321773 [00:01<09:09, 583.81 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 4%|β | 13000/321773 [00:01<00:31, 9748.54 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 8%|β | 24676/321773 [00:02<00:16, 18505.00 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 10%|β | 31436/321773 [00:02<00:16, 17386.79 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 11%|ββ | 36492/321773 [00:02<00:15, 18238.78 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 13%|ββ | 40844/321773 [00:02<00:13, 20542.41 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 14%|ββ | 45196/321773 [00:02<00:11, 23529.08 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 15%|ββ | 49224/321773 [00:03<00:10, 25848.89 exampl |
| | 3: es/s]
Dropping Long Sequences (>16384) (num_proc=192): 17%|ββ | 53224/321773 [00:03<00:09, 27406.33 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 50%|βββββ | 160968/321773 [00:03<00:00, 242231.63 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 64%|βββββββ | 205600/321773 [00:03<00:00, 283879.55 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 76%|ββββββββ | 243444/321773 [00:03<00:00, 242398.75 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 86%|βββββββββ | 275640/321773 [00:03<00:00, 227367.38 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 95%|ββββββββββ| 304213/321773 [00:03<00:00, 181908.04 examples/s]
Dropping Long Sequences (>16384) (num_proc=192): 100%|ββββββββββ| 321773/321773 [00:04<00:00, 69680.11 examples/s] |
| | 3:
Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 0/315947 [00:00<?, ? examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 0%| | 1000/315947 [00:01<09:07, 574.76 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 3%|β | 9000/315947 [00:01<00:46, 6579.69 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 5%|β | 17000/315947 [00:01<00:21, 13663.54 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 7%|β | 23292/315947 [00:02<00:15, 19157.30 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 9%|β | 29230/315947 [00:02<00:13, 21824.77 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 11%|β | 34814/315947 [00:02<00:11, 24900.39 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 13%|ββ | 40044/315947 [00:02<00:11, 24024.45 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 15%| |
| | 3: ββ | 45982/315947 [00:02<00:09, 29154.70 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 16%|ββ | 51274/315947 [00:02<00:08, 31939.93 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 18%|ββ | 56212/315947 [00:03<00:08, 30464.92 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 19%|ββ | 60796/315947 [00:03<00:08, 29937.43 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 21%|ββ | 66088/315947 [00:03<00:07, 33742.94 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 23%|βββ | 71380/315947 [00:03<00:06, 37325.43 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 24%|βββ | 76318/315947 [00:03<00:06, 35668.58 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 26%|βββ | 81256/315947 [00:03<00:09, 23980.92 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 27%|βββ | |
| | 3: 84840/315947 [00:04<00:10, 23099.84 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 28%|βββ | 89132/315947 [00:04<00:08, 25831.01 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 30%|βββ | 95716/315947 [00:04<00:06, 33324.19 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 84%|βββββββββ | 265082/315947 [00:04<00:00, 394437.69 examples/s]
Drop Samples with Zero Trainable Tokens (num_proc=192): 100%|ββββββββββ| 315947/315947 [00:05<00:00, 57226.91 examples/s] |
| | 3:
Add position_id column (Sample Packing) (num_proc=192): 0%| | 0/315947 [00:00<?, ? examples/s]
Add position_id column (Sample Packing) (num_proc=192): 0%| | 1000/315947 [00:02<10:54, 481.25 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 4%|β | 13000/315947 [00:02<00:37, 8147.50 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 7%|β | 23000/315947 [00:02<00:18, 15678.25 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 10%|β | 32000/315947 [00:02<00:12, 23384.21 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 14%|ββ | 44000/315947 [00:02<00:07, 35667.24 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 17%|ββ | 55000/315947 [00:02<00:05, 46194.47 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 20%|ββ | 64292/315947 [00:02<00:05, 49525.45 examples/s]
Add position_id column (Sample Packing) (num_proc=192): |
| | 3: 25%|βββ | 77876/315947 [00:02<00:03, 63396.95 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 29%|βββ | 90460/315947 [00:03<00:03, 73841.86 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 32%|ββββ | 100690/315947 [00:03<00:03, 71582.31 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 35%|ββββ | 111504/315947 [00:03<00:02, 77234.71 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 39%|ββββ | 123964/315947 [00:03<00:02, 87113.03 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 43%|βββββ | 134424/315947 [00:03<00:02, 84181.32 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 48%|βββββ | 152238/315947 [00:03<00:01, 107063.91 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 53%|ββββββ | 168114/315947 [00:03<00:01, 119038.24 examples/s]
Add position_id column (Sample Packi |
| | 3: ng) (num_proc=192): 57%|ββββββ | 181574/315947 [00:03<00:01, 121647.10 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 63%|βββββββ | 198326/315947 [00:03<00:00, 133616.17 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 67%|βββββββ | 212847/315947 [00:04<00:00, 117655.05 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 73%|ββββββββ | 230660/315947 [00:04<00:00, 130519.18 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 78%|ββββββββ | 247115/315947 [00:04<00:00, 139062.35 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 83%|βββββββββ | 262212/315947 [00:04<00:00, 135531.58 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 88%|βββββββββ | 276470/315947 [00:04<00:00, 131531.90 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 92%|ββββββββββ |
| | 3: | 290147/315947 [00:04<00:00, 119964.83 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 96%|ββββββββββ| 303047/315947 [00:04<00:00, 108030.56 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 100%|ββββββββββ| 314657/315947 [00:05<00:00, 73525.53 examples/s]
Add position_id column (Sample Packing) (num_proc=192): 100%|ββββββββββ| 315947/315947 [00:05<00:00, 57407.99 examples/s] |
| | 3:
Saving the dataset (0/192 shards): 0%| | 0/315947 [00:00<?, ? examples/s]
Saving the dataset (0/192 shards): 1%| | 1646/315947 [00:01<05:32, 944.80 examples/s]
Saving the dataset (1/192 shards): 1%| | 1646/315947 [00:01<05:32, 944.80 examples/s]
Saving the dataset (2/192 shards): 1%| | 3292/315947 [00:01<05:30, 944.80 examples/s]
Saving the dataset (3/192 shards): 2%|β | 4938/315947 [00:01<05:29, 944.80 examples/s]
Saving the dataset (4/192 shards): 2%|β | 6584/315947 [00:01<05:27, 944.80 examples/s]
Saving the dataset (5/192 shards): 3%|β | 8230/315947 [00:01<05:25, 944.80 examples/s]
Saving the dataset (6/192 shards): 3%|β | 9876/315947 [00:01<05:23, 944.80 examples/s]
Saving the dataset (7/192 shards): 4%|β | 11522/315947 [00:01<05:22, 944.80 examples/s]
Saving the dataset (8/192 shards): 4%|β | 13168/315947 [00:01<05:20, 944.80 examples/s]
Saving the dataset (9/192 shards): 5%|β |
| | 3: | 14814/315947 [00:01<05:18, 944.80 examples/s]
Saving the dataset (10/192 shards): 5%|β | 16460/315947 [00:01<05:16, 944.80 examples/s]
Saving the dataset (11/192 shards): 6%|β | 19752/315947 [00:01<05:13, 944.80 examples/s]
Saving the dataset (12/192 shards): 6%|β | 19752/315947 [00:01<05:13, 944.80 examples/s]
Saving the dataset (13/192 shards): 7%|β | 21398/315947 [00:01<05:11, 944.80 examples/s]
Saving the dataset (14/192 shards): 7%|β | 23044/315947 [00:01<05:10, 944.80 examples/s]
Saving the dataset (15/192 shards): 8%|β | 24690/315947 [00:01<05:08, 944.80 examples/s]
Saving the dataset (16/192 shards): 8%|β | 26336/315947 [00:01<05:06, 944.80 examples/s]
Saving the dataset (17/192 shards): 9%|β | 27982/315947 [00:01<05:04, 944.80 examples/s]
Saving the dataset (18/192 shards): 9%|β | 29628/315947 [00:01<05:03, 944.80 examples/s]
Saving the dataset (19/192 shards): 10%|β | 32920/31594 |
| | 3: 7 [00:01<04:59, 944.80 examples/s]
Saving the dataset (20/192 shards): 10%|β | 32920/315947 [00:01<04:59, 944.80 examples/s]
Saving the dataset (21/192 shards): 11%|ββ | 36212/315947 [00:01<04:56, 944.80 examples/s]
Saving the dataset (22/192 shards): 11%|ββ | 36212/315947 [00:01<04:56, 944.80 examples/s]
Saving the dataset (23/192 shards): 12%|ββ | 37858/315947 [00:01<04:54, 944.80 examples/s]
Saving the dataset (24/192 shards): 13%|ββ | 41150/315947 [00:01<04:50, 944.80 examples/s]
Saving the dataset (25/192 shards): 13%|ββ | 41150/315947 [00:01<04:50, 944.80 examples/s]
Saving the dataset (26/192 shards): 14%|ββ | 42796/315947 [00:01<04:49, 944.80 examples/s]
Saving the dataset (27/192 shards): 14%|ββ | 44442/315947 [00:01<04:47, 944.80 examples/s]
Saving the dataset (28/192 shards): 15%|ββ | 46088/315947 [00:01<04:45, 944.80 examples/s]
Saving the dataset (29/192 shards): 15%|ββ | 47734/3 |
| | 3: 15947 [00:01<04:43, 944.80 examples/s]
Saving the dataset (30/192 shards): 16%|ββ | 51026/315947 [00:01<04:40, 944.80 examples/s]
Saving the dataset (31/192 shards): 16%|ββ | 51026/315947 [00:01<04:40, 944.80 examples/s]
Saving the dataset (32/192 shards): 17%|ββ | 52672/315947 [00:01<04:38, 944.80 examples/s]
Saving the dataset (33/192 shards): 17%|ββ | 54318/315947 [00:01<04:36, 944.80 examples/s]
Saving the dataset (34/192 shards): 18%|ββ | 55964/315947 [00:01<04:35, 944.80 examples/s]
Saving the dataset (35/192 shards): 18%|ββ | 57610/315947 [00:01<04:33, 944.80 examples/s]
Saving the dataset (36/192 shards): 19%|ββ | 59256/315947 [00:01<04:31, 944.80 examples/s]
Saving the dataset (37/192 shards): 20%|ββ | 62548/315947 [00:01<04:28, 944.80 examples/s]
Saving the dataset (38/192 shards): 20%|ββ | 62548/315947 [00:01<04:28, 944.80 examples/s]
Saving the dataset (39/192 shards): 21%|ββ | 6 |
| | 3: 5840/315947 [00:01<04:24, 944.80 examples/s]
Saving the dataset (40/192 shards): 21%|βββ | 67486/315947 [00:01<04:22, 944.80 examples/s]
Saving the dataset (41/192 shards): 21%|βββ | 67486/315947 [00:01<04:22, 944.80 examples/s]
Saving the dataset (42/192 shards): 22%|βββ | 69132/315947 [00:01<04:21, 944.80 examples/s]
Saving the dataset (43/192 shards): 22%|βββ | 70778/315947 [00:01<04:19, 944.80 examples/s]
Saving the dataset (44/192 shards): 23%|βββ | 72424/315947 [00:01<04:17, 944.80 examples/s]
Saving the dataset (45/192 shards): 23%|βββ | 74070/315947 [00:01<04:16, 944.80 examples/s]
Saving the dataset (46/192 shards): 24%|βββ | 75716/315947 [00:01<04:14, 944.80 examples/s]
Saving the dataset (47/192 shards): 24%|βββ | 77362/315947 [00:01<04:12, 944.80 examples/s]
Saving the dataset (48/192 shards): 25%|βββ | 79008/315947 [00:01<04:10, 944.80 examples/s]
Saving the dataset (49/192 shards) |
| | 3: : 26%|βββ | 80654/315947 [00:01<04:09, 944.80 examples/s]
Saving the dataset (50/192 shards): 27%|βββ | 83946/315947 [00:01<04:05, 944.80 examples/s]
Saving the dataset (51/192 shards): 27%|βββ | 83946/315947 [00:01<04:05, 944.80 examples/s]
Saving the dataset (52/192 shards): 27%|βββ | 85592/315947 [00:01<04:03, 944.80 examples/s]
Saving the dataset (53/192 shards): 28%|βββ | 87238/315947 [00:01<04:02, 944.80 examples/s]
Saving the dataset (54/192 shards): 28%|βββ | 88884/315947 [00:01<04:00, 944.80 examples/s]
Saving the dataset (55/192 shards): 29%|βββ | 90530/315947 [00:01<03:58, 944.80 examples/s]
Saving the dataset (56/192 shards): 29%|βββ | 92176/315947 [00:01<03:56, 944.80 examples/s]
Saving the dataset (57/192 shards): 30%|βββ | 93822/315947 [00:01<03:55, 944.80 examples/s]
Saving the dataset (58/192 shards): 30%|βββ | 95468/315947 [00:01<03:53, 944.80 examples/s]
Saving t |
| | 3: he dataset (59/192 shards): 31%|βββ | 97114/315947 [00:01<03:51, 944.80 examples/s]
Saving the dataset (60/192 shards): 31%|ββββ | 98760/315947 [00:01<03:49, 944.80 examples/s]
Saving the dataset (61/192 shards): 32%|ββββ | 100406/315947 [00:01<03:48, 944.80 examples/s]
Saving the dataset (62/192 shards): 33%|ββββ | 103698/315947 [00:01<03:44, 944.80 examples/s]
Saving the dataset (63/192 shards): 33%|ββββ | 103698/315947 [00:01<03:44, 944.80 examples/s]
Saving the dataset (64/192 shards): 33%|ββββ | 105344/315947 [00:01<03:42, 944.80 examples/s]
Saving the dataset (65/192 shards): 34%|ββββ | 106990/315947 [00:01<03:41, 944.80 examples/s]
Saving the dataset (66/192 shards): 35%|ββββ | 110282/315947 [00:01<03:37, 944.80 examples/s]
Saving the dataset (67/192 shards): 35%|ββββ | 110282/315947 [00:01<03:37, 944.80 examples/s]
Saving the dataset (68/192 shards): 35%|ββββ | 111 |
| | 3: 928/315947 [00:01<03:35, 944.80 examples/s]
Saving the dataset (69/192 shards): 36%|ββββ | 115220/315947 [00:01<03:32, 944.80 examples/s]
Saving the dataset (70/192 shards): 36%|ββββ | 115220/315947 [00:01<03:32, 944.80 examples/s]
Saving the dataset (71/192 shards): 37%|ββββ | 116866/315947 [00:01<03:30, 944.80 examples/s]
Saving the dataset (72/192 shards): 38%|ββββ | 118512/315947 [00:01<03:28, 944.80 examples/s]
Saving the dataset (73/192 shards): 39%|ββββ | 121804/315947 [00:01<03:25, 944.80 examples/s]
Saving the dataset (74/192 shards): 39%|ββββ | 121804/315947 [00:01<03:25, 944.80 examples/s]
Saving the dataset (75/192 shards): 39%|ββββ | 123450/315947 [00:01<03:23, 944.80 examples/s]
Saving the dataset (76/192 shards): 40%|ββββ | 125096/315947 [00:01<03:22, 944.80 examples/s]
Saving the dataset (77/192 shards): 40%|ββββ | 126742/315947 [00:01<03:20, 944.80 examples/s]
Saving t |
| | 3: he dataset (78/192 shards): 41%|ββββ | 128388/315947 [00:01<03:18, 944.80 examples/s]
Saving the dataset (79/192 shards): 41%|ββββ | 130034/315947 [00:01<03:16, 944.80 examples/s]
Saving the dataset (80/192 shards): 42%|βββββ | 131680/315947 [00:01<03:15, 944.80 examples/s]
Saving the dataset (81/192 shards): 42%|βββββ | 133326/315947 [00:01<03:13, 944.80 examples/s]
Saving the dataset (82/192 shards): 43%|βββββ | 134972/315947 [00:01<03:11, 944.80 examples/s]
Saving the dataset (83/192 shards): 43%|βββββ | 136618/315947 [00:01<03:09, 944.80 examples/s]
Saving the dataset (84/192 shards): 44%|βββββ | 138264/315947 [00:01<03:08, 944.80 examples/s]
Saving the dataset (85/192 shards): 45%|βββββ | 141556/315947 [00:01<03:04, 944.80 examples/s]
Saving the dataset (86/192 shards): 45%|βββββ | 141556/315947 [00:01<03:04, 944.80 examples/s]
Saving the dataset (87/192 shards): 45%|βοΏ½ |
| | 3: οΏ½βββ | 143202/315947 [00:01<03:02, 944.80 examples/s]
Saving the dataset (88/192 shards): 46%|βββββ | 146494/315947 [00:01<02:59, 944.80 examples/s]
Saving the dataset (89/192 shards): 46%|βββββ | 146494/315947 [00:01<02:59, 944.80 examples/s]
Saving the dataset (90/192 shards): 47%|βββββ | 148140/315947 [00:01<02:57, 944.80 examples/s]
Saving the dataset (91/192 shards): 47%|βββββ | 149786/315947 [00:01<02:55, 944.80 examples/s]
Saving the dataset (92/192 shards): 48%|βββββ | 151432/315947 [00:01<02:54, 944.80 examples/s]
Saving the dataset (93/192 shards): 48%|βββββ | 153078/315947 [00:01<02:52, 944.80 examples/s]
Saving the dataset (94/192 shards): 49%|βββββ | 154724/315947 [00:01<02:50, 944.80 examples/s]
Saving the dataset (95/192 shards): 49%|βββββ | 156370/315947 [00:01<02:48, 944.80 examples/s]
Saving the dataset (96/192 shards): 50%|βββββ | 158016/315947 [00 |
| | 3: :01<02:47, 944.80 examples/s]
Saving the dataset (97/192 shards): 51%|βββββ | 159662/315947 [00:01<02:45, 944.80 examples/s]
Saving the dataset (98/192 shards): 52%|ββββββ | 162954/315947 [00:01<02:41, 944.80 examples/s]
Saving the dataset (99/192 shards): 52%|ββββββ | 162954/315947 [00:01<02:41, 944.80 examples/s]
Saving the dataset (100/192 shards): 52%|ββββββ | 164600/315947 [00:01<02:40, 944.80 examples/s]
Saving the dataset (101/192 shards): 53%|ββββββ | 167892/315947 [00:01<02:36, 944.80 examples/s]
Saving the dataset (102/192 shards): 53%|ββββββ | 167892/315947 [00:01<02:36, 944.80 examples/s]
Saving the dataset (103/192 shards): 54%|ββββββ | 169538/315947 [00:01<02:34, 944.80 examples/s]
Saving the dataset (104/192 shards): 54%|ββββββ | 171184/315947 [00:01<02:33, 944.80 examples/s]
Saving the dataset (105/192 shards): 55%|ββββββ | 174476/315947 [00:01<02:29, 9 |
| | 3: 44.80 examples/s]
Saving the dataset (106/192 shards): 56%|ββββββ | 176121/315947 [00:01<02:27, 944.80 examples/s]
Saving the dataset (107/192 shards): 56%|ββββββ | 176121/315947 [00:01<02:27, 944.80 examples/s]
Saving the dataset (108/192 shards): 56%|ββββββ | 177767/315947 [00:01<02:26, 944.80 examples/s]
Saving the dataset (109/192 shards): 57%|ββββββ | 179412/315947 [00:01<02:24, 944.80 examples/s]
Saving the dataset (110/192 shards): 57%|ββββββ | 181057/315947 [00:01<02:22, 944.80 examples/s]
Saving the dataset (111/192 shards): 58%|ββββββ | 182702/315947 [00:01<02:21, 944.80 examples/s]
Saving the dataset (112/192 shards): 58%|ββββββ | 184347/315947 [00:01<02:19, 944.80 examples/s]
Saving the dataset (113/192 shards): 59%|ββββββ | 185992/315947 [00:01<02:17, 944.80 examples/s]
Saving the dataset (114/192 shards): 59%|ββββββ | 187637/315947 [00:01<02:15, 944.80 e |
| | 3: xamples/s]
Saving the dataset (115/192 shards): 60%|ββββββ | 189282/315947 [00:01<02:14, 944.80 examples/s]
Saving the dataset (116/192 shards): 60%|ββββββ | 190927/315947 [00:01<02:12, 944.80 examples/s]
Saving the dataset (117/192 shards): 61%|ββββββ | 192572/315947 [00:01<02:10, 944.80 examples/s]
Saving the dataset (118/192 shards): 61%|βββββββ | 194217/315947 [00:01<02:08, 944.80 examples/s]
Saving the dataset (119/192 shards): 62%|βββββββ | 195862/315947 [00:01<02:07, 944.80 examples/s]
Saving the dataset (120/192 shards): 63%|βββββββ | 197507/315947 [00:01<02:05, 944.80 examples/s]
Saving the dataset (121/192 shards): 63%|βββββββ | 199152/315947 [00:01<02:03, 944.80 examples/s]
Saving the dataset (122/192 shards): 64%|βββββββ | 200797/315947 [00:01<02:01, 944.80 examples/s]
Saving the dataset (123/192 shards): 64%|βββββββ | 202442/315947 [00:01<02:00, 944 |
| | 3: .80 examples/s]
Saving the dataset (124/192 shards): 65%|βββββββ | 204087/315947 [00:01<01:58, 944.80 examples/s]
Saving the dataset (125/192 shards): 65%|βββββββ | 205732/315947 [00:01<01:56, 944.80 examples/s]
Saving the dataset (126/192 shards): 66%|βββββββ | 207377/315947 [00:01<01:54, 944.80 examples/s]
Saving the dataset (127/192 shards): 66%|βββββββ | 209022/315947 [00:01<01:53, 944.80 examples/s]
Saving the dataset (128/192 shards): 67%|βββββββ | 210667/315947 [00:01<01:51, 944.80 examples/s]
Saving the dataset (129/192 shards): 67%|βββββββ | 212312/315947 [00:01<01:49, 944.80 examples/s]
Saving the dataset (130/192 shards): 68%|βββββββ | 215602/315947 [00:01<01:46, 944.80 examples/s]
Saving the dataset (131/192 shards): 68%|βββββββ | 215602/315947 [00:01<01:46, 944.80 examples/s]
Saving the dataset (132/192 shards): 69%|βββββββ | 218892/315947 [00:01 |
| | 3: <01:42, 944.80 examples/s]
Saving the dataset (133/192 shards): 70%|βββββββ | 220537/315947 [00:01<01:40, 944.80 examples/s]
Saving the dataset (134/192 shards): 70%|βββββββ | 220537/315947 [00:01<01:40, 944.80 examples/s]
Saving the dataset (135/192 shards): 71%|βββββββ | 223827/315947 [00:01<01:37, 944.80 examples/s]
Saving the dataset (136/192 shards): 71%|ββββββββ | 225472/315947 [00:01<01:35, 944.80 examples/s]
Saving the dataset (137/192 shards): 71%|ββββββββ | 225472/315947 [00:01<01:35, 944.80 examples/s]
Saving the dataset (138/192 shards): 72%|ββββββββ | 227117/315947 [00:01<01:34, 944.80 examples/s]
Saving the dataset (139/192 shards): 72%|ββββββββ | 228762/315947 [00:01<01:32, 944.80 examples/s]
Saving the dataset (140/192 shards): 73%|ββββββββ | 230407/315947 [00:01<01:30, 944.80 examples/s]
Saving the dataset (141/192 shards): 73%|ββββββββ |
| | 3: | 232052/315947 [00:01<01:28, 944.80 examples/s]
Saving the dataset (142/192 shards): 74%|ββββββββ | 233697/315947 [00:01<01:27, 944.80 examples/s]
Saving the dataset (143/192 shards): 74%|ββββββββ | 235342/315947 [00:01<01:25, 944.80 examples/s]
Saving the dataset (144/192 shards): 75%|ββββββββ | 236987/315947 [00:01<01:23, 944.80 examples/s]
Saving the dataset (145/192 shards): 76%|ββββββββ | 238632/315947 [00:01<01:21, 944.80 examples/s]
Saving the dataset (146/192 shards): 76%|ββββββββ | 240277/315947 [00:01<01:20, 944.80 examples/s]
Saving the dataset (147/192 shards): 77%|ββββββββ | 241922/315947 [00:01<01:18, 944.80 examples/s]
Saving the dataset (148/192 shards): 78%|ββββββββ | 245212/315947 [00:01<01:14, 944.80 examples/s]
Saving the dataset (149/192 shards): 78%|ββββββββ | 245212/315947 [00:01<01:14, 944.80 examples/s]
Saving the dataset (150/192 shards): |
| | 3: 79%|ββββββββ | 248502/315947 [00:01<01:11, 944.80 examples/s]
Saving the dataset (151/192 shards): 79%|ββββββββ | 248502/315947 [00:01<01:11, 944.80 examples/s]
Saving the dataset (152/192 shards): 79%|ββββββββ | 250147/315947 [00:01<01:09, 944.80 examples/s]
Saving the dataset (153/192 shards): 80%|ββββββββ | 253437/315947 [00:01<01:06, 944.80 examples/s]
Saving the dataset (154/192 shards): 80%|ββββββββ | 253437/315947 [00:01<01:06, 944.80 examples/s]
Saving the dataset (155/192 shards): 81%|ββββββββ | 255082/315947 [00:01<01:04, 944.80 examples/s]
Saving the dataset (156/192 shards): 81%|βββββββββ | 256727/315947 [00:01<01:02, 944.80 examples/s]
Saving the dataset (157/192 shards): 82%|βββββββββ | 260017/315947 [00:01<00:59, 944.80 examples/s]
Saving the dataset (158/192 shards): 82%|βββββββββ | 260017/315947 [00:01<00:59, 944.80 examples/s]
Sav |
| | 3: ing the dataset (159/192 shards): 83%|βββββββββ | 263307/315947 [00:01<00:55, 944.80 examples/s]
Saving the dataset (160/192 shards): 83%|βββββββββ | 263307/315947 [00:01<00:55, 944.80 examples/s]
Saving the dataset (161/192 shards): 84%|βββββββββ | 264952/315947 [00:01<00:53, 944.80 examples/s]
Saving the dataset (162/192 shards): 84%|βββββββββ | 266597/315947 [00:01<00:52, 944.80 examples/s]
Saving the dataset (163/192 shards): 85%|βββββββββ | 268242/315947 [00:01<00:50, 944.80 examples/s]
Saving the dataset (164/192 shards): 86%|βββββββββ | 273177/315947 [00:01<00:45, 944.80 examples/s]
Saving the dataset (165/192 shards): 86%|βββββββββ | 273177/315947 [00:01<00:45, 944.80 examples/s]
Saving the dataset (166/192 shards): 87%|βββββββββ | 274822/315947 [00:01<00:43, 944.80 examples/s]
Saving the dataset (167/192 shards): 87%|βββββββββ | 274 |
| | 3: 822/315947 [00:01<00:43, 944.80 examples/s]
Saving the dataset (168/192 shards): 88%|βββββββββ | 276467/315947 [00:01<00:41, 944.80 examples/s]
Saving the dataset (169/192 shards): 88%|βββββββββ | 278112/315947 [00:01<00:40, 944.80 examples/s]
Saving the dataset (170/192 shards): 89%|βββββββββ | 281402/315947 [00:01<00:36, 944.80 examples/s]
Saving the dataset (171/192 shards): 89%|βββββββββ | 281402/315947 [00:01<00:36, 944.80 examples/s]
Saving the dataset (172/192 shards): 90%|βββββββββ | 283047/315947 [00:01<00:34, 944.80 examples/s]
Saving the dataset (173/192 shards): 91%|βββββββββ | 286337/315947 [00:01<00:31, 944.80 examples/s]
Saving the dataset (174/192 shards): 91%|βββββββββ | 286337/315947 [00:01<00:31, 944.80 examples/s]
Saving the dataset (175/192 shards): 91%|βββββββββ | 287982/315947 [00:01<00:29, 944.80 examples/s]
Saving the dataset (176/192 |
| | 3: shards): 92%|ββββββββββ| 289627/315947 [00:01<00:27, 944.80 examples/s]
Saving the dataset (177/192 shards): 92%|ββββββββββ| 291272/315947 [00:01<00:26, 944.80 examples/s]
Saving the dataset (178/192 shards): 93%|ββββββββββ| 292917/315947 [00:01<00:24, 944.80 examples/s]
Saving the dataset (179/192 shards): 93%|ββββββββββ| 294562/315947 [00:01<00:22, 944.80 examples/s]
Saving the dataset (180/192 shards): 94%|ββββββββββ| 296207/315947 [00:01<00:20, 944.80 examples/s]
Saving the dataset (181/192 shards): 94%|ββββββββββ| 297852/315947 [00:01<00:19, 944.80 examples/s]
Saving the dataset (182/192 shards): 95%|ββββββββββ| 299497/315947 [00:01<00:17, 944.80 examples/s]
Saving the dataset (183/192 shards): 95%|ββββββββββ| 301142/315947 [00:01<00:15, 944.80 examples/s]
Saving the dataset (184/192 shards): 96%|ββββββββββ| 302787/315 |
| | 3: 947 [00:01<00:13, 944.80 examples/s]
Saving the dataset (185/192 shards): 96%|ββββββββββ| 304432/315947 [00:01<00:12, 944.80 examples/s]
Saving the dataset (186/192 shards): 97%|ββββββββββ| 306077/315947 [00:01<00:10, 944.80 examples/s]
Saving the dataset (187/192 shards): 97%|ββββββββββ| 307722/315947 [00:01<00:08, 944.80 examples/s]
Saving the dataset (188/192 shards): 98%|ββββββββββ| 309367/315947 [00:01<00:06, 944.80 examples/s]
Saving the dataset (189/192 shards): 98%|ββββββββββ| 311012/315947 [00:01<00:05, 944.80 examples/s]
Saving the dataset (190/192 shards): 99%|ββββββββββ| 312657/315947 [00:01<00:03, 944.80 examples/s]
Saving the dataset (191/192 shards): 100%|ββββββββββ| 315947/315947 [00:01<00:00, 944.80 examples/s]
Saving the dataset (192/192 shards): 100%|ββββββββββ| 315947/315947 [00:01<00:00, 944.80 examples/s]
Saving the dataset |
| | 3: (192/192 shards): 100%|ββββββββββ| 315947/315947 [00:01<00:00, 166814.88 examples/s] |
| | 0: [2025-09-02 18:33:00,133] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:472] [PID:801836] [RANK:0] Loading prepared dataset from disk at /lustre/fsn1/projects/rech/dgo/udv55np/dataset_math/Qwen3-235B-A22B/0/df4594deeb64977bb186d11562029017...[39m |
| | 0: [2025-09-02 18:33:37,212] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:801836] [RANK:0] gather_len_batches: [25939, 25940, 25939, 25938, 25938, 25939, 25939, 25939, 25939, 25938, 25939, 25939, 25939, 25939, 25939, 25938][39m |
| | 0: [2025-09-02 18:33:37,417] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:495] [PID:801836] [RANK:0] sample_packing_eff_est across ranks: [0.9965550303459167, 0.9965550303459167, 0.9965550303459167, 0.9965550303459167, 0.9965166449546814, 0.9965550303459167, 0.996631920337677, 0.9965550303459167, 0.9965934753417969, 0.9965934753417969, 0.9965550303459167, 0.9965550303459167, 0.9965550303459167, 0.9965550303459167, 0.9965934753417969, 0.9965550303459167][39m |
| | 0: [2025-09-02 18:33:37,422] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:127] [PID:801836] [RANK:0] Maximum number of steps set at 1621[39m |
| | 0: [2025-09-02 18:33:37,749] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:110] [PID:801836] [RANK:0] Patched Trainer.evaluation_loop with nanmean loss calculation[39m |
| | 0: [2025-09-02 18:33:37,750] [INFO] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:164] [PID:801836] [RANK:0] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation[39m |
| | 0: [2025-09-02 18:33:50,442] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:801836] [RANK:0] Converting modules to torch.bfloat16[39m |
| | 0: [2025-09-02 18:34:01,367] [INFO] [axolotl.train.save_initial_configs:416] [PID:801836] [RANK:0] Pre-saving tokenizer to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0...[39m |
| | 0: [2025-09-02 18:34:01,793] [INFO] [axolotl.train.save_initial_configs:419] [PID:801836] [RANK:0] Pre-saving model config to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0...[39m |
| | 0: [2025-09-02 18:34:01,800] [INFO] [axolotl.train.execute_training:203] [PID:801836] [RANK:0] Starting trainer...[39m |
| | 0: [2025-09-02 18:36:07,470] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:436] [PID:801836] [RANK:0] gather_len_batches: [25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939, 25939][39m |
| | 0: Parameter Offload - Persistent parameters statistics: param_count = 141, numel = 144896 |
| | 0: {'loss': 0.3291, 'grad_norm': 0.3215025832173914, 'learning_rate': 1.0779999999999999e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.01} |
| | 0:
0%| | 0/1621 [00:00<?, ?it/s]
0%| | 1/1621 [03:15<87:56:27, 195.42s/it]
0%| | 2/1621 [03:18<36:53:28, 82.03s/it]
0%| | 3/1621 [03:18<20:12:28, 44.96s/it]
0%| | 4/1621 [03:19<12:22:43, 27.56s/it]
0%| | 5/1621 [03:20<8:03:17, 17.94s/it]
0%| | 6/1621 [03:21<5:26:43, 12.14s/it]
0%| | 7/1621 [03:22<3:47:13, 8.45s/it]
0%| | 8/1621 [03:23<2:42:07, 6.03s/it]
1%| | 9/1621 [03:24<1:58:26, 4.41s/it]
1%| | 10/1621 [03:24<1:29:01, 3.32s/it]
1%| | 10/1621 [03:25<1:29:01, 3.32s/it]
1%| | 11/1621 [03:25<1:09:22, 2.59s/it]
1%| | 12/1621 [03:26<55:27, 2.07s/it]
1%| | 13/1621 [03:27<47:03, 1.76s/it]
1%| | 14/1621 [03:28<39:48, 1.49s/it]
1%| | 15/1621 [03:29<34:37, 1.29s/it]
1%| | 16/1621 [03:30<31:04, 1.16s/it]
1%| | 17/1621 [03:31<28:54, 1.08s/it]
1%| |
| | 0: {'loss': 0.329, 'grad_norm': 0.2975306152955043, 'learning_rate': 1.498e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.01} |
| | 0: {'loss': 0.332, 'grad_norm': 0.2988797728667972, 'learning_rate': 1.918e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.02} |
| | 0: | 18/1621 [03:32<28:10, 1.05s/it]
1%| | 19/1621 [03:33<26:28, 1.01it/s]
1%| | 20/1621 [03:34<25:36, 1.04it/s]
1%| | 20/1621 [03:34<25:36, 1.04it/s]
1%|β | 21/1621 [03:34<24:55, 1.07it/s]
1%|β | 22/1621 [03:35<24:19, 1.10it/s]
1%|β | 23/1621 [03:36<23:58, 1.11it/s]
1%|β | 24/1621 [03:37<23:28, 1.13it/s]
2%|β | 25/1621 [03:38<23:19, 1.14it/s]
2%|β | 26/1621 [03:39<23:58, 1.11it/s]
2%|β | 27/1621 [03:40<23:39, 1.12it/s]
2%|β | 28/1621 [03:41<23:17, 1.14it/s]
2%|β | 29/1621 [03:41<23:13, 1.14it/s]
2%|β | 30/1621 [03:42<23:56, 1.11it/s]
2%|β | 30/1621 [03:42<23:56, 1.11it/s]
2%|β | 31/1621 [03:43<23:34, 1.12it/s]
2%|β | 32/1621 [03:44<23:23, 1.13it/s]
2%|β | 33/1621 [03:45<23:01, 1.15it/s] |
| | 0: {'loss': 0.3293, 'grad_norm': 0.31013818468501975, 'learning_rate': 2.338e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.02} |
| | 0:
2%|β | 34/1621 [03:46<22:52, 1.16it/s]
2%|β | 35/1621 [03:47<22:41, 1.16it/s]
2%|β | 36/1621 [03:47<22:37, 1.17it/s]
2%|β | 37/1621 [03:48<22:30, 1.17it/s]
2%|β | 38/1621 [03:49<23:02, 1.14it/s]
2%|β | 39/1621 [03:50<22:53, 1.15it/s]
2%|β | 40/1621 [03:51<22:52, 1.15it/s]
2%|β | 40/1621 [03:51<22:52, 1.15it/s]
3%|β | 41/1621 [03:52<23:20, 1.13it/s]
3%|β | 42/1621 [03:53<23:09, 1.14it/s]
3%|β | 43/1621 [03:54<23:12, 1.13it/s]
3%|β | 44/1621 [03:54<23:03, 1.14it/s]
3%|β | 45/1621 [03:55<22:56, 1.15it/s]
3%|β | 46/1621 [03:56<22:47, 1.15it/s]
3%|β | 47/1621 [03:57<22:51, 1.15it/s]
3%|β | 48/1621 [03:58<22:36, 1.16it/s]
3%|β | 49/1621 [03:59<22:55, 1.14it/s]
3%|β | 50/1621 [04:00<23:14, 1.13it/s]
|
| | 0: {'loss': 0.3215, 'grad_norm': 0.30101641303027743, 'learning_rate': 2.758e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.03} |
| | 0: {'loss': 0.3283, 'grad_norm': 0.28711923264082334, 'learning_rate': 3.1779999999999995e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.04} |
| | 0:
3%|β | 50/1621 [04:00<23:14, 1.13it/s]
3%|β | 51/1621 [04:01<23:02, 1.14it/s]
3%|β | 52/1621 [04:01<22:45, 1.15it/s]
3%|β | 53/1621 [04:02<22:47, 1.15it/s]
3%|β | 54/1621 [04:03<22:50, 1.14it/s]
3%|β | 55/1621 [04:04<22:59, 1.14it/s]
3%|β | 56/1621 [04:05<22:56, 1.14it/s]
4%|β | 57/1621 [04:06<22:42, 1.15it/s]
4%|β | 58/1621 [04:07<22:38, 1.15it/s]
4%|β | 59/1621 [04:08<22:32, 1.15it/s]
4%|β | 60/1621 [04:08<22:22, 1.16it/s]
4%|β | 60/1621 [04:08<22:22, 1.16it/s]
4%|β | 61/1621 [04:09<23:12, 1.12it/s]
4%|β | 62/1621 [04:10<22:57, 1.13it/s]
4%|β | 63/1621 [04:11<22:45, 1.14it/s]
4%|β | 64/1621 [04:12<22:35, 1.15it/s]
4%|β | 65/1621 [04:13<23:04, 1.12it/s]
4%|β | 66/1621 [04:14<23:12, 1.12it/s]
4%|β | 67/ |
| | 0: {'loss': 0.3187, 'grad_norm': 0.3018654823455164, 'learning_rate': 3.598e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.04} |
| | 0: {'loss': 0.3181, 'grad_norm': 0.2990949595796023, 'learning_rate': 4.0179999999999995e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.05} |
| | 0: 1621 [04:15<22:58, 1.13it/s]
4%|β | 68/1621 [04:16<23:10, 1.12it/s]
4%|β | 69/1621 [04:17<24:41, 1.05it/s]
4%|β | 70/1621 [04:18<23:52, 1.08it/s]
4%|β | 70/1621 [04:18<23:52, 1.08it/s]
4%|β | 71/1621 [04:18<23:48, 1.09it/s]
4%|β | 72/1621 [04:19<23:25, 1.10it/s]
5%|β | 73/1621 [04:20<23:07, 1.12it/s]
5%|β | 74/1621 [04:21<22:42, 1.14it/s]
5%|β | 75/1621 [04:22<22:30, 1.15it/s]
5%|β | 76/1621 [04:23<22:22, 1.15it/s]
5%|β | 77/1621 [04:24<22:16, 1.16it/s]
5%|β | 78/1621 [04:24<22:13, 1.16it/s]
5%|β | 79/1621 [04:25<22:14, 1.16it/s]
5%|β | 80/1621 [04:26<23:04, 1.11it/s]
5%|β | 80/1621 [04:26<23:04, 1.11it/s]
5%|β | 81/1621 [04:27<23:33, 1.09it/s]
5%|β | 82/1621 [04:28<23:02, 1.11it/s]
5%|β |
| | 0: {'loss': 0.318, 'grad_norm': 0.29019982626500473, 'learning_rate': 4.438e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.06} |
| | 0: | 83/1621 [04:29<22:51, 1.12it/s]
5%|β | 84/1621 [04:30<22:45, 1.13it/s]
5%|β | 85/1621 [04:31<22:46, 1.12it/s]
5%|β | 86/1621 [04:32<23:35, 1.08it/s]
5%|β | 87/1621 [04:33<23:29, 1.09it/s]
5%|β | 88/1621 [04:34<23:00, 1.11it/s]
5%|β | 89/1621 [04:34<22:40, 1.13it/s]
6%|β | 90/1621 [04:35<22:27, 1.14it/s]
6%|β | 90/1621 [04:35<22:27, 1.14it/s]
6%|β | 91/1621 [04:36<22:21, 1.14it/s]
6%|β | 92/1621 [04:37<22:09, 1.15it/s]
6%|β | 93/1621 [04:38<21:59, 1.16it/s]
6%|β | 94/1621 [04:39<21:56, 1.16it/s]
6%|β | 95/1621 [04:40<22:02, 1.15it/s]
6%|β | 96/1621 [04:40<21:59, 1.16it/s]
6%|β | 97/1621 [04:41<22:24, 1.13it/s]
6%|β | 98/1621 [04:42<22:17, 1.14it/s]
6%|β | 99/1621 [04:43<22:11, 1.14it/s]
6%|β | 100/1621 [04:44<21:54, 1. |
| | 0: {'loss': 0.3195, 'grad_norm': 0.30163478964963014, 'learning_rate': 4.858000000000001e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.06} |
| | 0: {'loss': 0.3174, 'grad_norm': 0.31809809662317745, 'learning_rate': 5.278e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.07} |
| | 0: 16it/s]
6%|β | 100/1621 [04:44<21:54, 1.16it/s]
6%|β | 101/1621 [04:45<22:56, 1.10it/s]
6%|β | 102/1621 [04:46<22:32, 1.12it/s]
6%|β | 103/1621 [04:47<22:12, 1.14it/s]
6%|β | 104/1621 [04:47<22:01, 1.15it/s]
6%|β | 105/1621 [04:48<21:49, 1.16it/s]
7%|β | 106/1621 [04:49<21:45, 1.16it/s]
7%|β | 107/1621 [04:50<21:36, 1.17it/s]
7%|β | 108/1621 [04:51<21:34, 1.17it/s]
7%|β | 109/1621 [04:52<22:47, 1.11it/s]
7%|β | 110/1621 [04:53<22:25, 1.12it/s]
7%|β | 110/1621 [04:53<22:25, 1.12it/s]
7%|β | 111/1621 [04:54<22:11, 1.13it/s]
7%|β | 112/1621 [04:54<22:01, 1.14it/s]
7%|β | 113/1621 [04:55<21:55, 1.15it/s]
7%|β | 114/1621 [04:56<22:06, 1.14it/s]
7%|β | 115/1621 [04:57<21:55, 1.15it/s]
7%|β |
| | 0: {'loss': 0.3304, 'grad_norm': 0.30854213193532004, 'learning_rate': 5.6979999999999995e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.07} |
| | 0: {'loss': 0.32, 'grad_norm': 0.3074992484407681, 'learning_rate': 6.118e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.08} |
| | 0: | 116/1621 [04:58<21:48, 1.15it/s]
7%|β | 117/1621 [04:59<21:59, 1.14it/s]
7%|β | 118/1621 [05:00<21:48, 1.15it/s]
7%|β | 119/1621 [05:01<22:59, 1.09it/s]
7%|β | 120/1621 [05:02<22:38, 1.10it/s]
7%|β | 120/1621 [05:02<22:38, 1.10it/s]
7%|β | 121/1621 [05:02<22:11, 1.13it/s]
8%|β | 122/1621 [05:03<21:59, 1.14it/s]
8%|β | 123/1621 [05:04<21:46, 1.15it/s]
8%|β | 124/1621 [05:05<21:40, 1.15it/s]
8%|β | 125/1621 [05:06<21:33, 1.16it/s]
8%|β | 126/1621 [05:07<21:32, 1.16it/s]
8%|β | 127/1621 [05:08<21:53, 1.14it/s]
8%|β | 128/1621 [05:09<21:47, 1.14it/s]
8%|β | 129/1621 [05:09<21:36, 1.15it/s]
8%|β | 130/1621 [05:10<21:35, 1.15it/s]
8%|β | 130/1621 [05:10<21:35, 1.15it/s]
8%|β | 131/1621 [0 |
| | 0: {'loss': 0.3198, 'grad_norm': 0.3132505507666493, 'learning_rate': 6.538e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.09} |
| | 0: 5:11<21:38, 1.15it/s]
8%|β | 132/1621 [05:12<21:26, 1.16it/s]
8%|β | 133/1621 [05:13<21:22, 1.16it/s]
8%|β | 134/1621 [05:14<21:17, 1.16it/s]
8%|β | 135/1621 [05:15<21:14, 1.17it/s]
8%|β | 136/1621 [05:15<21:12, 1.17it/s]
8%|β | 137/1621 [05:16<21:09, 1.17it/s]
9%|β | 138/1621 [05:17<21:05, 1.17it/s]
9%|β | 139/1621 [05:18<21:00, 1.18it/s]
9%|β | 140/1621 [05:19<21:00, 1.18it/s]
9%|β | 140/1621 [05:19<21:00, 1.18it/s]
9%|β | 141/1621 [05:20<21:16, 1.16it/s]
9%|β | 142/1621 [05:21<22:16, 1.11it/s]
9%|β | 143/1621 [05:22<21:49, 1.13it/s]
9%|β | 144/1621 [05:22<21:36, 1.14it/s]
9%|β | 145/1621 [05:23<21:27, 1.15it/s]
9%|β | 146/1621 [05:24<21:23, 1.15it/s]
9%|β | 147/1621 [05:25<21:28, 1.14it/s]
9%|β | 148/1621 [05:26<21:15, 1.15i |
| | 0: {'loss': 0.3218, 'grad_norm': 0.3246947676119254, 'learning_rate': 6.958e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.09} |
| | 0: {'loss': 0.3094, 'grad_norm': 0.28139341545138713, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.1} |
| | 0: t/s]
9%|β | 149/1621 [05:27<21:22, 1.15it/s]
9%|β | 150/1621 [05:28<21:14, 1.15it/s]
9%|β | 150/1621 [05:28<21:14, 1.15it/s]
9%|β | 151/1621 [05:28<21:08, 1.16it/s]
9%|β | 152/1621 [05:29<22:18, 1.10it/s]
9%|β | 153/1621 [05:30<22:14, 1.10it/s]
10%|β | 154/1621 [05:31<22:51, 1.07it/s]
10%|β | 155/1621 [05:32<22:11, 1.10it/s]
10%|β | 156/1621 [05:33<21:56, 1.11it/s]
10%|β | 157/1621 [05:34<21:34, 1.13it/s]
10%|β | 158/1621 [05:35<21:22, 1.14it/s]
10%|β | 159/1621 [05:36<21:13, 1.15it/s]
10%|β | 160/1621 [05:37<21:05, 1.15it/s]
10%|β | 160/1621 [05:37<21:05, 1.15it/s]
10%|β | 161/1621 [05:37<21:02, 1.16it/s]
10%|β | 162/1621 [05:38<20:52, 1.16it/s]
10%|β | 163/1621 [05:39<20:52, 1.16it/s]
10%|β |
| | 0: {'loss': 0.3101, 'grad_norm': 0.30704712278746976, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.1} |
| | 0: | 164/1621 [05:40<20:55, 1.16it/s]
10%|β | 165/1621 [05:41<21:43, 1.12it/s]
10%|β | 166/1621 [05:42<21:24, 1.13it/s]
10%|β | 167/1621 [05:43<21:11, 1.14it/s]
10%|β | 168/1621 [05:43<20:57, 1.16it/s]
10%|β | 169/1621 [05:44<20:50, 1.16it/s]
10%|β | 170/1621 [05:45<21:45, 1.11it/s]
10%|β | 170/1621 [05:45<21:45, 1.11it/s]
11%|β | 171/1621 [05:46<21:25, 1.13it/s]
11%|β | 172/1621 [05:47<21:08, 1.14it/s]
11%|β | 173/1621 [05:48<21:05, 1.14it/s]
11%|β | 174/1621 [05:49<21:42, 1.11it/s]
11%|β | 175/1621 [05:50<21:18, 1.13it/s]
11%|β | 176/1621 [05:51<21:08, 1.14it/s]
11%|β | 177/1621 [05:51<20:56, 1.15it/s]
11%|β | 178/1621 [05:52<21:13, 1.13it/s]
11%|β | 179/1621 [05:53<21:15, 1.13it/s]
11%|β | 180/1621 [05:54<21:03, 1.14it/s]
|
| | 0: {'loss': 0.3146, 'grad_norm': 0.3287206705570871, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.11} |
| | 0: {'loss': 0.3208, 'grad_norm': 0.3252333795917378, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.12} |
| | 0:
11%|β | 180/1621 [05:54<21:03, 1.14it/s]
11%|β | 181/1621 [05:55<22:21, 1.07it/s]
11%|β | 182/1621 [05:56<21:47, 1.10it/s]
11%|ββ | 183/1621 [05:57<21:57, 1.09it/s]
11%|ββ | 184/1621 [05:58<21:32, 1.11it/s]
11%|ββ | 185/1621 [05:59<21:13, 1.13it/s]
11%|ββ | 186/1621 [06:00<20:55, 1.14it/s]
12%|ββ | 187/1621 [06:00<20:47, 1.15it/s]
12%|ββ | 188/1621 [06:01<20:36, 1.16it/s]
12%|ββ | 189/1621 [06:02<20:33, 1.16it/s]
12%|ββ | 190/1621 [06:03<20:50, 1.14it/s]
12%|ββ | 190/1621 [06:03<20:50, 1.14it/s]
12%|ββ | 191/1621 [06:04<20:39, 1.15it/s]
12%|ββ | 192/1621 [06:05<20:42, 1.15it/s]
12%|ββ | 193/1621 [06:06<20:37, 1.15it/s]
12%|ββ | 194/1621 [06:06<20:52, 1.14it/s]
12%|ββ | 195/1621 [06:07<20:39, 1.15it/s]
12%|ββ | 19 |
| | 0: {'loss': 0.3109, 'grad_norm': 0.3167312472541867, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.12} |
| | 0: {'loss': 0.3198, 'grad_norm': 0.34127287905643694, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.13} |
| | 0: 6/1621 [06:08<20:36, 1.15it/s]
12%|ββ | 197/1621 [06:09<20:58, 1.13it/s]
12%|ββ | 198/1621 [06:10<21:28, 1.10it/s]
12%|ββ | 199/1621 [06:11<21:13, 1.12it/s]
12%|ββ | 200/1621 [06:12<21:13, 1.12it/s]
12%|ββ | 200/1621 [06:12<21:13, 1.12it/s]
12%|ββ | 201/1621 [06:13<20:58, 1.13it/s]
12%|ββ | 202/1621 [06:14<20:46, 1.14it/s]
13%|ββ | 203/1621 [06:14<21:03, 1.12it/s]
13%|ββ | 204/1621 [06:15<20:52, 1.13it/s]
13%|ββ | 205/1621 [06:16<20:53, 1.13it/s]
13%|ββ | 206/1621 [06:17<20:35, 1.15it/s]
13%|ββ | 207/1621 [06:18<20:29, 1.15it/s]
13%|ββ | 208/1621 [06:19<20:23, 1.15it/s]
13%|ββ | 209/1621 [06:20<20:15, 1.16it/s]
13%|ββ | 210/1621 [06:20<20:14, 1.16it/s]
13%|ββ | 210/1621 [06:20<20:14, 1.16it/s]
13%|β |
| | 0: {'loss': 0.3123, 'grad_norm': 0.3156643302673956, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.14} |
| | 0: β | 211/1621 [06:21<20:15, 1.16it/s]
13%|ββ | 212/1621 [06:22<20:11, 1.16it/s]
13%|ββ | 213/1621 [06:23<20:13, 1.16it/s]
13%|ββ | 214/1621 [06:24<20:14, 1.16it/s]
13%|ββ | 215/1621 [06:25<20:05, 1.17it/s]
13%|ββ | 216/1621 [06:26<20:06, 1.16it/s]
13%|ββ | 217/1621 [06:27<20:04, 1.17it/s]
13%|ββ | 218/1621 [06:27<20:03, 1.17it/s]
14%|ββ | 219/1621 [06:28<21:09, 1.10it/s]
14%|ββ | 220/1621 [06:29<20:51, 1.12it/s]
14%|ββ | 220/1621 [06:29<20:51, 1.12it/s]
14%|ββ | 221/1621 [06:30<20:35, 1.13it/s]
14%|ββ | 222/1621 [06:31<20:27, 1.14it/s]
14%|ββ | 223/1621 [06:32<21:34, 1.08it/s]
14%|ββ | 224/1621 [06:33<21:13, 1.10it/s]
14%|ββ | 225/1621 [06:34<20:46, 1.12it/s]
14%|ββ | 226/1621 [06:35<21:16, 1.09it/s]
14%|ββ | 227/1621 [06:36<20:50, |
| | 0: {'loss': 0.3132, 'grad_norm': 0.3383769344006066, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.14} |
| | 0: {'loss': 0.3167, 'grad_norm': 0.31419076691138864, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.15} |
| | 0: 1.12it/s]
14%|ββ | 228/1621 [06:36<20:38, 1.12it/s]
14%|ββ | 229/1621 [06:37<20:47, 1.12it/s]
14%|ββ | 230/1621 [06:38<21:32, 1.08it/s]
14%|ββ | 230/1621 [06:38<21:32, 1.08it/s]
14%|ββ | 231/1621 [06:39<21:01, 1.10it/s]
14%|ββ | 232/1621 [06:40<20:34, 1.13it/s]
14%|ββ | 233/1621 [06:41<20:23, 1.13it/s]
14%|ββ | 234/1621 [06:42<20:09, 1.15it/s]
14%|ββ | 235/1621 [06:43<20:04, 1.15it/s]
15%|ββ | 236/1621 [06:44<21:20, 1.08it/s]
15%|ββ | 237/1621 [06:45<20:51, 1.11it/s]
15%|ββ | 238/1621 [06:45<20:27, 1.13it/s]
15%|ββ | 239/1621 [06:46<20:24, 1.13it/s]
15%|ββ | 240/1621 [06:47<20:07, 1.14it/s]
15%|ββ | 240/1621 [06:47<20:07, 1.14it/s]
15%|ββ | 241/1621 [06:48<20:01, 1.15it/s]
15%|ββ | 242/1621 |
| | 0: {'loss': 0.3046, 'grad_norm': 0.2902941154120115, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.15} |
| | 0: [06:49<19:53, 1.16it/s]
15%|ββ | 243/1621 [06:50<20:02, 1.15it/s]
15%|ββ | 244/1621 [06:51<20:00, 1.15it/s]
15%|ββ | 245/1621 [06:51<19:54, 1.15it/s]
15%|ββ | 246/1621 [06:52<19:48, 1.16it/s]
15%|ββ | 247/1621 [06:53<19:42, 1.16it/s]
15%|ββ | 248/1621 [06:54<19:44, 1.16it/s]
15%|ββ | 249/1621 [06:55<19:47, 1.16it/s]
15%|ββ | 250/1621 [06:56<19:41, 1.16it/s]
15%|ββ | 250/1621 [06:56<19:41, 1.16it/s]
15%|ββ | 251/1621 [06:57<19:39, 1.16it/s]
16%|ββ | 252/1621 [06:57<19:30, 1.17it/s]
16%|ββ | 253/1621 [06:58<19:31, 1.17it/s]
16%|ββ | 254/1621 [06:59<19:26, 1.17it/s]
16%|ββ | 255/1621 [07:00<19:38, 1.16it/s]
16%|ββ | 256/1621 [07:01<19:30, 1.17it/s]
16%|ββ | 257/1621 [07:02<20:22, 1.12it/s]
16%|ββ | 258/1621 [07:03<20:24, 1.11it/s]
16%|ββ |
| | 0: {'loss': 0.3108, 'grad_norm': 0.33968570459412806, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.16} |
| | 0: {'loss': 0.3141, 'grad_norm': 0.3104904238715551, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.17} |
| | 0: | 259/1621 [07:04<20:18, 1.12it/s]
16%|ββ | 260/1621 [07:05<20:03, 1.13it/s]
16%|ββ | 260/1621 [07:05<20:03, 1.13it/s]
16%|ββ | 261/1621 [07:05<19:53, 1.14it/s]
16%|ββ | 262/1621 [07:06<19:44, 1.15it/s]
16%|ββ | 263/1621 [07:07<19:33, 1.16it/s]
16%|ββ | 264/1621 [07:08<19:29, 1.16it/s]
16%|ββ | 265/1621 [07:09<20:03, 1.13it/s]
16%|ββ | 266/1621 [07:10<19:52, 1.14it/s]
16%|ββ | 267/1621 [07:11<20:18, 1.11it/s]
17%|ββ | 268/1621 [07:12<20:00, 1.13it/s]
17%|ββ | 269/1621 [07:13<20:30, 1.10it/s]
17%|ββ | 270/1621 [07:13<20:13, 1.11it/s]
17%|ββ | 270/1621 [07:13<20:13, 1.11it/s]
17%|ββ | 271/1621 [07:14<19:53, 1.13it/s]
17%|ββ | 272/1621 [07:15<20:32, 1.09it/s]
17%|ββ | 273/1621 [07:16<20:58, 1.07it/ |
| | 0: {'loss': 0.3052, 'grad_norm': 0.29884303980196186, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.17} |
| | 0: s]
17%|ββ | 274/1621 [07:17<20:56, 1.07it/s]
17%|ββ | 275/1621 [07:18<20:25, 1.10it/s]
17%|ββ | 276/1621 [07:19<19:58, 1.12it/s]
17%|ββ | 277/1621 [07:20<19:40, 1.14it/s]
17%|ββ | 278/1621 [07:21<19:29, 1.15it/s]
17%|ββ | 279/1621 [07:21<19:39, 1.14it/s]
17%|ββ | 280/1621 [07:22<19:46, 1.13it/s]
17%|ββ | 280/1621 [07:22<19:46, 1.13it/s]
17%|ββ | 281/1621 [07:23<19:37, 1.14it/s]
17%|ββ | 282/1621 [07:24<19:21, 1.15it/s]
17%|ββ | 283/1621 [07:25<19:15, 1.16it/s]
18%|ββ | 284/1621 [07:26<19:13, 1.16it/s]
18%|ββ | 285/1621 [07:27<19:08, 1.16it/s]
18%|ββ | 286/1621 [07:27<19:00, 1.17it/s]
18%|ββ | 287/1621 [07:28<20:07, 1.10it/s]
18%|ββ | 288/1621 [07:29<19:48, 1.12it/s]
18%|ββ | 289/1621 [07:30<19:32, 1.14it/s]
18%|ββ | 290/1621 [07: |
| | 0: {'loss': 0.3096, 'grad_norm': 0.32007162423550123, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.18} |
| | 0: {'loss': 0.3081, 'grad_norm': 0.36455586630760095, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.19} |
| | 0: 31<19:16, 1.15it/s]
18%|ββ | 290/1621 [07:31<19:16, 1.15it/s]
18%|ββ | 291/1621 [07:32<19:11, 1.15it/s]
18%|ββ | 292/1621 [07:33<19:07, 1.16it/s]
18%|ββ | 293/1621 [07:34<19:04, 1.16it/s]
18%|ββ | 294/1621 [07:34<19:10, 1.15it/s]
18%|ββ | 295/1621 [07:35<19:11, 1.15it/s]
18%|ββ | 296/1621 [07:36<19:03, 1.16it/s]
18%|ββ | 297/1621 [07:37<19:06, 1.15it/s]
18%|ββ | 298/1621 [07:38<19:35, 1.13it/s]
18%|ββ | 299/1621 [07:39<19:20, 1.14it/s]
19%|ββ | 300/1621 [07:40<19:09, 1.15it/s]
19%|ββ | 300/1621 [07:40<19:09, 1.15it/s]
19%|ββ | 301/1621 [07:41<19:04, 1.15it/s]
19%|ββ | 302/1621 [07:41<18:57, 1.16it/s]
19%|ββ | 303/1621 [07:42<18:56, 1.16it/s]
19%|ββ | 304/1621 [07:43<19:05, 1.15it/s]
19%|ββ |
| | 0: {'loss': 0.309, 'grad_norm': 0.3243556638380136, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.19} |
| | 0: {'loss': 0.3023, 'grad_norm': 0.3265912530967524, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.2} |
| | 0: | 305/1621 [07:44<19:03, 1.15it/s]
19%|ββ | 306/1621 [07:45<18:58, 1.15it/s]
19%|ββ | 307/1621 [07:46<18:53, 1.16it/s]
19%|ββ | 308/1621 [07:47<19:06, 1.15it/s]
19%|ββ | 309/1621 [07:48<18:52, 1.16it/s]
19%|ββ | 310/1621 [07:48<18:46, 1.16it/s]
19%|ββ | 310/1621 [07:48<18:46, 1.16it/s]
19%|ββ | 311/1621 [07:49<19:02, 1.15it/s]
19%|ββ | 312/1621 [07:50<18:50, 1.16it/s]
19%|ββ | 313/1621 [07:51<18:59, 1.15it/s]
19%|ββ | 314/1621 [07:52<19:00, 1.15it/s]
19%|ββ | 315/1621 [07:53<19:26, 1.12it/s]
19%|ββ | 316/1621 [07:54<19:19, 1.13it/s]
20%|ββ | 317/1621 [07:55<19:05, 1.14it/s]
20%|ββ | 318/1621 [07:55<19:22, 1.12it/s]
20%|ββ | 319/1621 [07:56<19:11, 1.13it/s]
20%|ββ | 320/1621 [07:57<19:16, 1.12it/s]
20% |
| | 0: [2025-09-02 18:44:13,130] [INFO] [axolotl.core.trainers.base._save:613] [PID:801836] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0/checkpoint-325[39m |
| | 0: [2025-09-02 18:44:15,654] [INFO] [axolotl.core.trainers.base._save:662] [PID:801836] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
| | 0: {'loss': 0.3072, 'grad_norm': 0.32927349219682783, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.2} |
| | 0: |ββ | 320/1621 [07:57<19:16, 1.12it/s]
20%|ββ | 321/1621 [07:58<19:02, 1.14it/s]
20%|ββ | 322/1621 [07:59<19:29, 1.11it/s]
20%|ββ | 323/1621 [08:00<19:23, 1.12it/s]
20%|ββ | 324/1621 [08:01<19:05, 1.13it/s]
20%|ββ | 325/1621 [08:02<18:55, 1.14it/s]
20%|ββ | 326/1621 [08:08<52:04, 2.41s/it]
20%|ββ | 327/1621 [08:09<42:02, 1.95s/it]
20%|ββ | 328/1621 [08:10<36:32, 1.70s/it]
20%|ββ | 329/1621 [08:10<31:12, 1.45s/it]
20%|ββ | 330/1621 [08:11<27:23, 1.27s/it]
20%|ββ | 330/1621 [08:11<27:23, 1.27s/it]
20%|ββ | 331/1621 [08:12<25:40, 1.19s/it]
20%|ββ | 332/1621 [08:13<23:27, 1.09s/it]
21%|ββ | 333/1621 [08:14<21:53, 1.02s/it]
21%|ββ | 334/1621 [08:15<21:08, 1.01it/s]
21%|ββ | 335/1621 [08:16<20:16, 1.06it/s]
21%|ββ | 336/1621 [08:17<19:4 |
| | 0: {'loss': 0.3022, 'grad_norm': 0.3046050968707454, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.21} |
| | 0: {'loss': 0.3001, 'grad_norm': 0.3260898742784537, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.22} |
| | 0: 6, 1.08it/s]
21%|ββ | 337/1621 [08:18<20:27, 1.05it/s]
21%|ββ | 338/1621 [08:19<19:51, 1.08it/s]
21%|ββ | 339/1621 [08:19<19:20, 1.10it/s]
21%|ββ | 340/1621 [08:20<19:01, 1.12it/s]
21%|ββ | 340/1621 [08:20<19:01, 1.12it/s]
21%|ββ | 341/1621 [08:21<18:53, 1.13it/s]
21%|ββ | 342/1621 [08:22<18:59, 1.12it/s]
21%|ββ | 343/1621 [08:23<18:46, 1.13it/s]
21%|ββ | 344/1621 [08:24<18:29, 1.15it/s]
21%|βββ | 345/1621 [08:25<18:32, 1.15it/s]
21%|βββ | 346/1621 [08:26<18:25, 1.15it/s]
21%|βββ | 347/1621 [08:26<18:39, 1.14it/s]
21%|βββ | 348/1621 [08:27<18:26, 1.15it/s]
22%|βββ | 349/1621 [08:28<19:38, 1.08it/s]
22%|βββ | 350/1621 [08:29<19:08, 1.11it/s]
22%|βββ | 350/1621 [08:29<19:08, 1.11it/s]
22%|ββοΏ½ |
| | 0: {'loss': 0.3054, 'grad_norm': 0.3054229700445813, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.22} |
| | 0: οΏ½οΏ½ | 351/1621 [08:30<18:50, 1.12it/s]
22%|βββ | 352/1621 [08:31<18:33, 1.14it/s]
22%|βββ | 353/1621 [08:32<18:27, 1.14it/s]
22%|βββ | 354/1621 [08:33<18:21, 1.15it/s]
22%|βββ | 355/1621 [08:33<18:12, 1.16it/s]
22%|βββ | 356/1621 [08:34<18:14, 1.16it/s]
22%|βββ | 357/1621 [08:35<18:10, 1.16it/s]
22%|βββ | 358/1621 [08:36<18:01, 1.17it/s]
22%|βββ | 359/1621 [08:37<17:57, 1.17it/s]
22%|βββ | 360/1621 [08:38<18:23, 1.14it/s]
22%|βββ | 360/1621 [08:38<18:23, 1.14it/s]
22%|βββ | 361/1621 [08:39<18:30, 1.13it/s]
22%|βββ | 362/1621 [08:40<18:19, 1.15it/s]
22%|βββ | 363/1621 [08:40<18:08, 1.16it/s]
22%|βββ | 364/1621 [08:41<18:06, 1.16it/s]
23%|βββ | 365/1621 [08:42<18:02, 1.16it/s]
23%|βββ | 366/1621 [08:43<17:59, 1.16it/s]
23%|βββ |
| | 0: {'loss': 0.3062, 'grad_norm': 0.3148357041588486, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.23} |
| | 0: {'loss': 0.307, 'grad_norm': 0.31739638733721465, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.23} |
| | 0: | 367/1621 [08:44<18:03, 1.16it/s]
23%|βββ | 368/1621 [08:45<18:00, 1.16it/s]
23%|βββ | 369/1621 [08:46<18:14, 1.14it/s]
23%|βββ | 370/1621 [08:46<18:03, 1.16it/s]
23%|βββ | 370/1621 [08:46<18:03, 1.16it/s]
23%|βββ | 371/1621 [08:47<17:58, 1.16it/s]
23%|βββ | 372/1621 [08:48<18:06, 1.15it/s]
23%|βββ | 373/1621 [08:49<17:58, 1.16it/s]
23%|βββ | 374/1621 [08:50<18:02, 1.15it/s]
23%|βββ | 375/1621 [08:51<17:52, 1.16it/s]
23%|βββ | 376/1621 [08:52<18:30, 1.12it/s]
23%|βββ | 377/1621 [08:53<18:33, 1.12it/s]
23%|βββ | 378/1621 [08:53<18:19, 1.13it/s]
23%|βββ | 379/1621 [08:54<18:23, 1.13it/s]
23%|βββ | 380/1621 [08:55<18:05, 1.14it/s]
23%|βββ | 380/1621 [08:55<18:05, 1.14it/s]
24%|βββ | |
| | 0: {'loss': 0.3102, 'grad_norm': 0.3375465905994638, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.24} |
| | 0: 381/1621 [08:56<17:57, 1.15it/s]
24%|βββ | 382/1621 [08:57<17:49, 1.16it/s]
24%|βββ | 383/1621 [08:58<17:41, 1.17it/s]
24%|βββ | 384/1621 [08:59<17:44, 1.16it/s]
24%|βββ | 385/1621 [08:59<17:36, 1.17it/s]
24%|βββ | 386/1621 [09:00<17:34, 1.17it/s]
24%|βββ | 387/1621 [09:01<17:33, 1.17it/s]
24%|βββ | 388/1621 [09:02<17:43, 1.16it/s]
24%|βββ | 389/1621 [09:03<17:55, 1.15it/s]
24%|βββ | 390/1621 [09:04<17:50, 1.15it/s]
24%|βββ | 390/1621 [09:04<17:50, 1.15it/s]
24%|βββ | 391/1621 [09:05<17:48, 1.15it/s]
24%|βββ | 392/1621 [09:06<17:46, 1.15it/s]
24%|βββ | 393/1621 [09:06<17:39, 1.16it/s]
24%|βββ | 394/1621 [09:07<18:14, 1.12it/s]
24%|βββ | 395/1621 [09:08<17:57, 1.14it/s]
24%|βββ | 396/1621 [09:09<17:43, 1.15it/s]
24%|βββ | 397 |
| | 0: {'loss': 0.3147, 'grad_norm': 0.34474370193771653, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.25} |
| | 0: {'loss': 0.3014, 'grad_norm': 0.3087293542715505, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.25} |
| | 0: /1621 [09:10<18:02, 1.13it/s]
25%|βββ | 398/1621 [09:11<17:52, 1.14it/s]
25%|βββ | 399/1621 [09:12<17:48, 1.14it/s]
25%|βββ | 400/1621 [09:13<17:39, 1.15it/s]
25%|βββ | 400/1621 [09:13<17:39, 1.15it/s]
25%|βββ | 401/1621 [09:13<17:42, 1.15it/s]
25%|βββ | 402/1621 [09:14<17:39, 1.15it/s]
25%|βββ | 403/1621 [09:15<17:32, 1.16it/s]
25%|βββ | 404/1621 [09:16<17:46, 1.14it/s]
25%|βββ | 405/1621 [09:17<17:43, 1.14it/s]
25%|βββ | 406/1621 [09:18<18:20, 1.10it/s]
25%|βββ | 407/1621 [09:19<18:00, 1.12it/s]
25%|βββ | 408/1621 [09:20<17:53, 1.13it/s]
25%|βββ | 409/1621 [09:21<18:27, 1.09it/s]
25%|βββ | 410/1621 [09:21<18:09, 1.11it/s]
25%|βββ | 410/1621 [09:21<18:09, 1.11it/s]
25%|βββ | 411/1621 [0 |
| | 0: {'loss': 0.3032, 'grad_norm': 0.3212422046835313, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.26} |
| | 0: 9:22<17:56, 1.12it/s]
25%|βββ | 412/1621 [09:23<17:48, 1.13it/s]
25%|βββ | 413/1621 [09:24<17:41, 1.14it/s]
26%|βββ | 414/1621 [09:25<17:29, 1.15it/s]
26%|βββ | 415/1621 [09:26<17:32, 1.15it/s]
26%|βββ | 416/1621 [09:27<18:28, 1.09it/s]
26%|βββ | 417/1621 [09:28<18:01, 1.11it/s]
26%|βββ | 418/1621 [09:29<17:48, 1.13it/s]
26%|βββ | 419/1621 [09:29<17:33, 1.14it/s]
26%|βββ | 420/1621 [09:30<17:30, 1.14it/s]
26%|βββ | 420/1621 [09:30<17:30, 1.14it/s]
26%|βββ | 421/1621 [09:31<17:23, 1.15it/s]
26%|βββ | 422/1621 [09:32<17:15, 1.16it/s]
26%|βββ | 423/1621 [09:33<17:11, 1.16it/s]
26%|βββ | 424/1621 [09:34<17:06, 1.17it/s]
26%|βββ | 425/1621 [09:35<17:34, 1.13it/s]
26%|βββ | 426/1621 [09:36<17:35, 1.13it/s]
26%|βββ | 427/1621 [09:3 |
| | 0: {'loss': 0.2992, 'grad_norm': 0.30649618094104975, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.27} |
| | 0: {'loss': 0.3028, 'grad_norm': 0.3402735147626528, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.27} |
| | 0: 7<18:16, 1.09it/s]
26%|βββ | 428/1621 [09:37<17:54, 1.11it/s]
26%|βββ | 429/1621 [09:38<17:34, 1.13it/s]
27%|βββ | 430/1621 [09:39<17:23, 1.14it/s]
27%|βββ | 430/1621 [09:39<17:23, 1.14it/s]
27%|βββ | 431/1621 [09:40<17:11, 1.15it/s]
27%|βββ | 432/1621 [09:41<17:06, 1.16it/s]
27%|βββ | 433/1621 [09:42<17:14, 1.15it/s]
27%|βββ | 434/1621 [09:43<17:36, 1.12it/s]
27%|βββ | 435/1621 [09:43<17:31, 1.13it/s]
27%|βββ | 436/1621 [09:44<17:19, 1.14it/s]
27%|βββ | 437/1621 [09:45<17:16, 1.14it/s]
27%|βββ | 438/1621 [09:46<17:21, 1.14it/s]
27%|βββ | 439/1621 [09:47<17:29, 1.13it/s]
27%|βββ | 440/1621 [09:48<17:14, 1.14it/s]
27%|βββ | 440/1621 [09:48<17:14, 1.14it/s]
27%|βββ | 441/1621 [09:49<17:08, |
| | 0: {'loss': 0.3009, 'grad_norm': 0.30752642276161624, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.28} |
| | 0: 1.15it/s]
27%|βββ | 442/1621 [09:50<17:03, 1.15it/s]
27%|βββ | 443/1621 [09:50<16:57, 1.16it/s]
27%|βββ | 444/1621 [09:51<16:55, 1.16it/s]
27%|βββ | 445/1621 [09:52<16:48, 1.17it/s]
28%|βββ | 446/1621 [09:53<16:54, 1.16it/s]
28%|βββ | 447/1621 [09:54<16:47, 1.16it/s]
28%|βββ | 448/1621 [09:55<16:52, 1.16it/s]
28%|βββ | 449/1621 [09:56<16:46, 1.16it/s]
28%|βββ | 450/1621 [09:56<17:01, 1.15it/s]
28%|βββ | 450/1621 [09:56<17:01, 1.15it/s]
28%|βββ | 451/1621 [09:57<16:53, 1.15it/s]
28%|βββ | 452/1621 [09:58<16:46, 1.16it/s]
28%|βββ | 453/1621 [09:59<16:50, 1.16it/s]
28%|βββ | 454/1621 [10:00<17:19, 1.12it/s]
28%|βββ | 455/1621 [10:01<17:11, 1.13it/s]
28%|βββ | 456/1621 [10:02<16:57, 1.15it/s]
28%|βββ | 457/1621 [10:03<16:49, 1 |
| | 0: {'loss': 0.2975, 'grad_norm': 0.2988914632135753, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.28} |
| | 0: {'loss': 0.3047, 'grad_norm': 0.3029109045844161, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.29} |
| | 0: .15it/s]
28%|βββ | 458/1621 [10:03<16:52, 1.15it/s]
28%|βββ | 459/1621 [10:04<16:46, 1.15it/s]
28%|βββ | 460/1621 [10:05<16:49, 1.15it/s]
28%|βββ | 460/1621 [10:05<16:49, 1.15it/s]
28%|βββ | 461/1621 [10:06<17:07, 1.13it/s]
29%|βββ | 462/1621 [10:07<17:13, 1.12it/s]
29%|βββ | 463/1621 [10:08<17:41, 1.09it/s]
29%|βββ | 464/1621 [10:09<17:19, 1.11it/s]
29%|βββ | 465/1621 [10:10<17:01, 1.13it/s]
29%|βββ | 466/1621 [10:11<18:08, 1.06it/s]
29%|βββ | 467/1621 [10:12<17:39, 1.09it/s]
29%|βββ | 468/1621 [10:13<17:26, 1.10it/s]
29%|βββ | 469/1621 [10:13<17:14, 1.11it/s]
29%|βββ | 470/1621 [10:14<16:54, 1.13it/s]
29%|βββ | 470/1621 [10:14<16:54, 1.13it/s]
29%|βββ | 471/1621 [10:15<16:47, 1.14it/s] |
| | 0: {'loss': 0.304, 'grad_norm': 0.30933769716252907, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.3} |
| | 0:
29%|βββ | 472/1621 [10:16<16:55, 1.13it/s]
29%|βββ | 473/1621 [10:17<17:04, 1.12it/s]
29%|βββ | 474/1621 [10:18<16:49, 1.14it/s]
29%|βββ | 475/1621 [10:19<17:08, 1.11it/s]
29%|βββ | 476/1621 [10:20<17:00, 1.12it/s]
29%|βββ | 477/1621 [10:20<16:47, 1.14it/s]
29%|βββ | 478/1621 [10:21<16:36, 1.15it/s]
30%|βββ | 479/1621 [10:22<16:41, 1.14it/s]
30%|βββ | 480/1621 [10:23<16:35, 1.15it/s]
30%|βββ | 480/1621 [10:23<16:35, 1.15it/s]
30%|βββ | 481/1621 [10:24<17:18, 1.10it/s]
30%|βββ | 482/1621 [10:25<16:58, 1.12it/s]
30%|βββ | 483/1621 [10:26<16:58, 1.12it/s]
30%|βββ | 484/1621 [10:27<16:43, 1.13it/s]
30%|βββ | 485/1621 [10:28<17:17, 1.09it/s]
30%|βββ | 486/1621 [10:28<16:54, 1.12it/s]
30%|βββ | 487/1621 [10:29<17:25, 1.08it/s]
3 |
| | 0: {'loss': 0.304, 'grad_norm': 0.3140435122849643, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.3} |
| | 0: {'loss': 0.3019, 'grad_norm': 0.299751756066974, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.31} |
| | 0: 0%|βββ | 488/1621 [10:30<17:02, 1.11it/s]
30%|βββ | 489/1621 [10:31<16:43, 1.13it/s]
30%|βββ | 490/1621 [10:32<16:30, 1.14it/s]
30%|βββ | 490/1621 [10:32<16:30, 1.14it/s]
30%|βββ | 491/1621 [10:33<16:56, 1.11it/s]
30%|βββ | 492/1621 [10:34<16:38, 1.13it/s]
30%|βββ | 493/1621 [10:35<16:24, 1.15it/s]
30%|βββ | 494/1621 [10:36<16:32, 1.14it/s]
31%|βββ | 495/1621 [10:36<16:26, 1.14it/s]
31%|βββ | 496/1621 [10:37<16:31, 1.13it/s]
31%|βββ | 497/1621 [10:38<17:00, 1.10it/s]
31%|βββ | 498/1621 [10:39<16:40, 1.12it/s]
31%|βββ | 499/1621 [10:40<16:25, 1.14it/s]
31%|βββ | 500/1621 [10:41<16:18, 1.15it/s]
31%|βββ | 500/1621 [10:41<16:18, 1.15it/s]
31%|βββ | 501/1621 [10:42<16:15, 1.15it/s]
31%|βοΏ½ |
| | 0: {'loss': 0.3022, 'grad_norm': 0.31890034279759916, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.31} |
| | 0: οΏ½β | 502/1621 [10:43<16:09, 1.15it/s]
31%|βββ | 503/1621 [10:43<16:21, 1.14it/s]
31%|βββ | 504/1621 [10:44<16:09, 1.15it/s]
31%|βββ | 505/1621 [10:45<16:05, 1.16it/s]
31%|βββ | 506/1621 [10:46<16:57, 1.10it/s]
31%|ββββ | 507/1621 [10:47<16:51, 1.10it/s]
31%|ββββ | 508/1621 [10:48<16:33, 1.12it/s]
31%|ββββ | 509/1621 [10:49<16:17, 1.14it/s]
31%|ββββ | 510/1621 [10:50<16:03, 1.15it/s]
31%|ββββ | 510/1621 [10:50<16:03, 1.15it/s]
32%|ββββ | 511/1621 [10:51<16:37, 1.11it/s]
32%|ββββ | 512/1621 [10:52<16:27, 1.12it/s]
32%|ββββ | 513/1621 [10:52<16:36, 1.11it/s]
32%|ββββ | 514/1621 [10:53<16:20, 1.13it/s]
32%|ββββ | 515/1621 [10:54<16:25, 1.12it/s]
32%|ββββ | 516/1621 [10:55<16:12, 1.14it/s]
32%|ββββ | 517/1621 [10:56<16:00, |
| | 0: {'loss': 0.3021, 'grad_norm': 0.31952587601461685, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.32} |
| | 0: {'loss': 0.2955, 'grad_norm': 0.3094649616845461, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.33} |
| | 0: 1.15it/s]
32%|ββββ | 518/1621 [10:57<16:01, 1.15it/s]
32%|ββββ | 519/1621 [10:58<17:06, 1.07it/s]
32%|ββββ | 520/1621 [10:59<16:49, 1.09it/s]
32%|ββββ | 520/1621 [10:59<16:49, 1.09it/s]
32%|ββββ | 521/1621 [11:00<16:36, 1.10it/s]
32%|ββββ | 522/1621 [11:01<16:48, 1.09it/s]
32%|ββββ | 523/1621 [11:01<16:30, 1.11it/s]
32%|ββββ | 524/1621 [11:02<17:10, 1.06it/s]
32%|ββββ | 525/1621 [11:03<16:50, 1.08it/s]
32%|ββββ | 526/1621 [11:04<16:42, 1.09it/s]
33%|ββββ | 527/1621 [11:05<16:49, 1.08it/s]
33%|ββββ | 528/1621 [11:06<16:41, 1.09it/s]
33%|ββββ | 529/1621 [11:07<16:28, 1.10it/s]
33%|ββββ | 530/1621 [11:08<16:13, 1.12it/s]
33%|ββββ | 530/1621 [11:08<16:13, 1.12it/s]
33%|ββββ | |
| | 0: {'loss': 0.2986, 'grad_norm': 0.3112632339780639, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.33} |
| | 0: 531/1621 [11:09<16:42, 1.09it/s]
33%|ββββ | 532/1621 [11:10<16:54, 1.07it/s]
33%|ββββ | 533/1621 [11:11<16:29, 1.10it/s]
33%|ββββ | 534/1621 [11:11<16:16, 1.11it/s]
33%|ββββ | 535/1621 [11:12<16:01, 1.13it/s]
33%|ββββ | 536/1621 [11:13<16:04, 1.13it/s]
33%|ββββ | 537/1621 [11:14<15:51, 1.14it/s]
33%|ββββ | 538/1621 [11:15<15:40, 1.15it/s]
33%|ββββ | 539/1621 [11:16<15:32, 1.16it/s]
33%|ββββ | 540/1621 [11:17<15:36, 1.15it/s]
33%|ββββ | 540/1621 [11:17<15:36, 1.15it/s]
33%|ββββ | 541/1621 [11:17<15:29, 1.16it/s]
33%|ββββ | 542/1621 [11:18<15:51, 1.13it/s]
33%|ββββ | 543/1621 [11:19<15:49, 1.14it/s]
34%|ββββ | 544/1621 [11:20<15:40, 1.15it/s]
34%|ββββ | 545/1621 [11:21<16:22, 1.09it/s]
34%|ββββ | 546/1621 [11:22<16:06, 1.1 |
| | 0: {'loss': 0.306, 'grad_norm': 0.32623249352975026, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.34} |
| | 0: {'loss': 0.303, 'grad_norm': 0.3212950878739604, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.35} |
| | 0: 1it/s]
34%|ββββ | 547/1621 [11:23<15:54, 1.13it/s]
34%|ββββ | 548/1621 [11:24<15:43, 1.14it/s]
34%|ββββ | 549/1621 [11:25<15:45, 1.13it/s]
34%|ββββ | 550/1621 [11:25<15:33, 1.15it/s]
34%|ββββ | 550/1621 [11:25<15:33, 1.15it/s]
34%|ββββ | 551/1621 [11:26<15:27, 1.15it/s]
34%|ββββ | 552/1621 [11:27<15:29, 1.15it/s]
34%|ββββ | 553/1621 [11:28<15:23, 1.16it/s]
34%|ββββ | 554/1621 [11:29<15:26, 1.15it/s]
34%|ββββ | 555/1621 [11:30<15:21, 1.16it/s]
34%|ββββ | 556/1621 [11:31<16:22, 1.08it/s]
34%|ββββ | 557/1621 [11:32<16:02, 1.11it/s]
34%|ββββ | 558/1621 [11:33<15:53, 1.12it/s]
34%|ββββ | 559/1621 [11:34<16:40, 1.06it/s]
35%|ββββ | 560/1621 [11:34<16:09, 1.09it/s]
35%|ββββ | 560 |
| | 0: {'loss': 0.2953, 'grad_norm': 0.2767562705424428, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.35} |
| | 0: /1621 [11:35<16:09, 1.09it/s]
35%|ββββ | 561/1621 [11:35<15:52, 1.11it/s]
35%|ββββ | 562/1621 [11:36<15:35, 1.13it/s]
35%|ββββ | 563/1621 [11:37<15:25, 1.14it/s]
35%|ββββ | 564/1621 [11:38<15:17, 1.15it/s]
35%|ββββ | 565/1621 [11:39<15:18, 1.15it/s]
35%|ββββ | 566/1621 [11:40<15:10, 1.16it/s]
35%|ββββ | 567/1621 [11:40<15:06, 1.16it/s]
35%|ββββ | 568/1621 [11:41<15:03, 1.16it/s]
35%|ββββ | 569/1621 [11:42<15:55, 1.10it/s]
35%|ββββ | 570/1621 [11:43<15:38, 1.12it/s]
35%|ββββ | 570/1621 [11:43<15:38, 1.12it/s]
35%|ββββ | 571/1621 [11:44<15:26, 1.13it/s]
35%|ββββ | 572/1621 [11:45<15:14, 1.15it/s]
35%|ββββ | 573/1621 [11:46<15:08, 1.15it/s]
35%|ββββ | 574/1621 [11:47<15:01, 1.16it/s]
35%|ββββ | 575/1621 [11:47<14:59, 1.16it/ |
| | 0: {'loss': 0.2946, 'grad_norm': 0.285031036391949, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.36} |
| | 0: s]
36%|ββββ | 576/1621 [11:48<15:28, 1.13it/s]
36%|ββββ | 577/1621 [11:49<15:16, 1.14it/s]
36%|ββββ | 578/1621 [11:50<15:21, 1.13it/s]
36%|ββββ | 579/1621 [11:51<15:13, 1.14it/s]
36%|ββββ | 580/1621 [11:52<15:05, 1.15it/s]
36%|ββββ | 580/1621 [11:52<15:05, 1.15it/s]
36%|ββββ | 581/1621 [11:53<15:31, 1.12it/s]
36%|ββββ | 582/1621 [11:54<15:17, 1.13it/s]
36%|ββββ | 583/1621 [11:55<16:08, 1.07it/s]
36%|ββββ | 584/1621 [11:56<15:40, 1.10it/s]
36%|ββββ | 585/1621 [11:56<15:23, 1.12it/s]
36%|ββββ | 586/1621 [11:57<15:09, 1.14it/s]
36%|ββββ | 587/1621 [11:58<15:01, 1.15it/s]
36%|ββββ | 588/1621 [11:59<14:52, 1.16it/s]
36%|ββββ | 589/1621 [12:00<14:53, 1.15it/s]
36%|ββββ | 590/1621 [12:01<15:04, 1.14it/s]
|
| | 0: {'loss': 0.2978, 'grad_norm': 0.3116693864450672, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.36} |
| | 0: {'loss': 0.2942, 'grad_norm': 0.29553887428879677, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.37} |
| | 0:
36%|ββββ | 590/1621 [12:01<15:04, 1.14it/s]
36%|ββββ | 591/1621 [12:02<14:56, 1.15it/s]
37%|ββββ | 592/1621 [12:03<14:55, 1.15it/s]
37%|ββββ | 593/1621 [12:03<14:58, 1.14it/s]
37%|ββββ | 594/1621 [12:04<14:50, 1.15it/s]
37%|ββββ | 595/1621 [12:05<15:31, 1.10it/s]
37%|ββββ | 596/1621 [12:06<15:20, 1.11it/s]
37%|ββββ | 597/1621 [12:07<15:14, 1.12it/s]
37%|ββββ | 598/1621 [12:08<15:00, 1.14it/s]
37%|ββββ | 599/1621 [12:09<15:04, 1.13it/s]
37%|ββββ | 600/1621 [12:10<15:11, 1.12it/s]
37%|ββββ | 600/1621 [12:10<15:11, 1.12it/s]
37%|ββββ | 601/1621 [12:11<14:59, 1.13it/s]
37%|ββββ | 602/1621 [12:11<14:57, 1.14it/s]
37%|ββββ | 603/1621 [12:12<14:57, 1.13it/s]
37%|ββββ | 604/1621 [12:13<15:10, 1.12it/s]
|
| | 0: {'loss': 0.2993, 'grad_norm': 0.32060551637366747, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.38} |
| | 0: 37%|ββββ | 605/1621 [12:14<14:53, 1.14it/s]
37%|ββββ | 606/1621 [12:15<14:59, 1.13it/s]
37%|ββββ | 607/1621 [12:16<14:53, 1.13it/s]
38%|ββββ | 608/1621 [12:17<15:27, 1.09it/s]
38%|ββββ | 609/1621 [12:18<15:33, 1.08it/s]
38%|ββββ | 610/1621 [12:19<15:09, 1.11it/s]
38%|ββββ | 610/1621 [12:19<15:09, 1.11it/s]
38%|ββββ | 611/1621 [12:20<15:39, 1.08it/s]
38%|ββββ | 612/1621 [12:20<15:20, 1.10it/s]
38%|ββββ | 613/1621 [12:21<15:12, 1.11it/s]
38%|ββββ | 614/1621 [12:22<14:57, 1.12it/s]
38%|ββββ | 615/1621 [12:23<15:13, 1.10it/s]
38%|ββββ | 616/1621 [12:24<16:41, 1.00it/s]
38%|ββββ | 617/1621 [12:25<15:55, 1.05it/s]
38%|ββββ | 618/1621 [12:26<15:27, 1.08it/s]
38%|ββββ | 619/1621 [12:27<15:43, 1.06it/s]
38%|ββββ | 620/ |
| | 0: {'loss': 0.2881, 'grad_norm': 0.3005663461419241, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.38} |
| | 0: {'loss': 0.2984, 'grad_norm': 0.30549174693463893, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.39} |
| | 0: 1621 [12:28<15:19, 1.09it/s]
38%|ββββ | 620/1621 [12:28<15:19, 1.09it/s]
38%|ββββ | 621/1621 [12:29<15:01, 1.11it/s]
38%|ββββ | 622/1621 [12:30<14:45, 1.13it/s]
38%|ββββ | 623/1621 [12:30<14:32, 1.14it/s]
38%|ββββ | 624/1621 [12:31<14:27, 1.15it/s]
39%|ββββ | 625/1621 [12:32<14:20, 1.16it/s]
39%|ββββ | 626/1621 [12:33<15:02, 1.10it/s]
39%|ββββ | 627/1621 [12:34<14:42, 1.13it/s]
39%|ββββ | 628/1621 [12:35<15:30, 1.07it/s]
39%|ββββ | 629/1621 [12:36<15:40, 1.05it/s]
39%|ββββ | 630/1621 [12:37<15:18, 1.08it/s]
39%|ββββ | 630/1621 [12:37<15:18, 1.08it/s]
39%|ββββ | 631/1621 [12:38<15:01, 1.10it/s]
39%|ββββ | 632/1621 [12:39<14:59, 1.10it/s]
39%|ββββ | 633/1621 [12:40<14:41, 1.12it/s]
39%| |
| | 0: {'loss': 0.2927, 'grad_norm': 0.3779583398963665, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.39} |
| | 0: ββββ | 634/1621 [12:41<15:25, 1.07it/s]
39%|ββββ | 635/1621 [12:42<15:20, 1.07it/s]
39%|ββββ | 636/1621 [12:42<14:59, 1.10it/s]
39%|ββββ | 637/1621 [12:43<14:49, 1.11it/s]
39%|ββββ | 638/1621 [12:44<14:36, 1.12it/s]
39%|ββββ | 639/1621 [12:45<14:31, 1.13it/s]
39%|ββββ | 640/1621 [12:46<14:52, 1.10it/s]
39%|ββββ | 640/1621 [12:46<14:52, 1.10it/s]
40%|ββββ | 641/1621 [12:47<15:10, 1.08it/s]
40%|ββββ | 642/1621 [12:48<15:05, 1.08it/s]
40%|ββββ | 643/1621 [12:49<14:44, 1.11it/s]
40%|ββββ | 644/1621 [12:50<14:25, 1.13it/s]
40%|ββββ | 645/1621 [12:50<14:16, 1.14it/s]
40%|ββββ | 646/1621 [12:51<14:13, 1.14it/s]
40%|ββββ | 647/1621 [12:52<14:06, 1.15it/s]
40%|ββββ | 648/1621 [12:53<14:03, 1.15it/s]
40%|ββββ | 649/1621 |
| | 0: {'loss': 0.2996, 'grad_norm': 0.29989530504483425, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.4} |
| | 0: [2025-09-02 18:49:05,656] [INFO] [axolotl.core.trainers.base._save:613] [PID:801836] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0/checkpoint-650[39m |
| | 0: [2025-09-02 18:49:08,181] [INFO] [axolotl.core.trainers.base._save:662] [PID:801836] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
| | 0: {'loss': 0.2977, 'grad_norm': 0.30302027522488867, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.41} |
| | 0: [12:54<13:56, 1.16it/s]
40%|ββββ | 650/1621 [12:55<13:52, 1.17it/s]
40%|ββββ | 650/1621 [12:55<13:52, 1.17it/s]
40%|ββββ | 651/1621 [13:00<35:49, 2.22s/it]
40%|ββββ | 652/1621 [13:01<29:12, 1.81s/it]
40%|ββββ | 653/1621 [13:02<24:30, 1.52s/it]
40%|ββββ | 654/1621 [13:03<21:21, 1.33s/it]
40%|ββββ | 655/1621 [13:04<19:03, 1.18s/it]
40%|ββββ | 656/1621 [13:04<17:25, 1.08s/it]
41%|ββββ | 657/1621 [13:05<16:16, 1.01s/it]
41%|ββββ | 658/1621 [13:06<15:29, 1.04it/s]
41%|ββββ | 659/1621 [13:07<15:09, 1.06it/s]
41%|ββββ | 660/1621 [13:08<14:41, 1.09it/s]
41%|ββββ | 660/1621 [13:08<14:41, 1.09it/s]
41%|ββββ | 661/1621 [13:09<14:27, 1.11it/s]
41%|ββββ | 662/1621 [13:10<14:10, 1.13it/s]
41%|βοΏ½ |
| | 0: {'loss': 0.2976, 'grad_norm': 0.2892331257316936, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.41} |
| | 0: οΏ½οΏ½ββ | 663/1621 [13:10<14:01, 1.14it/s]
41%|ββββ | 664/1621 [13:11<14:20, 1.11it/s]
41%|ββββ | 665/1621 [13:12<14:03, 1.13it/s]
41%|ββββ | 666/1621 [13:13<14:12, 1.12it/s]
41%|ββββ | 667/1621 [13:14<14:02, 1.13it/s]
41%|ββββ | 668/1621 [13:15<13:59, 1.14it/s]
41%|βββββ | 669/1621 [13:16<13:52, 1.14it/s]
41%|βββββ | 670/1621 [13:17<13:46, 1.15it/s]
41%|βββββ | 670/1621 [13:17<13:46, 1.15it/s]
41%|βββββ | 671/1621 [13:18<14:02, 1.13it/s]
41%|βββββ | 672/1621 [13:18<13:54, 1.14it/s]
42%|βββββ | 673/1621 [13:19<13:46, 1.15it/s]
42%|βββββ | 674/1621 [13:20<13:38, 1.16it/s]
42%|βββββ | 675/1621 [13:21<13:34, 1.16it/s]
42%|βββββ | 676/1621 [13:22<13:38, 1.15it/s]
42%|βββββ | 677/1621 [13:23<13:34, 1.16it/s]
42%|ββββ |
| | 0: {'loss': 0.2914, 'grad_norm': 0.30196767512453576, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.42} |
| | 0: {'loss': 0.2932, 'grad_norm': 0.29700083045306425, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.43} |
| | 0: β | 678/1621 [13:24<14:00, 1.12it/s]
42%|βββββ | 679/1621 [13:25<14:12, 1.10it/s]
42%|βββββ | 680/1621 [13:25<13:56, 1.13it/s]
42%|βββββ | 680/1621 [13:25<13:56, 1.13it/s]
42%|βββββ | 681/1621 [13:26<13:44, 1.14it/s]
42%|βββββ | 682/1621 [13:27<13:34, 1.15it/s]
42%|βββββ | 683/1621 [13:28<13:33, 1.15it/s]
42%|βββββ | 684/1621 [13:29<13:38, 1.14it/s]
42%|βββββ | 685/1621 [13:30<13:34, 1.15it/s]
42%|βββββ | 686/1621 [13:31<13:52, 1.12it/s]
42%|βββββ | 687/1621 [13:32<13:38, 1.14it/s]
42%|βββββ | 688/1621 [13:33<14:33, 1.07it/s]
43%|βββββ | 689/1621 [13:33<14:07, 1.10it/s]
43%|βββββ | 690/1621 [13:34<14:09, 1.10it/s]
43%|βββββ | 690/1621 [13:34<14:09, 1.10it/s]
43%|βββββ |
| | 0: {'loss': 0.3008, 'grad_norm': 0.2990684455792348, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.43} |
| | 0: | 691/1621 [13:35<13:51, 1.12it/s]
43%|βββββ | 692/1621 [13:36<13:45, 1.13it/s]
43%|βββββ | 693/1621 [13:37<13:32, 1.14it/s]
43%|βββββ | 694/1621 [13:38<13:25, 1.15it/s]
43%|βββββ | 695/1621 [13:39<13:19, 1.16it/s]
43%|βββββ | 696/1621 [13:39<13:12, 1.17it/s]
43%|βββββ | 697/1621 [13:40<13:16, 1.16it/s]
43%|βββββ | 698/1621 [13:41<13:16, 1.16it/s]
43%|βββββ | 699/1621 [13:42<13:13, 1.16it/s]
43%|βββββ | 700/1621 [13:43<13:10, 1.17it/s]
43%|βββββ | 700/1621 [13:43<13:10, 1.17it/s]
43%|βββββ | 701/1621 [13:44<13:20, 1.15it/s]
43%|βββββ | 702/1621 [13:45<13:15, 1.16it/s]
43%|βββββ | 703/1621 [13:46<14:01, 1.09it/s]
43%|βββββ | 704/1621 [13:47<13:39, 1.12it/s]
43%|βββββ | 705/1621 [13:48<14:14, 1.07it/s]
44%|ββββοΏ½ |
| | 0: {'loss': 0.2935, 'grad_norm': 0.30645808053615586, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.44} |
| | 0: οΏ½οΏ½ | 706/1621 [13:48<13:52, 1.10it/s]
44%|βββββ | 707/1621 [13:49<14:19, 1.06it/s]
44%|βββββ | 708/1621 [13:50<13:56, 1.09it/s]
44%|βββββ | 709/1621 [13:51<13:40, 1.11it/s]
44%|βββββ | 710/1621 [13:52<13:35, 1.12it/s]
44%|βββββ | 710/1621 [13:52<13:35, 1.12it/s]
44%|βββββ | 711/1621 [13:53<13:28, 1.13it/s]
44%|βββββ | 712/1621 [13:54<13:17, 1.14it/s]
44%|βββββ | 713/1621 [13:55<13:18, 1.14it/s]
44%|βββββ | 714/1621 [13:56<13:56, 1.08it/s]
44%|βββββ | 715/1621 [13:57<14:25, 1.05it/s]
44%|βββββ | 716/1621 [13:58<14:03, 1.07it/s]
44%|βββββ | 717/1621 [13:58<13:40, 1.10it/s]
44%|βββββ | 718/1621 [13:59<13:27, 1.12it/s]
44%|βββββ | 719/1621 [14:00<13:21, 1.13it/s]
44%|βββββ | 720/1621 [14:01<13:10, 1.14it/s]
|
| | 0: {'loss': 0.2951, 'grad_norm': 0.34070474789654825, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.44} |
| | 0: {'loss': 0.3011, 'grad_norm': 0.3202334487749824, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.45} |
| | 0:
44%|βββββ | 720/1621 [14:01<13:10, 1.14it/s]
44%|βββββ | 721/1621 [14:02<13:13, 1.13it/s]
45%|βββββ | 722/1621 [14:03<13:02, 1.15it/s]
45%|βββββ | 723/1621 [14:04<13:25, 1.12it/s]
45%|βββββ | 724/1621 [14:05<13:14, 1.13it/s]
45%|βββββ | 725/1621 [14:05<13:11, 1.13it/s]
45%|βββββ | 726/1621 [14:06<13:00, 1.15it/s]
45%|βββββ | 727/1621 [14:07<12:53, 1.16it/s]
45%|βββββ | 728/1621 [14:08<12:52, 1.16it/s]
45%|βββββ | 729/1621 [14:09<12:49, 1.16it/s]
45%|βββββ | 730/1621 [14:10<12:47, 1.16it/s]
45%|βββββ | 730/1621 [14:10<12:47, 1.16it/s]
45%|βββββ | 731/1621 [14:11<12:46, 1.16it/s]
45%|βββββ | 732/1621 [14:11<12:43, 1.16it/s]
45%|βββββ | 733/1621 [14:12<12:39, 1.17it/s]
45%|ββββοΏ½ |
| | 0: {'loss': 0.2972, 'grad_norm': 0.3019359123796448, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.46} |
| | 0: οΏ½ | 734/1621 [14:13<12:36, 1.17it/s]
45%|βββββ | 735/1621 [14:14<12:34, 1.17it/s]
45%|βββββ | 736/1621 [14:15<12:32, 1.18it/s]
45%|βββββ | 737/1621 [14:16<12:29, 1.18it/s]
46%|βββββ | 738/1621 [14:17<12:28, 1.18it/s]
46%|βββββ | 739/1621 [14:17<12:29, 1.18it/s]
46%|βββββ | 740/1621 [14:18<12:32, 1.17it/s]
46%|βββββ | 740/1621 [14:18<12:32, 1.17it/s]
46%|βββββ | 741/1621 [14:19<12:33, 1.17it/s]
46%|βββββ | 742/1621 [14:20<12:28, 1.17it/s]
46%|βββββ | 743/1621 [14:21<12:41, 1.15it/s]
46%|βββββ | 744/1621 [14:22<12:39, 1.15it/s]
46%|βββββ | 745/1621 [14:23<12:33, 1.16it/s]
46%|βββββ | 746/1621 [14:23<12:33, 1.16it/s]
46%|βββββ | 747/1621 [14:24<12:31, 1.16it/s]
46%|βββββ | 748/1621 [14:25<12:35, 1.16it/s]
46%|βββοΏ½ |
| | 0: {'loss': 0.2934, 'grad_norm': 0.29846589457849965, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.46} |
| | 0: {'loss': 0.3048, 'grad_norm': 0.31089870103073103, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.47} |
| | 0: οΏ½οΏ½β | 749/1621 [14:26<12:32, 1.16it/s]
46%|βββββ | 750/1621 [14:27<12:29, 1.16it/s]
46%|βββββ | 750/1621 [14:27<12:29, 1.16it/s]
46%|βββββ | 751/1621 [14:28<12:37, 1.15it/s]
46%|βββββ | 752/1621 [14:29<12:29, 1.16it/s]
46%|βββββ | 753/1621 [14:29<12:24, 1.17it/s]
47%|βββββ | 754/1621 [14:30<12:22, 1.17it/s]
47%|βββββ | 755/1621 [14:31<12:19, 1.17it/s]
47%|βββββ | 756/1621 [14:32<12:15, 1.18it/s]
47%|βββββ | 757/1621 [14:33<12:12, 1.18it/s]
47%|βββββ | 758/1621 [14:34<12:11, 1.18it/s]
47%|βββββ | 759/1621 [14:35<12:35, 1.14it/s]
47%|βββββ | 760/1621 [14:35<12:32, 1.14it/s]
47%|βββββ | 760/1621 [14:35<12:32, 1.14it/s]
47%|βββββ | 761/1621 [14:36<12:26, 1.15it/s]
47%|βββββ |
| | 0: {'loss': 0.2954, 'grad_norm': 0.3055451871683868, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.48} |
| | 0: | 762/1621 [14:37<12:19, 1.16it/s]
47%|βββββ | 763/1621 [14:38<12:25, 1.15it/s]
47%|βββββ | 764/1621 [14:39<12:37, 1.13it/s]
47%|βββββ | 765/1621 [14:40<13:43, 1.04it/s]
47%|βββββ | 766/1621 [14:41<13:13, 1.08it/s]
47%|βββββ | 767/1621 [14:42<12:51, 1.11it/s]
47%|βββββ | 768/1621 [14:43<12:36, 1.13it/s]
47%|βββββ | 769/1621 [14:44<12:26, 1.14it/s]
48%|βββββ | 770/1621 [14:44<12:24, 1.14it/s]
48%|βββββ | 770/1621 [14:44<12:24, 1.14it/s]
48%|βββββ | 771/1621 [14:45<12:21, 1.15it/s]
48%|βββββ | 772/1621 [14:46<12:16, 1.15it/s]
48%|βββββ | 773/1621 [14:47<12:12, 1.16it/s]
48%|βββββ | 774/1621 [14:48<12:15, 1.15it/s]
48%|βββββ | 775/1621 [14:49<12:09, 1.16it/s]
48%|βββββ | 776/1621 [14:50<12:05, 1.17it/s]
48%|βββοΏ½ |
| | 0: {'loss': 0.2953, 'grad_norm': 0.310251278561544, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.48} |
| | 0: {'loss': 0.2877, 'grad_norm': 0.32857198895345724, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.49} |
| | 0: οΏ½β | 777/1621 [14:50<12:04, 1.17it/s]
48%|βββββ | 778/1621 [14:51<12:07, 1.16it/s]
48%|βββββ | 779/1621 [14:52<12:04, 1.16it/s]
48%|βββββ | 780/1621 [14:53<12:02, 1.16it/s]
48%|βββββ | 780/1621 [14:53<12:02, 1.16it/s]
48%|βββββ | 781/1621 [14:54<12:03, 1.16it/s]
48%|βββββ | 782/1621 [14:55<12:05, 1.16it/s]
48%|βββββ | 783/1621 [14:56<12:06, 1.15it/s]
48%|βββββ | 784/1621 [14:57<12:20, 1.13it/s]
48%|βββββ | 785/1621 [14:57<12:14, 1.14it/s]
48%|βββββ | 786/1621 [14:58<12:10, 1.14it/s]
49%|βββββ | 787/1621 [14:59<12:09, 1.14it/s]
49%|βββββ | 788/1621 [15:00<12:02, 1.15it/s]
49%|βββββ | 789/1621 [15:01<12:32, 1.11it/s]
49%|βββββ | 790/1621 [15:02<12:21, 1.12it/s]
49%|βββββ |
| | 0: {'loss': 0.2924, 'grad_norm': 0.2994049813187105, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.49} |
| | 0: | 790/1621 [15:02<12:21, 1.12it/s]
49%|βββββ | 791/1621 [15:03<12:13, 1.13it/s]
49%|βββββ | 792/1621 [15:04<12:04, 1.14it/s]
49%|βββββ | 793/1621 [15:04<12:14, 1.13it/s]
49%|βββββ | 794/1621 [15:05<12:06, 1.14it/s]
49%|βββββ | 795/1621 [15:06<12:02, 1.14it/s]
49%|βββββ | 796/1621 [15:07<11:54, 1.16it/s]
49%|βββββ | 797/1621 [15:08<11:48, 1.16it/s]
49%|βββββ | 798/1621 [15:09<11:44, 1.17it/s]
49%|βββββ | 799/1621 [15:10<11:43, 1.17it/s]
49%|βββββ | 800/1621 [15:10<11:40, 1.17it/s]
49%|βββββ | 800/1621 [15:10<11:40, 1.17it/s]
49%|βββββ | 801/1621 [15:11<11:41, 1.17it/s]
49%|βββββ | 802/1621 [15:12<11:37, 1.17it/s]
50%|βββββ | 803/1621 [15:13<11:40, 1.17it/s]
50%|βββββ | 804/1621 [15:14<11:39, 1.17it/s]
50%|ββββ |
| | 0: {'loss': 0.2911, 'grad_norm': 0.32458055128509955, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.5} |
| | 0: β | 805/1621 [15:15<11:45, 1.16it/s]
50%|βββββ | 806/1621 [15:16<11:40, 1.16it/s]
50%|βββββ | 807/1621 [15:16<11:36, 1.17it/s]
50%|βββββ | 808/1621 [15:17<12:20, 1.10it/s]
50%|βββββ | 809/1621 [15:18<12:07, 1.12it/s]
50%|βββββ | 810/1621 [15:19<12:08, 1.11it/s]
50%|βββββ | 810/1621 [15:19<12:08, 1.11it/s]
50%|βββββ | 811/1621 [15:20<12:00, 1.12it/s]
50%|βββββ | 812/1621 [15:21<12:01, 1.12it/s]
50%|βββββ | 813/1621 [15:22<11:51, 1.14it/s]
50%|βββββ | 814/1621 [15:23<11:47, 1.14it/s]
50%|βββββ | 815/1621 [15:24<11:40, 1.15it/s]
50%|βββββ | 816/1621 [15:24<11:38, 1.15it/s]
50%|βββββ | 817/1621 [15:25<11:34, 1.16it/s]
50%|βββββ | 818/1621 [15:26<11:32, 1.16it/s]
51%|βββββ | 819/1621 [15:27<11:34, 1.15it/s]
51%|ββοΏ½ |
| | 0: {'loss': 0.2926, 'grad_norm': 0.3005265468669908, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.51} |
| | 0: {'loss': 0.3011, 'grad_norm': 0.3180393440770147, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.51} |
| | 0: οΏ½ββ | 820/1621 [15:28<12:00, 1.11it/s]
51%|βββββ | 820/1621 [15:28<12:00, 1.11it/s]
51%|βββββ | 821/1621 [15:29<11:49, 1.13it/s]
51%|βββββ | 822/1621 [15:30<11:39, 1.14it/s]
51%|βββββ | 823/1621 [15:31<11:32, 1.15it/s]
51%|βββββ | 824/1621 [15:32<11:43, 1.13it/s]
51%|βββββ | 825/1621 [15:32<11:37, 1.14it/s]
51%|βββββ | 826/1621 [15:33<11:30, 1.15it/s]
51%|βββββ | 827/1621 [15:34<11:48, 1.12it/s]
51%|βββββ | 828/1621 [15:35<11:46, 1.12it/s]
51%|βββββ | 829/1621 [15:36<11:37, 1.14it/s]
51%|βββββ | 830/1621 [15:37<11:32, 1.14it/s]
51%|βββββ | 830/1621 [15:37<11:32, 1.14it/s]
51%|ββββββ | 831/1621 [15:38<11:50, 1.11it/s]
51%|ββββββ | 832/1621 [15:39<13:17, 1.01s/it]
51%|βββ |
| | 0: {'loss': 0.2884, 'grad_norm': 0.359753097877781, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.52} |
| | 0: βββ | 833/1621 [15:40<12:39, 1.04it/s]
51%|ββββββ | 834/1621 [15:41<12:10, 1.08it/s]
52%|ββββββ | 835/1621 [15:42<11:50, 1.11it/s]
52%|ββββββ | 836/1621 [15:42<11:51, 1.10it/s]
52%|ββββββ | 837/1621 [15:43<11:40, 1.12it/s]
52%|ββββββ | 838/1621 [15:44<12:19, 1.06it/s]
52%|ββββββ | 839/1621 [15:45<12:03, 1.08it/s]
52%|ββββββ | 840/1621 [15:46<11:58, 1.09it/s]
52%|ββββββ | 840/1621 [15:46<11:58, 1.09it/s]
52%|ββββββ | 841/1621 [15:47<11:48, 1.10it/s]
52%|ββββββ | 842/1621 [15:48<11:35, 1.12it/s]
52%|ββββββ | 843/1621 [15:49<11:53, 1.09it/s]
52%|ββββββ | 844/1621 [15:50<11:37, 1.11it/s]
52%|ββββββ | 845/1621 [15:51<11:26, 1.13it/s]
52%|ββββββ | 846/1621 [15:51<11:29, 1.12it/s]
52%|ββββββ | 847/1621 [15 |
| | 0: {'loss': 0.2858, 'grad_norm': 0.307823856703404, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.52} |
| | 0: {'loss': 0.2923, 'grad_norm': 0.2996148986851413, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.53} |
| | 0: :52<11:19, 1.14it/s]
52%|ββββββ | 848/1621 [15:53<11:27, 1.12it/s]
52%|ββββββ | 849/1621 [15:54<11:18, 1.14it/s]
52%|ββββββ | 850/1621 [15:55<11:10, 1.15it/s]
52%|ββββββ | 850/1621 [15:55<11:10, 1.15it/s]
52%|ββββββ | 851/1621 [15:56<11:23, 1.13it/s]
53%|ββββββ | 852/1621 [15:57<11:22, 1.13it/s]
53%|ββββββ | 853/1621 [15:58<11:14, 1.14it/s]
53%|ββββββ | 854/1621 [15:58<11:07, 1.15it/s]
53%|ββββββ | 855/1621 [15:59<11:03, 1.15it/s]
53%|ββββββ | 856/1621 [16:00<11:13, 1.14it/s]
53%|ββββββ | 857/1621 [16:01<11:35, 1.10it/s]
53%|ββββββ | 858/1621 [16:02<11:21, 1.12it/s]
53%|ββββββ | 859/1621 [16:03<11:13, 1.13it/s]
53%|ββββββ | 860/1621 [16:04<11:04, 1.14it/s]
53%|βββοΏ½ |
| | 0: {'loss': 0.2932, 'grad_norm': 0.2810728882724132, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.54} |
| | 0: οΏ½ββ | 860/1621 [16:04<11:04, 1.14it/s]
53%|ββββββ | 861/1621 [16:05<11:34, 1.09it/s]
53%|ββββββ | 862/1621 [16:06<11:19, 1.12it/s]
53%|ββββββ | 863/1621 [16:07<11:12, 1.13it/s]
53%|ββββββ | 864/1621 [16:07<11:06, 1.14it/s]
53%|ββββββ | 865/1621 [16:08<10:58, 1.15it/s]
53%|ββββββ | 866/1621 [16:09<11:07, 1.13it/s]
53%|ββββββ | 867/1621 [16:10<10:58, 1.14it/s]
54%|ββββββ | 868/1621 [16:11<10:59, 1.14it/s]
54%|ββββββ | 869/1621 [16:12<10:52, 1.15it/s]
54%|ββββββ | 870/1621 [16:13<11:33, 1.08it/s]
54%|ββββββ | 870/1621 [16:13<11:33, 1.08it/s]
54%|ββββββ | 871/1621 [16:14<11:23, 1.10it/s]
54%|ββββββ | 872/1621 [16:15<11:20, 1.10it/s]
54%|ββββββ | 873/1621 [16:15<11:04, 1.12it/s]
54%|ββββββ | 874/1621 [16:1 |
| | 0: {'loss': 0.2937, 'grad_norm': 0.30282456722647566, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.54} |
| | 0: 6<10:57, 1.14it/s]
54%|ββββββ | 875/1621 [16:17<10:56, 1.14it/s]
54%|ββββββ | 876/1621 [16:18<10:48, 1.15it/s]
54%|ββββββ | 877/1621 [16:19<10:44, 1.15it/s]
54%|ββββββ | 878/1621 [16:20<10:44, 1.15it/s]
54%|ββββββ | 879/1621 [16:21<10:40, 1.16it/s]
54%|ββββββ | 880/1621 [16:22<11:14, 1.10it/s]
54%|ββββββ | 880/1621 [16:22<11:14, 1.10it/s]
54%|ββββββ | 881/1621 [16:22<11:04, 1.11it/s]
54%|ββββββ | 882/1621 [16:23<10:57, 1.12it/s]
54%|ββββββ | 883/1621 [16:24<10:55, 1.13it/s]
55%|ββββββ | 884/1621 [16:25<10:49, 1.14it/s]
55%|ββββββ | 885/1621 [16:26<10:44, 1.14it/s]
55%|ββββββ | 886/1621 [16:27<10:37, 1.15it/s]
55%|ββββββ | 887/1621 [16:28<10:33, 1.16it/s]
55%|ββββββ | 888/1621 [16:29<10:34, 1.16it/s]
55%|οΏ½ |
| | 0: {'loss': 0.2934, 'grad_norm': 0.3068512318705077, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.55} |
| | 0: {'loss': 0.2955, 'grad_norm': 0.303815459875142, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.56} |
| | 0: οΏ½βββββ | 889/1621 [16:29<10:28, 1.16it/s]
55%|ββββββ | 890/1621 [16:30<10:28, 1.16it/s]
55%|ββββββ | 890/1621 [16:30<10:28, 1.16it/s]
55%|ββββββ | 891/1621 [16:31<10:36, 1.15it/s]
55%|ββββββ | 892/1621 [16:32<10:36, 1.15it/s]
55%|ββββββ | 893/1621 [16:33<10:43, 1.13it/s]
55%|ββββββ | 894/1621 [16:34<10:36, 1.14it/s]
55%|ββββββ | 895/1621 [16:35<10:29, 1.15it/s]
55%|ββββββ | 896/1621 [16:35<10:30, 1.15it/s]
55%|ββββββ | 897/1621 [16:36<10:31, 1.15it/s]
55%|ββββββ | 898/1621 [16:37<10:28, 1.15it/s]
55%|ββββββ | 899/1621 [16:38<10:25, 1.15it/s]
56%|ββββββ | 900/1621 [16:39<10:23, 1.16it/s]
56%|ββββββ | 900/1621 [16:39<10:23, 1.16it/s]
56%|ββββββ | 901/1621 [16:40< |
| | 0: {'loss': 0.2879, 'grad_norm': 0.30668180739641193, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.56} |
| | 0: 10:23, 1.16it/s]
56%|ββββββ | 902/1621 [16:41<10:56, 1.10it/s]
56%|ββββββ | 903/1621 [16:42<10:47, 1.11it/s]
56%|ββββββ | 904/1621 [16:43<10:41, 1.12it/s]
56%|ββββββ | 905/1621 [16:43<10:33, 1.13it/s]
56%|ββββββ | 906/1621 [16:44<10:32, 1.13it/s]
56%|ββββββ | 907/1621 [16:45<10:29, 1.13it/s]
56%|ββββββ | 908/1621 [16:46<10:22, 1.14it/s]
56%|ββββββ | 909/1621 [16:47<10:16, 1.15it/s]
56%|ββββββ | 910/1621 [16:48<10:10, 1.16it/s]
56%|ββββββ | 910/1621 [16:48<10:10, 1.16it/s]
56%|ββββββ | 911/1621 [16:49<10:08, 1.17it/s]
56%|ββββββ | 912/1621 [16:49<10:08, 1.17it/s]
56%|ββββββ | 913/1621 [16:50<10:11, 1.16it/s]
56%|ββββββ | 914/1621 [16:51<10:15, 1.15it/s]
56%|ββββββ | 915/1621 [16:52<10:50, 1.09it/s]
57%|βοΏ½ |
| | 0: {'loss': 0.2898, 'grad_norm': 0.30259565260728083, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.57} |
| | 0: οΏ½οΏ½ββββ | 916/1621 [16:53<10:35, 1.11it/s]
57%|ββββββ | 917/1621 [16:54<10:25, 1.13it/s]
57%|ββββββ | 918/1621 [16:55<10:17, 1.14it/s]
57%|ββββββ | 919/1621 [16:56<10:18, 1.13it/s]
57%|ββββββ | 920/1621 [16:57<10:16, 1.14it/s]
57%|ββββββ | 920/1621 [16:57<10:16, 1.14it/s]
57%|ββββββ | 921/1621 [16:58<10:28, 1.11it/s]
57%|ββββββ | 922/1621 [16:58<10:17, 1.13it/s]
57%|ββββββ | 923/1621 [16:59<10:31, 1.10it/s]
57%|ββββββ | 924/1621 [17:00<10:50, 1.07it/s]
57%|ββββββ | 925/1621 [17:01<10:32, 1.10it/s]
57%|ββββββ | 926/1621 [17:02<10:20, 1.12it/s]
57%|ββββββ | 927/1621 [17:03<10:47, 1.07it/s]
57%|ββββββ | 928/1621 [17:04<10:32, 1.10it/s]
57%|ββββββ | 929/1621 [17:05<10:19, 1.12it/s]
57%|ββββββ | 930/162 |
| | 0: {'loss': 0.2892, 'grad_norm': 0.28494124741923377, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.57} |
| | 0: {'loss': 0.2933, 'grad_norm': 0.3026563536305431, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.58} |
| | 0: 1 [17:06<10:13, 1.13it/s]
57%|ββββββ | 930/1621 [17:06<10:13, 1.13it/s]
57%|ββββββ | 931/1621 [17:07<10:04, 1.14it/s]
57%|ββββββ | 932/1621 [17:07<09:57, 1.15it/s]
58%|ββββββ | 933/1621 [17:08<10:04, 1.14it/s]
58%|ββββββ | 934/1621 [17:09<09:58, 1.15it/s]
58%|ββββββ | 935/1621 [17:10<09:54, 1.15it/s]
58%|ββββββ | 936/1621 [17:11<10:04, 1.13it/s]
58%|ββββββ | 937/1621 [17:12<09:57, 1.14it/s]
58%|ββββββ | 938/1621 [17:13<10:29, 1.08it/s]
58%|ββββββ | 939/1621 [17:14<10:16, 1.11it/s]
58%|ββββββ | 940/1621 [17:15<10:06, 1.12it/s]
58%|ββββββ | 940/1621 [17:15<10:06, 1.12it/s]
58%|ββββββ | 941/1621 [17:15<10:03, 1.13it/s]
58%|ββββββ | 942/1621 [17:16<09:55, 1.14it/s]
58%|ββ |
| | 0: {'loss': 0.2949, 'grad_norm': 0.2955013797109043, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.59} |
| | 0: ββββ | 943/1621 [17:17<09:50, 1.15it/s]
58%|ββββββ | 944/1621 [17:18<09:45, 1.16it/s]
58%|ββββββ | 945/1621 [17:19<09:41, 1.16it/s]
58%|ββββββ | 946/1621 [17:20<09:40, 1.16it/s]
58%|ββββββ | 947/1621 [17:21<09:38, 1.17it/s]
58%|ββββββ | 948/1621 [17:21<09:35, 1.17it/s]
59%|ββββββ | 949/1621 [17:22<09:31, 1.18it/s]
59%|ββββββ | 950/1621 [17:23<09:30, 1.18it/s]
59%|ββββββ | 950/1621 [17:23<09:30, 1.18it/s]
59%|ββββββ | 951/1621 [17:24<09:46, 1.14it/s]
59%|ββββββ | 952/1621 [17:25<09:44, 1.15it/s]
59%|ββββββ | 953/1621 [17:26<09:40, 1.15it/s]
59%|ββββββ | 954/1621 [17:27<09:44, 1.14it/s]
59%|ββββββ | 955/1621 [17:28<09:53, 1.12it/s]
59%|ββββββ | 956/1621 [17:28<09:45, 1.14it/s]
59%|ββββββ | 957/1621 |
| | 0: {'loss': 0.2868, 'grad_norm': 0.28031507522996657, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.59} |
| | 0: {'loss': 0.2978, 'grad_norm': 0.29965929088247534, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.6} |
| | 0: [17:29<09:45, 1.13it/s]
59%|ββββββ | 958/1621 [17:30<09:38, 1.15it/s]
59%|ββββββ | 959/1621 [17:31<09:37, 1.15it/s]
59%|ββββββ | 960/1621 [17:32<09:33, 1.15it/s]
59%|ββββββ | 960/1621 [17:32<09:33, 1.15it/s]
59%|ββββββ | 961/1621 [17:33<09:35, 1.15it/s]
59%|ββββββ | 962/1621 [17:34<09:30, 1.15it/s]
59%|ββββββ | 963/1621 [17:34<09:27, 1.16it/s]
59%|ββββββ | 964/1621 [17:35<09:23, 1.17it/s]
60%|ββββββ | 965/1621 [17:36<09:23, 1.16it/s]
60%|ββββββ | 966/1621 [17:37<09:19, 1.17it/s]
60%|ββββββ | 967/1621 [17:38<09:19, 1.17it/s]
60%|ββββββ | 968/1621 [17:39<09:20, 1.17it/s]
60%|ββββββ | 969/1621 [17:40<09:20, 1.16it/s]
60%|ββββββ | 970/1621 [17:40<09:20, 1.16it/s]
60%|ββοΏ½ |
| | 0: [2025-09-02 18:53:55,696] [INFO] [axolotl.core.trainers.base._save:613] [PID:801836] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0/checkpoint-975[39m |
| | 0: [2025-09-02 18:53:58,194] [INFO] [axolotl.core.trainers.base._save:662] [PID:801836] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
| | 0: {'loss': 0.2943, 'grad_norm': 0.29624852844606697, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.6} |
| | 0: οΏ½βββ | 970/1621 [17:40<09:20, 1.16it/s]
60%|ββββββ | 971/1621 [17:41<09:18, 1.16it/s]
60%|ββββββ | 972/1621 [17:42<09:15, 1.17it/s]
60%|ββββββ | 973/1621 [17:43<09:13, 1.17it/s]
60%|ββββββ | 974/1621 [17:44<09:13, 1.17it/s]
60%|ββββββ | 975/1621 [17:45<09:21, 1.15it/s]
60%|ββββββ | 976/1621 [17:50<23:43, 2.21s/it]
60%|ββββββ | 977/1621 [17:51<19:19, 1.80s/it]
60%|ββββββ | 978/1621 [17:52<16:13, 1.51s/it]
60%|ββββββ | 979/1621 [17:53<14:13, 1.33s/it]
60%|ββββββ | 980/1621 [17:54<12:43, 1.19s/it]
60%|ββββββ | 980/1621 [17:54<12:43, 1.19s/it]
61%|ββββββ | 981/1621 [17:54<11:42, 1.10s/it]
61%|ββββββ | 982/1621 [17:55<10:54, 1.02s/it]
61%|ββββββ | 983/1621 [17:56<10:22, 1.02it/s]
61%|ββββββ | 984/1621 [1 |
| | 0: {'loss': 0.296, 'grad_norm': 0.30350730138638954, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.61} |
| | 0: 7:57<10:17, 1.03it/s]
61%|ββββββ | 985/1621 [17:58<09:54, 1.07it/s]
61%|ββββββ | 986/1621 [17:59<09:37, 1.10it/s]
61%|ββββββ | 987/1621 [18:00<09:34, 1.10it/s]
61%|ββββββ | 988/1621 [18:01<09:27, 1.12it/s]
61%|ββββββ | 989/1621 [18:01<09:16, 1.13it/s]
61%|ββββββ | 990/1621 [18:02<09:13, 1.14it/s]
61%|ββββββ | 990/1621 [18:02<09:13, 1.14it/s]
61%|ββββββ | 991/1621 [18:03<09:08, 1.15it/s]
61%|ββββββ | 992/1621 [18:04<09:04, 1.15it/s]
61%|βββββββ | 993/1621 [18:05<09:00, 1.16it/s]
61%|βββββββ | 994/1621 [18:06<08:57, 1.17it/s]
61%|βββββββ | 995/1621 [18:07<08:54, 1.17it/s]
61%|βββββββ | 996/1621 [18:08<09:29, 1.10it/s]
62%|βββββββ | 997/1621 [18:08<09:18, 1.12it/s]
62%|βββββββ | 998/1621 [18:09<09:08, 1. |
| | 0: {'loss': 0.2983, 'grad_norm': 0.3203748629162452, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.62} |
| | 0: {'loss': 0.2842, 'grad_norm': 0.2997338003534859, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.62} |
| | 0: 14it/s]
62%|βββββββ | 999/1621 [18:10<09:02, 1.15it/s]
62%|βββββββ | 1000/1621 [18:11<09:02, 1.14it/s]
62%|βββββββ | 1000/1621 [18:11<09:02, 1.14it/s]
62%|βββββββ | 1001/1621 [18:12<08:59, 1.15it/s]
62%|βββββββ | 1002/1621 [18:13<08:58, 1.15it/s]
62%|βββββββ | 1003/1621 [18:14<08:56, 1.15it/s]
62%|βββββββ | 1004/1621 [18:14<08:53, 1.16it/s]
62%|βββββββ | 1005/1621 [18:15<08:52, 1.16it/s]
62%|βββββββ | 1006/1621 [18:16<08:50, 1.16it/s]
62%|βββββββ | 1007/1621 [18:17<08:46, 1.17it/s]
62%|βββββββ | 1008/1621 [18:18<08:43, 1.17it/s]
62%|βββββββ | 1009/1621 [18:19<09:01, 1.13it/s]
62%|βββββββ | 1010/1621 [18:20<08:56, 1.14it/s]
62%|βββββββ | 1010/1621 [18:20<08:56 |
| | 0: {'loss': 0.2849, 'grad_norm': 0.31338661689855674, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.63} |
| | 0: , 1.14it/s]
62%|βββββββ | 1011/1621 [18:21<08:51, 1.15it/s]
62%|βββββββ | 1012/1621 [18:22<09:07, 1.11it/s]
62%|βββββββ | 1013/1621 [18:22<09:13, 1.10it/s]
63%|βββββββ | 1014/1621 [18:23<09:03, 1.12it/s]
63%|βββββββ | 1015/1621 [18:24<08:56, 1.13it/s]
63%|βββββββ | 1016/1621 [18:25<08:59, 1.12it/s]
63%|βββββββ | 1017/1621 [18:26<08:58, 1.12it/s]
63%|βββββββ | 1018/1621 [18:27<09:01, 1.11it/s]
63%|βββββββ | 1019/1621 [18:28<08:53, 1.13it/s]
63%|βββββββ | 1020/1621 [18:29<08:49, 1.14it/s]
63%|βββββββ | 1020/1621 [18:29<08:49, 1.14it/s]
63%|βββββββ | 1021/1621 [18:30<08:52, 1.13it/s]
63%|βββββββ | 1022/1621 [18:30<08:52, 1.13it/s]
63%|βββββββ | 1023/1621 [18:31<08:44, 1.14it/s]
63%|βββββββ | 102 |
| | 0: {'loss': 0.2896, 'grad_norm': 0.317479701582765, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.64} |
| | 0: 4/1621 [18:32<08:39, 1.15it/s]
63%|βββββββ | 1025/1621 [18:33<08:45, 1.13it/s]
63%|βββββββ | 1026/1621 [18:34<09:14, 1.07it/s]
63%|βββββββ | 1027/1621 [18:35<08:59, 1.10it/s]
63%|βββββββ | 1028/1621 [18:36<09:01, 1.09it/s]
63%|βββββββ | 1029/1621 [18:37<08:53, 1.11it/s]
64%|βββββββ | 1030/1621 [18:38<08:56, 1.10it/s]
64%|βββββββ | 1030/1621 [18:38<08:56, 1.10it/s]
64%|βββββββ | 1031/1621 [18:39<09:00, 1.09it/s]
64%|βββββββ | 1032/1621 [18:40<08:57, 1.10it/s]
64%|βββββββ | 1033/1621 [18:40<08:46, 1.12it/s]
64%|βββββββ | 1034/1621 [18:41<08:39, 1.13it/s]
64%|βββββββ | 1035/1621 [18:42<08:34, 1.14it/s]
64%|βββββββ | 1036/1621 [18:43<08:40, 1.12it/s]
64%|βββββββ | 1037/1621 [18:44<08:35, 1.13it/s]
64%|βββοΏ½ |
| | 0: {'loss': 0.2973, 'grad_norm': 0.2949775076536377, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.64} |
| | 0: {'loss': 0.2888, 'grad_norm': 0.3078098551918824, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.65} |
| | 0: οΏ½οΏ½βββ | 1038/1621 [18:45<08:30, 1.14it/s]
64%|βββββββ | 1039/1621 [18:46<08:44, 1.11it/s]
64%|βββββββ | 1040/1621 [18:47<08:40, 1.12it/s]
64%|βββββββ | 1040/1621 [18:47<08:40, 1.12it/s]
64%|βββββββ | 1041/1621 [18:48<09:07, 1.06it/s]
64%|βββββββ | 1042/1621 [18:48<08:51, 1.09it/s]
64%|βββββββ | 1043/1621 [18:49<08:37, 1.12it/s]
64%|βββββββ | 1044/1621 [18:50<08:31, 1.13it/s]
64%|βββββββ | 1045/1621 [18:51<08:24, 1.14it/s]
65%|βββββββ | 1046/1621 [18:52<08:20, 1.15it/s]
65%|βββββββ | 1047/1621 [18:53<08:16, 1.16it/s]
65%|βββββββ | 1048/1621 [18:54<08:14, 1.16it/s]
65%|βββββββ | 1049/1621 [18:54<08:15, 1.16it/s]
65%|βββββββ | 1050/1621 [18:55<08:12, 1.16it/s]
65%|βοΏ½ |
| | 0: {'loss': 0.2936, 'grad_norm': 0.3170740850922587, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.65} |
| | 0: οΏ½οΏ½βββββ | 1050/1621 [18:55<08:12, 1.16it/s]
65%|βββββββ | 1051/1621 [18:56<08:24, 1.13it/s]
65%|βββββββ | 1052/1621 [18:57<08:44, 1.08it/s]
65%|βββββββ | 1053/1621 [18:58<08:35, 1.10it/s]
65%|βββββββ | 1054/1621 [18:59<08:23, 1.13it/s]
65%|βββββββ | 1055/1621 [19:00<08:16, 1.14it/s]
65%|βββββββ | 1056/1621 [19:01<08:12, 1.15it/s]
65%|βββββββ | 1057/1621 [19:02<08:08, 1.15it/s]
65%|βββββββ | 1058/1621 [19:02<08:09, 1.15it/s]
65%|βββββββ | 1059/1621 [19:03<08:17, 1.13it/s]
65%|βββββββ | 1060/1621 [19:04<08:22, 1.12it/s]
65%|βββββββ | 1060/1621 [19:04<08:22, 1.12it/s]
65%|βββββββ | 1061/1621 [19:05<08:14, 1.13it/s]
66%|βββββββ | 1062/1621 [19:06<08:09, 1.14it/s]
66%|βββββββ | 1063/1621 [19:07<08:15, |
| | 0: {'loss': 0.2845, 'grad_norm': 0.3117559688954944, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.66} |
| | 0: 1.13it/s]
66%|βββββββ | 1064/1621 [19:08<08:11, 1.13it/s]
66%|βββββββ | 1065/1621 [19:09<08:04, 1.15it/s]
66%|βββββββ | 1066/1621 [19:10<08:32, 1.08it/s]
66%|βββββββ | 1067/1621 [19:11<08:20, 1.11it/s]
66%|βββββββ | 1068/1621 [19:11<08:15, 1.12it/s]
66%|βββββββ | 1069/1621 [19:12<08:15, 1.11it/s]
66%|βββββββ | 1070/1621 [19:13<08:11, 1.12it/s]
66%|βββββββ | 1070/1621 [19:13<08:11, 1.12it/s]
66%|βββββββ | 1071/1621 [19:14<08:05, 1.13it/s]
66%|βββββββ | 1072/1621 [19:15<08:06, 1.13it/s]
66%|βββββββ | 1073/1621 [19:16<08:41, 1.05it/s]
66%|βββββββ | 1074/1621 [19:17<08:25, 1.08it/s]
66%|βββββββ | 1075/1621 [19:18<08:18, 1.10it/s]
66%|βββββββ | 1076/1621 [19:19<08:11, 1.11it/s]
66%|βββββββ | 1077/1 |
| | 0: {'loss': 0.2835, 'grad_norm': 0.29256036199346086, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.67} |
| | 0: 621 [19:20<08:05, 1.12it/s]
67%|βββββββ | 1078/1621 [19:21<08:20, 1.09it/s]
67%|βββββββ | 1079/1621 [19:21<08:08, 1.11it/s]
67%|βββββββ | 1080/1621 [19:22<07:59, 1.13it/s]
67%|βββββββ | 1080/1621 [19:22<07:59, 1.13it/s]
67%|βββββββ | 1081/1621 [19:23<07:52, 1.14it/s]
67%|βββββββ | 1082/1621 [19:24<07:51, 1.14it/s]
67%|βββββββ | 1083/1621 [19:25<08:01, 1.12it/s]
67%|βββββββ | 1084/1621 [19:26<07:54, 1.13it/s]
67%|βββββββ | 1085/1621 [19:27<07:47, 1.15it/s]
67%|βββββββ | 1086/1621 [19:27<07:46, 1.15it/s]
67%|βββββββ | 1087/1621 [19:28<07:58, 1.12it/s]
67%|βββββββ | 1088/1621 [19:29<07:55, 1.12it/s]
67%|βββββββ | 1089/1621 [19:30<07:50, 1.13it/s]
67%|βββββββ | 1090/1621 [19:31<07:53, 1.12it/s]
|
| | 0: {'loss': 0.2824, 'grad_norm': 0.29072721767613235, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.67} |
| | 0: {'loss': 0.2923, 'grad_norm': 0.29738132555989805, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.68} |
| | 0:
67%|βββββββ | 1090/1621 [19:31<07:53, 1.12it/s]
67%|βββββββ | 1091/1621 [19:32<07:45, 1.14it/s]
67%|βββββββ | 1092/1621 [19:33<07:42, 1.14it/s]
67%|βββββββ | 1093/1621 [19:34<07:42, 1.14it/s]
67%|βββββββ | 1094/1621 [19:35<07:38, 1.15it/s]
68%|βββββββ | 1095/1621 [19:35<07:36, 1.15it/s]
68%|βββββββ | 1096/1621 [19:36<07:40, 1.14it/s]
68%|βββββββ | 1097/1621 [19:37<07:35, 1.15it/s]
68%|βββββββ | 1098/1621 [19:38<07:40, 1.14it/s]
68%|βββββββ | 1099/1621 [19:39<07:37, 1.14it/s]
68%|βββββββ | 1100/1621 [19:40<07:36, 1.14it/s]
68%|βββββββ | 1100/1621 [19:40<07:36, 1.14it/s]
68%|βββββββ | 1101/1621 [19:41<07:33, 1.15it/s]
68%|βββββββ | 1102/1621 [19:41<07:28, 1.16it/s]
68%|ββοΏ½ |
| | 0: {'loss': 0.2888, 'grad_norm': 0.39509367378956034, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.68} |
| | 0: οΏ½οΏ½ββββ | 1103/1621 [19:42<07:26, 1.16it/s]
68%|βββββββ | 1104/1621 [19:43<07:27, 1.16it/s]
68%|βββββββ | 1105/1621 [19:44<07:23, 1.16it/s]
68%|βββββββ | 1106/1621 [19:45<07:57, 1.08it/s]
68%|βββββββ | 1107/1621 [19:46<07:49, 1.10it/s]
68%|βββββββ | 1108/1621 [19:47<07:37, 1.12it/s]
68%|βββββββ | 1109/1621 [19:48<07:35, 1.12it/s]
68%|βββββββ | 1110/1621 [19:49<07:36, 1.12it/s]
68%|βββββββ | 1110/1621 [19:49<07:36, 1.12it/s]
69%|βββββββ | 1111/1621 [19:50<07:32, 1.13it/s]
69%|βββββββ | 1112/1621 [19:51<07:58, 1.06it/s]
69%|βββββββ | 1113/1621 [19:51<07:47, 1.09it/s]
69%|βββββββ | 1114/1621 [19:52<07:37, 1.11it/s]
69%|βββββββ | 1115/1621 [19:53<07:28, 1.13it/s]
69%|βββββββ | 1116/1621 [19:54<07:24, 1.1 |
| | 0: {'loss': 0.292, 'grad_norm': 0.30996483704428673, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.69} |
| | 0: 4it/s]
69%|βββββββ | 1117/1621 [19:55<07:45, 1.08it/s]
69%|βββββββ | 1118/1621 [19:56<07:33, 1.11it/s]
69%|βββββββ | 1119/1621 [19:57<07:23, 1.13it/s]
69%|βββββββ | 1120/1621 [19:58<07:17, 1.15it/s]
69%|βββββββ | 1120/1621 [19:58<07:17, 1.15it/s]
69%|βββββββ | 1121/1621 [19:59<07:19, 1.14it/s]
69%|βββββββ | 1122/1621 [19:59<07:26, 1.12it/s]
69%|βββββββ | 1123/1621 [20:00<07:18, 1.13it/s]
69%|βββββββ | 1124/1621 [20:01<07:15, 1.14it/s]
69%|βββββββ | 1125/1621 [20:02<07:44, 1.07it/s]
69%|βββββββ | 1126/1621 [20:03<07:31, 1.10it/s]
70%|βββββββ | 1127/1621 [20:04<07:22, 1.12it/s]
70%|βββββββ | 1128/1621 [20:05<07:26, 1.10it/s]
70%|βββββββ | 1129/1621 [20:06<07:31, 1.09it/s]
70%|βββββββ | 1130/1621 |
| | 0: {'loss': 0.2888, 'grad_norm': 0.3063733535961954, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.7} |
| | 0: {'loss': 0.2922, 'grad_norm': 0.3416852250077931, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.7} |
| | 0: [20:07<07:24, 1.11it/s]
70%|βββββββ | 1130/1621 [20:07<07:24, 1.11it/s]
70%|βββββββ | 1131/1621 [20:08<07:16, 1.12it/s]
70%|βββββββ | 1132/1621 [20:08<07:12, 1.13it/s]
70%|βββββββ | 1133/1621 [20:09<07:07, 1.14it/s]
70%|βββββββ | 1134/1621 [20:10<07:03, 1.15it/s]
70%|βββββββ | 1135/1621 [20:11<07:45, 1.04it/s]
70%|βββββββ | 1136/1621 [20:12<07:29, 1.08it/s]
70%|βββββββ | 1137/1621 [20:13<07:26, 1.08it/s]
70%|βββββββ | 1138/1621 [20:14<07:15, 1.11it/s]
70%|βββββββ | 1139/1621 [20:15<07:34, 1.06it/s]
70%|βββββββ | 1140/1621 [20:16<07:31, 1.06it/s]
70%|βββββββ | 1140/1621 [20:16<07:31, 1.06it/s]
70%|βββββββ | 1141/1621 [20:17<07:36, 1.05it/s]
70%|βββββββ | 114 |
| | 0: {'loss': 0.2925, 'grad_norm': 0.30884469264741793, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.71} |
| | 0: 2/1621 [20:18<07:27, 1.07it/s]
71%|βββββββ | 1143/1621 [20:19<07:15, 1.10it/s]
71%|βββββββ | 1144/1621 [20:19<07:07, 1.12it/s]
71%|βββββββ | 1145/1621 [20:20<07:00, 1.13it/s]
71%|βββββββ | 1146/1621 [20:21<06:54, 1.15it/s]
71%|βββββββ | 1147/1621 [20:22<06:51, 1.15it/s]
71%|βββββββ | 1148/1621 [20:23<06:54, 1.14it/s]
71%|βββββββ | 1149/1621 [20:24<06:49, 1.15it/s]
71%|βββββββ | 1150/1621 [20:25<06:47, 1.16it/s]
71%|βββββββ | 1150/1621 [20:25<06:47, 1.16it/s]
71%|βββββββ | 1151/1621 [20:25<06:45, 1.16it/s]
71%|βββββββ | 1152/1621 [20:26<06:53, 1.13it/s]
71%|βββββββ | 1153/1621 [20:27<06:50, 1.14it/s]
71%|βββββββ | 1154/1621 [20:28<06:45, 1.15it/s]
71%|ββββββββ | 1155/1621 [20:29<06:43, 1.15it/s]
71%|ββοΏ½ |
| | 0: {'loss': 0.2894, 'grad_norm': 0.3044894747315433, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.72} |
| | 0: οΏ½βββββ | 1156/1621 [20:30<06:43, 1.15it/s]
71%|ββββββββ | 1157/1621 [20:31<06:39, 1.16it/s]
71%|ββββββββ | 1158/1621 [20:32<06:37, 1.16it/s]
71%|ββββββββ | 1159/1621 [20:32<06:38, 1.16it/s]
72%|ββββββββ | 1160/1621 [20:33<06:51, 1.12it/s]
72%|ββββββββ | 1160/1621 [20:33<06:51, 1.12it/s]
72%|ββββββββ | 1161/1621 [20:34<06:46, 1.13it/s]
72%|ββββββββ | 1162/1621 [20:35<06:39, 1.15it/s]
72%|ββββββββ | 1163/1621 [20:36<06:41, 1.14it/s]
72%|ββββββββ | 1164/1621 [20:37<06:40, 1.14it/s]
72%|ββββββββ | 1165/1621 [20:38<06:38, 1.14it/s]
72%|ββββββββ | 1166/1621 [20:39<06:34, 1.15it/s]
72%|ββββββββ | 1167/1621 [20:39<06:31, 1.16it/s]
72%|ββββββββ | 1168/1621 [20:40<06:28, 1.17it/s]
72%|ββββββββ | |
| | 0: {'loss': 0.2951, 'grad_norm': 0.30433585143251524, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.72} |
| | 0: {'loss': 0.2873, 'grad_norm': 0.2914456090348807, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.73} |
| | 0: 1169/1621 [20:41<06:35, 1.14it/s]
72%|ββββββββ | 1170/1621 [20:42<06:31, 1.15it/s]
72%|ββββββββ | 1170/1621 [20:42<06:31, 1.15it/s]
72%|ββββββββ | 1171/1621 [20:43<06:32, 1.15it/s]
72%|ββββββββ | 1172/1621 [20:44<06:35, 1.14it/s]
72%|ββββββββ | 1173/1621 [20:45<06:30, 1.15it/s]
72%|ββββββββ | 1174/1621 [20:46<06:26, 1.16it/s]
72%|ββββββββ | 1175/1621 [20:46<06:22, 1.17it/s]
73%|ββββββββ | 1176/1621 [20:47<06:21, 1.17it/s]
73%|ββββββββ | 1177/1621 [20:48<06:23, 1.16it/s]
73%|ββββββββ | 1178/1621 [20:49<06:22, 1.16it/s]
73%|ββββββββ | 1179/1621 [20:50<06:25, 1.15it/s]
73%|ββββββββ | 1180/1621 [20:51<06:22, 1.15it/s]
73%|ββββββββ | 1180/1621 [20:51<06:22, 1.15it/s |
| | 0: {'loss': 0.2904, 'grad_norm': 0.29539333636292997, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.73} |
| | 0: ]
73%|ββββββββ | 1181/1621 [20:52<06:20, 1.16it/s]
73%|ββββββββ | 1182/1621 [20:52<06:20, 1.15it/s]
73%|ββββββββ | 1183/1621 [20:53<06:17, 1.16it/s]
73%|ββββββββ | 1184/1621 [20:54<06:15, 1.17it/s]
73%|ββββββββ | 1185/1621 [20:55<06:13, 1.17it/s]
73%|ββββββββ | 1186/1621 [20:56<06:12, 1.17it/s]
73%|ββββββββ | 1187/1621 [20:57<06:10, 1.17it/s]
73%|ββββββββ | 1188/1621 [20:58<06:10, 1.17it/s]
73%|ββββββββ | 1189/1621 [20:58<06:09, 1.17it/s]
73%|ββββββββ | 1190/1621 [20:59<06:08, 1.17it/s]
73%|ββββββββ | 1190/1621 [20:59<06:08, 1.17it/s]
73%|ββββββββ | 1191/1621 [21:00<06:09, 1.16it/s]
74%|ββββββββ | 1192/1621 [21:01<06:08, 1.17it/s]
74%|ββββββββ | 1193/1621 [21:02<06:08, 1.16it/s]
74%|ββββ |
| | 0: {'loss': 0.2846, 'grad_norm': 0.29449247766474873, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.74} |
| | 0: ββββ | 1194/1621 [21:03<06:07, 1.16it/s]
74%|ββββββββ | 1195/1621 [21:04<06:20, 1.12it/s]
74%|ββββββββ | 1196/1621 [21:05<06:15, 1.13it/s]
74%|ββββββββ | 1197/1621 [21:05<06:11, 1.14it/s]
74%|ββββββββ | 1198/1621 [21:06<06:21, 1.11it/s]
74%|ββββββββ | 1199/1621 [21:07<06:20, 1.11it/s]
74%|ββββββββ | 1200/1621 [21:08<06:13, 1.13it/s]
74%|ββββββββ | 1200/1621 [21:08<06:13, 1.13it/s]
74%|ββββββββ | 1201/1621 [21:09<06:21, 1.10it/s]
74%|ββββββββ | 1202/1621 [21:10<06:14, 1.12it/s]
74%|ββββββββ | 1203/1621 [21:11<06:11, 1.13it/s]
74%|ββββββββ | 1204/1621 [21:12<06:05, 1.14it/s]
74%|ββββββββ | 1205/1621 [21:13<06:02, 1.15it/s]
74%|ββββββββ | 1206/1621 [21:14<06:26, 1.07it/s]
74%|ββββββββ | 120 |
| | 0: {'loss': 0.2885, 'grad_norm': 0.2826708578899858, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.75} |
| | 0: 7/1621 [21:14<06:14, 1.11it/s]
75%|ββββββββ | 1208/1621 [21:15<06:09, 1.12it/s]
75%|ββββββββ | 1209/1621 [21:16<06:03, 1.13it/s]
75%|ββββββββ | 1210/1621 [21:17<06:00, 1.14it/s]
75%|ββββββββ | 1210/1621 [21:17<06:00, 1.14it/s]
75%|ββββββββ | 1211/1621 [21:18<06:03, 1.13it/s]
75%|ββββββββ | 1212/1621 [21:19<06:00, 1.13it/s]
75%|ββββββββ | 1213/1621 [21:20<05:56, 1.14it/s]
75%|ββββββββ | 1214/1621 [21:21<05:54, 1.15it/s]
75%|ββββββββ | 1215/1621 [21:21<06:00, 1.13it/s]
75%|ββββββββ | 1216/1621 [21:22<05:57, 1.13it/s]
75%|ββββββββ | 1217/1621 [21:23<05:54, 1.14it/s]
75%|ββββββββ | 1218/1621 [21:24<05:50, 1.15it/s]
75%|ββββββββ | 1219/1621 [21:25<05:47, 1.16it/s]
75%|ββββββββ | 1220/1621 [21:26<05:48 |
| | 0: {'loss': 0.2881, 'grad_norm': 0.29371514774816154, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.75} |
| | 0: {'loss': 0.2925, 'grad_norm': 0.3014384531364355, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.76} |
| | 0: , 1.15it/s]
75%|ββββββββ | 1220/1621 [21:26<05:48, 1.15it/s]
75%|ββββββββ | 1221/1621 [21:27<05:45, 1.16it/s]
75%|ββββββββ | 1222/1621 [21:27<05:42, 1.16it/s]
75%|ββββββββ | 1223/1621 [21:28<05:46, 1.15it/s]
76%|ββββββββ | 1224/1621 [21:29<05:44, 1.15it/s]
76%|ββββββββ | 1225/1621 [21:30<05:41, 1.16it/s]
76%|ββββββββ | 1226/1621 [21:31<05:38, 1.17it/s]
76%|ββββββββ | 1227/1621 [21:32<05:37, 1.17it/s]
76%|ββββββββ | 1228/1621 [21:33<05:38, 1.16it/s]
76%|ββββββββ | 1229/1621 [21:34<05:36, 1.16it/s]
76%|ββββββββ | 1230/1621 [21:34<05:33, 1.17it/s]
76%|ββββββββ | 1230/1621 [21:34<05:33, 1.17it/s]
76%|ββββββββ | 1231/1621 [21:35<05:32, 1.17it/s]
76%|βββββοΏ½ |
| | 0: {'loss': 0.2906, 'grad_norm': 0.3072217593018881, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.76} |
| | 0: οΏ½οΏ½ββ | 1232/1621 [21:36<05:37, 1.15it/s]
76%|ββββββββ | 1233/1621 [21:37<05:35, 1.16it/s]
76%|ββββββββ | 1234/1621 [21:38<05:34, 1.16it/s]
76%|ββββββββ | 1235/1621 [21:39<05:33, 1.16it/s]
76%|ββββββββ | 1236/1621 [21:40<05:32, 1.16it/s]
76%|ββββββββ | 1237/1621 [21:40<05:29, 1.16it/s]
76%|ββββββββ | 1238/1621 [21:41<05:55, 1.08it/s]
76%|ββββββββ | 1239/1621 [21:42<05:47, 1.10it/s]
76%|ββββββββ | 1240/1621 [21:43<05:43, 1.11it/s]
76%|ββββββββ | 1240/1621 [21:43<05:43, 1.11it/s]
77%|ββββββββ | 1241/1621 [21:44<05:39, 1.12it/s]
77%|ββββββββ | 1242/1621 [21:45<05:33, 1.14it/s]
77%|ββββββββ | 1243/1621 [21:46<05:31, 1.14it/s]
77%|ββββββββ | 1244/1621 [21:47<05:27, 1.15it/s]
77%|ββββββββ | 1245/16 |
| | 0: {'loss': 0.284, 'grad_norm': 0.3038905157918577, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.77} |
| | 0: 21 [21:48<05:34, 1.12it/s]
77%|ββββββββ | 1246/1621 [21:49<05:44, 1.09it/s]
77%|ββββββββ | 1247/1621 [21:49<05:35, 1.11it/s]
77%|ββββββββ | 1248/1621 [21:50<05:29, 1.13it/s]
77%|ββββββββ | 1249/1621 [21:51<05:29, 1.13it/s]
77%|ββββββββ | 1250/1621 [21:52<05:25, 1.14it/s]
77%|ββββββββ | 1250/1621 [21:52<05:25, 1.14it/s]
77%|ββββββββ | 1251/1621 [21:53<05:21, 1.15it/s]
77%|ββββββββ | 1252/1621 [21:54<05:18, 1.16it/s]
77%|ββββββββ | 1253/1621 [21:55<05:16, 1.16it/s]
77%|ββββββββ | 1254/1621 [21:55<05:16, 1.16it/s]
77%|ββββββββ | 1255/1621 [21:56<05:14, 1.17it/s]
77%|ββββββββ | 1256/1621 [21:57<05:14, 1.16it/s]
78%|ββββββββ | 1257/1621 [21:58<05:12, 1.17it/s]
78%|ββββββββ | 1258/1621 [21:59<05:09, 1 |
| | 0: {'loss': 0.29, 'grad_norm': 0.3050525690945563, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.78} |
| | 0: {'loss': 0.2953, 'grad_norm': 0.31129474618367203, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.78} |
| | 0: .17it/s]
78%|ββββββββ | 1259/1621 [22:00<05:10, 1.16it/s]
78%|ββββββββ | 1260/1621 [22:01<05:10, 1.16it/s]
78%|ββββββββ | 1260/1621 [22:01<05:10, 1.16it/s]
78%|ββββββββ | 1261/1621 [22:02<05:14, 1.14it/s]
78%|ββββββββ | 1262/1621 [22:02<05:11, 1.15it/s]
78%|ββββββββ | 1263/1621 [22:03<05:12, 1.14it/s]
78%|ββββββββ | 1264/1621 [22:04<05:09, 1.15it/s]
78%|ββββββββ | 1265/1621 [22:05<05:06, 1.16it/s]
78%|ββββββββ | 1266/1621 [22:06<05:04, 1.17it/s]
78%|ββββββββ | 1267/1621 [22:07<05:04, 1.16it/s]
78%|ββββββββ | 1268/1621 [22:08<05:05, 1.16it/s]
78%|ββββββββ | 1269/1621 [22:08<05:03, 1.16it/s]
78%|ββββββββ | 1270/1621 [22:09<05:04, 1.15it/s]
78%|ββββββοΏ½ |
| | 0: {'loss': 0.2891, 'grad_norm': 0.3171141586431303, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.79} |
| | 0: οΏ½β | 1270/1621 [22:09<05:04, 1.15it/s]
78%|ββββββββ | 1271/1621 [22:10<05:26, 1.07it/s]
78%|ββββββββ | 1272/1621 [22:11<05:16, 1.10it/s]
79%|ββββββββ | 1273/1621 [22:12<05:09, 1.12it/s]
79%|ββββββββ | 1274/1621 [22:13<05:04, 1.14it/s]
79%|ββββββββ | 1275/1621 [22:14<05:07, 1.13it/s]
79%|ββββββββ | 1276/1621 [22:15<05:05, 1.13it/s]
79%|ββββββββ | 1277/1621 [22:16<05:02, 1.14it/s]
79%|ββββββββ | 1278/1621 [22:16<04:58, 1.15it/s]
79%|ββββββββ | 1279/1621 [22:17<04:56, 1.15it/s]
79%|ββββββββ | 1280/1621 [22:18<04:54, 1.16it/s]
79%|ββββββββ | 1280/1621 [22:18<04:54, 1.16it/s]
79%|ββββββββ | 1281/1621 [22:19<04:53, 1.16it/s]
79%|ββββββββ | 1282/1621 [22:20<04:57, 1.14it/s]
79%|ββββββββ | 1283/1621 [ |
| | 0: {'loss': 0.2838, 'grad_norm': 0.2925938234274559, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.8} |
| | 0: 22:21<05:00, 1.13it/s]
79%|ββββββββ | 1284/1621 [22:22<04:55, 1.14it/s]
79%|ββββββββ | 1285/1621 [22:23<05:04, 1.10it/s]
79%|ββββββββ | 1286/1621 [22:24<05:00, 1.11it/s]
79%|ββββββββ | 1287/1621 [22:24<05:01, 1.11it/s]
79%|ββββββββ | 1288/1621 [22:25<04:57, 1.12it/s]
80%|ββββββββ | 1289/1621 [22:26<04:53, 1.13it/s]
80%|ββββββββ | 1290/1621 [22:27<04:50, 1.14it/s]
80%|ββββββββ | 1290/1621 [22:27<04:50, 1.14it/s]
80%|ββββββββ | 1291/1621 [22:28<04:59, 1.10it/s]
80%|ββββββββ | 1292/1621 [22:29<04:52, 1.12it/s]
80%|ββββββββ | 1293/1621 [22:30<04:48, 1.14it/s]
80%|ββββββββ | 1294/1621 [22:31<04:45, 1.15it/s]
80%|ββββββββ | 1295/1621 [22:31<04:43, 1.15it/s]
80%|ββββββββ | 1296/1621 [22:32<04:46, 1.14i |
| | 0: {'loss': 0.2905, 'grad_norm': 0.30092146988101065, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.8} |
| | 0: [2025-09-02 18:58:46,711] [INFO] [axolotl.core.trainers.base._save:613] [PID:801836] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0/checkpoint-1300[39m |
| | 0: [2025-09-02 18:58:49,199] [INFO] [axolotl.core.trainers.base._save:662] [PID:801836] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
| | 0: t/s]
80%|ββββββββ | 1297/1621 [22:33<04:44, 1.14it/s]
80%|ββββββββ | 1298/1621 [22:34<04:41, 1.15it/s]
80%|ββββββββ | 1299/1621 [22:35<04:39, 1.15it/s]
80%|ββββββββ | 1300/1621 [22:36<04:38, 1.15it/s]
80%|ββββββββ | 1300/1621 [22:36<04:38, 1.15it/s]
80%|ββββββββ | 1301/1621 [22:41<11:51, 2.22s/it]
80%|ββββββββ | 1302/1621 [22:42<09:48, 1.85s/it]
80%|ββββββββ | 1303/1621 [22:43<08:13, 1.55s/it]
80%|ββββββββ | 1304/1621 [22:44<07:05, 1.34s/it]
81%|ββββββββ | 1305/1621 [22:45<06:37, 1.26s/it]
81%|ββββββββ | 1306/1621 [22:46<05:59, 1.14s/it]
81%|ββββββββ | 1307/1621 [22:47<05:32, 1.06s/it]
81%|ββββββββ | 1308/1621 [22:48<05:13, 1.00s/it]
81%|ββββββββ | 1309/1621 [22:48<04:58, 1.05it/s]
81%|βββ |
| | 0: {'loss': 0.2876, 'grad_norm': 0.30256147814489603, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.81} |
| | 0: {'loss': 0.2903, 'grad_norm': 0.29325662274638065, 'learning_rate': 7e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.81} |
| | 0: βββββ | 1310/1621 [22:49<04:47, 1.08it/s]
81%|ββββββββ | 1310/1621 [22:49<04:47, 1.08it/s]
81%|ββββββββ | 1311/1621 [22:50<04:40, 1.10it/s]
81%|ββββββββ | 1312/1621 [22:51<04:36, 1.12it/s]
81%|ββββββββ | 1313/1621 [22:52<04:31, 1.14it/s]
81%|ββββββββ | 1314/1621 [22:53<04:28, 1.14it/s]
81%|ββββββββ | 1315/1621 [22:54<04:26, 1.15it/s]
81%|ββββββββ | 1316/1621 [22:54<04:25, 1.15it/s]
81%|ββββββββ | 1317/1621 [22:55<04:24, 1.15it/s]
81%|βββββββββ | 1318/1621 [22:56<04:22, 1.15it/s]
81%|βββββββββ | 1319/1621 [22:57<04:21, 1.15it/s]
81%|βββββββββ | 1320/1621 [22:58<04:21, 1.15it/s]
81%|βββββββββ | 1320/1621 [22:58<04:21, 1.15it/s]
81%|βββββββββ | 1321/ |
| | 0: {'loss': 0.2853, 'grad_norm': 0.32226439329094486, 'learning_rate': 6.9889525066359386e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.82} |
| | 0: 1621 [22:59<04:22, 1.14it/s]
82%|βββββββββ | 1322/1621 [23:00<04:20, 1.15it/s]
82%|βββββββββ | 1323/1621 [23:00<04:18, 1.15it/s]
82%|βββββββββ | 1324/1621 [23:01<04:17, 1.15it/s]
82%|βββββββββ | 1325/1621 [23:02<04:15, 1.16it/s]
82%|βββββββββ | 1326/1621 [23:03<04:13, 1.16it/s]
82%|βββββββββ | 1327/1621 [23:04<04:13, 1.16it/s]
82%|βββββββββ | 1328/1621 [23:05<04:18, 1.13it/s]
82%|βββββββββ | 1329/1621 [23:06<04:15, 1.14it/s]
82%|βββββββββ | 1330/1621 [23:07<04:14, 1.14it/s]
82%|βββββββββ | 1330/1621 [23:07<04:14, 1.14it/s]
82%|βββββββββ | 1331/1621 [23:07<04:13, 1.15it/s]
82%|βββββββββ | 1332/1621 [23:08<04:14, 1.14it/s]
82%|βββββββββ | 1333/1621 [23:09<04:11, 1.15it/s]
82%|ββββββββοΏ½ |
| | 0: {'loss': 0.2837, 'grad_norm': 0.30545478926692277, 'learning_rate': 6.94420483979537e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.83} |
| | 0: οΏ½ | 1334/1621 [23:10<04:25, 1.08it/s]
82%|βββββββββ | 1335/1621 [23:11<04:23, 1.09it/s]
82%|βββββββββ | 1336/1621 [23:12<04:16, 1.11it/s]
82%|βββββββββ | 1337/1621 [23:13<04:11, 1.13it/s]
83%|βββββββββ | 1338/1621 [23:14<04:08, 1.14it/s]
83%|βββββββββ | 1339/1621 [23:15<04:12, 1.11it/s]
83%|βββββββββ | 1340/1621 [23:16<04:08, 1.13it/s]
83%|βββββββββ | 1340/1621 [23:16<04:08, 1.13it/s]
83%|βββββββββ | 1341/1621 [23:16<04:05, 1.14it/s]
83%|βββββββββ | 1342/1621 [23:17<04:05, 1.13it/s]
83%|βββββββββ | 1343/1621 [23:18<04:12, 1.10it/s]
83%|βββββββββ | 1344/1621 [23:19<04:06, 1.12it/s]
83%|βββββββββ | 1345/1621 [23:20<04:03, 1.13it/s]
83%|βββββββββ | 1346/1621 [23:21<04:00, 1.14it/s]
83%|βββββοΏ½ |
| | 0: {'loss': 0.2878, 'grad_norm': 0.3035523369868482, 'learning_rate': 6.865556417226012e-06, 'memory/max_mem_active(gib)': 32.71, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.83} |
| | 0: οΏ½βββ | 1347/1621 [23:22<03:58, 1.15it/s]
83%|βββββββββ | 1348/1621 [23:23<03:56, 1.16it/s]
83%|βββββββββ | 1349/1621 [23:23<03:53, 1.17it/s]
83%|βββββββββ | 1350/1621 [23:24<03:51, 1.17it/s]
83%|βββββββββ | 1350/1621 [23:24<03:51, 1.17it/s]
83%|βββββββββ | 1351/1621 [23:25<04:13, 1.06it/s]
83%|βββββββββ | 1352/1621 [23:26<04:05, 1.10it/s]
83%|βββββββββ | 1353/1621 [23:27<03:59, 1.12it/s]
84%|βββββββββ | 1354/1621 [23:28<03:57, 1.13it/s]
84%|βββββββββ | 1355/1621 [23:29<03:52, 1.14it/s]
84%|βββββββββ | 1356/1621 [23:30<03:51, 1.15it/s]
84%|βββββββββ | 1357/1621 [23:31<03:52, 1.13it/s]
84%|βββββββββ | 1358/1621 [23:31<03:49, 1.15it/s]
84%|βββββββββ | 1359/1621 [23:32<04:03, 1.08it/s]
84%|ββοΏ½ |
| | 0: {'loss': 0.285, 'grad_norm': 0.3004133597396245, 'learning_rate': 6.7538689275037765e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.84} |
| | 0: {'loss': 0.2822, 'grad_norm': 0.28270553303909407, 'learning_rate': 6.61036604213817e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.85} |
| | 0: οΏ½ββββββ | 1360/1621 [23:33<03:57, 1.10it/s]
84%|βββββββββ | 1360/1621 [23:33<03:57, 1.10it/s]
84%|βββββββββ | 1361/1621 [23:34<03:52, 1.12it/s]
84%|βββββββββ | 1362/1621 [23:35<03:47, 1.14it/s]
84%|βββββββββ | 1363/1621 [23:36<03:46, 1.14it/s]
84%|βββββββββ | 1364/1621 [23:37<03:54, 1.09it/s]
84%|βββββββββ | 1365/1621 [23:38<03:49, 1.12it/s]
84%|βββββββββ | 1366/1621 [23:39<03:48, 1.12it/s]
84%|βββββββββ | 1367/1621 [23:39<03:44, 1.13it/s]
84%|βββββββββ | 1368/1621 [23:40<03:45, 1.12it/s]
84%|βββββββββ | 1369/1621 [23:41<03:41, 1.14it/s]
85%|βββββββββ | 1370/1621 [23:42<03:38, 1.15it/s]
85%|βββββββββ | 1370/1621 [23:42<03:38, 1.15it/s]
85%|βββββοΏ½ |
| | 0: {'loss': 0.2879, 'grad_norm': 0.3047147743418751, 'learning_rate': 6.436620008771168e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.85} |
| | 0: οΏ½οΏ½βββ | 1371/1621 [23:43<03:47, 1.10it/s]
85%|βββββββββ | 1372/1621 [23:44<03:43, 1.11it/s]
85%|βββββββββ | 1373/1621 [23:45<03:40, 1.13it/s]
85%|βββββββββ | 1374/1621 [23:46<03:37, 1.13it/s]
85%|βββββββββ | 1375/1621 [23:47<03:35, 1.14it/s]
85%|βββββββββ | 1376/1621 [23:48<03:42, 1.10it/s]
85%|βββββββββ | 1377/1621 [23:48<03:42, 1.10it/s]
85%|βββββββββ | 1378/1621 [23:49<03:38, 1.11it/s]
85%|βββββββββ | 1379/1621 [23:50<03:34, 1.13it/s]
85%|βββββββββ | 1380/1621 [23:51<03:31, 1.14it/s]
85%|βββββββββ | 1380/1621 [23:51<03:31, 1.14it/s]
85%|βββββββββ | 1381/1621 [23:52<03:29, 1.15it/s]
85%|βββββββββ | 1382/1621 [23:53<03:27, 1.15it/s]
85%|βββββββββ | 1383/1621 [23:54<03:29, 1.13it/s]
85%|ββοΏ½ |
| | 0: {'loss': 0.2813, 'grad_norm': 0.29257326468017114, 'learning_rate': 6.234534425303033e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.86} |
| | 0: οΏ½οΏ½ββββββ | 1384/1621 [23:55<03:27, 1.14it/s]
85%|βββββββββ | 1385/1621 [23:55<03:26, 1.14it/s]
86%|βββββββββ | 1386/1621 [23:56<03:27, 1.13it/s]
86%|βββββββββ | 1387/1621 [23:57<03:24, 1.14it/s]
86%|βββββββββ | 1388/1621 [23:58<03:23, 1.14it/s]
86%|βββββββββ | 1389/1621 [23:59<03:21, 1.15it/s]
86%|βββββββββ | 1390/1621 [24:00<03:21, 1.15it/s]
86%|βββββββββ | 1390/1621 [24:00<03:21, 1.15it/s]
86%|βββββββββ | 1391/1621 [24:01<03:22, 1.13it/s]
86%|βββββββββ | 1392/1621 [24:02<03:20, 1.14it/s]
86%|βββββββββ | 1393/1621 [24:02<03:18, 1.15it/s]
86%|βββββββββ | 1394/1621 [24:03<03:17, 1.15it/s]
86%|βββββββββ | 1395/1621 [24:04<03:23, 1.11it/s]
86%|βββββββββ | 1396/1621 [24:05<03:19, 1.13it/s]
86 |
| | 0: {'loss': 0.2892, 'grad_norm': 0.29126232634431454, 'learning_rate': 6.006323383675369e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.86} |
| | 0: %|βββββββββ | 1397/1621 [24:06<03:15, 1.14it/s]
86%|βββββββββ | 1398/1621 [24:07<03:14, 1.15it/s]
86%|βββββββββ | 1399/1621 [24:08<03:12, 1.15it/s]
86%|βββββββββ | 1400/1621 [24:09<03:10, 1.16it/s]
86%|βββββββββ | 1400/1621 [24:09<03:10, 1.16it/s]
86%|βββββββββ | 1401/1621 [24:09<03:08, 1.16it/s]
86%|βββββββββ | 1402/1621 [24:10<03:07, 1.17it/s]
87%|βββββββββ | 1403/1621 [24:11<03:06, 1.17it/s]
87%|βββββββββ | 1404/1621 [24:12<03:05, 1.17it/s]
87%|βββββββββ | 1405/1621 [24:13<03:08, 1.15it/s]
87%|βββββββββ | 1406/1621 [24:14<03:08, 1.14it/s]
87%|βββββββββ | 1407/1621 [24:15<03:07, 1.14it/s]
87%|βββββββββ | 1408/1621 [24:16<03:19, 1.07it/s]
87%|βββββββββ | 1409/1621 [24:17<03:13, 1.09 |
| | 0: {'loss': 0.2822, 'grad_norm': 0.27692361391158343, 'learning_rate': 5.754487211816481e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.87} |
| | 0: {'loss': 0.2783, 'grad_norm': 0.2902902504363524, 'learning_rate': 5.48178507952536e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.88} |
| | 0: it/s]
87%|βββββββββ | 1410/1621 [24:17<03:10, 1.11it/s]
87%|βββββββββ | 1410/1621 [24:17<03:10, 1.11it/s]
87%|βββββββββ | 1411/1621 [24:18<03:05, 1.13it/s]
87%|βββββββββ | 1412/1621 [24:19<03:03, 1.14it/s]
87%|βββββββββ | 1413/1621 [24:20<03:03, 1.13it/s]
87%|βββββββββ | 1414/1621 [24:21<03:01, 1.14it/s]
87%|βββββββββ | 1415/1621 [24:22<02:59, 1.15it/s]
87%|βββββββββ | 1416/1621 [24:23<02:58, 1.15it/s]
87%|βββββββββ | 1417/1621 [24:24<02:59, 1.14it/s]
87%|βββββββββ | 1418/1621 [24:24<03:04, 1.10it/s]
88%|βββββββββ | 1419/1621 [24:25<03:00, 1.12it/s]
88%|βββββββββ | 1420/1621 [24:26<02:57, 1.13it/s]
88%|βββββββββ | 1420/1621 [24:26<02:57, 1.13it/s]
8 |
| | 0: {'loss': 0.2819, 'grad_norm': 0.301471028456177, 'learning_rate': 5.191204768429978e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.88} |
| | 0: 8%|βββββββββ | 1421/1621 [24:27<02:59, 1.12it/s]
88%|βββββββββ | 1422/1621 [24:28<02:55, 1.13it/s]
88%|βββββββββ | 1423/1621 [24:29<02:54, 1.14it/s]
88%|βββββββββ | 1424/1621 [24:30<02:52, 1.14it/s]
88%|βββββββββ | 1425/1621 [24:31<02:50, 1.15it/s]
88%|βββββββββ | 1426/1621 [24:31<02:48, 1.16it/s]
88%|βββββββββ | 1427/1621 [24:32<02:47, 1.16it/s]
88%|βββββββββ | 1428/1621 [24:33<02:51, 1.12it/s]
88%|βββββββββ | 1429/1621 [24:34<02:51, 1.12it/s]
88%|βββββββββ | 1430/1621 [24:35<02:48, 1.13it/s]
88%|βββββββββ | 1430/1621 [24:35<02:48, 1.13it/s]
88%|βββββββββ | 1431/1621 [24:36<02:45, 1.15it/s]
88%|βββββββββ | 1432/1621 [24:37<02:44, 1.15it/s]
88%|βββββββββ | 1433/1621 [24:38<02:53, 1.0 |
| | 0: {'loss': 0.2796, 'grad_norm': 0.30104829275719075, 'learning_rate': 4.885929937226537e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.89} |
| | 0: 8it/s]
88%|βββββββββ | 1434/1621 [24:39<02:48, 1.11it/s]
89%|βββββββββ | 1435/1621 [24:40<02:55, 1.06it/s]
89%|βββββββββ | 1436/1621 [24:41<02:51, 1.08it/s]
89%|βββββββββ | 1437/1621 [24:42<02:54, 1.05it/s]
89%|βββββββββ | 1438/1621 [24:42<02:49, 1.08it/s]
89%|βββββββββ | 1439/1621 [24:43<02:44, 1.11it/s]
89%|βββββββββ | 1440/1621 [24:44<02:41, 1.12it/s]
89%|βββββββββ | 1440/1621 [24:44<02:41, 1.12it/s]
89%|βββββββββ | 1441/1621 [24:45<02:38, 1.13it/s]
89%|βββββββββ | 1442/1621 [24:46<02:37, 1.14it/s]
89%|βββββββββ | 1443/1621 [24:47<02:35, 1.15it/s]
89%|βββββββββ | 1444/1621 [24:48<02:35, 1.14it/s]
89%|βββββββββ | 1445/1621 [24:48<02:35, 1.13it/s]
89%|βββββββββ | 1446/1621 [24:49<02 |
| | 0: {'loss': 0.289, 'grad_norm': 0.32313381078930165, 'learning_rate': 4.569305240848566e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.89} |
| | 0: :33, 1.14it/s]
89%|βββββββββ | 1447/1621 [24:50<02:30, 1.15it/s]
89%|βββββββββ | 1448/1621 [24:51<02:29, 1.16it/s]
89%|βββββββββ | 1449/1621 [24:52<02:30, 1.14it/s]
89%|βββββββββ | 1450/1621 [24:53<02:33, 1.12it/s]
89%|βββββββββ | 1450/1621 [24:53<02:33, 1.12it/s]
90%|βββββββββ | 1451/1621 [24:54<02:38, 1.08it/s]
90%|βββββββββ | 1452/1621 [24:55<02:33, 1.10it/s]
90%|βββββββββ | 1453/1621 [24:56<02:30, 1.12it/s]
90%|βββββββββ | 1454/1621 [24:56<02:27, 1.13it/s]
90%|βββββββββ | 1455/1621 [24:57<02:27, 1.13it/s]
90%|βββββββββ | 1456/1621 [24:58<02:25, 1.14it/s]
90%|βββββββββ | 1457/1621 [24:59<02:23, 1.14it/s]
90%|βββββββββ | 1458/1621 [25:00<02:22, 1.15it/s]
90%|βββββββββ | 1459/1621 |
| | 0: {'loss': 0.2819, 'grad_norm': 0.28369782040728675, 'learning_rate': 4.244799685727559e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.9} |
| | 0: {'loss': 0.285, 'grad_norm': 0.273992840302611, 'learning_rate': 3.9159686226325745e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.91} |
| | 0: [25:01<02:23, 1.13it/s]
90%|βββββββββ | 1460/1621 [25:02<02:22, 1.13it/s]
90%|βββββββββ | 1460/1621 [25:02<02:22, 1.13it/s]
90%|βββββββββ | 1461/1621 [25:03<02:19, 1.14it/s]
90%|βββββββββ | 1462/1621 [25:03<02:18, 1.15it/s]
90%|βββββββββ | 1463/1621 [25:04<02:16, 1.16it/s]
90%|βββββββββ | 1464/1621 [25:05<02:15, 1.16it/s]
90%|βββββββββ | 1465/1621 [25:06<02:14, 1.16it/s]
90%|βββββββββ | 1466/1621 [25:07<02:14, 1.15it/s]
90%|βββββββββ | 1467/1621 [25:08<02:14, 1.15it/s]
91%|βββββββββ | 1468/1621 [25:09<02:13, 1.15it/s]
91%|βββββββββ | 1469/1621 [25:10<02:13, 1.14it/s]
91%|βββββββββ | 1470/1621 [25:10<02:12, 1.14it/s]
91%|βββββββββ | 1470/1621 [25:10<0 |
| | 0: {'loss': 0.2798, 'grad_norm': 0.303660187115468, 'learning_rate': 3.586414793503207e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.91} |
| | 0: 2:12, 1.14it/s]
91%|βββββββββ | 1471/1621 [25:11<02:14, 1.12it/s]
91%|βββββββββ | 1472/1621 [25:12<02:11, 1.13it/s]
91%|βββββββββ | 1473/1621 [25:13<02:09, 1.15it/s]
91%|βββββββββ | 1474/1621 [25:14<02:07, 1.15it/s]
91%|βββββββββ | 1475/1621 [25:15<02:06, 1.15it/s]
91%|βββββββββ | 1476/1621 [25:16<02:05, 1.16it/s]
91%|βββββββββ | 1477/1621 [25:17<02:04, 1.16it/s]
91%|βββββββββ | 1478/1621 [25:17<02:04, 1.15it/s]
91%|βββββββββ | 1479/1621 [25:18<02:02, 1.15it/s]
91%|ββββββββββ| 1480/1621 [25:19<02:01, 1.16it/s]
91%|ββββββββββ| 1480/1621 [25:19<02:01, 1.16it/s]
91%|ββββββββββ| 1481/1621 [25:20<02:00, 1.16it/s]
91%|ββββββββββ| 1482/1621 [25:21<02:09, 1.07it/s]
91%|ββββββββββ| |
| | 0: {'loss': 0.2826, 'grad_norm': 0.2972873589703875, 'learning_rate': 3.2597488590549667e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.92} |
| | 0: 1483/1621 [25:22<02:05, 1.10it/s]
92%|ββββββββββ| 1484/1621 [25:23<02:02, 1.12it/s]
92%|ββββββββββ| 1485/1621 [25:24<02:05, 1.09it/s]
92%|ββββββββββ| 1486/1621 [25:25<02:07, 1.06it/s]
92%|ββββββββββ| 1487/1621 [25:26<02:02, 1.09it/s]
92%|ββββββββββ| 1488/1621 [25:26<01:58, 1.12it/s]
92%|ββββββββββ| 1489/1621 [25:27<01:57, 1.13it/s]
92%|ββββββββββ| 1490/1621 [25:28<01:55, 1.14it/s]
92%|ββββββββββ| 1490/1621 [25:28<01:55, 1.14it/s]
92%|ββββββββββ| 1491/1621 [25:29<01:53, 1.15it/s]
92%|ββββββββββ| 1492/1621 [25:30<01:51, 1.15it/s]
92%|ββββββββββ| 1493/1621 [25:31<01:50, 1.15it/s]
92%|ββββββββββ| 1494/1621 [25:32<01:50, 1.15it/s]
92%|ββββββββββ| 1495/1621 [25:33<01:49, 1.15it/s] |
| | 0: {'loss': 0.2789, 'grad_norm': 0.28996847808960213, 'learning_rate': 2.9395498396249143e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.93} |
| | 0:
92%|ββββββββββ| 1496/1621 [25:33<01:47, 1.16it/s]
92%|ββββββββββ| 1497/1621 [25:34<01:49, 1.13it/s]
92%|ββββββββββ| 1498/1621 [25:35<01:48, 1.13it/s]
92%|ββββββββββ| 1499/1621 [25:36<01:46, 1.14it/s]
93%|ββββββββββ| 1500/1621 [25:37<01:44, 1.15it/s]
93%|ββββββββββ| 1500/1621 [25:37<01:44, 1.15it/s]
93%|ββββββββββ| 1501/1621 [25:38<01:45, 1.14it/s]
93%|ββββββββββ| 1502/1621 [25:39<01:44, 1.14it/s]
93%|ββββββββββ| 1503/1621 [25:40<01:43, 1.13it/s]
93%|ββββββββββ| 1504/1621 [25:40<01:42, 1.14it/s]
93%|ββββββββββ| 1505/1621 [25:41<01:40, 1.15it/s]
93%|ββββββββββ| 1506/1621 [25:42<01:39, 1.16it/s]
93%|ββββββββββ| 1507/1621 [25:43<01:39, 1.15it/s]
93%|βββββββββοΏ½ |
| | 0: {'loss': 0.2848, 'grad_norm': 0.2966980326764614, 'learning_rate': 2.629325902675876e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.93} |
| | 0: οΏ½| 1508/1621 [25:44<01:37, 1.15it/s]
93%|ββββββββββ| 1509/1621 [25:45<01:36, 1.16it/s]
93%|ββββββββββ| 1510/1621 [25:46<01:35, 1.16it/s]
93%|ββββββββββ| 1510/1621 [25:46<01:35, 1.16it/s]
93%|ββββββββββ| 1511/1621 [25:46<01:34, 1.17it/s]
93%|ββββββββββ| 1512/1621 [25:47<01:33, 1.17it/s]
93%|ββββββββββ| 1513/1621 [25:48<01:38, 1.10it/s]
93%|ββββββββββ| 1514/1621 [25:49<01:43, 1.03it/s]
93%|ββββββββββ| 1515/1621 [25:50<01:39, 1.06it/s]
94%|ββββββββββ| 1516/1621 [25:51<01:36, 1.09it/s]
94%|ββββββββββ| 1517/1621 [25:52<01:34, 1.10it/s]
94%|ββββββββββ| 1518/1621 [25:53<01:33, 1.11it/s]
94%|ββββββββββ| 1519/1621 [25:54<01:30, 1.13it/s]
94%|ββββββββββ| 1520/1621 [25:55<01:28, 1.14it/ |
| | 0: {'loss': 0.2788, 'grad_norm': 0.2671505339196964, 'learning_rate': 2.3324759265795965e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.94} |
| | 0: {'loss': 0.2899, 'grad_norm': 0.28163545935283396, 'learning_rate': 2.0522522617940406e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.94} |
| | 0: s]
94%|ββββββββββ| 1520/1621 [25:55<01:28, 1.14it/s]
94%|ββββββββββ| 1521/1621 [25:56<01:27, 1.14it/s]
94%|ββββββββββ| 1522/1621 [25:56<01:25, 1.16it/s]
94%|ββββββββββ| 1523/1621 [25:57<01:24, 1.16it/s]
94%|ββββββββββ| 1524/1621 [25:58<01:25, 1.13it/s]
94%|ββββββββββ| 1525/1621 [25:59<01:23, 1.14it/s]
94%|ββββββββββ| 1526/1621 [26:00<01:23, 1.14it/s]
94%|ββββββββββ| 1527/1621 [26:01<01:22, 1.14it/s]
94%|ββββββββββ| 1528/1621 [26:02<01:22, 1.13it/s]
94%|ββββββββββ| 1529/1621 [26:03<01:24, 1.09it/s]
94%|ββββββββββ| 1530/1621 [26:04<01:22, 1.10it/s]
94%|ββββββββββ| 1530/1621 [26:04<01:22, 1.10it/s]
94%|ββββββββββ| 1531/1621 [26: |
| | 0: {'loss': 0.2877, 'grad_norm': 0.29425238378168045, 'learning_rate': 1.7917250974311677e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.95} |
| | 0: 04<01:20, 1.12it/s]
95%|ββββββββββ| 1532/1621 [26:05<01:18, 1.14it/s]
95%|ββββββββββ| 1533/1621 [26:06<01:16, 1.15it/s]
95%|ββββββββββ| 1534/1621 [26:07<01:15, 1.16it/s]
95%|ββββββββββ| 1535/1621 [26:08<01:18, 1.10it/s]
95%|ββββββββββ| 1536/1621 [26:09<01:15, 1.12it/s]
95%|ββββββββββ| 1537/1621 [26:10<01:15, 1.11it/s]
95%|ββββββββββ| 1538/1621 [26:11<01:14, 1.12it/s]
95%|ββββββββββ| 1539/1621 [26:11<01:12, 1.13it/s]
95%|ββββββββββ| 1540/1621 [26:12<01:10, 1.15it/s]
95%|ββββββββββ| 1540/1621 [26:12<01:10, 1.15it/s]
95%|ββββββββββ| 1541/1621 [26:13<01:12, 1.11it/s]
95%|ββββββββββ| 1542/1621 [26:14<01:10, 1.11it/s]
95%|ββββββββββ| 1543/1621 [26:15<01:09, 1.12it/s]
95%|βββ |
| | 0: {'loss': 0.2838, 'grad_norm': 0.2694113241241066, 'learning_rate': 1.5537488236225542e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.96} |
| | 0: βββββββ| 1544/1621 [26:16<01:07, 1.14it/s]
95%|ββββββββββ| 1545/1621 [26:17<01:06, 1.15it/s]
95%|ββββββββββ| 1546/1621 [26:18<01:05, 1.15it/s]
95%|ββββββββββ| 1547/1621 [26:19<01:05, 1.13it/s]
95%|ββββββββββ| 1548/1621 [26:19<01:04, 1.13it/s]
96%|ββββββββββ| 1549/1621 [26:20<01:02, 1.14it/s]
96%|ββββββββββ| 1550/1621 [26:21<01:01, 1.15it/s]
96%|ββββββββββ| 1550/1621 [26:21<01:01, 1.15it/s]
96%|ββββββββββ| 1551/1621 [26:22<01:00, 1.15it/s]
96%|ββββββββββ| 1552/1621 [26:23<01:01, 1.13it/s]
96%|ββββββββββ| 1553/1621 [26:24<00:59, 1.14it/s]
96%|ββββββββββ| 1554/1621 [26:25<00:59, 1.13it/s]
96%|ββββββββββ| 1555/1621 [26:26<00:57, 1.14it/s]
96%|ββββββββββ| 1556/1621 [2 |
| | 0: {'loss': 0.2834, 'grad_norm': 0.2739383897655746, 'learning_rate': 1.340930758223782e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.96} |
| | 0: 6:26<00:56, 1.15it/s]
96%|ββββββββββ| 1557/1621 [26:27<00:55, 1.16it/s]
96%|ββββββββββ| 1558/1621 [26:28<00:55, 1.15it/s]
96%|ββββββββββ| 1559/1621 [26:29<00:55, 1.12it/s]
96%|ββββββββββ| 1560/1621 [26:30<00:53, 1.14it/s]
96%|ββββββββββ| 1560/1621 [26:30<00:53, 1.14it/s]
96%|ββββββββββ| 1561/1621 [26:31<00:54, 1.10it/s]
96%|ββββββββββ| 1562/1621 [26:32<00:53, 1.11it/s]
96%|ββββββββββ| 1563/1621 [26:33<00:51, 1.12it/s]
96%|ββββββββββ| 1564/1621 [26:33<00:50, 1.14it/s]
97%|ββββββββββ| 1565/1621 [26:34<00:48, 1.15it/s]
97%|ββββββββββ| 1566/1621 [26:35<00:47, 1.15it/s]
97%|ββββββββββ| 1567/1621 [26:36<00:46, 1.16it/s]
97%|ββββββββββ| 1568/1621 [26:37<00:48, 1.09it/s]
97%|ββοΏ½ |
| | 0: {'loss': 0.2845, 'grad_norm': 0.2793598127461013, 'learning_rate': 1.1556025804944043e-06, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.97} |
| | 0: οΏ½οΏ½βββββββ| 1569/1621 [26:38<00:47, 1.10it/s]
97%|ββββββββββ| 1570/1621 [26:39<00:45, 1.13it/s]
97%|ββββββββββ| 1570/1621 [26:39<00:45, 1.13it/s]
97%|ββββββββββ| 1571/1621 [26:40<00:43, 1.14it/s]
97%|ββββββββββ| 1572/1621 [26:41<00:43, 1.13it/s]
97%|ββββββββββ| 1573/1621 [26:41<00:41, 1.15it/s]
97%|ββββββββββ| 1574/1621 [26:42<00:40, 1.16it/s]
97%|ββββββββββ| 1575/1621 [26:43<00:39, 1.16it/s]
97%|ββββββββββ| 1576/1621 [26:44<00:39, 1.15it/s]
97%|ββββββββββ| 1577/1621 [26:45<00:38, 1.15it/s]
97%|ββββββββββ| 1578/1621 [26:46<00:37, 1.15it/s]
97%|ββββββββββ| 1579/1621 [26:47<00:36, 1.16it/s]
97%|ββββββββββ| 1580/1621 [26:47<00:35, 1.16it/s]
|
| | 0: {'loss': 0.2878, 'grad_norm': 0.27156348472491937, 'learning_rate': 9.99794784732039e-07, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.97} |
| | 0: {'loss': 0.2781, 'grad_norm': 0.2638303305592663, 'learning_rate': 8.752144337519349e-07, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.98} |
| | 0:
97%|ββββββββββ| 1580/1621 [26:47<00:35, 1.16it/s]
98%|ββββββββββ| 1581/1621 [26:49<00:36, 1.08it/s]
98%|ββββββββββ| 1582/1621 [26:49<00:35, 1.11it/s]
98%|ββββββββββ| 1583/1621 [26:50<00:34, 1.10it/s]
98%|ββββββββββ| 1584/1621 [26:51<00:33, 1.12it/s]
98%|ββββββββββ| 1585/1621 [26:52<00:31, 1.14it/s]
98%|ββββββββββ| 1586/1621 [26:53<00:30, 1.15it/s]
98%|ββββββββββ| 1587/1621 [26:54<00:29, 1.15it/s]
98%|ββββββββββ| 1588/1621 [26:55<00:28, 1.16it/s]
98%|ββββββββββ| 1589/1621 [26:55<00:27, 1.16it/s]
98%|ββββββββββ| 1590/1621 [26:56<00:27, 1.14it/s]
98%|ββββββββββ| 1590/1621 [26:56<00:27, 1.14it/s]
98%|ββββββββββ| 1591/1621 [26:57<00:26, 1.14it/s]
98%|ββββββββ |
| | 0: {'loss': 0.2831, 'grad_norm': 0.27452303324042504, 'learning_rate': 7.832264559495451e-07, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.99} |
| | 0: ββ| 1592/1621 [26:58<00:25, 1.15it/s]
98%|ββββββββββ| 1593/1621 [26:59<00:24, 1.15it/s]
98%|ββββββββββ| 1594/1621 [27:00<00:23, 1.15it/s]
98%|ββββββββββ| 1595/1621 [27:01<00:22, 1.16it/s]
98%|ββββββββββ| 1596/1621 [27:02<00:22, 1.10it/s]
99%|ββββββββββ| 1597/1621 [27:03<00:21, 1.12it/s]
99%|ββββββββββ| 1598/1621 [27:03<00:20, 1.12it/s]
99%|ββββββββββ| 1599/1621 [27:04<00:19, 1.13it/s]
99%|ββββββββββ| 1600/1621 [27:05<00:18, 1.12it/s]
99%|ββββββββββ| 1600/1621 [27:05<00:18, 1.12it/s]
99%|ββββββββββ| 1601/1621 [27:06<00:17, 1.14it/s]
99%|ββββββββββ| 1602/1621 [27:07<00:16, 1.14it/s]
99%|ββββββββββ| 1603/1621 [27:08<00:15, 1.15it/s]
99%|ββββββββββ| 1604/1621 [27:09<00:14, 1. |
| | 0: {'loss': 0.2921, 'grad_norm': 0.2764208630314015, 'learning_rate': 7.248386908593951e-07, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 0.99} |
| | 0: 14it/s]
99%|ββββββββββ| 1605/1621 [27:10<00:13, 1.15it/s]
99%|ββββββββββ| 1606/1621 [27:10<00:13, 1.15it/s]
99%|ββββββββββ| 1607/1621 [27:11<00:12, 1.16it/s]
99%|ββββββββββ| 1608/1621 [27:12<00:11, 1.16it/s]
99%|ββββββββββ| 1609/1621 [27:13<00:10, 1.15it/s]
99%|ββββββββββ| 1610/1621 [27:14<00:09, 1.15it/s]
99%|ββββββββββ| 1610/1621 [27:14<00:09, 1.15it/s]
99%|ββββββββββ| 1611/1621 [27:15<00:08, 1.15it/s]
99%|ββββββββββ| 1612/1621 [27:16<00:07, 1.16it/s]
100%|ββββββββββ| 1613/1621 [27:16<00:06, 1.15it/s]
100%|ββββββββββ| 1614/1621 [27:17<00:06, 1.10it/s]
100%|ββββββββββ| 1615/1621 [27:18<00:05, 1.10it/s]
100%|ββββββββββ| 1616/1621 [27:19<00:04, 1.12it/s]
100%|βββββββοΏ½ |
| | 0: {'loss': 0.283, 'grad_norm': 0.26571883451523387, 'learning_rate': 7.006908470542366e-07, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 1.0} |
| | 0: [2025-09-02 19:03:37,520] [INFO] [axolotl.core.trainers.base._save:613] [PID:801836] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0/checkpoint-1621[39m |
| | 0: [2025-09-02 19:03:40,044] [INFO] [axolotl.core.trainers.base._save:662] [PID:801836] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
| | 0: {'train_runtime': 1651.6754, 'train_samples_per_second': 15.703, 'train_steps_per_second': 0.981, 'train_loss': 0.2972346406744852, 'memory/max_mem_active(gib)': 32.97, 'memory/max_mem_allocated(gib)': 32.71, 'memory/device_mem_reserved(gib)': 38.82, 'epoch': 1.0} |
| | 0: οΏ½οΏ½ββ| 1617/1621 [27:20<00:03, 1.14it/s]
100%|ββββββββββ| 1618/1621 [27:21<00:02, 1.15it/s]
100%|ββββββββββ| 1619/1621 [27:22<00:01, 1.15it/s]
100%|ββββββββββ| 1620/1621 [27:23<00:00, 1.16it/s]
100%|ββββββββββ| 1620/1621 [27:23<00:00, 1.16it/s]
100%|ββββββββββ| 1621/1621 [27:27<00:00, 1.79s/it]
100%|ββββββββββ| 1621/1621 [27:31<00:00, 1.79s/it]
100%|ββββββββββ| 1621/1621 [27:31<00:00, 1.02s/it] |
| | 0: [2025-09-02 19:03:41,703] [INFO] [axolotl.train.save_trained_model:228] [PID:801836] [RANK:0] Training completed! Saving trained model to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0.[39m |
| | 0: [2025-09-02 19:03:42,150] [INFO] [axolotl.core.trainers.base._save:613] [PID:801836] [RANK:0] Saving model checkpoint to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0[39m |
| | 0: [2025-09-02 19:03:44,635] [INFO] [axolotl.core.trainers.base._save:662] [PID:801836] [RANK:0] Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`[39m |
| | 0: [2025-09-02 19:03:44,822] [INFO] [axolotl.train.save_trained_model:350] [PID:801836] [RANK:0] Model successfully saved to /lustre/fswork/projects/rech/dgo/udv55np/math/Qwen3-235B-A22B/Qwen2.5-1.5B_ift/0[39m |
| | |