diff --git a/log/2026-02-01_113215 - profiler traces.zip b/log/2026-02-01_113215 - profiler traces.zip new file mode 100644 index 00000000..ee356d57 Binary files /dev/null and b/log/2026-02-01_113215 - profiler traces.zip differ diff --git a/log/base_train-gc-fixes.py b/log/base_train-gc-fixes.py new file mode 100644 index 00000000..9cab2215 --- /dev/null +++ b/log/base_train-gc-fixes.py @@ -0,0 +1,507 @@ +""" +Train model. From root directory of the project, run as: + +python -m scripts.base_train + +or distributed as: + +torchrun --nproc_per_node=8 -m scripts.base_train + +If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example: +python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20 +""" + +import os +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import argparse +import time +import gc +from contextlib import nullcontext + +# GC timing callback to detect if garbage collection is causing training stalls +def _gc_callback(phase, info): + if phase == "start": + _gc_callback.start_time = time.perf_counter() + elif phase == "stop": + duration_ms = (time.perf_counter() - _gc_callback.start_time) * 1000 + if duration_ms > 10: # Only log if GC took >10ms + rank = getattr(_gc_callback, 'rank', '?') + print(f"[GC rank{rank}] gen{info['generation']}: {duration_ms:.1f}ms collected {info.get('collected', '?')} objects") +_gc_callback.start_time = 0 +_gc_callback.rank = '?' # Will be set after compute_init +gc.callbacks.append(_gc_callback) + +import wandb +import torch + +from nanochat.gpt import GPT, GPTConfig +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit, tokenizing_distributed_data_loader_with_state_bos_bestfit +from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type, get_peak_flops +from nanochat.tokenizer import get_tokenizer, get_token_bytes +from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint +from nanochat.loss_eval import evaluate_bpb +from nanochat.engine import Engine +from nanochat.flash_attention import HAS_FA3 +from scripts.base_eval import evaluate_core +print_banner() + +# ----------------------------------------------------------------------------- +# CLI arguments +parser = argparse.ArgumentParser(description="Pretrain base model") +# Logging +parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") +# Runtime +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +# Model architecture +parser.add_argument("--depth", type=int, default=20, help="depth of the Transformer model") +parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") +parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention") +parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") +parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") +# Training horizon (only one used, in order of precedence) +parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") +parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") +parser.add_argument("--target-param-data-ratio", type=float, default=10.5, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") +# Optimization +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") +parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") +parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") +parser.add_argument("--weight-decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") +parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--scalar-lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)") +parser.add_argument("--adam-beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") +parser.add_argument("--adam-beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") +parser.add_argument("--warmup-ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") +parser.add_argument("--warmdown-ratio", type=float, default=0.5, help="ratio of iterations for LR warmdown") +parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR as fraction of initial LR") +parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)") +# Evaluation +parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)") +parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") +parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)") +parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric") +parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)") +parser.add_argument("--save-every", type=int, default=-1, help="save checkpoints every N steps (-1 = only at end)") +# Output +parser.add_argument("--model-tag", type=str, default=None, help="override model tag for checkpoint directory name") +args = parser.parse_args() +user_config = vars(args).copy() # for logging +# ----------------------------------------------------------------------------- + +# Compute init +device_type = autodetect_device_type() if args.device_type == "" else args.device_type +ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) +master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. +_gc_callback.rank = ddp_rank # Store rank for GC log printouts +autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() +synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None +get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 +if device_type == "cuda": + gpu_device_name = torch.cuda.get_device_name(0) + gpu_peak_flops = get_peak_flops(gpu_device_name) + print0(f"GPU: {gpu_device_name} | Peak FLOPS (BF16): {gpu_peak_flops:.2e}") +else: + gpu_peak_flops = float('inf') # MFU not meaningful for CPU/MPS + +# wandb logging init +use_dummy_wandb = args.run == "dummy" or not master_process +if use_dummy_wandb: + wandb_run = DummyWandb() +else: + try: + wandb_run = wandb.init(project="nanochat", name=args.run, config=user_config) + except wandb.errors.UsageError as e: + print0(f"Warning: wandb initialization failed ({e}), logging disabled. Run 'wandb login' to enable.") + wandb_run = DummyWandb() + +# Flash Attention status +if HAS_FA3: + print0("✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome.") +else: + print0("!" * 80) + print0("WARNING: Flash Attention 3 not available, using PyTorch SDPA fallback") + print0("WARNING: Training will be less efficient without FA3") + if args.window_pattern != "L": + print0(f"WARNING: SDPA has no support for sliding window attention (window_pattern='{args.window_pattern}'). Your GPU utilization will be terrible.") + print0("WARNING: Recommend using --window-pattern L for full context attention without alternating sliding window patterns.") + print0("!" * 80) + +# Tokenizer will be useful for evaluation, also we need the vocab size +tokenizer = get_tokenizer() +token_bytes = get_token_bytes(device=device) +vocab_size = tokenizer.get_vocab_size() +print0(f"Vocab size: {vocab_size:,}") + +# Model kwargs are derived from the desired depth of the model +# We nudge model_dim up to the nearest multiple of head_dim to ensure clean division +# (FA3 requires head_dim divisible by 8, and this guarantees head_dim == args.head_dim exactly) +# (For very small depths, this gives a slight "unfair" advantage to models with odd depths) +num_layers = args.depth +base_dim = args.depth * args.aspect_ratio +model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim +num_heads = model_dim // args.head_dim +num_kv_heads = num_heads # default is 1:1 GQA (Group Query Attention) ratio (i.e. GQA is disabled) +head_dim = model_dim // num_heads +print0(f"num_layers: {num_layers}") +print0(f"model_dim: {model_dim} (base: {base_dim}, nudge: {model_dim - base_dim:+d})") +print0(f"num_heads: {num_heads}") +print0(f"head_dim: {head_dim}") +print0(f"num_kv_heads: {num_kv_heads}") + +# Optimizer / data / training length related hyperparameters +# figure out the needed gradient accumulation to reach the desired total batch size +tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank +world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks +assert args.total_batch_size % world_tokens_per_fwdbwd == 0 +grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd +print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") +print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") +print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") + +# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) +batch_lr_scale = 1.0 +reference_batch_size = 2**19 +batch_ratio = args.total_batch_size / reference_batch_size +if batch_ratio != 1.0: + # SGD: linear scaling with batch size is standard (not used in nanochat) + # AdamW: sqrt scaling is standard + # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer + batch_lr_scale = batch_ratio ** 0.5 + print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") + +# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) +weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 +if args.depth != 12: + print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") + +# ----------------------------------------------------------------------------- +# Initialize the Model + +# Create a new model with random weights +model_config_kwargs = dict(sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim, window_pattern=args.window_pattern) +with torch.device("meta"): + # All tensors are created as meta tensors (they have shape/dtype but no data) + model_config = GPTConfig(**model_config_kwargs) + model = GPT(model_config) +model.to_empty(device=device) # All tensors get storage on target device but with uninitialized (garbage) data +model.init_weights() # All tensors get initialized + +# If we are resuming, overwrite the model parameters with those of the checkpoint +base_dir = get_base_dir() +output_dirname = args.model_tag if args.model_tag else f"d{args.depth}" # e.g. d12 +checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname) +resuming = args.resume_from_step != -1 +if resuming: + print0(f"Resuming optimization from step {args.resume_from_step}") + model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, args.resume_from_step, device, load_optimizer=True, rank=ddp_rank) + model.load_state_dict(model_data, strict=True, assign=True) + del model_data # free up this memory after the copy + +orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) +model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe + +# Detailed parameter counts +param_counts = orig_model.num_scaling_params() +print0(f"Parameter counts:") +for key, value in param_counts.items(): + print0(f"{key:24s}: {value:,}") +num_params = param_counts['total'] +num_scaling_params = param_counts['transformer_matrices'] + param_counts['lm_head'] # determined to give the cleanest scaling laws, see dev/LOG.md Jan 27, 2026 +num_flops_per_token = model.estimate_flops() +print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") + +# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order) +assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0 +if args.num_iterations > 0: + num_iterations = args.num_iterations + print0(f"Using user-provided number of iterations: {num_iterations:,}") +elif args.target_flops > 0: + # calculate the number of iterations from the target flops + num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size)) + print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") +elif args.target_param_data_ratio > 0: + # calculate the number of iterations from the target param data ratio (use scaling params per Kaplan et al.) + target_tokens = int(args.target_param_data_ratio * num_scaling_params) + num_iterations = target_tokens // args.total_batch_size + print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") +else: + raise ValueError("No training horizon specified") +total_tokens = args.total_batch_size * num_iterations +print0(f"Total number of training tokens: {total_tokens:,}") +print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 +print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") + +# ----------------------------------------------------------------------------- +# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) +adam_betas = (args.adam_beta1, args.adam_beta2) +optimizer = model.setup_optimizer( + unembedding_lr=args.unembedding_lr * batch_lr_scale, + embedding_lr=args.embedding_lr * batch_lr_scale, + matrix_lr=args.matrix_lr * batch_lr_scale, + weight_decay=weight_decay_scaled, + adam_betas=adam_betas, + scalar_lr=args.scalar_lr * batch_lr_scale, +) + +if resuming: + optimizer.load_state_dict(optimizer_data) + del optimizer_data + +# ----------------------------------------------------------------------------- +# Initialize the DataLoaders for train/val +dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] +train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) +build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) +x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data + +# ----------------------------------------------------------------------------- +# Set up hyperparameter schedulers + +# Learning rate scheduler +def get_lr_multiplier(it): + warmup_iters = round(args.warmup_ratio * num_iterations) + warmdown_iters = round(args.warmdown_ratio * num_iterations) + if it < warmup_iters: + return (it + 1) / warmup_iters + elif it <= num_iterations - warmdown_iters: + return 1.0 + else: + progress = (num_iterations - it) / warmdown_iters + return progress * 1.0 + (1 - progress) * args.final_lr_frac + +# Momentum scheduler for Muon optimizer +def get_muon_momentum(it): + frac = min(it / 300, 1) + momentum = (1 - frac) * 0.85 + frac * 0.95 + return momentum + +# Weight decay scheduler for Muon optimizer (linear to zero over the course of training) +def get_weight_decay(it): + return weight_decay_scaled * (1 - it / num_iterations) + +# ----------------------------------------------------------------------------- +# Loop state (variables updated by the training loop) + +if not resuming: + step = 0 + val_bpb = None # will be set if eval_every > 0 + min_val_bpb = float("inf") + smooth_train_loss = 0 # EMA of training loss + total_training_time = 0 # total wall-clock time of training +else: + step = meta_data["step"] + loop_state = meta_data["loop_state"] + val_bpb = meta_data["val_bpb"] + min_val_bpb = loop_state["min_val_bpb"] + smooth_train_loss = loop_state["smooth_train_loss"] + total_training_time = loop_state["total_training_time"] + +# ----------------------------------------------------------------------------- +# Training loop +while True: + last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end + flops_so_far = num_flops_per_token * args.total_batch_size * step + + # once in a while: evaluate the val bpb (all ranks participate) + if args.eval_every > 0 and (last_step or step % args.eval_every == 0): + model.eval() + val_loader = build_val_loader() + eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) + with autocast_ctx: + val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) + print0(f"Step {step:05d} | Validation bpb: {val_bpb:.6f}") + if val_bpb < min_val_bpb: + min_val_bpb = val_bpb + wandb_run.log({ + "step": step, + "total_training_flops": flops_so_far, + "total_training_time": total_training_time, + "val/bpb": val_bpb, + }) + model.train() + + # once in a while: estimate the CORE metric (all ranks participate) + # use the original uncompiled model because the inputs keep changing shape + results = {} + if args.core_metric_every > 0 and (last_step or (step > 0 and step % args.core_metric_every == 0)): + model.eval() + with autocast_ctx: + results = evaluate_core(orig_model, tokenizer, device, max_per_task=args.core_metric_max_per_task) + print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}") + wandb_run.log({ + "step": step, + "total_training_flops": flops_so_far, + "core_metric": results["core_metric"], + "centered_results": results["centered_results"], + }) + model.train() + + # once in a while: sample from the model (only on master process) + # use the original uncompiled model because the inputs keep changing shape + if args.sample_every > 0 and master_process and (last_step or (step > 0 and step % args.sample_every == 0)): + model.eval() + prompts = [ + "The capital of France is", + "The chemical symbol of gold is", + "If yesterday was Friday, then tomorrow will be", + "The opposite of hot is", + "The planets of the solar system are:", + "My favorite color is", + "If 5*x + 3 = 13, then x is", + ] + engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation + for prompt in prompts: + tokens = tokenizer(prompt, prepend="<|bos|>") + with autocast_ctx: + sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) + print0(tokenizer.decode(sample[0])) + model.train() + + # save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step + if last_step or (step > 0 and step != args.resume_from_step and args.save_every > 0 and step % args.save_every == 0): + save_checkpoint( + checkpoint_dir, + step, + orig_model.state_dict(), # model parameters + optimizer.state_dict(), # optimizer state + { # metadata saved as json + "step": step, + "val_bpb": val_bpb, # loss at last step + "model_config": model_config_kwargs, + "user_config": user_config, # inputs to the training script + "device_batch_size": args.device_batch_size, + "max_seq_len": args.max_seq_len, + "dataloader_state_dict": dataloader_state_dict, + "loop_state": { # all loop state (other than step) so that we can resume training + "min_val_bpb": min_val_bpb, + "smooth_train_loss": smooth_train_loss, + "total_training_time": total_training_time, + }, + }, + rank=ddp_rank, + ) + + # termination conditions (TODO: possibly also add loss explosions etc.) + if last_step: + break + + # ------------------------------------------------------------------------- + # single training step + # evaluate the gradient + synchronize() + t0 = time.time() + for micro_step in range(grad_accum_steps): + with autocast_ctx: + loss = model(x, y) + train_loss = loss.detach() # for logging + loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here + loss.backward() + x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward + # step the optimizer + lrm = get_lr_multiplier(step) + muon_momentum = get_muon_momentum(step) + muon_weight_decay = get_weight_decay(step) + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * lrm + if group['kind'] == 'muon': + group["momentum"] = muon_momentum + group["weight_decay"] = muon_weight_decay + optimizer.step() + model.zero_grad(set_to_none=True) + train_loss_f = train_loss.item() # .item() is a CPU-GPU sync point + synchronize() + t1 = time.time() + dt = t1 - t0 + # ------------------------------------------------------------------------- + + # logging (CPU action only) + ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging + smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f # EMA the training loss + debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA + pct_done = 100 * step / num_iterations + tok_per_sec = int(args.total_batch_size / dt) + flops_per_sec = num_flops_per_token * args.total_batch_size / dt + mfu = 100 * flops_per_sec / (gpu_peak_flops * ddp_world_size) + if step > 10: + total_training_time += dt # only count the time after the first 10 steps + # Calculate ETA based on average time per step (excluding first 10 steps) + steps_done = step - 10 + if steps_done > 0: + avg_time_per_step = total_training_time / steps_done + remaining_steps = num_iterations - step + eta_seconds = remaining_steps * avg_time_per_step + eta_str = f" | eta: {eta_seconds/60:.1f}m" + else: + eta_str = "" + epoch = dataloader_state_dict["epoch"] + print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}") + if step % 100 == 0: + log_data = { + "step": step, + "total_training_flops": flops_so_far, + "total_training_time": total_training_time, + "train/loss": debiased_smooth_loss, + "train/lrm": lrm, + "train/dt": dt, + "train/tok_per_sec": tok_per_sec, + "train/mfu": mfu, + "train/epoch": epoch, + } + wandb_run.log(log_data) + + # Set 'first_step_of_run' flag + first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step) + + # state update + step += 1 + + # # TEMP - Bail at 1000 steps for benchmarking. + # if step == 1001: + # print0(f"Elapsed + ETA: {total_training_time + eta_seconds:.0f}s") + # break + + # Help out the garbage collector by flushing garbage and then freezing long-lived objects + # This eliminates random ~500ms pauses during training steps as the GC scans ~millions of objects for cycles + if first_step_of_run: + gc.collect() + gc.freeze() + gc.disable() # nuclear option: disable GC for the run + elif step % 2000 == 0: + gc.collect() # manual GC to keep memory usage in check for very long runs + +# print a few more stats +print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") +print0(f"Total training time: {total_training_time/60:.2f}m") +if val_bpb is not None: + print0(f"Minimum validation bpb: {min_val_bpb:.6f}") + +# Log to report +from nanochat.report import get_report +get_report().log(section="Base model training", data=[ + user_config, # CLI args + { # stats about the training setup + "Number of parameters": num_params, + "Number of FLOPs per token": f"{num_flops_per_token:e}", + "Calculated number of iterations": num_iterations, + "Number of training tokens": total_tokens, + "Tokens : Scaling params ratio": args.total_batch_size * num_iterations / num_scaling_params, + "DDP world size": ddp_world_size, + "warmup_ratio": args.warmup_ratio, + "warmdown_ratio": args.warmdown_ratio, + "final_lr_frac": args.final_lr_frac, + }, + { # stats about training outcomes + "Minimum validation bpb": min_val_bpb if val_bpb is not None else None, + "Final validation bpb": val_bpb, + "CORE metric estimate": results.get("core_metric", None), + "MFU %": f"{mfu:.2f}%", + "Total training flops": f"{flops_so_far:e}", + "Total training time": f"{total_training_time/60:.2f}m", + "Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB", + } +]) + +# cleanup +wandb_run.finish() # wandb run finish +compute_cleanup() diff --git a/log/dataloader-gc-fixes.py b/log/dataloader-gc-fixes.py new file mode 100644 index 00000000..544d9901 --- /dev/null +++ b/log/dataloader-gc-fixes.py @@ -0,0 +1,239 @@ +""" +Distributed dataloaders for pretraining. + +Two implementations are provided: + +1. Original (tokenizing_distributed_data_loader): + - Streams tokens into a flat buffer, reshapes to (B, T) + - Rows may start mid-document (no guaranteed BOS at position 0) + - 100% token utilization, simple and efficient + +2. BOS-aligned bestfit (tokenizing_distributed_data_loader_bos_bestfit): + - Every row starts with BOS token + - Documents packed using best-fit algorithm to minimize cropping + - When no document fits remaining space, crops a document to fill exactly + - 100% utilization (no padding), ~35% tokens cropped at T=2048 + +The tradeoff: BOS-aligned loses ~35% of tokens to cropping, but ensures that +there are fewer "confusing" tokens in the train/val batches as every token can +now attend back to the BOS token and sees the full context of the document. +(2) is the new default if you have enough data. +Fallback to (1) if you have very limited data AND long documents. +""" + +import torch +import pyarrow.parquet as pq + +from nanochat.common import get_dist_info +from nanochat.dataset import list_parquet_files + +def _document_batches(split, resume_state_dict, tokenizer_batch_size): + """ + Infinite iterator over document batches (list of text strings) from parquet files. + + Handles DDP sharding and approximate resume. Each yield is (text_batch, (pq_idx, rg_idx, epoch)) + where text_batch is a list of document strings, indices track position for resumption, + and epoch counts how many times we've cycled through the dataset (starts at 1). + """ + ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() + + parquet_paths = list_parquet_files() + assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?" + parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:] + + resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0 + resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None + resume_epoch = resume_state_dict.get("epoch", 1) if resume_state_dict is not None else 1 + first_pass = True + pq_idx = resume_pq_idx + epoch = resume_epoch + + while True: # iterate infinitely (multi-epoch) + pq_idx = resume_pq_idx if first_pass else 0 + while pq_idx < len(parquet_paths): + filepath = parquet_paths[pq_idx] + pf = pq.ParquetFile(filepath) + # Start from resume point if resuming on same file, otherwise from DDP rank + if first_pass and (resume_rg_idx is not None) and (pq_idx == resume_pq_idx): + base_idx = resume_rg_idx // ddp_world_size + base_idx += 1 # advance by 1 so we don't repeat data after resuming + rg_idx = base_idx * ddp_world_size + ddp_rank + if rg_idx >= pf.num_row_groups: + pq_idx += 1 + continue + resume_rg_idx = None # only do this once + else: + rg_idx = ddp_rank + while rg_idx < pf.num_row_groups: + rg = pf.read_row_group(rg_idx) + batch = rg.column('text').to_pylist() + for i in range(0, len(batch), tokenizer_batch_size): + yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx, epoch) + rg_idx += ddp_world_size + pq_idx += 1 + first_pass = False + epoch += 1 + + +def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None): + """ + Stream pretraining text from parquet files, tokenize, yield training batches. + + This is the original dataloader that streams tokens into a flat buffer and reshapes. + Rows may start mid-document (no guaranteed BOS at position 0). + + Supports approximate resume via state_dict. + """ + assert split in ["train", "val"], "split must be 'train' or 'val'" + + batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) + needed_tokens = B * T + 1 # +1 for target at last position + bos_token = tokenizer.get_bos_token_id() + token_buffer = [] + pq_idx, rg_idx, epoch = 0, 0, 1 + + while True: + + # Accumulate enough tokens + while len(token_buffer) < needed_tokens: + doc_batch, (pq_idx, rg_idx, epoch) = next(batches) + token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) + for tokens in token_lists: + token_buffer.extend(tokens) + tokens = token_buffer[:needed_tokens] # Read B*T+1 tokens (+1 is only for the target for the last token) + token_buffer = token_buffer[B*T:] # Advance by B*T tokens, so we move exactly one window of B*T tokens over + + # Package tokens into inputs and targets, yield + use_cuda = device == "cuda" + scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda) + inputs = scratch[:-1].view(B, T).to(device=device, non_blocking=use_cuda) + targets = scratch[1:].view(B, T).to(device=device, non_blocking=use_cuda) + yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + + +def tokenizing_distributed_data_loader(*args, **kwargs): + """Helper that omits state_dict from yields.""" + for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs): + yield inputs, targets + + +def tokenizing_distributed_data_loader_with_state_bos_bestfit( + tokenizer, B, T, split, + tokenizer_threads=4, tokenizer_batch_size=128, + device="cuda", resume_state_dict=None, + buffer_size=1000 +): + """ + BOS-aligned dataloader with Best-Fit Cropping. + + Reduces token waste compared to simple greedy cropping by searching a buffer + for documents that fit well, while maintaining 100% utilization (no padding). + + Algorithm for each row: + 1. From buffered docs, pick the LARGEST doc that fits entirely + 2. Repeat until no doc fits + 3. When nothing fits, crop a doc to fill remaining space exactly + + Key properties: + - Every row starts with BOS + - 100% utilization (no padding, every token is trained on) + - Approximately 35% of all tokens are discarded due to cropping + """ + assert split in ["train", "val"], "split must be 'train' or 'val'" + + row_capacity = T + 1 + batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) + bos_token = tokenizer.get_bos_token_id() + pq_idx, rg_idx, epoch = 0, 0, 1 + + # Token pool: single tensor holding all buffered tokens + # Documents tracked as (start, length) tuples + pool = torch.empty(buffer_size * 512, dtype=torch.long) + pool_end = 0 + docs = [] # [(start, length), ...] + + def compact_pool(): + """Shift active documents to front of pool, reclaiming space.""" + nonlocal pool_end + if not docs: + pool_end = 0 + return + write_pos = 0 + for i, (start, length) in enumerate(docs): + if start != write_pos: + pool[write_pos:write_pos + length] = pool[start:start + length].clone() + docs[i] = (write_pos, length) + write_pos += length + pool_end = write_pos + + def refill_buffer(): + """Retrieve more docs and add them to the pool""" + nonlocal pq_idx, rg_idx, epoch, pool, pool_end + doc_batch, (pq_idx, rg_idx, epoch) = next(batches) + token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) + # Number of new tokens to store + total_new = sum(len(t) for t in token_lists) + # If there's not enough space at the end, + if pool_end + total_new > pool.size(0): + compact_pool() # Try compacting first. + # If still not enough, + if pool_end + total_new > pool.size(0): + # Allocate a new, larger pool. + new_size = max(pool.size(0) * 2, pool_end + total_new) + new_pool = torch.empty(new_size, dtype=torch.long) + new_pool[:pool_end] = pool[:pool_end] + pool = new_pool + # Write tokens to pool + for tokens in token_lists: + n = len(tokens) + pool[pool_end:pool_end + n] = torch.tensor(tokens, dtype=torch.long) + docs.append((pool_end, n)) + pool_end += n + + # Pre-allocate buffers once + use_cuda = device == "cuda" + row_buffer = torch.empty((B, row_capacity), dtype=torch.long) + inputs = torch.empty((B, T), dtype=torch.long, device=device) + targets = torch.empty((B, T), dtype=torch.long, device=device) + + while True: + for row_idx in range(B): + col = 0 + while col < row_capacity: + # Ensure buffer has documents + while len(docs) < buffer_size: + refill_buffer() + + remaining = row_capacity - col + + # Find largest doc that fits entirely + best_idx = -1 + best_len = 0 + for i, (start, length) in enumerate(docs): + if length <= remaining and length > best_len: + best_idx = i + best_len = length + + if best_idx >= 0: + start, length = docs.pop(best_idx) + row_buffer[row_idx, col:col + length] = pool[start:start + length] + col += length + else: + # No doc fits - crop shortest to fill remaining + shortest_idx = min(range(len(docs)), key=lambda i: docs[i][1]) + start, length = docs.pop(shortest_idx) + row_buffer[row_idx, col:col + remaining] = pool[start:start + remaining] + col += remaining + + # Copy to GPU + inputs.copy_(row_buffer[:, :-1], non_blocking=use_cuda) + targets.copy_(row_buffer[:, 1:], non_blocking=use_cuda) + + state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + yield inputs, targets, state_dict + +def tokenizing_distributed_data_loader_bos_bestfit(*args, **kwargs): + """Helper that omits state_dict from yields.""" + for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state_bos_bestfit(*args, **kwargs): + yield inputs, targets + \ No newline at end of file diff --git a/log/optim-mlp-lr-1x2x.py b/log/optim-mlp-lr-1x2x.py new file mode 100644 index 00000000..494c2985 --- /dev/null +++ b/log/optim-mlp-lr-1x2x.py @@ -0,0 +1,538 @@ +""" +A nice and efficient mixed AdamW/Muon Combined Optimizer. +Usually the embeddings and scalars go into AdamW, and the matrix parameters go into Muon. +Two versions are provided (MuonAdamW, DistMuonAdamW), for single GPU and distributed. + +Addapted from: https://github.com/KellerJordan/modded-nanogpt +Further contributions from @karpathy and @chrisjmccormick. +""" + +import torch +import torch.distributed as dist +from torch import Tensor + +# ----------------------------------------------------------------------------- +""" +Good old AdamW optimizer, fused kernel. +https://arxiv.org/abs/1711.05101 +""" + +@torch.compile(dynamic=False, fullgraph=True) +def adamw_step_fused( + p: Tensor, # (32768, 768) - parameter tensor + grad: Tensor, # (32768, 768) - gradient, same shape as p + exp_avg: Tensor, # (32768, 768) - first moment, same shape as p + exp_avg_sq: Tensor, # (32768, 768) - second moment, same shape as p + step_t: Tensor, # () - 0-D CPU tensor, step count + lr_t: Tensor, # () - 0-D CPU tensor, learning rate + beta1_t: Tensor, # () - 0-D CPU tensor, beta1 + beta2_t: Tensor, # () - 0-D CPU tensor, beta2 + eps_t: Tensor, # () - 0-D CPU tensor, epsilon + wd_t: Tensor, # () - 0-D CPU tensor, weight decay +) -> None: + """ + Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update + All in one compiled graph to eliminate Python overhead between ops. + The 0-D CPU tensors avoid recompilation when hyperparameter values change. + """ + # Weight decay (decoupled, applied before the update) + p.mul_(1 - lr_t * wd_t) + # Update running averages (lerp_ is cleaner and fuses well) + exp_avg.lerp_(grad, 1 - beta1_t) + exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) + # Bias corrections + bias1 = 1 - beta1_t ** step_t + bias2 = 1 - beta2_t ** step_t + # Compute update and apply + denom = (exp_avg_sq / bias2).sqrt() + eps_t + step_size = lr_t / bias1 + p.add_(exp_avg / denom, alpha=-step_size) + +# ----------------------------------------------------------------------------- +""" +Muon optimizer adapted and simplified from modded-nanogpt. +https://github.com/KellerJordan/modded-nanogpt + +Background: +Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a +quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose +of minimizing steps, it turns out to be empirically effective to keep increasing the slope at +zero even beyond the point where the iteration no longer converges all the way to one everywhere +on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T +where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model +performance at all relative to UV^T, where USV^T = G is the SVD. + +Here, an alternative to Newton-Schulz iteration with potentially better convergence properties: +Polar Express Sign Method for orthogonalization. +https://arxiv.org/pdf/2505.16932 +by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + +Some of the changes in nanochat implementation: +- Uses a simpler, more general approach to parameter grouping and stacking +- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step +- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format) +""" + +# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2) +# From https://arxiv.org/pdf/2505.16932 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), +] + +@torch.compile(dynamic=False, fullgraph=True) +def muon_step_fused( + stacked_grads: Tensor, # (12, 768, 3072) - stacked gradients + stacked_params: Tensor, # (12, 768, 3072) - stacked parameters + momentum_buffer: Tensor, # (12, 768, 3072) - first moment buffer + second_momentum_buffer: Tensor, # (12, 768, 1) or (12, 1, 3072) - factored second moment + momentum_t: Tensor, # () - 0-D CPU tensor, momentum coefficient + lr_t: Tensor, # () - 0-D CPU tensor, learning rate + wd_t: Tensor, # () - 0-D CPU tensor, weight decay + beta2_t: Tensor, # () - 0-D CPU tensor, beta2 for second moment + ns_steps: int, # 5 - number of Newton-Schulz/Polar Express iterations + red_dim: int, # -1 or -2 - reduction dimension for variance +) -> None: + """ + Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update + All in one compiled graph to eliminate Python overhead between ops. + Some of the constants are 0-D CPU tensors to avoid recompilation when values change. + """ + + # Nesterov momentum + momentum = momentum_t.to(stacked_grads.dtype) + momentum_buffer.lerp_(stacked_grads, 1 - momentum) + g = stacked_grads.lerp_(momentum_buffer, momentum) + + # Polar express + X = g.bfloat16() + X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) + if g.size(-2) > g.size(-1): # Tall matrix + for a, b, c in polar_express_coeffs[:ns_steps]: + A = X.mT @ X + B = b * A + c * (A @ A) + X = a * X + X @ B + else: # Wide matrix (original math) + for a, b, c in polar_express_coeffs[:ns_steps]: + A = X @ X.mT + B = b * A + c * (A @ A) + X = a * X + B @ X + g = X + + # Variance reduction + beta2 = beta2_t.to(g.dtype) + v_mean = g.float().square().mean(dim=red_dim, keepdim=True) + red_dim_size = g.size(red_dim) + v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size + v_norm = v_norm_sq.sqrt() + second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt() + scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square() + v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() + final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) + g = g * final_scale.to(g.dtype) + + # Cautious weight decay + parameter update + lr = lr_t.to(g.dtype) + wd = wd_t.to(g.dtype) + mask = (g * stacked_params) >= 0 + stacked_params.sub_(lr * g + lr * wd * stacked_params * mask) + +# ----------------------------------------------------------------------------- +# Single GPU version of the MuonAdamW optimizer. +# Used mostly for reference, debugging and testing. + +class MuonAdamW(torch.optim.Optimizer): + """ + Combined optimizer: Muon for 2D matrix params, AdamW for others, single GPU version. + + AdamW - Fused AdamW optimizer step. + + Muon - MomentUm Orthogonalized by Newton-schulz + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Some warnings: + - The Muon optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. + + Arguments: + param_groups: List of dicts, each containing: + - 'params': List of parameters + - 'kind': 'adamw' or 'muon' + - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay' + - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay' + """ + def __init__(self, param_groups: list[dict]): + super().__init__(param_groups, defaults={}) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + # AdamW tensors + self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + # Muon tensors + self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + + def _step_adamw(self, group: dict) -> None: + """ + AdamW update for each param in the group individually. + Lazy init the state, fill in all 0-D tensors, call the fused kernel. + """ + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + state = self.state[p] + + # State init + if not state: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + exp_avg = state['exp_avg'] + exp_avg_sq = state['exp_avg_sq'] + state['step'] += 1 + + # Fill 0-D tensors with current values + self._adamw_step_t.fill_(state['step']) + self._adamw_lr_t.fill_(group['lr']) + self._adamw_beta1_t.fill_(group['betas'][0]) + self._adamw_beta2_t.fill_(group['betas'][1]) + self._adamw_eps_t.fill_(group['eps']) + self._adamw_wd_t.fill_(group['weight_decay']) + + # Fused update: weight_decay -> momentum -> bias_correction -> param_update + adamw_step_fused( + p, grad, exp_avg, exp_avg_sq, + self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + ) + + def _step_muon(self, group: dict) -> None: + """ + Muon update for all params in the group (stacked for efficiency). + Lazy init the state, fill in all 0-D tensors, call the fused kernel. + """ + params: list[Tensor] = group['params'] + if not params: + return + + # Get or create group-level buffers (stored in first param's state for convenience) + p = params[0] + state = self.state[p] + num_params = len(params) + shape, device, dtype = p.shape, p.device, p.dtype + + # Momentum for every individual parameter + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) + momentum_buffer = state["momentum_buffer"] + + # Second momentum buffer is factored, either per-row or per-column + if "second_momentum_buffer" not in state: + state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1]) + state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) + second_momentum_buffer = state["second_momentum_buffer"] + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Stack grads and params (NOTE: this assumes all params have the same shape) + stacked_grads = torch.stack([p.grad for p in params]) + stacked_params = torch.stack(params) + + # Fill all the 0-D tensors with current values + self._muon_momentum_t.fill_(group["momentum"]) + self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) + # Shape-based LR scaling (flipped from original): + # - Tall matrices (input projections like c_fc): 1x + # - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4 + ratio = shape[-2] / shape[-1] + lr_mult = 1.0 if ratio >= 1 else ratio**-0.5 + self._muon_lr_t.fill_(group["lr"] * lr_mult) + self._muon_wd_t.fill_(group["weight_decay"]) + + # Single fused kernel: momentum -> polar_express -> variance_reduction -> update + muon_step_fused( + stacked_grads, + stacked_params, + momentum_buffer, + second_momentum_buffer, + self._muon_momentum_t, + self._muon_lr_t, + self._muon_wd_t, + self._muon_beta2_t, + group["ns_steps"], + red_dim, + ) + + # Copy back to original params + torch._foreach_copy_(params, list(stacked_params.unbind(0))) + + @torch.no_grad() + def step(self): + for group in self.param_groups: + if group['kind'] == 'adamw': + self._step_adamw(group) + elif group['kind'] == 'muon': + self._step_muon(group) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + +# ----------------------------------------------------------------------------- +# Distributed version of the MuonAdamW optimizer. +# Used for training on multiple GPUs. + +class DistMuonAdamW(torch.optim.Optimizer): + """ + Combined distributed optimizer: Muon for 2D matrix params, AdamW for others. + + See MuonAdamW for the algorithmic details of each optimizer. This class adds + distributed communication to enable multi-GPU training without PyTorch DDP. + + Design Goals: + - Overlap communication with computation (async ops) + - Minimize memory by sharding optimizer states across ranks (ZeRO-2 style) + - Batch small tensors into single comm ops where possible + + Communication Pattern (3-phase async): + We use a 3-phase structure to maximize overlap between communication and compute: + + Phase 1: Launch all async reduce ops + - Kick off all reduce_scatter/all_reduce operations + - Don't wait - let them run in background while we continue + + Phase 2: Wait for reduces, compute updates, launch gathers + - For each group: wait for its reduce, compute the update, launch gather + - By processing groups in order, earlier gathers run while later computes happen + + Phase 3: Wait for gathers, copy back + - Wait for all gathers to complete + - Copy updated params back to original tensors (Muon only) + + AdamW Communication (ZeRO-2 style): + - Small params (<1024 elements): all_reduce gradients, update full param on each rank. + Optimizer state is replicated but these params are tiny (scalars, biases). + - Large params: reduce_scatter gradients so each rank gets 1/N of the grad, update + only that slice, then all_gather the updated slices. Optimizer state (exp_avg, + exp_avg_sq) is sharded - each rank only stores state for its slice. + Requires param.shape[0] divisible by world_size. + + Muon Communication (stacked + chunked): + - All params in a Muon group must have the same shape (caller's responsibility). + - Stack all K params into a single (K, *shape) tensor for efficient comm. + - Divide K params across N ranks: each rank "owns" ceil(K/N) params. + - reduce_scatter the stacked grads so each rank gets its chunk. + - Each rank computes Muon update only for params it owns. + - all_gather the updated params back to all ranks. + - Optimizer state (momentum_buffer, second_momentum_buffer) is sharded by chunk. + - Padding: if K doesn't divide evenly, we zero-pad to (ceil(K/N) * N) for comm, + then ignore the padding when copying back. + + Buffer Reuse: + - For Muon, we allocate stacked_grads for reduce_scatter input, then reuse the + same buffer as the output for all_gather (stacked_params). This saves memory + since we don't need both buffers simultaneously. + + Arguments: + param_groups: List of dicts, each containing: + - 'params': List of parameters + - 'kind': 'adamw' or 'muon' + - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay' + - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay' + """ + def __init__(self, param_groups: list[dict]): + super().__init__(param_groups, defaults={}) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + + def _reduce_adamw(self, group: dict, world_size: int) -> dict: + """Launch async reduce ops for AdamW group. Returns info dict with per-param infos.""" + param_infos = {} + for p in group['params']: + grad = p.grad + if p.numel() < 1024: + # Small params: all_reduce (no scatter/gather needed) + future = dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future() + param_infos[p] = dict(future=future, grad_slice=grad, is_small=True) + else: + # Large params: reduce_scatter + rank_size = grad.shape[0] // world_size + grad_slice = torch.empty_like(grad[:rank_size]) + future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future() + param_infos[p] = dict(future=future, grad_slice=grad_slice, is_small=False) + return dict(param_infos=param_infos) + + def _reduce_muon(self, group: dict, world_size: int) -> dict: + """Launch async reduce op for Muon group. Returns info dict.""" + params = group['params'] + chunk_size = (len(params) + world_size - 1) // world_size + padded_num_params = chunk_size * world_size + p = params[0] + shape, device, dtype = p.shape, p.device, p.dtype + + # Stack grads and zero-pad to padded_num_params + grad_stack = torch.stack([p.grad for p in params]) + stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device) + stacked_grads[:len(params)].copy_(grad_stack) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + # Reduce_scatter to get this rank's chunk + grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + future = dist.reduce_scatter_tensor(grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True).get_future() + + return dict(future=future, grad_chunk=grad_chunk, stacked_grads=stacked_grads, chunk_size=chunk_size) + + def _compute_adamw(self, group: dict, info: dict, gather_list: list, rank: int, world_size: int) -> None: + """Wait for reduce, compute AdamW updates, launch gathers for large params.""" + param_infos = info['param_infos'] + for p in group['params']: + pinfo = param_infos[p] + pinfo['future'].wait() + grad_slice = pinfo['grad_slice'] + state = self.state[p] + + # For small params, operate on full param; for large, operate on slice + if pinfo['is_small']: + p_slice = p + else: + rank_size = p.shape[0] // world_size + p_slice = p[rank * rank_size:(rank + 1) * rank_size] + + # State init + if not state: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_slice) + state['exp_avg_sq'] = torch.zeros_like(p_slice) + state['step'] += 1 + + # Fill 0-D tensors and run fused kernel + self._adamw_step_t.fill_(state['step']) + self._adamw_lr_t.fill_(group['lr']) + self._adamw_beta1_t.fill_(group['betas'][0]) + self._adamw_beta2_t.fill_(group['betas'][1]) + self._adamw_eps_t.fill_(group['eps']) + self._adamw_wd_t.fill_(group['weight_decay']) + adamw_step_fused( + p_slice, grad_slice, state['exp_avg'], state['exp_avg_sq'], + self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + ) + + # Large params need all_gather + if not pinfo['is_small']: + future = dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future() + gather_list.append(dict(future=future, params=None)) + + def _compute_muon(self, group: dict, info: dict, gather_list: list, rank: int) -> None: + """Wait for reduce, compute Muon updates, launch gather.""" + info['future'].wait() + params = group['params'] + chunk_size = info['chunk_size'] + grad_chunk = info['grad_chunk'] + p = params[0] + shape, device, dtype = p.shape, p.device, p.dtype + + # How many params does this rank own? + start_idx = rank * chunk_size + num_owned = min(chunk_size, max(0, len(params) - start_idx)) + + # Get or create group-level state + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device) + if "second_momentum_buffer" not in state: + state_shape = (chunk_size, shape[-2], 1) if shape[-2] >= shape[-1] else (chunk_size, 1, shape[-1]) + state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Build output buffer for all_gather + updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + + if num_owned > 0: + owned_params = [params[start_idx + i] for i in range(num_owned)] + stacked_owned = torch.stack(owned_params) + + # Fill 0-D tensors and run fused kernel + self._muon_momentum_t.fill_(group["momentum"]) + self._muon_beta2_t.fill_(group["beta2"]) + # Shape-based LR scaling (flipped from original): + # - Tall matrices (input projections like c_fc): 1x + # - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4 + ratio = shape[-2] / shape[-1] + lr_mult = 1.0 if ratio >= 1 else ratio**-0.5 + self._muon_lr_t.fill_(group["lr"] * lr_mult) + self._muon_wd_t.fill_(group["weight_decay"]) + muon_step_fused( + grad_chunk[:num_owned], stacked_owned, + state["momentum_buffer"][:num_owned], state["second_momentum_buffer"][:num_owned], + self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, self._muon_beta2_t, + group["ns_steps"], red_dim, + ) + updated_params[:num_owned].copy_(stacked_owned) + + if num_owned < chunk_size: + updated_params[num_owned:].zero_() + + # Reuse stacked_grads buffer for all_gather output + stacked_params = info["stacked_grads"] + future = dist.all_gather_into_tensor(stacked_params, updated_params, async_op=True).get_future() + gather_list.append(dict(future=future, stacked_params=stacked_params, params=params)) + + def _finish_gathers(self, gather_list: list) -> None: + """Wait for all gathers and copy Muon params back.""" + for info in gather_list: + info["future"].wait() + if info["params"] is not None: + # Muon: copy from stacked buffer back to individual params + torch._foreach_copy_(info["params"], list(info["stacked_params"][:len(info["params"])].unbind(0))) + + @torch.no_grad() + def step(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Phase 1: launch all async reduce ops + reduce_infos: list[dict] = [] + for group in self.param_groups: + if group['kind'] == 'adamw': + reduce_infos.append(self._reduce_adamw(group, world_size)) + elif group['kind'] == 'muon': + reduce_infos.append(self._reduce_muon(group, world_size)) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + + # Phase 2: wait for reduces, compute updates, launch gathers + gather_list: list[dict] = [] + for group, info in zip(self.param_groups, reduce_infos): + if group['kind'] == 'adamw': + self._compute_adamw(group, info, gather_list, rank, world_size) + elif group['kind'] == 'muon': + self._compute_muon(group, info, gather_list, rank) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + + # Phase 3: wait for gathers, copy back + self._finish_gathers(gather_list) diff --git a/log/report/base-model-training.md b/log/report/base-model-training.md new file mode 100644 index 00000000..90079203 --- /dev/null +++ b/log/report/base-model-training.md @@ -0,0 +1,50 @@ +## Base model training +timestamp: 2026-02-02 00:09:59 + +- run: d24-feb01 +- device_type: +- depth: 24 +- aspect_ratio: 64 +- head_dim: 128 +- max_seq_len: 2048 +- window_pattern: SSSL +- num_iterations: -1 +- target_flops: -1.0000 +- target_param_data_ratio: 12.0000 +- device_batch_size: 16 +- total_batch_size: 524,288 +- embedding_lr: 0.3000 +- unembedding_lr: 0.0040 +- weight_decay: 0.2000 +- matrix_lr: 0.0200 +- scalar_lr: 0.5000 +- adam_beta1: 0.8000 +- adam_beta2: 0.9500 +- warmup_ratio: 0.0000 +- warmdown_ratio: 0.5000 +- final_lr_frac: 0.0000 +- resume_from_step: -1 +- eval_every: 250 +- eval_tokens: 10,485,760 +- core_metric_every: 3000 +- core_metric_max_per_task: -1 +- sample_every: -1 +- save_every: -1 +- model_tag: d24_feb01 +- Number of parameters: 1,384,124,976 +- Number of FLOPs per token: 4.945112e+09 +- Calculated number of iterations: 16,704 +- Number of training tokens: 8,757,706,752 +- Tokens : Scaling params ratio: 11.9999 +- DDP world size: 8 +- warmup_ratio: 0.0000 +- warmdown_ratio: 0.5000 +- final_lr_frac: 0.0000 +- Minimum validation bpb: 0.7533 +- Final validation bpb: 0.7533 +- CORE metric estimate: 0.2633 +- MFU %: 50.77% +- Total training flops: 4.330784e+19 +- Total training time: 179.44m +- Peak memory usage: 62389.67MiB + diff --git a/log/report/chat-evaluation-sft.md b/log/report/chat-evaluation-sft.md new file mode 100644 index 00000000..b7746cad --- /dev/null +++ b/log/report/chat-evaluation-sft.md @@ -0,0 +1,23 @@ +## Chat evaluation sft +timestamp: 2026-02-02 01:24:46 + +- source: sft +- task_name: None +- dtype: bfloat16 +- temperature: 0.0000 +- max_new_tokens: 512 +- num_samples: 1 +- top_k: 50 +- batch_size: 8 +- model_tag: None +- step: None +- max_problems: None +- device_type: +- ARC-Easy: 0.4903 +- ARC-Challenge: 0.3848 +- MMLU: 0.3480 +- GSM8K: 0.0470 +- HumanEval: 0.1463 +- SpellingBee: 0.9883 +- ChatCORE metric: 0.3021 + diff --git a/log/report/sft.md b/log/report/sft.md new file mode 100644 index 00000000..becee465 --- /dev/null +++ b/log/report/sft.md @@ -0,0 +1,24 @@ +## SFT +timestamp: 2026-02-02 01:13:02 + +- run: dummy +- device_type: +- dtype: bfloat16 +- model_tag: None +- model_step: None +- num_iterations: -1 +- max_seq_len: 2048 +- device_batch_size: 16 +- total_batch_size: 524,288 +- embedding_lr: 0.2000 +- unembedding_lr: 0.0040 +- matrix_lr: 0.0200 +- weight_decay: 0.0000 +- init_lr_frac: 1.0000 +- eval_every: 150 +- eval_tokens: 10,485,760 +- dry_run: False +- Number of iterations: 849 +- DDP world size: 8 +- Minimum validation bpb: 0.3478 + diff --git a/log/report/tokenizer-training.md b/log/report/tokenizer-training.md new file mode 100644 index 00000000..a1b8f3be --- /dev/null +++ b/log/report/tokenizer-training.md @@ -0,0 +1,13 @@ +## Tokenizer training +timestamp: 2026-02-01 14:40:20 + +- max_chars: 2,000,000,000 +- doc_cap: 10,000 +- vocab_size: 32,768 +- train_time: 87.9820 +- num_special_tokens: 9 +- token_bytes_min: 1 +- token_bytes_max: 19 +- token_bytes_mean: 6.6029 +- token_bytes_std: 2.8250 + diff --git a/log/speedrun-feb01.sh b/log/speedrun-feb01.sh new file mode 100644 index 00000000..028d5f09 --- /dev/null +++ b/log/speedrun-feb01.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Default intermediate artifacts directory is in ~/.cache/nanochat +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +mkdir -p $NANOCHAT_BASE_DIR + +# ----------------------------------------------------------------------------- +# Python venv setup with uv + +# install uv (if not already installed) +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +# create a .venv local virtual environment (if it doesn't exist) +[ -d ".venv" ] || uv venv +# install the repo dependencies +uv sync --extra gpu +# activate venv so that `python` uses the project's venv instead of system python +source .venv/bin/activate + +( cat ./nanochat/gpt.py; cat ./nanochat/optim.py; cat ./nanochat/dataloader.py; cat ./scripts/base_train.py; echo -e "\n\n===== TRAINING OUTPUT =====\n\n"; OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=24 \ + --run=d24-feb01 \ + --model-tag=d24_feb01 \ + --device-batch-size=16 \ + --sample-every=-1 \ + --save-every=-1 \ + --core-metric-max-per-task=-1 \ + --core-metric-every=3000 \ + --target-param-data-ratio=12 ) \ + 2>&1 | tee ./logs/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log diff --git a/log/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log b/log/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log new file mode 100644 index 00000000..b9bac2dc --- /dev/null +++ b/log/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log @@ -0,0 +1,18889 @@ +""" +GPT model (rewrite, a lot simpler) +Notable features: +- rotary embeddings (and no positional embeddings) +- QK norm +- untied weights for token embedding and lm_head +- relu^2 activation in MLP +- norm after token embedding +- no learnable params in rmsnorm +- no bias in linear layers +- Group-Query Attention (GQA) support for more efficient inference +- Flash Attention 3 integration +""" + +from functools import partial +from dataclasses import dataclass + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from nanochat.common import get_dist_info, print0 +from nanochat.optim import MuonAdamW, DistMuonAdamW + +# Our custom Flash Attention module that automatically uses FA3 on Hopper+ and SDPA fallback elsewhere +from nanochat.flash_attention import flash_attn + +@dataclass +class GPTConfig: + sequence_len: int = 2048 + vocab_size: int = 32768 + n_layer: int = 12 + n_head: int = 6 # number of query heads + n_kv_head: int = 6 # number of key/value heads (GQA) + n_embd: int = 768 + # Sliding window attention pattern string, tiled across layers. Final layer always L. + # Characters: L=long (full context), S=short (half context) + # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long + window_pattern: str = "SSSL" + + +def norm(x): + # Purely functional rmsnorm with no learnable params + return F.rms_norm(x, (x.size(-1),)) + + +def has_ve(layer_idx, n_layer): + """Returns True if GPT layer should have Value Embedding (alternating, last layer always included).""" + return layer_idx % 2 == (n_layer - 1) % 2 + +def apply_rotary_emb(x, cos, sin): + """Apply rotary embeddings using chunk (more efficient than slicing).""" + assert x.ndim == 4 # (B, T, H, D) multihead attention layout + x1, x2 = x.chunk(2, dim=-1) # split head_dim into two halves + y1 = x1 * cos + x2 * sin + y2 = x1 * (-sin) + x2 * cos + return torch.cat([y1, y2], dim=-1) + +# def apply_rotary_emb(x, cos, sin): + # assert x.ndim == 4 # multihead attention + # d = x.shape[3] // 2 + # x1, x2 = x[..., :d], x[..., d:] # split up last dim into two halves + # y1 = x1 * cos + x2 * sin # rotate pairs of dims + # y2 = x1 * (-sin) + x2 * cos + # return torch.cat([y1, y2], 3) + +class CausalSelfAttention(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.layer_idx = layer_idx + self.n_head = config.n_head + self.n_kv_head = config.n_kv_head + self.n_embd = config.n_embd + self.head_dim = self.n_embd // self.n_head + assert self.n_embd % self.n_head == 0 + assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0 + self.c_q = nn.Linear(self.n_embd, self.n_head * self.head_dim, bias=False) + self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) + self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) + self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) + self.ve_gate_channels = 32 + self.ve_gate = nn.Linear(self.ve_gate_channels, self.n_kv_head, bias=False) if has_ve(layer_idx, config.n_layer) else None + + def forward(self, x, ve, cos_sin, window_size, kv_cache): + B, T, C = x.size() + + # Project the input to get queries, keys, and values + # Shape: (B, T, H, D) - FA3's native layout, no transpose needed! + q = self.c_q(x).view(B, T, self.n_head, self.head_dim) + k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim) + v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim) + + # Value residual (ResFormer): mix in value embedding with input-dependent gate per head + if ve is not None: + ve = ve.view(B, T, self.n_kv_head, self.head_dim) + gate = 2 * torch.sigmoid(self.ve_gate(x[..., :self.ve_gate_channels])) # (B, T, n_kv_head), range (0, 2) + v = v + gate.unsqueeze(-1) * ve + + # Apply Rotary Embeddings to queries and keys to get relative positional encoding + cos, sin = cos_sin + q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) + q, k = norm(q), norm(k) # QK norm + + # Flash Attention (FA3 on Hopper+, PyTorch SDPA fallback elsewhere) + # window_size is (left, right) tuple: (N, 0) for causal, (-1, 0) for full context + if kv_cache is None: + # Training: causal attention with optional sliding window + y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=window_size) + else: + # Inference: use flash_attn_with_kvcache which handles cache management + k_cache, v_cache = kv_cache.get_layer_cache(self.layer_idx) + y = flash_attn.flash_attn_with_kvcache( + q, k_cache, v_cache, + k=k, v=v, + cache_seqlens=kv_cache.cache_seqlens, + causal=True, + window_size=window_size, + ) + # Advance position after last layer processes + if self.layer_idx == kv_cache.n_layers - 1: + kv_cache.advance(T) + + # Re-assemble the heads and project back to residual stream + y = y.contiguous().view(B, T, -1) + y = self.c_proj(y) + return y + + +class MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False) + self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False) + + def forward(self, x): + x = self.c_fc(x) + x = F.relu(x).square() + x = self.c_proj(x) + return x + + +class Block(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.attn = CausalSelfAttention(config, layer_idx) + self.mlp = MLP(config) + + def forward(self, x, ve, cos_sin, window_size, kv_cache): + x = x + self.attn(norm(x), ve, cos_sin, window_size, kv_cache) + x = x + self.mlp(norm(x)) + return x + + +class GPT(nn.Module): + def __init__(self, config, pad_vocab_size_to=64): + """ + NOTE a major footgun: this __init__ function runs in meta device context (!!) + Therefore, any calculations inside here are shapes and dtypes only, no actual data. + => We actually initialize all data (parameters, buffers, etc.) in init_weights() instead. + """ + super().__init__() + self.config = config + # Compute per-layer window sizes for sliding window attention + # window_size is (left, right) tuple: (-1, 0) for full context, (N, 0) for sliding window + self.window_sizes = self._compute_window_sizes(config) + # Pad vocab for efficiency (DDP, tensor cores). This is just an optimization - outputs are cropped in forward(). + # https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.resize_token_embeddings + padded_vocab_size = ((config.vocab_size + pad_vocab_size_to - 1) // pad_vocab_size_to) * pad_vocab_size_to + if padded_vocab_size != config.vocab_size: + print0(f"Padding vocab_size from {config.vocab_size} to {padded_vocab_size} for efficiency") + self.transformer = nn.ModuleDict({ + "wte": nn.Embedding(padded_vocab_size, config.n_embd), + "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]), + }) + self.lm_head = nn.Linear(config.n_embd, padded_vocab_size, bias=False) + # Per-layer learnable scalars (inspired by modded-nanogpt) + # resid_lambdas: scales the residual stream at each layer (init 1.0 = neutral) + # x0_lambdas: blends initial embedding back in at each layer (init 0.0 = disabled) + # Separate parameters so they can have different optimizer treatment + self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() + self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() + # Value embeddings (ResFormer-style): alternating layers, last layer always included + head_dim = config.n_embd // config.n_head + kv_dim = config.n_kv_head * head_dim + self.value_embeds = nn.ModuleDict({str(i): nn.Embedding(padded_vocab_size, kv_dim) for i in range(config.n_layer) if has_ve(i, config.n_layer)}) + # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only. + # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory, + # so let's just over-compute them by 10X, but assert fail if we ever reach that amount. + # In the future we can dynamically grow the cache, for now it's fine. + self.rotary_seq_len = config.sequence_len * 10 # 10X over-compute should be enough, TODO make nicer? + head_dim = config.n_embd // config.n_head + cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) + self.register_buffer("cos", cos, persistent=False) # persistent=False means it's not saved to the checkpoint + self.register_buffer("sin", sin, persistent=False) + + @torch.no_grad() + def init_weights(self): + """ + Initialize the full model in this one function for maximum clarity. + + wte (embedding): normal, std=1.0 + lm_head: normal, std=0.001 + for each block: + attn.c_q: uniform, std=1/sqrt(n_embd) + attn.c_k: uniform, std=1/sqrt(n_embd) + attn.c_v: uniform, std=1/sqrt(n_embd) + attn.c_proj: zeros + mlp.c_fc: uniform, std=1/sqrt(n_embd) + mlp.c_proj: zeros + """ + + # Embedding and unembedding + torch.nn.init.normal_(self.transformer.wte.weight, mean=0.0, std=1.0) + torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001) + + # Transformer blocks: uniform init with bound = sqrt(3) * std (same standard deviation as normal) + n_embd = self.config.n_embd + s = 3**0.5 * n_embd**-0.5 # sqrt(3) multiplier makes sure Uniform achieves the same std as Normal + for block in self.transformer.h: + torch.nn.init.uniform_(block.attn.c_q.weight, -s, s) # weights use Uniform to avoid outliers + torch.nn.init.uniform_(block.attn.c_k.weight, -s, s) + torch.nn.init.uniform_(block.attn.c_v.weight, -s, s) + torch.nn.init.zeros_(block.attn.c_proj.weight) # projections are zero + torch.nn.init.uniform_(block.mlp.c_fc.weight, -s, s) + torch.nn.init.zeros_(block.mlp.c_proj.weight) + + # Per-layer scalars + self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init + self.x0_lambdas.fill_(0.1) # 0.1 => small initial weight for skip connection to input embedding + + # Value embeddings (init like c_v: uniform with same std) + for ve in self.value_embeds.values(): + torch.nn.init.uniform_(ve.weight, -s, s) + + # Gate weights init to zero so gates start at sigmoid(0) = 0.5, scaled by 2 -> 1.0 (neutral) + for block in self.transformer.h: + if block.attn.ve_gate is not None: + torch.nn.init.zeros_(block.attn.ve_gate.weight) + + # Rotary embeddings + head_dim = self.config.n_embd // self.config.n_head + cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) + self.cos, self.sin = cos, sin + + # Cast embeddings to bf16: optimizer can tolerate it and it saves memory + if self.transformer.wte.weight.device.type == "cuda": + self.transformer.wte.to(dtype=torch.bfloat16) + for ve in self.value_embeds.values(): + ve.to(dtype=torch.bfloat16) + + def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): + # TODO: bump base theta more? e.g. 100K is more common more recently + # autodetect the device from model embeddings + if device is None: + device = self.transformer.wte.weight.device + # stride the channels + channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device) + inv_freq = 1.0 / (base ** (channel_range / head_dim)) + # stride the time steps + t = torch.arange(seq_len, dtype=torch.float32, device=device) + # calculate the rotation frequencies at each (time, channel) pair + freqs = torch.outer(t, inv_freq) + cos, sin = freqs.cos(), freqs.sin() + cos, sin = cos.bfloat16(), sin.bfloat16() # keep them in bfloat16 + cos, sin = cos[None, :, None, :], sin[None, :, None, :] # add batch and head dims for later broadcasting + return cos, sin + + def _compute_window_sizes(self, config): + """ + Compute per-layer window sizes for sliding window attention. + + Returns list of (left, right) tuples for FA3's window_size parameter: + - left: how many tokens before current position to attend to (-1 = unlimited) + - right: how many tokens after current position to attend to (0 for causal) + + Pattern string is tiled across layers. Final layer always gets L (full context). + Characters: L=long (full context), S=short (half context) + """ + pattern = config.window_pattern.upper() + assert all(c in "SL" for c in pattern), f"Invalid window_pattern: {pattern}. Use only S and L." + # Map characters to window sizes + long_window = config.sequence_len + short_window = long_window // 2 + char_to_window = { + "L": (long_window, 0), + "S": (short_window, 0), + } + # Tile pattern across layers + window_sizes = [] + for layer_idx in range(config.n_layer): + char = pattern[layer_idx % len(pattern)] + window_sizes.append(char_to_window[char]) + # Final layer always gets full context + window_sizes[-1] = (long_window, 0) + return window_sizes + + def get_device(self): + return self.transformer.wte.weight.device + + def estimate_flops(self): + """ + Return the estimated FLOPs per token for the model (forward + backward). + Each matmul weight parameter contributes 2 FLOPs (multiply *, accumulate +) in forward, and 2X that in backward => 2+4=6. + Cleanest explanation of this: https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4 + On top of that, 12 * h * q * effective_seq_len accounts for key @ query matmul flops inside attention. + With sliding windows, effective_seq_len varies per layer (capped by window size). + Ref: https://arxiv.org/abs/2204.02311 (PaLM paper). + This is ~1% off from the exact formulas of Chinchilla paper, the difference is: + - Chinchilla counts the embedding layer as flops (? weird, it's just a lookup => we ignore) + - Chinchilla counts exp/sum/divide in attention softmax as flops (a little sus and very tiny => we ignore) + """ + nparams = sum(p.numel() for p in self.parameters()) + # Exclude non-matmul params: embeddings and per-layer scalars + value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds.values()) + nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + + self.resid_lambdas.numel() + self.x0_lambdas.numel()) + h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len + # Sum attention FLOPs per layer, accounting for sliding window + attn_flops = 0 + for window_size in self.window_sizes: + window = window_size[0] # (left, right) tuple, we use left + effective_seq = t if window < 0 else min(window, t) + attn_flops += 12 * h * q * effective_seq + num_flops_per_token = 6 * (nparams - nparams_exclude) + attn_flops + return num_flops_per_token + + def num_scaling_params(self): + """ + Return detailed parameter counts for scaling law analysis. + Different papers use different conventions: + - Kaplan et al. excluded embedding parameters + - Chinchilla included all parameters + Ref: https://arxiv.org/abs/2203.15556 (Chinchilla paper) + Ref: https://arxiv.org/abs/2001.08361 (Kaplan et al. original scaling laws paper) + + Returns a dict with counts for each parameter group, so downstream analysis + can experiment with which combination gives the cleanest scaling laws. + """ + # Count each group separately (mirrors the grouping in setup_optimizers) + wte = sum(p.numel() for p in self.transformer.wte.parameters()) + value_embeds = sum(p.numel() for p in self.value_embeds.parameters()) + lm_head = sum(p.numel() for p in self.lm_head.parameters()) + transformer_matrices = sum(p.numel() for p in self.transformer.h.parameters()) + scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() + total = wte + value_embeds + lm_head + transformer_matrices + scalars + assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch" + return { + 'wte': wte, + 'value_embeds': value_embeds, + 'lm_head': lm_head, + 'transformer_matrices': transformer_matrices, + 'scalars': scalars, + 'total': total, + } + + def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): + model_dim = self.config.n_embd + ddp, rank, local_rank, world_size = get_dist_info() + + # Separate out all parameters into groups + matrix_params = list(self.transformer.h.parameters()) + value_embeds_params = list(self.value_embeds.parameters()) + embedding_params = list(self.transformer.wte.parameters()) + lm_head_params = list(self.lm_head.parameters()) + resid_params = [self.resid_lambdas] + x0_params = [self.x0_lambdas] + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + + # Scale the LR for the AdamW parameters by ∝1/√dmodel (tuned for 768 dim model) + dmodel_lr_scale = (model_dim / 768) ** -0.5 + print0(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}") + + # Build param_groups with all required fields explicit + param_groups = [ + # AdamW groups (embeddings, lm_head, scalars) + dict(kind='adamw', params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=embedding_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=resid_params, lr=scalar_lr * 0.01, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=x0_params, lr=scalar_lr, betas=(0.96, 0.95), eps=1e-10, weight_decay=0.0), # higher beta1 for x0 + ] + # Muon groups (matrix params, grouped by shape for stacking) + for shape in sorted({p.shape for p in matrix_params}): + group_params = [p for p in matrix_params if p.shape == shape] + param_groups.append(dict( + kind='muon', params=group_params, lr=matrix_lr, + momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=weight_decay, + )) + + Factory = DistMuonAdamW if ddp else MuonAdamW + optimizer = Factory(param_groups) + for group in optimizer.param_groups: + group["initial_lr"] = group["lr"] + return optimizer + + def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'): + B, T = idx.size() + + # Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim/2)) + assert T <= self.cos.size(1), f"Sequence length grew beyond the rotary embeddings cache: {T} > {self.cos.size(1)}" + assert idx.device == self.cos.device, f"Rotary embeddings and idx are on different devices: {idx.device} != {self.cos.device}" + assert self.cos.dtype == torch.bfloat16, "Rotary embeddings must be in bfloat16" + # if kv cache exists, we need to offset the rotary embeddings to the current position in the cache + T0 = 0 if kv_cache is None else kv_cache.get_pos() + cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T] # truncate cache to current sequence length + + # Forward the trunk of the Transformer + x = self.transformer.wte(idx) # embed current token + x = norm(x) + x0 = x # save initial normalized embedding for x0 residual + for i, block in enumerate(self.transformer.h): + x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 + ve = self.value_embeds[str(i)](idx) if str(i) in self.value_embeds else None + x = block(x, ve, cos_sin, self.window_sizes[i], kv_cache) + x = norm(x) + + # Forward the lm_head (compute logits) + softcap = 15 # smoothly cap the logits to the range [-softcap, softcap] + logits = self.lm_head(x) # (B, T, padded_vocab_size) <- very big tensor, large amount of memory + logits = logits[..., :self.config.vocab_size] # slice to remove padding + logits = logits.float() # switch to fp32 for logit softcap and loss computation + logits = softcap * torch.tanh(logits / softcap) # squash the logits + + if targets is not None: + # training: given the targets, compute and return the loss + # TODO experiment with chunked cross-entropy? + loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction) + return loss + else: + # inference: just return the logits directly + return logits + + @torch.inference_mode() + def generate(self, tokens, max_tokens, temperature=1.0, top_k=None, seed=42): + """ + Naive autoregressive streaming inference. + To make it super simple, let's assume: + - batch size is 1 + - ids and the yielded tokens are simple Python lists and ints + """ + assert isinstance(tokens, list) + device = self.get_device() + rng = None + if temperature > 0: + rng = torch.Generator(device=device) + rng.manual_seed(seed) + ids = torch.tensor([tokens], dtype=torch.long, device=device) # add batch dim + for _ in range(max_tokens): + logits = self.forward(ids) # (B, T, vocab_size) + logits = logits[:, -1, :] # (B, vocab_size) + if top_k is not None and top_k > 0: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float('Inf') + if temperature > 0: + logits = logits / temperature + probs = F.softmax(logits, dim=-1) + next_ids = torch.multinomial(probs, num_samples=1, generator=rng) + else: + next_ids = torch.argmax(logits, dim=-1, keepdim=True) + ids = torch.cat((ids, next_ids), dim=1) + token = next_ids.item() + yield token +""" +A nice and efficient mixed AdamW/Muon Combined Optimizer. +Usually the embeddings and scalars go into AdamW, and the matrix parameters go into Muon. +Two versions are provided (MuonAdamW, DistMuonAdamW), for single GPU and distributed. + +Addapted from: https://github.com/KellerJordan/modded-nanogpt +Further contributions from @karpathy and @chrisjmccormick. +""" + +import torch +import torch.distributed as dist +from torch import Tensor + +# ----------------------------------------------------------------------------- +""" +Good old AdamW optimizer, fused kernel. +https://arxiv.org/abs/1711.05101 +""" + +@torch.compile(dynamic=False, fullgraph=True) +def adamw_step_fused( + p: Tensor, # (32768, 768) - parameter tensor + grad: Tensor, # (32768, 768) - gradient, same shape as p + exp_avg: Tensor, # (32768, 768) - first moment, same shape as p + exp_avg_sq: Tensor, # (32768, 768) - second moment, same shape as p + step_t: Tensor, # () - 0-D CPU tensor, step count + lr_t: Tensor, # () - 0-D CPU tensor, learning rate + beta1_t: Tensor, # () - 0-D CPU tensor, beta1 + beta2_t: Tensor, # () - 0-D CPU tensor, beta2 + eps_t: Tensor, # () - 0-D CPU tensor, epsilon + wd_t: Tensor, # () - 0-D CPU tensor, weight decay +) -> None: + """ + Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update + All in one compiled graph to eliminate Python overhead between ops. + The 0-D CPU tensors avoid recompilation when hyperparameter values change. + """ + # Weight decay (decoupled, applied before the update) + p.mul_(1 - lr_t * wd_t) + # Update running averages (lerp_ is cleaner and fuses well) + exp_avg.lerp_(grad, 1 - beta1_t) + exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) + # Bias corrections + bias1 = 1 - beta1_t ** step_t + bias2 = 1 - beta2_t ** step_t + # Compute update and apply + denom = (exp_avg_sq / bias2).sqrt() + eps_t + step_size = lr_t / bias1 + p.add_(exp_avg / denom, alpha=-step_size) + +# ----------------------------------------------------------------------------- +""" +Muon optimizer adapted and simplified from modded-nanogpt. +https://github.com/KellerJordan/modded-nanogpt + +Background: +Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a +quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose +of minimizing steps, it turns out to be empirically effective to keep increasing the slope at +zero even beyond the point where the iteration no longer converges all the way to one everywhere +on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T +where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model +performance at all relative to UV^T, where USV^T = G is the SVD. + +Here, an alternative to Newton-Schulz iteration with potentially better convergence properties: +Polar Express Sign Method for orthogonalization. +https://arxiv.org/pdf/2505.16932 +by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + +Some of the changes in nanochat implementation: +- Uses a simpler, more general approach to parameter grouping and stacking +- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step +- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format) +""" + +# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2) +# From https://arxiv.org/pdf/2505.16932 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), +] + +@torch.compile(dynamic=False, fullgraph=True) +def muon_step_fused( + stacked_grads: Tensor, # (12, 768, 3072) - stacked gradients + stacked_params: Tensor, # (12, 768, 3072) - stacked parameters + momentum_buffer: Tensor, # (12, 768, 3072) - first moment buffer + second_momentum_buffer: Tensor, # (12, 768, 1) or (12, 1, 3072) - factored second moment + momentum_t: Tensor, # () - 0-D CPU tensor, momentum coefficient + lr_t: Tensor, # () - 0-D CPU tensor, learning rate + wd_t: Tensor, # () - 0-D CPU tensor, weight decay + beta2_t: Tensor, # () - 0-D CPU tensor, beta2 for second moment + ns_steps: int, # 5 - number of Newton-Schulz/Polar Express iterations + red_dim: int, # -1 or -2 - reduction dimension for variance +) -> None: + """ + Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update + All in one compiled graph to eliminate Python overhead between ops. + Some of the constants are 0-D CPU tensors to avoid recompilation when values change. + """ + + # Nesterov momentum + momentum = momentum_t.to(stacked_grads.dtype) + momentum_buffer.lerp_(stacked_grads, 1 - momentum) + g = stacked_grads.lerp_(momentum_buffer, momentum) + + # Polar express + X = g.bfloat16() + X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) + if g.size(-2) > g.size(-1): # Tall matrix + for a, b, c in polar_express_coeffs[:ns_steps]: + A = X.mT @ X + B = b * A + c * (A @ A) + X = a * X + X @ B + else: # Wide matrix (original math) + for a, b, c in polar_express_coeffs[:ns_steps]: + A = X @ X.mT + B = b * A + c * (A @ A) + X = a * X + B @ X + g = X + + # Variance reduction + beta2 = beta2_t.to(g.dtype) + v_mean = g.float().square().mean(dim=red_dim, keepdim=True) + red_dim_size = g.size(red_dim) + v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size + v_norm = v_norm_sq.sqrt() + second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt() + scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square() + v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() + final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) + g = g * final_scale.to(g.dtype) + + # Cautious weight decay + parameter update + lr = lr_t.to(g.dtype) + wd = wd_t.to(g.dtype) + mask = (g * stacked_params) >= 0 + stacked_params.sub_(lr * g + lr * wd * stacked_params * mask) + +# ----------------------------------------------------------------------------- +# Single GPU version of the MuonAdamW optimizer. +# Used mostly for reference, debugging and testing. + +class MuonAdamW(torch.optim.Optimizer): + """ + Combined optimizer: Muon for 2D matrix params, AdamW for others, single GPU version. + + AdamW - Fused AdamW optimizer step. + + Muon - MomentUm Orthogonalized by Newton-schulz + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Some warnings: + - The Muon optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. + + Arguments: + param_groups: List of dicts, each containing: + - 'params': List of parameters + - 'kind': 'adamw' or 'muon' + - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay' + - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay' + """ + def __init__(self, param_groups: list[dict]): + super().__init__(param_groups, defaults={}) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + # AdamW tensors + self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + # Muon tensors + self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + + def _step_adamw(self, group: dict) -> None: + """ + AdamW update for each param in the group individually. + Lazy init the state, fill in all 0-D tensors, call the fused kernel. + """ + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + state = self.state[p] + + # State init + if not state: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + exp_avg = state['exp_avg'] + exp_avg_sq = state['exp_avg_sq'] + state['step'] += 1 + + # Fill 0-D tensors with current values + self._adamw_step_t.fill_(state['step']) + self._adamw_lr_t.fill_(group['lr']) + self._adamw_beta1_t.fill_(group['betas'][0]) + self._adamw_beta2_t.fill_(group['betas'][1]) + self._adamw_eps_t.fill_(group['eps']) + self._adamw_wd_t.fill_(group['weight_decay']) + + # Fused update: weight_decay -> momentum -> bias_correction -> param_update + adamw_step_fused( + p, grad, exp_avg, exp_avg_sq, + self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + ) + + def _step_muon(self, group: dict) -> None: + """ + Muon update for all params in the group (stacked for efficiency). + Lazy init the state, fill in all 0-D tensors, call the fused kernel. + """ + params: list[Tensor] = group['params'] + if not params: + return + + # Get or create group-level buffers (stored in first param's state for convenience) + p = params[0] + state = self.state[p] + num_params = len(params) + shape, device, dtype = p.shape, p.device, p.dtype + + # Momentum for every individual parameter + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) + momentum_buffer = state["momentum_buffer"] + + # Second momentum buffer is factored, either per-row or per-column + if "second_momentum_buffer" not in state: + state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1]) + state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) + second_momentum_buffer = state["second_momentum_buffer"] + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Stack grads and params (NOTE: this assumes all params have the same shape) + stacked_grads = torch.stack([p.grad for p in params]) + stacked_params = torch.stack(params) + + # Fill all the 0-D tensors with current values + self._muon_momentum_t.fill_(group["momentum"]) + self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) + # Shape-based LR scaling (flipped from original): + # - Tall matrices (input projections like c_fc): 1x + # - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4 + ratio = shape[-2] / shape[-1] + lr_mult = 1.0 if ratio >= 1 else ratio**-0.5 + self._muon_lr_t.fill_(group["lr"] * lr_mult) + self._muon_wd_t.fill_(group["weight_decay"]) + + # Single fused kernel: momentum -> polar_express -> variance_reduction -> update + muon_step_fused( + stacked_grads, + stacked_params, + momentum_buffer, + second_momentum_buffer, + self._muon_momentum_t, + self._muon_lr_t, + self._muon_wd_t, + self._muon_beta2_t, + group["ns_steps"], + red_dim, + ) + + # Copy back to original params + torch._foreach_copy_(params, list(stacked_params.unbind(0))) + + @torch.no_grad() + def step(self): + for group in self.param_groups: + if group['kind'] == 'adamw': + self._step_adamw(group) + elif group['kind'] == 'muon': + self._step_muon(group) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + +# ----------------------------------------------------------------------------- +# Distributed version of the MuonAdamW optimizer. +# Used for training on multiple GPUs. + +class DistMuonAdamW(torch.optim.Optimizer): + """ + Combined distributed optimizer: Muon for 2D matrix params, AdamW for others. + + See MuonAdamW for the algorithmic details of each optimizer. This class adds + distributed communication to enable multi-GPU training without PyTorch DDP. + + Design Goals: + - Overlap communication with computation (async ops) + - Minimize memory by sharding optimizer states across ranks (ZeRO-2 style) + - Batch small tensors into single comm ops where possible + + Communication Pattern (3-phase async): + We use a 3-phase structure to maximize overlap between communication and compute: + + Phase 1: Launch all async reduce ops + - Kick off all reduce_scatter/all_reduce operations + - Don't wait - let them run in background while we continue + + Phase 2: Wait for reduces, compute updates, launch gathers + - For each group: wait for its reduce, compute the update, launch gather + - By processing groups in order, earlier gathers run while later computes happen + + Phase 3: Wait for gathers, copy back + - Wait for all gathers to complete + - Copy updated params back to original tensors (Muon only) + + AdamW Communication (ZeRO-2 style): + - Small params (<1024 elements): all_reduce gradients, update full param on each rank. + Optimizer state is replicated but these params are tiny (scalars, biases). + - Large params: reduce_scatter gradients so each rank gets 1/N of the grad, update + only that slice, then all_gather the updated slices. Optimizer state (exp_avg, + exp_avg_sq) is sharded - each rank only stores state for its slice. + Requires param.shape[0] divisible by world_size. + + Muon Communication (stacked + chunked): + - All params in a Muon group must have the same shape (caller's responsibility). + - Stack all K params into a single (K, *shape) tensor for efficient comm. + - Divide K params across N ranks: each rank "owns" ceil(K/N) params. + - reduce_scatter the stacked grads so each rank gets its chunk. + - Each rank computes Muon update only for params it owns. + - all_gather the updated params back to all ranks. + - Optimizer state (momentum_buffer, second_momentum_buffer) is sharded by chunk. + - Padding: if K doesn't divide evenly, we zero-pad to (ceil(K/N) * N) for comm, + then ignore the padding when copying back. + + Buffer Reuse: + - For Muon, we allocate stacked_grads for reduce_scatter input, then reuse the + same buffer as the output for all_gather (stacked_params). This saves memory + since we don't need both buffers simultaneously. + + Arguments: + param_groups: List of dicts, each containing: + - 'params': List of parameters + - 'kind': 'adamw' or 'muon' + - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay' + - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay' + """ + def __init__(self, param_groups: list[dict]): + super().__init__(param_groups, defaults={}) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + + def _reduce_adamw(self, group: dict, world_size: int) -> dict: + """Launch async reduce ops for AdamW group. Returns info dict with per-param infos.""" + param_infos = {} + for p in group['params']: + grad = p.grad + if p.numel() < 1024: + # Small params: all_reduce (no scatter/gather needed) + future = dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future() + param_infos[p] = dict(future=future, grad_slice=grad, is_small=True) + else: + # Large params: reduce_scatter + rank_size = grad.shape[0] // world_size + grad_slice = torch.empty_like(grad[:rank_size]) + future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future() + param_infos[p] = dict(future=future, grad_slice=grad_slice, is_small=False) + return dict(param_infos=param_infos) + + def _reduce_muon(self, group: dict, world_size: int) -> dict: + """Launch async reduce op for Muon group. Returns info dict.""" + params = group['params'] + chunk_size = (len(params) + world_size - 1) // world_size + padded_num_params = chunk_size * world_size + p = params[0] + shape, device, dtype = p.shape, p.device, p.dtype + + # Stack grads and zero-pad to padded_num_params + grad_stack = torch.stack([p.grad for p in params]) + stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device) + stacked_grads[:len(params)].copy_(grad_stack) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + # Reduce_scatter to get this rank's chunk + grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + future = dist.reduce_scatter_tensor(grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True).get_future() + + return dict(future=future, grad_chunk=grad_chunk, stacked_grads=stacked_grads, chunk_size=chunk_size) + + def _compute_adamw(self, group: dict, info: dict, gather_list: list, rank: int, world_size: int) -> None: + """Wait for reduce, compute AdamW updates, launch gathers for large params.""" + param_infos = info['param_infos'] + for p in group['params']: + pinfo = param_infos[p] + pinfo['future'].wait() + grad_slice = pinfo['grad_slice'] + state = self.state[p] + + # For small params, operate on full param; for large, operate on slice + if pinfo['is_small']: + p_slice = p + else: + rank_size = p.shape[0] // world_size + p_slice = p[rank * rank_size:(rank + 1) * rank_size] + + # State init + if not state: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_slice) + state['exp_avg_sq'] = torch.zeros_like(p_slice) + state['step'] += 1 + + # Fill 0-D tensors and run fused kernel + self._adamw_step_t.fill_(state['step']) + self._adamw_lr_t.fill_(group['lr']) + self._adamw_beta1_t.fill_(group['betas'][0]) + self._adamw_beta2_t.fill_(group['betas'][1]) + self._adamw_eps_t.fill_(group['eps']) + self._adamw_wd_t.fill_(group['weight_decay']) + adamw_step_fused( + p_slice, grad_slice, state['exp_avg'], state['exp_avg_sq'], + self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + ) + + # Large params need all_gather + if not pinfo['is_small']: + future = dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future() + gather_list.append(dict(future=future, params=None)) + + def _compute_muon(self, group: dict, info: dict, gather_list: list, rank: int) -> None: + """Wait for reduce, compute Muon updates, launch gather.""" + info['future'].wait() + params = group['params'] + chunk_size = info['chunk_size'] + grad_chunk = info['grad_chunk'] + p = params[0] + shape, device, dtype = p.shape, p.device, p.dtype + + # How many params does this rank own? + start_idx = rank * chunk_size + num_owned = min(chunk_size, max(0, len(params) - start_idx)) + + # Get or create group-level state + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device) + if "second_momentum_buffer" not in state: + state_shape = (chunk_size, shape[-2], 1) if shape[-2] >= shape[-1] else (chunk_size, 1, shape[-1]) + state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Build output buffer for all_gather + updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + + if num_owned > 0: + owned_params = [params[start_idx + i] for i in range(num_owned)] + stacked_owned = torch.stack(owned_params) + + # Fill 0-D tensors and run fused kernel + self._muon_momentum_t.fill_(group["momentum"]) + self._muon_beta2_t.fill_(group["beta2"]) + # Shape-based LR scaling (flipped from original): + # - Tall matrices (input projections like c_fc): 1x + # - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4 + ratio = shape[-2] / shape[-1] + lr_mult = 1.0 if ratio >= 1 else ratio**-0.5 + self._muon_lr_t.fill_(group["lr"] * lr_mult) + self._muon_wd_t.fill_(group["weight_decay"]) + muon_step_fused( + grad_chunk[:num_owned], stacked_owned, + state["momentum_buffer"][:num_owned], state["second_momentum_buffer"][:num_owned], + self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, self._muon_beta2_t, + group["ns_steps"], red_dim, + ) + updated_params[:num_owned].copy_(stacked_owned) + + if num_owned < chunk_size: + updated_params[num_owned:].zero_() + + # Reuse stacked_grads buffer for all_gather output + stacked_params = info["stacked_grads"] + future = dist.all_gather_into_tensor(stacked_params, updated_params, async_op=True).get_future() + gather_list.append(dict(future=future, stacked_params=stacked_params, params=params)) + + def _finish_gathers(self, gather_list: list) -> None: + """Wait for all gathers and copy Muon params back.""" + for info in gather_list: + info["future"].wait() + if info["params"] is not None: + # Muon: copy from stacked buffer back to individual params + torch._foreach_copy_(info["params"], list(info["stacked_params"][:len(info["params"])].unbind(0))) + + @torch.no_grad() + def step(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Phase 1: launch all async reduce ops + reduce_infos: list[dict] = [] + for group in self.param_groups: + if group['kind'] == 'adamw': + reduce_infos.append(self._reduce_adamw(group, world_size)) + elif group['kind'] == 'muon': + reduce_infos.append(self._reduce_muon(group, world_size)) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + + # Phase 2: wait for reduces, compute updates, launch gathers + gather_list: list[dict] = [] + for group, info in zip(self.param_groups, reduce_infos): + if group['kind'] == 'adamw': + self._compute_adamw(group, info, gather_list, rank, world_size) + elif group['kind'] == 'muon': + self._compute_muon(group, info, gather_list, rank) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + + # Phase 3: wait for gathers, copy back + self._finish_gathers(gather_list) +""" +Distributed dataloaders for pretraining. + +Two implementations are provided: + +1. Original (tokenizing_distributed_data_loader): + - Streams tokens into a flat buffer, reshapes to (B, T) + - Rows may start mid-document (no guaranteed BOS at position 0) + - 100% token utilization, simple and efficient + +2. BOS-aligned bestfit (tokenizing_distributed_data_loader_bos_bestfit): + - Every row starts with BOS token + - Documents packed using best-fit algorithm to minimize cropping + - When no document fits remaining space, crops a document to fill exactly + - 100% utilization (no padding), ~35% tokens cropped at T=2048 + +The tradeoff: BOS-aligned loses ~35% of tokens to cropping, but ensures that +there are fewer "confusing" tokens in the train/val batches as every token can +now attend back to the BOS token and sees the full context of the document. +(2) is the new default if you have enough data. +Fallback to (1) if you have very limited data AND long documents. +""" + +import torch +import pyarrow.parquet as pq + +from nanochat.common import get_dist_info +from nanochat.dataset import list_parquet_files + +def _document_batches(split, resume_state_dict, tokenizer_batch_size): + """ + Infinite iterator over document batches (list of text strings) from parquet files. + + Handles DDP sharding and approximate resume. Each yield is (text_batch, (pq_idx, rg_idx, epoch)) + where text_batch is a list of document strings, indices track position for resumption, + and epoch counts how many times we've cycled through the dataset (starts at 1). + """ + ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() + + parquet_paths = list_parquet_files() + assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?" + parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:] + + resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0 + resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None + resume_epoch = resume_state_dict.get("epoch", 1) if resume_state_dict is not None else 1 + first_pass = True + pq_idx = resume_pq_idx + epoch = resume_epoch + + while True: # iterate infinitely (multi-epoch) + pq_idx = resume_pq_idx if first_pass else 0 + while pq_idx < len(parquet_paths): + filepath = parquet_paths[pq_idx] + pf = pq.ParquetFile(filepath) + # Start from resume point if resuming on same file, otherwise from DDP rank + if first_pass and (resume_rg_idx is not None) and (pq_idx == resume_pq_idx): + base_idx = resume_rg_idx // ddp_world_size + base_idx += 1 # advance by 1 so we don't repeat data after resuming + rg_idx = base_idx * ddp_world_size + ddp_rank + if rg_idx >= pf.num_row_groups: + pq_idx += 1 + continue + resume_rg_idx = None # only do this once + else: + rg_idx = ddp_rank + while rg_idx < pf.num_row_groups: + rg = pf.read_row_group(rg_idx) + batch = rg.column('text').to_pylist() + for i in range(0, len(batch), tokenizer_batch_size): + yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx, epoch) + rg_idx += ddp_world_size + pq_idx += 1 + first_pass = False + epoch += 1 + + +def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None): + """ + Stream pretraining text from parquet files, tokenize, yield training batches. + + This is the original dataloader that streams tokens into a flat buffer and reshapes. + Rows may start mid-document (no guaranteed BOS at position 0). + + Supports approximate resume via state_dict. + """ + assert split in ["train", "val"], "split must be 'train' or 'val'" + + batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) + needed_tokens = B * T + 1 # +1 for target at last position + bos_token = tokenizer.get_bos_token_id() + token_buffer = [] + pq_idx, rg_idx, epoch = 0, 0, 1 + + while True: + + # Accumulate enough tokens + while len(token_buffer) < needed_tokens: + doc_batch, (pq_idx, rg_idx, epoch) = next(batches) + token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) + for tokens in token_lists: + token_buffer.extend(tokens) + tokens = token_buffer[:needed_tokens] # Read B*T+1 tokens (+1 is only for the target for the last token) + token_buffer = token_buffer[B*T:] # Advance by B*T tokens, so we move exactly one window of B*T tokens over + + # Package tokens into inputs and targets, yield + use_cuda = device == "cuda" + scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda) + inputs = scratch[:-1].view(B, T).to(device=device, non_blocking=use_cuda) + targets = scratch[1:].view(B, T).to(device=device, non_blocking=use_cuda) + yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + + +def tokenizing_distributed_data_loader(*args, **kwargs): + """Helper that omits state_dict from yields.""" + for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs): + yield inputs, targets + + +def tokenizing_distributed_data_loader_with_state_bos_bestfit( + tokenizer, B, T, split, + tokenizer_threads=4, tokenizer_batch_size=128, + device="cuda", resume_state_dict=None, + buffer_size=1000 +): + """ + BOS-aligned dataloader with Best-Fit Cropping. + + Reduces token waste compared to simple greedy cropping by searching a buffer + for documents that fit well, while maintaining 100% utilization (no padding). + + Algorithm for each row: + 1. From buffered docs, pick the LARGEST doc that fits entirely + 2. Repeat until no doc fits + 3. When nothing fits, crop a doc to fill remaining space exactly + + Key properties: + - Every row starts with BOS + - 100% utilization (no padding, every token is trained on) + - Approximately 35% of all tokens are discarded due to cropping + """ + assert split in ["train", "val"], "split must be 'train' or 'val'" + + row_capacity = T + 1 + batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) + bos_token = tokenizer.get_bos_token_id() + pq_idx, rg_idx, epoch = 0, 0, 1 + + # Token pool: single tensor holding all buffered tokens + # Documents tracked as (start, length) tuples + pool = torch.empty(buffer_size * 512, dtype=torch.long) + pool_end = 0 + docs = [] # [(start, length), ...] + + def compact_pool(): + """Shift active documents to front of pool, reclaiming space.""" + nonlocal pool_end + if not docs: + pool_end = 0 + return + write_pos = 0 + for i, (start, length) in enumerate(docs): + if start != write_pos: + pool[write_pos:write_pos + length] = pool[start:start + length].clone() + docs[i] = (write_pos, length) + write_pos += length + pool_end = write_pos + + def refill_buffer(): + """Retrieve more docs and add them to the pool""" + nonlocal pq_idx, rg_idx, epoch, pool, pool_end + doc_batch, (pq_idx, rg_idx, epoch) = next(batches) + token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) + # Number of new tokens to store + total_new = sum(len(t) for t in token_lists) + # If there's not enough space at the end, + if pool_end + total_new > pool.size(0): + compact_pool() # Try compacting first. + # If still not enough, + if pool_end + total_new > pool.size(0): + # Allocate a new, larger pool. + new_size = max(pool.size(0) * 2, pool_end + total_new) + new_pool = torch.empty(new_size, dtype=torch.long) + new_pool[:pool_end] = pool[:pool_end] + pool = new_pool + # Write tokens to pool + for tokens in token_lists: + n = len(tokens) + pool[pool_end:pool_end + n] = torch.tensor(tokens, dtype=torch.long) + docs.append((pool_end, n)) + pool_end += n + + # Pre-allocate buffers once + use_cuda = device == "cuda" + row_buffer = torch.empty((B, row_capacity), dtype=torch.long) + inputs = torch.empty((B, T), dtype=torch.long, device=device) + targets = torch.empty((B, T), dtype=torch.long, device=device) + + while True: + for row_idx in range(B): + col = 0 + while col < row_capacity: + # Ensure buffer has documents + while len(docs) < buffer_size: + refill_buffer() + + remaining = row_capacity - col + + # Find largest doc that fits entirely + best_idx = -1 + best_len = 0 + for i, (start, length) in enumerate(docs): + if length <= remaining and length > best_len: + best_idx = i + best_len = length + + if best_idx >= 0: + start, length = docs.pop(best_idx) + row_buffer[row_idx, col:col + length] = pool[start:start + length] + col += length + else: + # No doc fits - crop shortest to fill remaining + shortest_idx = min(range(len(docs)), key=lambda i: docs[i][1]) + start, length = docs.pop(shortest_idx) + row_buffer[row_idx, col:col + remaining] = pool[start:start + remaining] + col += remaining + + # Copy to GPU + inputs.copy_(row_buffer[:, :-1], non_blocking=use_cuda) + targets.copy_(row_buffer[:, 1:], non_blocking=use_cuda) + + state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + yield inputs, targets, state_dict + +def tokenizing_distributed_data_loader_bos_bestfit(*args, **kwargs): + """Helper that omits state_dict from yields.""" + for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state_bos_bestfit(*args, **kwargs): + yield inputs, targets + """ +Train model. From root directory of the project, run as: + +python -m scripts.base_train + +or distributed as: + +torchrun --nproc_per_node=8 -m scripts.base_train + +If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example: +python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20 +""" + +import os +os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" +import argparse +import time +import gc +from contextlib import nullcontext + +# GC timing callback to detect if garbage collection is causing training stalls +def _gc_callback(phase, info): + if phase == "start": + _gc_callback.start_time = time.perf_counter() + elif phase == "stop": + duration_ms = (time.perf_counter() - _gc_callback.start_time) * 1000 + if duration_ms > 10: # Only log if GC took >10ms + rank = getattr(_gc_callback, 'rank', '?') + print(f"[GC rank{rank}] gen{info['generation']}: {duration_ms:.1f}ms collected {info.get('collected', '?')} objects") +_gc_callback.start_time = 0 +_gc_callback.rank = '?' # Will be set after compute_init +gc.callbacks.append(_gc_callback) + +import wandb +import torch + +from nanochat.gpt import GPT, GPTConfig +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit, tokenizing_distributed_data_loader_with_state_bos_bestfit +from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type, get_peak_flops +from nanochat.tokenizer import get_tokenizer, get_token_bytes +from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint +from nanochat.loss_eval import evaluate_bpb +from nanochat.engine import Engine +from nanochat.flash_attention import HAS_FA3 +from scripts.base_eval import evaluate_core +print_banner() + +# ----------------------------------------------------------------------------- +# CLI arguments +parser = argparse.ArgumentParser(description="Pretrain base model") +# Logging +parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") +# Runtime +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +# Model architecture +parser.add_argument("--depth", type=int, default=20, help="depth of the Transformer model") +parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") +parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention") +parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") +parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") +# Training horizon (only one used, in order of precedence) +parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") +parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") +parser.add_argument("--target-param-data-ratio", type=float, default=10.5, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") +# Optimization +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") +parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") +parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") +parser.add_argument("--weight-decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") +parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--scalar-lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)") +parser.add_argument("--adam-beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") +parser.add_argument("--adam-beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") +parser.add_argument("--warmup-ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") +parser.add_argument("--warmdown-ratio", type=float, default=0.5, help="ratio of iterations for LR warmdown") +parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR as fraction of initial LR") +parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)") +# Evaluation +parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)") +parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") +parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)") +parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric") +parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)") +parser.add_argument("--save-every", type=int, default=-1, help="save checkpoints every N steps (-1 = only at end)") +# Output +parser.add_argument("--model-tag", type=str, default=None, help="override model tag for checkpoint directory name") +args = parser.parse_args() +user_config = vars(args).copy() # for logging +# ----------------------------------------------------------------------------- + +# Compute init +device_type = autodetect_device_type() if args.device_type == "" else args.device_type +ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) +master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. +_gc_callback.rank = ddp_rank # Store rank for GC log printouts +autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() +synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None +get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 +if device_type == "cuda": + gpu_device_name = torch.cuda.get_device_name(0) + gpu_peak_flops = get_peak_flops(gpu_device_name) + print0(f"GPU: {gpu_device_name} | Peak FLOPS (BF16): {gpu_peak_flops:.2e}") +else: + gpu_peak_flops = float('inf') # MFU not meaningful for CPU/MPS + +# wandb logging init +use_dummy_wandb = args.run == "dummy" or not master_process +if use_dummy_wandb: + wandb_run = DummyWandb() +else: + try: + wandb_run = wandb.init(project="nanochat", name=args.run, config=user_config) + except wandb.errors.UsageError as e: + print0(f"Warning: wandb initialization failed ({e}), logging disabled. Run 'wandb login' to enable.") + wandb_run = DummyWandb() + +# Flash Attention status +if HAS_FA3: + print0("✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome.") +else: + print0("!" * 80) + print0("WARNING: Flash Attention 3 not available, using PyTorch SDPA fallback") + print0("WARNING: Training will be less efficient without FA3") + if args.window_pattern != "L": + print0(f"WARNING: SDPA has no support for sliding window attention (window_pattern='{args.window_pattern}'). Your GPU utilization will be terrible.") + print0("WARNING: Recommend using --window-pattern L for full context attention without alternating sliding window patterns.") + print0("!" * 80) + +# Tokenizer will be useful for evaluation, also we need the vocab size +tokenizer = get_tokenizer() +token_bytes = get_token_bytes(device=device) +vocab_size = tokenizer.get_vocab_size() +print0(f"Vocab size: {vocab_size:,}") + +# Model kwargs are derived from the desired depth of the model +# We nudge model_dim up to the nearest multiple of head_dim to ensure clean division +# (FA3 requires head_dim divisible by 8, and this guarantees head_dim == args.head_dim exactly) +# (For very small depths, this gives a slight "unfair" advantage to models with odd depths) +num_layers = args.depth +base_dim = args.depth * args.aspect_ratio +model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim +num_heads = model_dim // args.head_dim +num_kv_heads = num_heads # default is 1:1 GQA (Group Query Attention) ratio (i.e. GQA is disabled) +head_dim = model_dim // num_heads +print0(f"num_layers: {num_layers}") +print0(f"model_dim: {model_dim} (base: {base_dim}, nudge: {model_dim - base_dim:+d})") +print0(f"num_heads: {num_heads}") +print0(f"head_dim: {head_dim}") +print0(f"num_kv_heads: {num_kv_heads}") + +# Optimizer / data / training length related hyperparameters +# figure out the needed gradient accumulation to reach the desired total batch size +tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank +world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks +assert args.total_batch_size % world_tokens_per_fwdbwd == 0 +grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd +print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") +print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") +print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") + +# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) +batch_lr_scale = 1.0 +reference_batch_size = 2**19 +batch_ratio = args.total_batch_size / reference_batch_size +if batch_ratio != 1.0: + # SGD: linear scaling with batch size is standard (not used in nanochat) + # AdamW: sqrt scaling is standard + # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer + batch_lr_scale = batch_ratio ** 0.5 + print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") + +# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) +weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 +if args.depth != 12: + print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") + +# ----------------------------------------------------------------------------- +# Initialize the Model + +# Create a new model with random weights +model_config_kwargs = dict(sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim, window_pattern=args.window_pattern) +with torch.device("meta"): + # All tensors are created as meta tensors (they have shape/dtype but no data) + model_config = GPTConfig(**model_config_kwargs) + model = GPT(model_config) +model.to_empty(device=device) # All tensors get storage on target device but with uninitialized (garbage) data +model.init_weights() # All tensors get initialized + +# If we are resuming, overwrite the model parameters with those of the checkpoint +base_dir = get_base_dir() +output_dirname = args.model_tag if args.model_tag else f"d{args.depth}" # e.g. d12 +checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname) +resuming = args.resume_from_step != -1 +if resuming: + print0(f"Resuming optimization from step {args.resume_from_step}") + model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, args.resume_from_step, device, load_optimizer=True, rank=ddp_rank) + model.load_state_dict(model_data, strict=True, assign=True) + del model_data # free up this memory after the copy + +orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) +model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe + +# Detailed parameter counts +param_counts = orig_model.num_scaling_params() +print0(f"Parameter counts:") +for key, value in param_counts.items(): + print0(f"{key:24s}: {value:,}") +num_params = param_counts['total'] +num_scaling_params = param_counts['transformer_matrices'] + param_counts['lm_head'] # determined to give the cleanest scaling laws, see dev/LOG.md Jan 27, 2026 +num_flops_per_token = model.estimate_flops() +print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") + +# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order) +assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0 +if args.num_iterations > 0: + num_iterations = args.num_iterations + print0(f"Using user-provided number of iterations: {num_iterations:,}") +elif args.target_flops > 0: + # calculate the number of iterations from the target flops + num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size)) + print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") +elif args.target_param_data_ratio > 0: + # calculate the number of iterations from the target param data ratio (use scaling params per Kaplan et al.) + target_tokens = int(args.target_param_data_ratio * num_scaling_params) + num_iterations = target_tokens // args.total_batch_size + print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") +else: + raise ValueError("No training horizon specified") +total_tokens = args.total_batch_size * num_iterations +print0(f"Total number of training tokens: {total_tokens:,}") +print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 +print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") + +# ----------------------------------------------------------------------------- +# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) +adam_betas = (args.adam_beta1, args.adam_beta2) +optimizer = model.setup_optimizer( + unembedding_lr=args.unembedding_lr * batch_lr_scale, + embedding_lr=args.embedding_lr * batch_lr_scale, + matrix_lr=args.matrix_lr * batch_lr_scale, + weight_decay=weight_decay_scaled, + adam_betas=adam_betas, + scalar_lr=args.scalar_lr * batch_lr_scale, +) + +if resuming: + optimizer.load_state_dict(optimizer_data) + del optimizer_data + +# ----------------------------------------------------------------------------- +# Initialize the DataLoaders for train/val +dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] +train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) +build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) +x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data + +# ----------------------------------------------------------------------------- +# Set up hyperparameter schedulers + +# Learning rate scheduler +def get_lr_multiplier(it): + warmup_iters = round(args.warmup_ratio * num_iterations) + warmdown_iters = round(args.warmdown_ratio * num_iterations) + if it < warmup_iters: + return (it + 1) / warmup_iters + elif it <= num_iterations - warmdown_iters: + return 1.0 + else: + progress = (num_iterations - it) / warmdown_iters + return progress * 1.0 + (1 - progress) * args.final_lr_frac + +# Momentum scheduler for Muon optimizer +def get_muon_momentum(it): + frac = min(it / 300, 1) + momentum = (1 - frac) * 0.85 + frac * 0.95 + return momentum + +# Weight decay scheduler for Muon optimizer (linear to zero over the course of training) +def get_weight_decay(it): + return weight_decay_scaled * (1 - it / num_iterations) + +# ----------------------------------------------------------------------------- +# Loop state (variables updated by the training loop) + +if not resuming: + step = 0 + val_bpb = None # will be set if eval_every > 0 + min_val_bpb = float("inf") + smooth_train_loss = 0 # EMA of training loss + total_training_time = 0 # total wall-clock time of training +else: + step = meta_data["step"] + loop_state = meta_data["loop_state"] + val_bpb = meta_data["val_bpb"] + min_val_bpb = loop_state["min_val_bpb"] + smooth_train_loss = loop_state["smooth_train_loss"] + total_training_time = loop_state["total_training_time"] + +# ----------------------------------------------------------------------------- +# Training loop +while True: + last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end + flops_so_far = num_flops_per_token * args.total_batch_size * step + + # once in a while: evaluate the val bpb (all ranks participate) + if args.eval_every > 0 and (last_step or step % args.eval_every == 0): + model.eval() + val_loader = build_val_loader() + eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) + with autocast_ctx: + val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) + print0(f"Step {step:05d} | Validation bpb: {val_bpb:.6f}") + if val_bpb < min_val_bpb: + min_val_bpb = val_bpb + wandb_run.log({ + "step": step, + "total_training_flops": flops_so_far, + "total_training_time": total_training_time, + "val/bpb": val_bpb, + }) + model.train() + + # once in a while: estimate the CORE metric (all ranks participate) + # use the original uncompiled model because the inputs keep changing shape + results = {} + if args.core_metric_every > 0 and (last_step or (step > 0 and step % args.core_metric_every == 0)): + model.eval() + with autocast_ctx: + results = evaluate_core(orig_model, tokenizer, device, max_per_task=args.core_metric_max_per_task) + print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}") + wandb_run.log({ + "step": step, + "total_training_flops": flops_so_far, + "core_metric": results["core_metric"], + "centered_results": results["centered_results"], + }) + model.train() + + # once in a while: sample from the model (only on master process) + # use the original uncompiled model because the inputs keep changing shape + if args.sample_every > 0 and master_process and (last_step or (step > 0 and step % args.sample_every == 0)): + model.eval() + prompts = [ + "The capital of France is", + "The chemical symbol of gold is", + "If yesterday was Friday, then tomorrow will be", + "The opposite of hot is", + "The planets of the solar system are:", + "My favorite color is", + "If 5*x + 3 = 13, then x is", + ] + engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation + for prompt in prompts: + tokens = tokenizer(prompt, prepend="<|bos|>") + with autocast_ctx: + sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) + print0(tokenizer.decode(sample[0])) + model.train() + + # save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step + if last_step or (step > 0 and step != args.resume_from_step and args.save_every > 0 and step % args.save_every == 0): + save_checkpoint( + checkpoint_dir, + step, + orig_model.state_dict(), # model parameters + optimizer.state_dict(), # optimizer state + { # metadata saved as json + "step": step, + "val_bpb": val_bpb, # loss at last step + "model_config": model_config_kwargs, + "user_config": user_config, # inputs to the training script + "device_batch_size": args.device_batch_size, + "max_seq_len": args.max_seq_len, + "dataloader_state_dict": dataloader_state_dict, + "loop_state": { # all loop state (other than step) so that we can resume training + "min_val_bpb": min_val_bpb, + "smooth_train_loss": smooth_train_loss, + "total_training_time": total_training_time, + }, + }, + rank=ddp_rank, + ) + + # termination conditions (TODO: possibly also add loss explosions etc.) + if last_step: + break + + # ------------------------------------------------------------------------- + # single training step + # evaluate the gradient + synchronize() + t0 = time.time() + for micro_step in range(grad_accum_steps): + with autocast_ctx: + loss = model(x, y) + train_loss = loss.detach() # for logging + loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here + loss.backward() + x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward + # step the optimizer + lrm = get_lr_multiplier(step) + muon_momentum = get_muon_momentum(step) + muon_weight_decay = get_weight_decay(step) + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * lrm + if group['kind'] == 'muon': + group["momentum"] = muon_momentum + group["weight_decay"] = muon_weight_decay + optimizer.step() + model.zero_grad(set_to_none=True) + train_loss_f = train_loss.item() # .item() is a CPU-GPU sync point + synchronize() + t1 = time.time() + dt = t1 - t0 + # ------------------------------------------------------------------------- + + # logging (CPU action only) + ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging + smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f # EMA the training loss + debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA + pct_done = 100 * step / num_iterations + tok_per_sec = int(args.total_batch_size / dt) + flops_per_sec = num_flops_per_token * args.total_batch_size / dt + mfu = 100 * flops_per_sec / (gpu_peak_flops * ddp_world_size) + if step > 10: + total_training_time += dt # only count the time after the first 10 steps + # Calculate ETA based on average time per step (excluding first 10 steps) + steps_done = step - 10 + if steps_done > 0: + avg_time_per_step = total_training_time / steps_done + remaining_steps = num_iterations - step + eta_seconds = remaining_steps * avg_time_per_step + eta_str = f" | eta: {eta_seconds/60:.1f}m" + else: + eta_str = "" + epoch = dataloader_state_dict["epoch"] + print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}") + if step % 100 == 0: + log_data = { + "step": step, + "total_training_flops": flops_so_far, + "total_training_time": total_training_time, + "train/loss": debiased_smooth_loss, + "train/lrm": lrm, + "train/dt": dt, + "train/tok_per_sec": tok_per_sec, + "train/mfu": mfu, + "train/epoch": epoch, + } + wandb_run.log(log_data) + + # Set 'first_step_of_run' flag + first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step) + + # state update + step += 1 + + # # TEMP - Bail at 1000 steps for benchmarking. + # if step == 1001: + # print0(f"Elapsed + ETA: {total_training_time + eta_seconds:.0f}s") + # break + + # Help out the garbage collector by flushing garbage and then freezing long-lived objects + # This eliminates random ~500ms pauses during training steps as the GC scans ~millions of objects for cycles + if first_step_of_run: + gc.collect() + gc.freeze() + gc.disable() # nuclear option: disable GC for the run + elif step % 2000 == 0: + gc.collect() # manual GC to keep memory usage in check for very long runs + +# print a few more stats +print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") +print0(f"Total training time: {total_training_time/60:.2f}m") +if val_bpb is not None: + print0(f"Minimum validation bpb: {min_val_bpb:.6f}") + +# Log to report +from nanochat.report import get_report +get_report().log(section="Base model training", data=[ + user_config, # CLI args + { # stats about the training setup + "Number of parameters": num_params, + "Number of FLOPs per token": f"{num_flops_per_token:e}", + "Calculated number of iterations": num_iterations, + "Number of training tokens": total_tokens, + "Tokens : Scaling params ratio": args.total_batch_size * num_iterations / num_scaling_params, + "DDP world size": ddp_world_size, + "warmup_ratio": args.warmup_ratio, + "warmdown_ratio": args.warmdown_ratio, + "final_lr_frac": args.final_lr_frac, + }, + { # stats about training outcomes + "Minimum validation bpb": min_val_bpb if val_bpb is not None else None, + "Final validation bpb": val_bpb, + "CORE metric estimate": results.get("core_metric", None), + "MFU %": f"{mfu:.2f}%", + "Total training flops": f"{flops_so_far:e}", + "Total training time": f"{total_training_time/60:.2f}m", + "Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB", + } +]) + +# cleanup +wandb_run.finish() # wandb run finish +compute_cleanup() + + +===== TRAINING OUTPUT ===== + + +[GC rank?] gen2: 11.9ms collected 28 objects +[GC rank?] gen2: 12.0ms collected 28 objects +[GC rank?] gen2: 10.3ms collected 28 objects +[GC rank?] gen2: 10.4ms collected 28 objects +[GC rank?] gen2: 14.6ms collected 28 objects +[GC rank?] gen2: 14.5ms collected 28 objects +[GC rank?] gen2: 15.1ms collected 28 objects +[GC rank?] gen2: 27.5ms collected 274 objects +[GC rank?] gen2: 31.0ms collected 274 objects +[GC rank?] gen2: 32.1ms collected 274 objects +[GC rank?] gen2: 33.4ms collected 274 objects +[GC rank?] gen2: 34.5ms collected 274 objects +[GC rank?] gen2: 36.4ms collected 274 objects +[GC rank?] gen2: 35.0ms collected 274 objects +[GC rank?] gen2: 35.6ms collected 274 objects +[GC rank?] gen2: 34.9ms collected 10 objects +[GC rank?] gen2: 32.6ms collected 10 objects +[GC rank?] gen2: 45.3ms collected 10 objects +[GC rank?] gen2: 47.3ms collected 10 objects +[GC rank?] gen2: 43.9ms collected 10 objects +[GC rank?] gen2: 53.0ms collected 10 objects +[GC rank?] gen2: 44.5ms collected 10 objects +[GC rank?] gen2: 50.0ms collected 10 objects +[GC rank?] gen2: 57.2ms collected 56 objects +[GC rank?] gen2: 54.6ms collected 56 objects +[GC rank?] gen2: 71.4ms collected 56 objects +[GC rank?] gen2: 77.6ms collected 56 objects +[GC rank?] gen2: 71.0ms collected 56 objects +[GC rank?] gen2: 68.2ms collected 56 objects +[GC rank?] gen2: 77.8ms collected 56 objects +[GC rank?] gen2: 75.4ms collected 56 objects +[GC rank?] gen2: 79.3ms collected 10 objects +[GC rank?] gen2: 79.7ms collected 10 objects +[GC rank?] gen2: 102.8ms collected 10 objects +[GC rank?] gen2: 98.6ms collected 10 objects +[GC rank?] gen2: 99.6ms collected 10 objects +[GC rank?] gen2: 99.0ms collected 10 objects +[GC rank?] gen2: 101.4ms collected 10 objects +[GC rank?] gen2: 105.0ms collected 10 objects +[GC rank?] gen2: 106.3ms collected 20 objects +[GC rank?] gen2: 107.0ms collected 20 objects +[GC rank?] gen2: 138.0ms collected 20 objects +[GC rank?] gen2: 138.4ms collected 20 objects +[GC rank?] gen2: 138.1ms collected 20 objects +[GC rank?] gen2: 133.7ms collected 20 objects +[GC rank?] gen2: 135.0ms collected 20 objects +[GC rank?] gen2: 134.5ms collected 20 objects + + █████ █████ + ░░███ ░░███ + ████████ ██████ ████████ ██████ ██████ ░███████ ██████ ███████ + ░░███░░███ ░░░░░███ ░░███░░███ ███░░███ ███░░███ ░███░░███ ░░░░░███░░░███░ + ░███ ░███ ███████ ░███ ░███ ░███ ░███░███ ░░░ ░███ ░███ ███████ ░███ + ░███ ░███ ███░░███ ░███ ░███ ░███ ░███░███ ███ ░███ ░███ ███░░███ ░███ ███ + ████ █████░░████████ ████ █████░░██████ ░░██████ ████ █████░░███████ ░░█████ + ░░░░ ░░░░░ ░░░░░░░░ ░░░░ ░░░░░ ░░░░░░ ░░░░░░ ░░░░ ░░░░░ ░░░░░░░░ ░░░░░ + +Autodetected device type: cuda +2026-02-01 20:37:26,202 - nanochat.common - INFO - Distributed world size: 8 +GPU: NVIDIA H100 80GB HBM3 | Peak FLOPS (BF16): 9.89e+14 +Warning: wandb initialization failed (api_key not configured (no-tty). call wandb.login(key=[your_api_key])), logging disabled. Run 'wandb login' to enable. +✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome. +Vocab size: 32,768 +num_layers: 24 +model_dim: 1536 (base: 1536, nudge: +0) +num_heads: 12 +head_dim: 128 +num_kv_heads: 12 +Tokens / micro-batch / rank: 16 x 2048 = 32,768 +Tokens / micro-batch: 262,144 +Total batch size 524,288 => gradient accumulation steps: 2 +Scaling weight decay from 0.200000 to 0.050000 for depth 24 +[GC rank2] gen2: 139.2ms collected 10 objects +[GC rank4] gen2: 154.6ms collected 10 objects +[GC rank6] gen2: 163.6ms collected 10 objects +[GC rank3] gen2: 164.4ms collected 10 objects +[GC rank1] gen2: 165.8ms collected 10 objects +[GC rank7] gen2: 165.8ms collected 10 objects +[GC rank5] gen2: 171.3ms collected 10 objects +[GC rank0] gen2: 170.2ms collected 10 objects +[GC rank2] gen2: 156.1ms collected 0 objects +[GC rank4] gen2: 175.5ms collected 0 objects +[GC rank6] gen2: 182.1ms collected 0 objects +[GC rank3] gen2: 183.4ms collected 0 objects +[GC rank7] gen2: 185.9ms collected 0 objects +[GC rank1] gen2: 191.4ms collected 0 objects +[GC rank5] gen2: 189.8ms collected 0 objects +[GC rank0] gen2: 192.6ms collected 0 objects +Parameter counts: +wte : 50,331,648 +value_embeds : 603,979,776 +lm_head : 50,331,648 +transformer_matrices : 679,481,856 +scalars : 48 +total : 1,384,124,976 +Estimated FLOPs per token: 4.945112e+09 +Calculated number of iterations from target data:param ratio: 16,704 +Total number of training tokens: 8,757,706,752 +Tokens : Scaling params ratio: 12.00 +Total training FLOPs estimate: 4.330784e+19 +Scaling the LR for the AdamW parameters ∝1/√(1536/768) = 0.707107 +[GC rank0] gen2: 219.7ms collected 0 objects +[GC rank2] gen2: 175.0ms collected 83 objects +[GC rank4] gen2: 198.6ms collected 83 objects +[GC rank1] gen2: 202.4ms collected 83 objects +[GC rank7] gen2: 198.4ms collected 83 objects +[GC rank6] gen2: 189.2ms collected 83 objects +[GC rank3] gen2: 199.9ms collected 83 objects +[GC rank5] gen2: 211.7ms collected 83 objects +Step 00000 | Validation bpb: 3.167584 +[GC rank0] gen2: 284.1ms collected 365 objects +[GC rank2] gen2: 223.6ms collected 384 objects +[GC rank4] gen2: 248.1ms collected 384 objects +[GC rank7] gen2: 250.2ms collected 384 objects +[GC rank6] gen2: 249.5ms collected 267 objects +[GC rank3] gen2: 255.8ms collected 384 objects +[GC rank1] gen2: 256.7ms collected 384 objects +[GC rank5] gen2: 281.4ms collected 384 objects +[GC rank0] gen2: 384.0ms collected 758 objects +[GC rank7] gen2: 320.0ms collected 739 objects +[GC rank4] gen2: 322.2ms collected 739 objects +[GC rank6] gen2: 321.5ms collected 820 objects +[GC rank3] gen2: 327.0ms collected 739 objects +[GC rank2] gen2: 301.2ms collected 739 objects +[GC rank1] gen2: 324.7ms collected 739 objects +[GC rank5] gen2: 378.8ms collected 739 objects +[GC rank6] gen2: 390.6ms collected 2276 objects +[GC rank7] gen2: 391.4ms collected 2266 objects +[GC rank3] gen2: 397.6ms collected 2266 objects +[GC rank4] gen2: 396.6ms collected 2266 objects +[GC rank2] gen2: 383.5ms collected 2266 objects +[GC rank1] gen2: 398.6ms collected 2266 objects +[GC rank0] gen2: 473.9ms collected 2308 objects +[GC rank5] gen2: 475.5ms collected 2265 objects +[GC rank6] gen2: 433.5ms collected 96649 objects +[GC rank7] gen2: 428.9ms collected 17996 objects +[GC rank3] gen2: 431.2ms collected 17996 objects +[GC rank4] gen2: 435.0ms collected 17996 objects +[GC rank2] gen2: 413.7ms collected 17996 objects +[GC rank1] gen2: 433.6ms collected 17996 objects +[GC rank5] gen2: 504.9ms collected 17996 objects +[GC rank0] gen2: 513.0ms collected 17996 objects +[GC rank6] gen2: 532.2ms collected 258240 objects +step 00000/16704 (0.00%) | loss: 10.397165 | lrm: 1.00 | dt: 18077.85ms | tok/sec: 29,001 | mfu: 1.81 | epoch: 1 | total time: 0.00m +[GC rank6] gen2: 344.5ms collected 98 objects +[GC rank2] gen2: 588.5ms collected 336956 objects +[GC rank4] gen2: 612.1ms collected 336956 objects +[GC rank7] gen2: 708.9ms collected 336906 objects +[GC rank1] gen2: 712.1ms collected 336956 objects +[GC rank3] gen2: 723.0ms collected 336956 objects +[GC rank5] gen2: 745.7ms collected 336956 objects +[GC rank0] gen2: 746.0ms collected 336936 objects +step 00001/16704 (0.01%) | loss: 10.757717 | lrm: 1.00 | dt: 629.00ms | tok/sec: 833,525 | mfu: 52.10 | epoch: 1 | total time: 0.00m +step 00002/16704 (0.01%) | loss: 10.090174 | lrm: 1.00 | dt: 660.18ms | tok/sec: 794,157 | mfu: 49.64 | epoch: 1 | total time: 0.00m +step 00003/16704 (0.02%) | loss: 9.809756 | lrm: 1.00 | dt: 650.37ms | tok/sec: 806,131 | mfu: 50.38 | epoch: 1 | total time: 0.00m +step 00004/16704 (0.02%) | loss: 9.384240 | lrm: 1.00 | dt: 651.85ms | tok/sec: 804,305 | mfu: 50.27 | epoch: 1 | total time: 0.00m +step 00005/16704 (0.03%) | loss: 8.987481 | lrm: 1.00 | dt: 655.89ms | tok/sec: 799,348 | mfu: 49.96 | epoch: 1 | total time: 0.00m +step 00006/16704 (0.04%) | loss: 8.653763 | lrm: 1.00 | dt: 650.04ms | tok/sec: 806,545 | mfu: 50.41 | epoch: 1 | total time: 0.00m +step 00007/16704 (0.04%) | loss: 8.378430 | lrm: 1.00 | dt: 650.73ms | tok/sec: 805,692 | mfu: 50.36 | epoch: 1 | total time: 0.00m +step 00008/16704 (0.05%) | loss: 8.116607 | lrm: 1.00 | dt: 656.24ms | tok/sec: 798,928 | mfu: 49.93 | epoch: 1 | total time: 0.00m +step 00009/16704 (0.05%) | loss: 7.888759 | lrm: 1.00 | dt: 651.90ms | tok/sec: 804,246 | mfu: 50.27 | epoch: 1 | total time: 0.00m +step 00010/16704 (0.06%) | loss: 7.693003 | lrm: 1.00 | dt: 651.19ms | tok/sec: 805,121 | mfu: 50.32 | epoch: 1 | total time: 0.00m +step 00011/16704 (0.07%) | loss: 7.507898 | lrm: 1.00 | dt: 658.60ms | tok/sec: 796,060 | mfu: 49.75 | epoch: 1 | total time: 0.01m | eta: 183.2m +step 00012/16704 (0.07%) | loss: 7.352264 | lrm: 1.00 | dt: 651.46ms | tok/sec: 804,789 | mfu: 50.30 | epoch: 1 | total time: 0.02m | eta: 182.2m +step 00013/16704 (0.08%) | loss: 7.201264 | lrm: 1.00 | dt: 655.50ms | tok/sec: 799,824 | mfu: 49.99 | epoch: 1 | total time: 0.03m | eta: 182.3m +step 00014/16704 (0.08%) | loss: 7.074662 | lrm: 1.00 | dt: 655.28ms | tok/sec: 800,095 | mfu: 50.01 | epoch: 1 | total time: 0.04m | eta: 182.3m +step 00015/16704 (0.09%) | loss: 6.967393 | lrm: 1.00 | dt: 653.01ms | tok/sec: 802,876 | mfu: 50.18 | epoch: 1 | total time: 0.05m | eta: 182.1m +step 00016/16704 (0.10%) | loss: 6.874048 | lrm: 1.00 | dt: 654.66ms | tok/sec: 800,849 | mfu: 50.05 | epoch: 1 | total time: 0.07m | eta: 182.1m +step 00017/16704 (0.10%) | loss: 6.783022 | lrm: 1.00 | dt: 652.36ms | tok/sec: 803,684 | mfu: 50.23 | epoch: 1 | total time: 0.08m | eta: 182.0m +step 00018/16704 (0.11%) | loss: 6.694305 | lrm: 1.00 | dt: 654.10ms | tok/sec: 801,538 | mfu: 50.10 | epoch: 1 | total time: 0.09m | eta: 182.0m +step 00019/16704 (0.11%) | loss: 6.621023 | lrm: 1.00 | dt: 653.02ms | tok/sec: 802,867 | mfu: 50.18 | epoch: 1 | total time: 0.10m | eta: 181.9m +step 00020/16704 (0.12%) | loss: 6.536498 | lrm: 1.00 | dt: 656.79ms | tok/sec: 798,252 | mfu: 49.89 | epoch: 1 | total time: 0.11m | eta: 182.0m +step 00021/16704 (0.13%) | loss: 6.469269 | lrm: 1.00 | dt: 655.79ms | tok/sec: 799,481 | mfu: 49.97 | epoch: 1 | total time: 0.12m | eta: 182.0m +step 00022/16704 (0.13%) | loss: 6.410279 | lrm: 1.00 | dt: 655.53ms | tok/sec: 799,789 | mfu: 49.99 | epoch: 1 | total time: 0.13m | eta: 182.0m +step 00023/16704 (0.14%) | loss: 6.355250 | lrm: 1.00 | dt: 655.20ms | tok/sec: 800,191 | mfu: 50.01 | epoch: 1 | total time: 0.14m | eta: 182.0m +step 00024/16704 (0.14%) | loss: 6.307764 | lrm: 1.00 | dt: 654.52ms | tok/sec: 801,027 | mfu: 50.07 | epoch: 1 | total time: 0.15m | eta: 182.0m +step 00025/16704 (0.15%) | loss: 6.268705 | lrm: 1.00 | dt: 656.74ms | tok/sec: 798,319 | mfu: 49.90 | epoch: 1 | total time: 0.16m | eta: 182.0m +step 00026/16704 (0.16%) | loss: 6.219539 | lrm: 1.00 | dt: 656.65ms | tok/sec: 798,431 | mfu: 49.90 | epoch: 1 | total time: 0.17m | eta: 182.1m +step 00027/16704 (0.16%) | loss: 6.180974 | lrm: 1.00 | dt: 655.31ms | tok/sec: 800,063 | mfu: 50.01 | epoch: 1 | total time: 0.19m | eta: 182.0m +step 00028/16704 (0.17%) | loss: 6.153837 | lrm: 1.00 | dt: 654.55ms | tok/sec: 800,993 | mfu: 50.06 | epoch: 1 | total time: 0.20m | eta: 182.0m +step 00029/16704 (0.17%) | loss: 6.113489 | lrm: 1.00 | dt: 655.48ms | tok/sec: 799,853 | mfu: 49.99 | epoch: 1 | total time: 0.21m | eta: 182.0m +step 00030/16704 (0.18%) | loss: 6.081127 | lrm: 1.00 | dt: 655.17ms | tok/sec: 800,233 | mfu: 50.02 | epoch: 1 | total time: 0.22m | eta: 182.0m +step 00031/16704 (0.19%) | loss: 6.050413 | lrm: 1.00 | dt: 655.14ms | tok/sec: 800,265 | mfu: 50.02 | epoch: 1 | total time: 0.23m | eta: 182.0m +step 00032/16704 (0.19%) | loss: 6.017242 | lrm: 1.00 | dt: 656.04ms | tok/sec: 799,173 | mfu: 49.95 | epoch: 1 | total time: 0.24m | eta: 182.0m +step 00033/16704 (0.20%) | loss: 5.986702 | lrm: 1.00 | dt: 656.85ms | tok/sec: 798,183 | mfu: 49.89 | epoch: 1 | total time: 0.25m | eta: 182.0m +step 00034/16704 (0.20%) | loss: 5.958956 | lrm: 1.00 | dt: 655.44ms | tok/sec: 799,898 | mfu: 49.99 | epoch: 1 | total time: 0.26m | eta: 182.0m +step 00035/16704 (0.21%) | loss: 5.933490 | lrm: 1.00 | dt: 657.85ms | tok/sec: 796,975 | mfu: 49.81 | epoch: 1 | total time: 0.27m | eta: 182.0m +step 00036/16704 (0.22%) | loss: 5.902999 | lrm: 1.00 | dt: 653.47ms | tok/sec: 802,307 | mfu: 50.15 | epoch: 1 | total time: 0.28m | eta: 182.0m +step 00037/16704 (0.22%) | loss: 5.881747 | lrm: 1.00 | dt: 659.12ms | tok/sec: 795,435 | mfu: 49.72 | epoch: 1 | total time: 0.29m | eta: 182.0m +step 00038/16704 (0.23%) | loss: 5.855020 | lrm: 1.00 | dt: 656.57ms | tok/sec: 798,523 | mfu: 49.91 | epoch: 1 | total time: 0.31m | eta: 182.0m +step 00039/16704 (0.23%) | loss: 5.829025 | lrm: 1.00 | dt: 655.95ms | tok/sec: 799,276 | mfu: 49.96 | epoch: 1 | total time: 0.32m | eta: 182.0m +step 00040/16704 (0.24%) | loss: 5.802283 | lrm: 1.00 | dt: 658.68ms | tok/sec: 795,966 | mfu: 49.75 | epoch: 1 | total time: 0.33m | eta: 182.1m +step 00041/16704 (0.25%) | loss: 5.784826 | lrm: 1.00 | dt: 654.91ms | tok/sec: 800,551 | mfu: 50.04 | epoch: 1 | total time: 0.34m | eta: 182.0m +step 00042/16704 (0.25%) | loss: 5.747655 | lrm: 1.00 | dt: 656.71ms | tok/sec: 798,360 | mfu: 49.90 | epoch: 1 | total time: 0.35m | eta: 182.0m +step 00043/16704 (0.26%) | loss: 5.725543 | lrm: 1.00 | dt: 656.26ms | tok/sec: 798,897 | mfu: 49.93 | epoch: 1 | total time: 0.36m | eta: 182.0m +step 00044/16704 (0.26%) | loss: 5.707833 | lrm: 1.00 | dt: 656.79ms | tok/sec: 798,260 | mfu: 49.89 | epoch: 1 | total time: 0.37m | eta: 182.0m +step 00045/16704 (0.27%) | loss: 5.672526 | lrm: 1.00 | dt: 658.03ms | tok/sec: 796,754 | mfu: 49.80 | epoch: 1 | total time: 0.38m | eta: 182.0m +step 00046/16704 (0.28%) | loss: 5.656714 | lrm: 1.00 | dt: 655.08ms | tok/sec: 800,336 | mfu: 50.02 | epoch: 1 | total time: 0.39m | eta: 182.0m +step 00047/16704 (0.28%) | loss: 5.629670 | lrm: 1.00 | dt: 655.68ms | tok/sec: 799,610 | mfu: 49.98 | epoch: 1 | total time: 0.40m | eta: 182.0m +step 00048/16704 (0.29%) | loss: 5.601416 | lrm: 1.00 | dt: 655.73ms | tok/sec: 799,547 | mfu: 49.97 | epoch: 1 | total time: 0.42m | eta: 182.0m +step 00049/16704 (0.29%) | loss: 5.579140 | lrm: 1.00 | dt: 656.09ms | tok/sec: 799,104 | mfu: 49.95 | epoch: 1 | total time: 0.43m | eta: 182.0m +step 00050/16704 (0.30%) | loss: 5.556641 | lrm: 1.00 | dt: 653.80ms | tok/sec: 801,907 | mfu: 50.12 | epoch: 1 | total time: 0.44m | eta: 182.0m +step 00051/16704 (0.31%) | loss: 5.534975 | lrm: 1.00 | dt: 657.50ms | tok/sec: 797,390 | mfu: 49.84 | epoch: 1 | total time: 0.45m | eta: 182.0m +step 00052/16704 (0.31%) | loss: 5.508741 | lrm: 1.00 | dt: 655.81ms | tok/sec: 799,445 | mfu: 49.97 | epoch: 1 | total time: 0.46m | eta: 182.0m +step 00053/16704 (0.32%) | loss: 5.485194 | lrm: 1.00 | dt: 658.13ms | tok/sec: 796,630 | mfu: 49.79 | epoch: 1 | total time: 0.47m | eta: 182.0m +step 00054/16704 (0.32%) | loss: 5.453144 | lrm: 1.00 | dt: 659.30ms | tok/sec: 795,223 | mfu: 49.70 | epoch: 1 | total time: 0.48m | eta: 182.0m +step 00055/16704 (0.33%) | loss: 5.446050 | lrm: 1.00 | dt: 654.83ms | tok/sec: 800,651 | mfu: 50.04 | epoch: 1 | total time: 0.49m | eta: 182.0m +step 00056/16704 (0.34%) | loss: 5.417690 | lrm: 1.00 | dt: 656.50ms | tok/sec: 798,616 | mfu: 49.91 | epoch: 1 | total time: 0.50m | eta: 182.0m +step 00057/16704 (0.34%) | loss: 5.392186 | lrm: 1.00 | dt: 654.79ms | tok/sec: 800,699 | mfu: 50.04 | epoch: 1 | total time: 0.51m | eta: 181.9m +step 00058/16704 (0.35%) | loss: 5.365032 | lrm: 1.00 | dt: 656.86ms | tok/sec: 798,174 | mfu: 49.89 | epoch: 1 | total time: 0.52m | eta: 181.9m +step 00059/16704 (0.35%) | loss: 5.344333 | lrm: 1.00 | dt: 656.35ms | tok/sec: 798,796 | mfu: 49.93 | epoch: 1 | total time: 0.54m | eta: 181.9m +step 00060/16704 (0.36%) | loss: 5.315046 | lrm: 1.00 | dt: 655.81ms | tok/sec: 799,453 | mfu: 49.97 | epoch: 1 | total time: 0.55m | eta: 181.9m +step 00061/16704 (0.37%) | loss: 5.285807 | lrm: 1.00 | dt: 654.62ms | tok/sec: 800,902 | mfu: 50.06 | epoch: 1 | total time: 0.56m | eta: 181.9m +step 00062/16704 (0.37%) | loss: 5.274210 | lrm: 1.00 | dt: 655.85ms | tok/sec: 799,397 | mfu: 49.96 | epoch: 1 | total time: 0.57m | eta: 181.9m +step 00063/16704 (0.38%) | loss: 5.245866 | lrm: 1.00 | dt: 653.01ms | tok/sec: 802,876 | mfu: 50.18 | epoch: 1 | total time: 0.58m | eta: 181.9m +step 00064/16704 (0.38%) | loss: 5.219385 | lrm: 1.00 | dt: 656.09ms | tok/sec: 799,105 | mfu: 49.95 | epoch: 1 | total time: 0.59m | eta: 181.9m +step 00065/16704 (0.39%) | loss: 5.194639 | lrm: 1.00 | dt: 654.54ms | tok/sec: 801,006 | mfu: 50.06 | epoch: 1 | total time: 0.60m | eta: 181.8m +step 00066/16704 (0.40%) | loss: 5.184130 | lrm: 1.00 | dt: 655.75ms | tok/sec: 799,518 | mfu: 49.97 | epoch: 1 | total time: 0.61m | eta: 181.8m +step 00067/16704 (0.40%) | loss: 5.167058 | lrm: 1.00 | dt: 659.91ms | tok/sec: 794,479 | mfu: 49.66 | epoch: 1 | total time: 0.62m | eta: 181.8m +step 00068/16704 (0.41%) | loss: 5.149390 | lrm: 1.00 | dt: 653.84ms | tok/sec: 801,861 | mfu: 50.12 | epoch: 1 | total time: 0.63m | eta: 181.8m +step 00069/16704 (0.41%) | loss: 5.123382 | lrm: 1.00 | dt: 658.42ms | tok/sec: 796,278 | mfu: 49.77 | epoch: 1 | total time: 0.64m | eta: 181.8m +step 00070/16704 (0.42%) | loss: 5.100636 | lrm: 1.00 | dt: 656.65ms | tok/sec: 798,422 | mfu: 49.90 | epoch: 1 | total time: 0.66m | eta: 181.8m +step 00071/16704 (0.43%) | loss: 5.084162 | lrm: 1.00 | dt: 654.42ms | tok/sec: 801,143 | mfu: 50.07 | epoch: 1 | total time: 0.67m | eta: 181.8m +step 00072/16704 (0.43%) | loss: 5.059754 | lrm: 1.00 | dt: 657.78ms | tok/sec: 797,059 | mfu: 49.82 | epoch: 1 | total time: 0.68m | eta: 181.8m +step 00073/16704 (0.44%) | loss: 5.037835 | lrm: 1.00 | dt: 655.65ms | tok/sec: 799,647 | mfu: 49.98 | epoch: 1 | total time: 0.69m | eta: 181.8m +step 00074/16704 (0.44%) | loss: 5.016178 | lrm: 1.00 | dt: 654.61ms | tok/sec: 800,920 | mfu: 50.06 | epoch: 1 | total time: 0.70m | eta: 181.8m +step 00075/16704 (0.45%) | loss: 4.983141 | lrm: 1.00 | dt: 656.59ms | tok/sec: 798,499 | mfu: 49.91 | epoch: 1 | total time: 0.71m | eta: 181.8m +step 00076/16704 (0.45%) | loss: 4.959392 | lrm: 1.00 | dt: 654.43ms | tok/sec: 801,140 | mfu: 50.07 | epoch: 1 | total time: 0.72m | eta: 181.7m +step 00077/16704 (0.46%) | loss: 4.936908 | lrm: 1.00 | dt: 656.03ms | tok/sec: 799,182 | mfu: 49.95 | epoch: 1 | total time: 0.73m | eta: 181.7m +step 00078/16704 (0.47%) | loss: 4.927168 | lrm: 1.00 | dt: 654.83ms | tok/sec: 800,650 | mfu: 50.04 | epoch: 1 | total time: 0.74m | eta: 181.7m +step 00079/16704 (0.47%) | loss: 4.919510 | lrm: 1.00 | dt: 654.55ms | tok/sec: 800,995 | mfu: 50.06 | epoch: 1 | total time: 0.75m | eta: 181.7m +step 00080/16704 (0.48%) | loss: 4.914779 | lrm: 1.00 | dt: 654.18ms | tok/sec: 801,445 | mfu: 50.09 | epoch: 1 | total time: 0.77m | eta: 181.7m +step 00081/16704 (0.48%) | loss: 4.893783 | lrm: 1.00 | dt: 657.13ms | tok/sec: 797,848 | mfu: 49.87 | epoch: 1 | total time: 0.78m | eta: 181.7m +step 00082/16704 (0.49%) | loss: 4.871493 | lrm: 1.00 | dt: 658.23ms | tok/sec: 796,511 | mfu: 49.78 | epoch: 1 | total time: 0.79m | eta: 181.7m +step 00083/16704 (0.50%) | loss: 4.852376 | lrm: 1.00 | dt: 656.10ms | tok/sec: 799,096 | mfu: 49.94 | epoch: 1 | total time: 0.80m | eta: 181.7m +step 00084/16704 (0.50%) | loss: 4.833374 | lrm: 1.00 | dt: 654.58ms | tok/sec: 800,957 | mfu: 50.06 | epoch: 1 | total time: 0.81m | eta: 181.7m +step 00085/16704 (0.51%) | loss: 4.819323 | lrm: 1.00 | dt: 655.59ms | tok/sec: 799,724 | mfu: 49.98 | epoch: 1 | total time: 0.82m | eta: 181.6m +step 00086/16704 (0.51%) | loss: 4.794202 | lrm: 1.00 | dt: 657.87ms | tok/sec: 796,948 | mfu: 49.81 | epoch: 1 | total time: 0.83m | eta: 181.6m +step 00087/16704 (0.52%) | loss: 4.779545 | lrm: 1.00 | dt: 654.87ms | tok/sec: 800,604 | mfu: 50.04 | epoch: 1 | total time: 0.84m | eta: 181.6m +step 00088/16704 (0.53%) | loss: 4.769977 | lrm: 1.00 | dt: 654.78ms | tok/sec: 800,707 | mfu: 50.05 | epoch: 1 | total time: 0.85m | eta: 181.6m +step 00089/16704 (0.53%) | loss: 4.753274 | lrm: 1.00 | dt: 656.84ms | tok/sec: 798,200 | mfu: 49.89 | epoch: 1 | total time: 0.86m | eta: 181.6m +step 00090/16704 (0.54%) | loss: 4.728457 | lrm: 1.00 | dt: 653.08ms | tok/sec: 802,798 | mfu: 50.18 | epoch: 1 | total time: 0.87m | eta: 181.6m +step 00091/16704 (0.54%) | loss: 4.716371 | lrm: 1.00 | dt: 656.20ms | tok/sec: 798,981 | mfu: 49.94 | epoch: 1 | total time: 0.89m | eta: 181.6m +step 00092/16704 (0.55%) | loss: 4.696345 | lrm: 1.00 | dt: 655.41ms | tok/sec: 799,936 | mfu: 50.00 | epoch: 1 | total time: 0.90m | eta: 181.6m +step 00093/16704 (0.56%) | loss: 4.680723 | lrm: 1.00 | dt: 656.47ms | tok/sec: 798,652 | mfu: 49.92 | epoch: 1 | total time: 0.91m | eta: 181.5m +step 00094/16704 (0.56%) | loss: 4.667451 | lrm: 1.00 | dt: 653.50ms | tok/sec: 802,280 | mfu: 50.14 | epoch: 1 | total time: 0.92m | eta: 181.5m +step 00095/16704 (0.57%) | loss: 4.656081 | lrm: 1.00 | dt: 657.14ms | tok/sec: 797,835 | mfu: 49.87 | epoch: 1 | total time: 0.93m | eta: 181.5m +step 00096/16704 (0.57%) | loss: 4.646886 | lrm: 1.00 | dt: 654.67ms | tok/sec: 800,839 | mfu: 50.05 | epoch: 1 | total time: 0.94m | eta: 181.5m +step 00097/16704 (0.58%) | loss: 4.629264 | lrm: 1.00 | dt: 652.96ms | tok/sec: 802,940 | mfu: 50.18 | epoch: 1 | total time: 0.95m | eta: 181.5m +step 00098/16704 (0.59%) | loss: 4.610198 | lrm: 1.00 | dt: 659.49ms | tok/sec: 794,993 | mfu: 49.69 | epoch: 1 | total time: 0.96m | eta: 181.5m +step 00099/16704 (0.59%) | loss: 4.599328 | lrm: 1.00 | dt: 655.44ms | tok/sec: 799,901 | mfu: 49.99 | epoch: 1 | total time: 0.97m | eta: 181.5m +step 00100/16704 (0.60%) | loss: 4.586919 | lrm: 1.00 | dt: 654.80ms | tok/sec: 800,682 | mfu: 50.04 | epoch: 1 | total time: 0.98m | eta: 181.5m +step 00101/16704 (0.60%) | loss: 4.573125 | lrm: 1.00 | dt: 658.34ms | tok/sec: 796,383 | mfu: 49.78 | epoch: 1 | total time: 0.99m | eta: 181.5m +step 00102/16704 (0.61%) | loss: 4.558673 | lrm: 1.00 | dt: 653.61ms | tok/sec: 802,135 | mfu: 50.13 | epoch: 1 | total time: 1.01m | eta: 181.4m +step 00103/16704 (0.62%) | loss: 4.542304 | lrm: 1.00 | dt: 655.96ms | tok/sec: 799,269 | mfu: 49.96 | epoch: 1 | total time: 1.02m | eta: 181.4m +step 00104/16704 (0.62%) | loss: 4.532002 | lrm: 1.00 | dt: 656.54ms | tok/sec: 798,558 | mfu: 49.91 | epoch: 1 | total time: 1.03m | eta: 181.4m +step 00105/16704 (0.63%) | loss: 4.521828 | lrm: 1.00 | dt: 655.57ms | tok/sec: 799,739 | mfu: 49.98 | epoch: 1 | total time: 1.04m | eta: 181.4m +step 00106/16704 (0.63%) | loss: 4.511173 | lrm: 1.00 | dt: 657.39ms | tok/sec: 797,524 | mfu: 49.85 | epoch: 1 | total time: 1.05m | eta: 181.4m +step 00107/16704 (0.64%) | loss: 4.492525 | lrm: 1.00 | dt: 654.98ms | tok/sec: 800,465 | mfu: 50.03 | epoch: 1 | total time: 1.06m | eta: 181.4m +step 00108/16704 (0.65%) | loss: 4.478823 | lrm: 1.00 | dt: 657.23ms | tok/sec: 797,729 | mfu: 49.86 | epoch: 1 | total time: 1.07m | eta: 181.4m +step 00109/16704 (0.65%) | loss: 4.456509 | lrm: 1.00 | dt: 653.82ms | tok/sec: 801,883 | mfu: 50.12 | epoch: 1 | total time: 1.08m | eta: 181.4m +step 00110/16704 (0.66%) | loss: 4.447875 | lrm: 1.00 | dt: 656.08ms | tok/sec: 799,124 | mfu: 49.95 | epoch: 1 | total time: 1.09m | eta: 181.4m +step 00111/16704 (0.66%) | loss: 4.429141 | lrm: 1.00 | dt: 656.28ms | tok/sec: 798,878 | mfu: 49.93 | epoch: 1 | total time: 1.10m | eta: 181.4m +step 00112/16704 (0.67%) | loss: 4.405155 | lrm: 1.00 | dt: 654.20ms | tok/sec: 801,424 | mfu: 50.09 | epoch: 1 | total time: 1.11m | eta: 181.3m +step 00113/16704 (0.68%) | loss: 4.404705 | lrm: 1.00 | dt: 656.88ms | tok/sec: 798,151 | mfu: 49.89 | epoch: 1 | total time: 1.13m | eta: 181.3m +step 00114/16704 (0.68%) | loss: 4.386963 | lrm: 1.00 | dt: 656.72ms | tok/sec: 798,343 | mfu: 49.90 | epoch: 1 | total time: 1.14m | eta: 181.3m +step 00115/16704 (0.69%) | loss: 4.375032 | lrm: 1.00 | dt: 653.25ms | tok/sec: 802,579 | mfu: 50.16 | epoch: 1 | total time: 1.15m | eta: 181.3m +step 00116/16704 (0.69%) | loss: 4.368567 | lrm: 1.00 | dt: 657.30ms | tok/sec: 797,643 | mfu: 49.85 | epoch: 1 | total time: 1.16m | eta: 181.3m +step 00117/16704 (0.70%) | loss: 4.368899 | lrm: 1.00 | dt: 653.20ms | tok/sec: 802,650 | mfu: 50.17 | epoch: 1 | total time: 1.17m | eta: 181.3m +step 00118/16704 (0.71%) | loss: 4.349174 | lrm: 1.00 | dt: 655.71ms | tok/sec: 799,577 | mfu: 49.97 | epoch: 1 | total time: 1.18m | eta: 181.3m +step 00119/16704 (0.71%) | loss: 4.338053 | lrm: 1.00 | dt: 655.09ms | tok/sec: 800,331 | mfu: 50.02 | epoch: 1 | total time: 1.19m | eta: 181.3m +step 00120/16704 (0.72%) | loss: 4.318405 | lrm: 1.00 | dt: 653.71ms | tok/sec: 802,017 | mfu: 50.13 | epoch: 1 | total time: 1.20m | eta: 181.2m +step 00121/16704 (0.72%) | loss: 4.309106 | lrm: 1.00 | dt: 654.64ms | tok/sec: 800,877 | mfu: 50.06 | epoch: 1 | total time: 1.21m | eta: 181.2m +step 00122/16704 (0.73%) | loss: 4.289476 | lrm: 1.00 | dt: 659.36ms | tok/sec: 795,147 | mfu: 49.70 | epoch: 1 | total time: 1.22m | eta: 181.2m +step 00123/16704 (0.74%) | loss: 4.261147 | lrm: 1.00 | dt: 653.57ms | tok/sec: 802,185 | mfu: 50.14 | epoch: 1 | total time: 1.23m | eta: 181.2m +step 00124/16704 (0.74%) | loss: 4.258568 | lrm: 1.00 | dt: 653.79ms | tok/sec: 801,916 | mfu: 50.12 | epoch: 1 | total time: 1.25m | eta: 181.2m +step 00125/16704 (0.75%) | loss: 4.246927 | lrm: 1.00 | dt: 655.27ms | tok/sec: 800,104 | mfu: 50.01 | epoch: 1 | total time: 1.26m | eta: 181.2m +step 00126/16704 (0.75%) | loss: 4.230104 | lrm: 1.00 | dt: 654.16ms | tok/sec: 801,468 | mfu: 50.09 | epoch: 1 | total time: 1.27m | eta: 181.2m +step 00127/16704 (0.76%) | loss: 4.223904 | lrm: 1.00 | dt: 655.31ms | tok/sec: 800,065 | mfu: 50.01 | epoch: 1 | total time: 1.28m | eta: 181.2m +step 00128/16704 (0.77%) | loss: 4.233421 | lrm: 1.00 | dt: 652.58ms | tok/sec: 803,411 | mfu: 50.21 | epoch: 1 | total time: 1.29m | eta: 181.1m +step 00129/16704 (0.77%) | loss: 4.221309 | lrm: 1.00 | dt: 655.48ms | tok/sec: 799,852 | mfu: 49.99 | epoch: 1 | total time: 1.30m | eta: 181.1m +step 00130/16704 (0.78%) | loss: 4.205040 | lrm: 1.00 | dt: 654.89ms | tok/sec: 800,569 | mfu: 50.04 | epoch: 1 | total time: 1.31m | eta: 181.1m +step 00131/16704 (0.78%) | loss: 4.189385 | lrm: 1.00 | dt: 655.16ms | tok/sec: 800,240 | mfu: 50.02 | epoch: 1 | total time: 1.32m | eta: 181.1m +step 00132/16704 (0.79%) | loss: 4.168342 | lrm: 1.00 | dt: 656.16ms | tok/sec: 799,029 | mfu: 49.94 | epoch: 1 | total time: 1.33m | eta: 181.1m +step 00133/16704 (0.80%) | loss: 4.159879 | lrm: 1.00 | dt: 654.42ms | tok/sec: 801,151 | mfu: 50.07 | epoch: 1 | total time: 1.34m | eta: 181.1m +step 00134/16704 (0.80%) | loss: 4.141436 | lrm: 1.00 | dt: 655.45ms | tok/sec: 799,892 | mfu: 49.99 | epoch: 1 | total time: 1.35m | eta: 181.1m +step 00135/16704 (0.81%) | loss: 4.132336 | lrm: 1.00 | dt: 657.00ms | tok/sec: 797,997 | mfu: 49.88 | epoch: 1 | total time: 1.37m | eta: 181.1m +step 00136/16704 (0.81%) | loss: 4.128697 | lrm: 1.00 | dt: 658.37ms | tok/sec: 796,338 | mfu: 49.77 | epoch: 1 | total time: 1.38m | eta: 181.1m +step 00137/16704 (0.82%) | loss: 4.114453 | lrm: 1.00 | dt: 653.32ms | tok/sec: 802,496 | mfu: 50.16 | epoch: 1 | total time: 1.39m | eta: 181.0m +step 00138/16704 (0.83%) | loss: 4.118414 | lrm: 1.00 | dt: 656.33ms | tok/sec: 798,814 | mfu: 49.93 | epoch: 1 | total time: 1.40m | eta: 181.0m +step 00139/16704 (0.83%) | loss: 4.108316 | lrm: 1.00 | dt: 655.91ms | tok/sec: 799,334 | mfu: 49.96 | epoch: 1 | total time: 1.41m | eta: 181.0m +step 00140/16704 (0.84%) | loss: 4.107939 | lrm: 1.00 | dt: 654.27ms | tok/sec: 801,326 | mfu: 50.08 | epoch: 1 | total time: 1.42m | eta: 181.0m +step 00141/16704 (0.84%) | loss: 4.102592 | lrm: 1.00 | dt: 653.96ms | tok/sec: 801,706 | mfu: 50.11 | epoch: 1 | total time: 1.43m | eta: 181.0m +step 00142/16704 (0.85%) | loss: 4.113482 | lrm: 1.00 | dt: 655.45ms | tok/sec: 799,886 | mfu: 49.99 | epoch: 1 | total time: 1.44m | eta: 181.0m +step 00143/16704 (0.86%) | loss: 4.107142 | lrm: 1.00 | dt: 655.44ms | tok/sec: 799,900 | mfu: 49.99 | epoch: 1 | total time: 1.45m | eta: 181.0m +step 00144/16704 (0.86%) | loss: 4.097182 | lrm: 1.00 | dt: 656.47ms | tok/sec: 798,643 | mfu: 49.92 | epoch: 1 | total time: 1.46m | eta: 181.0m +step 00145/16704 (0.87%) | loss: 4.082116 | lrm: 1.00 | dt: 653.23ms | tok/sec: 802,604 | mfu: 50.16 | epoch: 1 | total time: 1.48m | eta: 180.9m +step 00146/16704 (0.87%) | loss: 4.087711 | lrm: 1.00 | dt: 655.08ms | tok/sec: 800,340 | mfu: 50.02 | epoch: 1 | total time: 1.49m | eta: 180.9m +step 00147/16704 (0.88%) | loss: 4.080210 | lrm: 1.00 | dt: 657.33ms | tok/sec: 797,596 | mfu: 49.85 | epoch: 1 | total time: 1.50m | eta: 180.9m +step 00148/16704 (0.89%) | loss: 4.058941 | lrm: 1.00 | dt: 653.79ms | tok/sec: 801,925 | mfu: 50.12 | epoch: 1 | total time: 1.51m | eta: 180.9m +step 00149/16704 (0.89%) | loss: 4.063621 | lrm: 1.00 | dt: 656.93ms | tok/sec: 798,086 | mfu: 49.88 | epoch: 1 | total time: 1.52m | eta: 180.9m +step 00150/16704 (0.90%) | loss: 4.064712 | lrm: 1.00 | dt: 655.52ms | tok/sec: 799,803 | mfu: 49.99 | epoch: 1 | total time: 1.53m | eta: 180.9m +step 00151/16704 (0.90%) | loss: 4.043783 | lrm: 1.00 | dt: 652.94ms | tok/sec: 802,970 | mfu: 50.19 | epoch: 1 | total time: 1.54m | eta: 180.9m +step 00152/16704 (0.91%) | loss: 4.030807 | lrm: 1.00 | dt: 655.40ms | tok/sec: 799,955 | mfu: 50.00 | epoch: 1 | total time: 1.55m | eta: 180.9m +step 00153/16704 (0.92%) | loss: 4.013478 | lrm: 1.00 | dt: 654.44ms | tok/sec: 801,122 | mfu: 50.07 | epoch: 1 | total time: 1.56m | eta: 180.8m +step 00154/16704 (0.92%) | loss: 4.019917 | lrm: 1.00 | dt: 653.95ms | tok/sec: 801,723 | mfu: 50.11 | epoch: 1 | total time: 1.57m | eta: 180.8m +step 00155/16704 (0.93%) | loss: 4.017176 | lrm: 1.00 | dt: 657.06ms | tok/sec: 797,926 | mfu: 49.87 | epoch: 1 | total time: 1.58m | eta: 180.8m +step 00156/16704 (0.93%) | loss: 4.002030 | lrm: 1.00 | dt: 655.00ms | tok/sec: 800,439 | mfu: 50.03 | epoch: 1 | total time: 1.60m | eta: 180.8m +step 00157/16704 (0.94%) | loss: 3.992957 | lrm: 1.00 | dt: 655.89ms | tok/sec: 799,357 | mfu: 49.96 | epoch: 1 | total time: 1.61m | eta: 180.8m +step 00158/16704 (0.95%) | loss: 3.985273 | lrm: 1.00 | dt: 655.99ms | tok/sec: 799,226 | mfu: 49.95 | epoch: 1 | total time: 1.62m | eta: 180.8m +step 00159/16704 (0.95%) | loss: 3.984910 | lrm: 1.00 | dt: 655.75ms | tok/sec: 799,529 | mfu: 49.97 | epoch: 1 | total time: 1.63m | eta: 180.8m +step 00160/16704 (0.96%) | loss: 3.964257 | lrm: 1.00 | dt: 655.03ms | tok/sec: 800,398 | mfu: 50.03 | epoch: 1 | total time: 1.64m | eta: 180.8m +step 00161/16704 (0.96%) | loss: 3.948886 | lrm: 1.00 | dt: 655.08ms | tok/sec: 800,338 | mfu: 50.02 | epoch: 1 | total time: 1.65m | eta: 180.8m +step 00162/16704 (0.97%) | loss: 3.942521 | lrm: 1.00 | dt: 656.35ms | tok/sec: 798,794 | mfu: 49.93 | epoch: 1 | total time: 1.66m | eta: 180.7m +step 00163/16704 (0.98%) | loss: 3.935633 | lrm: 1.00 | dt: 656.11ms | tok/sec: 799,090 | mfu: 49.94 | epoch: 1 | total time: 1.67m | eta: 180.7m +step 00164/16704 (0.98%) | loss: 3.930693 | lrm: 1.00 | dt: 653.01ms | tok/sec: 802,884 | mfu: 50.18 | epoch: 1 | total time: 1.68m | eta: 180.7m +step 00165/16704 (0.99%) | loss: 3.918543 | lrm: 1.00 | dt: 654.13ms | tok/sec: 801,499 | mfu: 50.09 | epoch: 1 | total time: 1.69m | eta: 180.7m +step 00166/16704 (0.99%) | loss: 3.912656 | lrm: 1.00 | dt: 655.85ms | tok/sec: 799,403 | mfu: 49.96 | epoch: 1 | total time: 1.70m | eta: 180.7m +step 00167/16704 (1.00%) | loss: 3.905903 | lrm: 1.00 | dt: 656.07ms | tok/sec: 799,138 | mfu: 49.95 | epoch: 1 | total time: 1.72m | eta: 180.7m +step 00168/16704 (1.01%) | loss: 3.894113 | lrm: 1.00 | dt: 655.88ms | tok/sec: 799,363 | mfu: 49.96 | epoch: 1 | total time: 1.73m | eta: 180.7m +step 00169/16704 (1.01%) | loss: 3.889027 | lrm: 1.00 | dt: 656.16ms | tok/sec: 799,025 | mfu: 49.94 | epoch: 1 | total time: 1.74m | eta: 180.7m +step 00170/16704 (1.02%) | loss: 3.883548 | lrm: 1.00 | dt: 652.07ms | tok/sec: 804,042 | mfu: 50.25 | epoch: 1 | total time: 1.75m | eta: 180.7m +step 00171/16704 (1.02%) | loss: 3.870545 | lrm: 1.00 | dt: 657.34ms | tok/sec: 797,589 | mfu: 49.85 | epoch: 1 | total time: 1.76m | eta: 180.6m +step 00172/16704 (1.03%) | loss: 3.871527 | lrm: 1.00 | dt: 655.67ms | tok/sec: 799,627 | mfu: 49.98 | epoch: 1 | total time: 1.77m | eta: 180.6m +step 00173/16704 (1.04%) | loss: 3.878878 | lrm: 1.00 | dt: 654.23ms | tok/sec: 801,378 | mfu: 50.09 | epoch: 1 | total time: 1.78m | eta: 180.6m +step 00174/16704 (1.04%) | loss: 3.864161 | lrm: 1.00 | dt: 653.45ms | tok/sec: 802,335 | mfu: 50.15 | epoch: 1 | total time: 1.79m | eta: 180.6m +step 00175/16704 (1.05%) | loss: 3.869574 | lrm: 1.00 | dt: 655.85ms | tok/sec: 799,396 | mfu: 49.96 | epoch: 1 | total time: 1.80m | eta: 180.6m +step 00176/16704 (1.05%) | loss: 3.861155 | lrm: 1.00 | dt: 654.84ms | tok/sec: 800,635 | mfu: 50.04 | epoch: 1 | total time: 1.81m | eta: 180.6m +step 00177/16704 (1.06%) | loss: 3.849311 | lrm: 1.00 | dt: 655.30ms | tok/sec: 800,073 | mfu: 50.01 | epoch: 1 | total time: 1.82m | eta: 180.6m +step 00178/16704 (1.07%) | loss: 3.842159 | lrm: 1.00 | dt: 655.78ms | tok/sec: 799,492 | mfu: 49.97 | epoch: 1 | total time: 1.84m | eta: 180.6m +step 00179/16704 (1.07%) | loss: 3.843941 | lrm: 1.00 | dt: 653.59ms | tok/sec: 802,169 | mfu: 50.14 | epoch: 1 | total time: 1.85m | eta: 180.5m +step 00180/16704 (1.08%) | loss: 3.844999 | lrm: 1.00 | dt: 653.74ms | tok/sec: 801,979 | mfu: 50.12 | epoch: 1 | total time: 1.86m | eta: 180.5m +step 00181/16704 (1.08%) | loss: 3.842259 | lrm: 1.00 | dt: 655.89ms | tok/sec: 799,355 | mfu: 49.96 | epoch: 1 | total time: 1.87m | eta: 180.5m +step 00182/16704 (1.09%) | loss: 3.842439 | lrm: 1.00 | dt: 654.30ms | tok/sec: 801,298 | mfu: 50.08 | epoch: 1 | total time: 1.88m | eta: 180.5m +step 00183/16704 (1.10%) | loss: 3.837479 | lrm: 1.00 | dt: 658.04ms | tok/sec: 796,738 | mfu: 49.80 | epoch: 1 | total time: 1.89m | eta: 180.5m +step 00184/16704 (1.10%) | loss: 3.834496 | lrm: 1.00 | dt: 653.39ms | tok/sec: 802,414 | mfu: 50.15 | epoch: 1 | total time: 1.90m | eta: 180.5m +step 00185/16704 (1.11%) | loss: 3.832354 | lrm: 1.00 | dt: 653.84ms | tok/sec: 801,855 | mfu: 50.12 | epoch: 1 | total time: 1.91m | eta: 180.5m +step 00186/16704 (1.11%) | loss: 3.825363 | lrm: 1.00 | dt: 656.19ms | tok/sec: 798,984 | mfu: 49.94 | epoch: 1 | total time: 1.92m | eta: 180.5m +step 00187/16704 (1.12%) | loss: 3.812997 | lrm: 1.00 | dt: 652.19ms | tok/sec: 803,888 | mfu: 50.24 | epoch: 1 | total time: 1.93m | eta: 180.4m +step 00188/16704 (1.13%) | loss: 3.800968 | lrm: 1.00 | dt: 653.65ms | tok/sec: 802,091 | mfu: 50.13 | epoch: 1 | total time: 1.94m | eta: 180.4m +step 00189/16704 (1.13%) | loss: 3.797200 | lrm: 1.00 | dt: 657.04ms | tok/sec: 797,951 | mfu: 49.87 | epoch: 1 | total time: 1.96m | eta: 180.4m +step 00190/16704 (1.14%) | loss: 3.800149 | lrm: 1.00 | dt: 650.73ms | tok/sec: 805,687 | mfu: 50.36 | epoch: 1 | total time: 1.97m | eta: 180.4m +step 00191/16704 (1.14%) | loss: 3.785488 | lrm: 1.00 | dt: 656.25ms | tok/sec: 798,909 | mfu: 49.93 | epoch: 1 | total time: 1.98m | eta: 180.4m +step 00192/16704 (1.15%) | loss: 3.775921 | lrm: 1.00 | dt: 652.61ms | tok/sec: 803,368 | mfu: 50.21 | epoch: 1 | total time: 1.99m | eta: 180.4m +step 00193/16704 (1.16%) | loss: 3.761634 | lrm: 1.00 | dt: 655.59ms | tok/sec: 799,716 | mfu: 49.98 | epoch: 1 | total time: 2.00m | eta: 180.4m +step 00194/16704 (1.16%) | loss: 3.768554 | lrm: 1.00 | dt: 655.91ms | tok/sec: 799,334 | mfu: 49.96 | epoch: 1 | total time: 2.01m | eta: 180.4m +step 00195/16704 (1.17%) | loss: 3.770066 | lrm: 1.00 | dt: 652.42ms | tok/sec: 803,609 | mfu: 50.23 | epoch: 1 | total time: 2.02m | eta: 180.3m +step 00196/16704 (1.17%) | loss: 3.769804 | lrm: 1.00 | dt: 651.73ms | tok/sec: 804,453 | mfu: 50.28 | epoch: 1 | total time: 2.03m | eta: 180.3m +step 00197/16704 (1.18%) | loss: 3.771690 | lrm: 1.00 | dt: 657.50ms | tok/sec: 797,397 | mfu: 49.84 | epoch: 1 | total time: 2.04m | eta: 180.3m +step 00198/16704 (1.19%) | loss: 3.778739 | lrm: 1.00 | dt: 653.93ms | tok/sec: 801,753 | mfu: 50.11 | epoch: 1 | total time: 2.05m | eta: 180.3m +step 00199/16704 (1.19%) | loss: 3.787775 | lrm: 1.00 | dt: 654.41ms | tok/sec: 801,160 | mfu: 50.07 | epoch: 1 | total time: 2.06m | eta: 180.3m +step 00200/16704 (1.20%) | loss: 3.767985 | lrm: 1.00 | dt: 651.48ms | tok/sec: 804,761 | mfu: 50.30 | epoch: 1 | total time: 2.08m | eta: 180.3m +step 00201/16704 (1.20%) | loss: 3.774918 | lrm: 1.00 | dt: 654.90ms | tok/sec: 800,566 | mfu: 50.04 | epoch: 1 | total time: 2.09m | eta: 180.3m +step 00202/16704 (1.21%) | loss: 3.775537 | lrm: 1.00 | dt: 653.17ms | tok/sec: 802,678 | mfu: 50.17 | epoch: 1 | total time: 2.10m | eta: 180.3m +step 00203/16704 (1.22%) | loss: 3.754212 | lrm: 1.00 | dt: 653.00ms | tok/sec: 802,887 | mfu: 50.18 | epoch: 1 | total time: 2.11m | eta: 180.2m +step 00204/16704 (1.22%) | loss: 3.732288 | lrm: 1.00 | dt: 654.31ms | tok/sec: 801,287 | mfu: 50.08 | epoch: 1 | total time: 2.12m | eta: 180.2m +step 00205/16704 (1.23%) | loss: 3.726233 | lrm: 1.00 | dt: 653.81ms | tok/sec: 801,902 | mfu: 50.12 | epoch: 1 | total time: 2.13m | eta: 180.2m +step 00206/16704 (1.23%) | loss: 3.720483 | lrm: 1.00 | dt: 652.00ms | tok/sec: 804,125 | mfu: 50.26 | epoch: 1 | total time: 2.14m | eta: 180.2m +step 00207/16704 (1.24%) | loss: 3.727871 | lrm: 1.00 | dt: 656.27ms | tok/sec: 798,896 | mfu: 49.93 | epoch: 1 | total time: 2.15m | eta: 180.2m +step 00208/16704 (1.25%) | loss: 3.707649 | lrm: 1.00 | dt: 654.28ms | tok/sec: 801,317 | mfu: 50.08 | epoch: 1 | total time: 2.16m | eta: 180.2m +step 00209/16704 (1.25%) | loss: 3.696831 | lrm: 1.00 | dt: 654.19ms | tok/sec: 801,433 | mfu: 50.09 | epoch: 1 | total time: 2.17m | eta: 180.2m +step 00210/16704 (1.26%) | loss: 3.697056 | lrm: 1.00 | dt: 651.40ms | tok/sec: 804,865 | mfu: 50.31 | epoch: 1 | total time: 2.18m | eta: 180.1m +step 00211/16704 (1.26%) | loss: 3.684050 | lrm: 1.00 | dt: 657.35ms | tok/sec: 797,576 | mfu: 49.85 | epoch: 1 | total time: 2.20m | eta: 180.1m +step 00212/16704 (1.27%) | loss: 3.678020 | lrm: 1.00 | dt: 655.61ms | tok/sec: 799,689 | mfu: 49.98 | epoch: 1 | total time: 2.21m | eta: 180.1m +step 00213/16704 (1.28%) | loss: 3.677048 | lrm: 1.00 | dt: 650.41ms | tok/sec: 806,084 | mfu: 50.38 | epoch: 1 | total time: 2.22m | eta: 180.1m +step 00214/16704 (1.28%) | loss: 3.675600 | lrm: 1.00 | dt: 656.44ms | tok/sec: 798,682 | mfu: 49.92 | epoch: 1 | total time: 2.23m | eta: 180.1m +step 00215/16704 (1.29%) | loss: 3.686273 | lrm: 1.00 | dt: 654.27ms | tok/sec: 801,335 | mfu: 50.08 | epoch: 1 | total time: 2.24m | eta: 180.1m +step 00216/16704 (1.29%) | loss: 3.674498 | lrm: 1.00 | dt: 652.04ms | tok/sec: 804,074 | mfu: 50.26 | epoch: 1 | total time: 2.25m | eta: 180.1m +step 00217/16704 (1.30%) | loss: 3.666577 | lrm: 1.00 | dt: 656.93ms | tok/sec: 798,084 | mfu: 49.88 | epoch: 1 | total time: 2.26m | eta: 180.1m +step 00218/16704 (1.31%) | loss: 3.672996 | lrm: 1.00 | dt: 653.23ms | tok/sec: 802,607 | mfu: 50.16 | epoch: 1 | total time: 2.27m | eta: 180.1m +step 00219/16704 (1.31%) | loss: 3.653284 | lrm: 1.00 | dt: 651.83ms | tok/sec: 804,329 | mfu: 50.27 | epoch: 1 | total time: 2.28m | eta: 180.0m +step 00220/16704 (1.32%) | loss: 3.661873 | lrm: 1.00 | dt: 655.08ms | tok/sec: 800,345 | mfu: 50.02 | epoch: 1 | total time: 2.29m | eta: 180.0m +step 00221/16704 (1.32%) | loss: 3.659809 | lrm: 1.00 | dt: 653.21ms | tok/sec: 802,634 | mfu: 50.17 | epoch: 1 | total time: 2.30m | eta: 180.0m +step 00222/16704 (1.33%) | loss: 3.647123 | lrm: 1.00 | dt: 652.68ms | tok/sec: 803,286 | mfu: 50.21 | epoch: 1 | total time: 2.32m | eta: 180.0m +step 00223/16704 (1.34%) | loss: 3.644057 | lrm: 1.00 | dt: 653.56ms | tok/sec: 802,201 | mfu: 50.14 | epoch: 1 | total time: 2.33m | eta: 180.0m +step 00224/16704 (1.34%) | loss: 3.645499 | lrm: 1.00 | dt: 652.95ms | tok/sec: 802,955 | mfu: 50.19 | epoch: 1 | total time: 2.34m | eta: 180.0m +step 00225/16704 (1.35%) | loss: 3.648018 | lrm: 1.00 | dt: 653.38ms | tok/sec: 802,421 | mfu: 50.15 | epoch: 1 | total time: 2.35m | eta: 180.0m +step 00226/16704 (1.35%) | loss: 3.655343 | lrm: 1.00 | dt: 652.35ms | tok/sec: 803,693 | mfu: 50.23 | epoch: 1 | total time: 2.36m | eta: 179.9m +step 00227/16704 (1.36%) | loss: 3.656045 | lrm: 1.00 | dt: 651.28ms | tok/sec: 805,008 | mfu: 50.31 | epoch: 1 | total time: 2.37m | eta: 179.9m +step 00228/16704 (1.36%) | loss: 3.648752 | lrm: 1.00 | dt: 653.41ms | tok/sec: 802,387 | mfu: 50.15 | epoch: 1 | total time: 2.38m | eta: 179.9m +step 00229/16704 (1.37%) | loss: 3.636460 | lrm: 1.00 | dt: 650.72ms | tok/sec: 805,704 | mfu: 50.36 | epoch: 1 | total time: 2.39m | eta: 179.9m +step 00230/16704 (1.38%) | loss: 3.637640 | lrm: 1.00 | dt: 655.75ms | tok/sec: 799,523 | mfu: 49.97 | epoch: 1 | total time: 2.40m | eta: 179.9m +step 00231/16704 (1.38%) | loss: 3.632991 | lrm: 1.00 | dt: 654.01ms | tok/sec: 801,655 | mfu: 50.10 | epoch: 1 | total time: 2.41m | eta: 179.9m +step 00232/16704 (1.39%) | loss: 3.617179 | lrm: 1.00 | dt: 653.14ms | tok/sec: 802,722 | mfu: 50.17 | epoch: 1 | total time: 2.42m | eta: 179.9m +step 00233/16704 (1.39%) | loss: 3.607807 | lrm: 1.00 | dt: 653.03ms | tok/sec: 802,851 | mfu: 50.18 | epoch: 1 | total time: 2.43m | eta: 179.8m +step 00234/16704 (1.40%) | loss: 3.617604 | lrm: 1.00 | dt: 653.33ms | tok/sec: 802,488 | mfu: 50.16 | epoch: 1 | total time: 2.45m | eta: 179.8m +step 00235/16704 (1.41%) | loss: 3.610034 | lrm: 1.00 | dt: 653.70ms | tok/sec: 802,035 | mfu: 50.13 | epoch: 1 | total time: 2.46m | eta: 179.8m +step 00236/16704 (1.41%) | loss: 3.609677 | lrm: 1.00 | dt: 650.89ms | tok/sec: 805,489 | mfu: 50.34 | epoch: 1 | total time: 2.47m | eta: 179.8m +step 00237/16704 (1.42%) | loss: 3.611420 | lrm: 1.00 | dt: 653.77ms | tok/sec: 801,946 | mfu: 50.12 | epoch: 1 | total time: 2.48m | eta: 179.8m +step 00238/16704 (1.42%) | loss: 3.606787 | lrm: 1.00 | dt: 652.57ms | tok/sec: 803,424 | mfu: 50.22 | epoch: 1 | total time: 2.49m | eta: 179.8m +step 00239/16704 (1.43%) | loss: 3.601073 | lrm: 1.00 | dt: 651.63ms | tok/sec: 804,576 | mfu: 50.29 | epoch: 1 | total time: 2.50m | eta: 179.8m +step 00240/16704 (1.44%) | loss: 3.592283 | lrm: 1.00 | dt: 656.15ms | tok/sec: 799,042 | mfu: 49.94 | epoch: 1 | total time: 2.51m | eta: 179.8m +step 00241/16704 (1.44%) | loss: 3.599309 | lrm: 1.00 | dt: 651.78ms | tok/sec: 804,390 | mfu: 50.28 | epoch: 1 | total time: 2.52m | eta: 179.7m +step 00242/16704 (1.45%) | loss: 3.597153 | lrm: 1.00 | dt: 653.49ms | tok/sec: 802,284 | mfu: 50.14 | epoch: 1 | total time: 2.53m | eta: 179.7m +step 00243/16704 (1.45%) | loss: 3.598413 | lrm: 1.00 | dt: 651.67ms | tok/sec: 804,529 | mfu: 50.28 | epoch: 1 | total time: 2.54m | eta: 179.7m +step 00244/16704 (1.46%) | loss: 3.593188 | lrm: 1.00 | dt: 650.44ms | tok/sec: 806,046 | mfu: 50.38 | epoch: 1 | total time: 2.55m | eta: 179.7m +step 00245/16704 (1.47%) | loss: 3.597655 | lrm: 1.00 | dt: 654.12ms | tok/sec: 801,519 | mfu: 50.10 | epoch: 1 | total time: 2.57m | eta: 179.7m +step 00246/16704 (1.47%) | loss: 3.592952 | lrm: 1.00 | dt: 650.66ms | tok/sec: 805,779 | mfu: 50.36 | epoch: 1 | total time: 2.58m | eta: 179.7m +step 00247/16704 (1.48%) | loss: 3.591142 | lrm: 1.00 | dt: 652.53ms | tok/sec: 803,466 | mfu: 50.22 | epoch: 1 | total time: 2.59m | eta: 179.7m +step 00248/16704 (1.48%) | loss: 3.581021 | lrm: 1.00 | dt: 653.09ms | tok/sec: 802,780 | mfu: 50.17 | epoch: 1 | total time: 2.60m | eta: 179.6m +step 00249/16704 (1.49%) | loss: 3.593002 | lrm: 1.00 | dt: 649.30ms | tok/sec: 807,461 | mfu: 50.47 | epoch: 1 | total time: 2.61m | eta: 179.6m +Step 00250 | Validation bpb: 1.086329 +step 00250/16704 (1.50%) | loss: 3.602910 | lrm: 1.00 | dt: 659.50ms | tok/sec: 794,972 | mfu: 49.69 | epoch: 1 | total time: 2.62m | eta: 179.6m +step 00251/16704 (1.50%) | loss: 3.612431 | lrm: 1.00 | dt: 651.52ms | tok/sec: 804,718 | mfu: 50.30 | epoch: 1 | total time: 2.63m | eta: 179.6m +step 00252/16704 (1.51%) | loss: 3.608562 | lrm: 1.00 | dt: 654.60ms | tok/sec: 800,931 | mfu: 50.06 | epoch: 1 | total time: 2.64m | eta: 179.6m +step 00253/16704 (1.51%) | loss: 3.603806 | lrm: 1.00 | dt: 651.79ms | tok/sec: 804,383 | mfu: 50.28 | epoch: 1 | total time: 2.65m | eta: 179.6m +step 00254/16704 (1.52%) | loss: 3.598828 | lrm: 1.00 | dt: 652.96ms | tok/sec: 802,938 | mfu: 50.18 | epoch: 1 | total time: 2.66m | eta: 179.6m +step 00255/16704 (1.53%) | loss: 3.592115 | lrm: 1.00 | dt: 651.08ms | tok/sec: 805,254 | mfu: 50.33 | epoch: 1 | total time: 2.67m | eta: 179.5m +step 00256/16704 (1.53%) | loss: 3.585303 | lrm: 1.00 | dt: 652.37ms | tok/sec: 803,667 | mfu: 50.23 | epoch: 1 | total time: 2.69m | eta: 179.5m +step 00257/16704 (1.54%) | loss: 3.570201 | lrm: 1.00 | dt: 652.00ms | tok/sec: 804,128 | mfu: 50.26 | epoch: 1 | total time: 2.70m | eta: 179.5m +step 00258/16704 (1.54%) | loss: 3.568783 | lrm: 1.00 | dt: 654.15ms | tok/sec: 801,480 | mfu: 50.09 | epoch: 1 | total time: 2.71m | eta: 179.5m +step 00259/16704 (1.55%) | loss: 3.565855 | lrm: 1.00 | dt: 651.93ms | tok/sec: 804,204 | mfu: 50.26 | epoch: 1 | total time: 2.72m | eta: 179.5m +step 00260/16704 (1.56%) | loss: 3.570491 | lrm: 1.00 | dt: 652.52ms | tok/sec: 803,475 | mfu: 50.22 | epoch: 1 | total time: 2.73m | eta: 179.5m +step 00261/16704 (1.56%) | loss: 3.579777 | lrm: 1.00 | dt: 654.04ms | tok/sec: 801,609 | mfu: 50.10 | epoch: 1 | total time: 2.74m | eta: 179.5m +step 00262/16704 (1.57%) | loss: 3.567314 | lrm: 1.00 | dt: 652.91ms | tok/sec: 803,003 | mfu: 50.19 | epoch: 1 | total time: 2.75m | eta: 179.5m +step 00263/16704 (1.57%) | loss: 3.552447 | lrm: 1.00 | dt: 651.77ms | tok/sec: 804,408 | mfu: 50.28 | epoch: 1 | total time: 2.76m | eta: 179.4m +step 00264/16704 (1.58%) | loss: 3.553986 | lrm: 1.00 | dt: 651.76ms | tok/sec: 804,424 | mfu: 50.28 | epoch: 1 | total time: 2.77m | eta: 179.4m +step 00265/16704 (1.59%) | loss: 3.543746 | lrm: 1.00 | dt: 654.17ms | tok/sec: 801,461 | mfu: 50.09 | epoch: 1 | total time: 2.78m | eta: 179.4m +step 00266/16704 (1.59%) | loss: 3.546530 | lrm: 1.00 | dt: 653.58ms | tok/sec: 802,184 | mfu: 50.14 | epoch: 1 | total time: 2.79m | eta: 179.4m +step 00267/16704 (1.60%) | loss: 3.539271 | lrm: 1.00 | dt: 652.62ms | tok/sec: 803,364 | mfu: 50.21 | epoch: 1 | total time: 2.80m | eta: 179.4m +step 00268/16704 (1.60%) | loss: 3.545364 | lrm: 1.00 | dt: 650.68ms | tok/sec: 805,757 | mfu: 50.36 | epoch: 1 | total time: 2.82m | eta: 179.4m +step 00269/16704 (1.61%) | loss: 3.538350 | lrm: 1.00 | dt: 652.57ms | tok/sec: 803,424 | mfu: 50.22 | epoch: 1 | total time: 2.83m | eta: 179.4m +step 00270/16704 (1.62%) | loss: 3.530582 | lrm: 1.00 | dt: 651.62ms | tok/sec: 804,596 | mfu: 50.29 | epoch: 1 | total time: 2.84m | eta: 179.3m +step 00271/16704 (1.62%) | loss: 3.529643 | lrm: 1.00 | dt: 653.70ms | tok/sec: 802,026 | mfu: 50.13 | epoch: 1 | total time: 2.85m | eta: 179.3m +step 00272/16704 (1.63%) | loss: 3.540949 | lrm: 1.00 | dt: 652.17ms | tok/sec: 803,914 | mfu: 50.25 | epoch: 1 | total time: 2.86m | eta: 179.3m +step 00273/16704 (1.63%) | loss: 3.546302 | lrm: 1.00 | dt: 652.90ms | tok/sec: 803,016 | mfu: 50.19 | epoch: 1 | total time: 2.87m | eta: 179.3m +step 00274/16704 (1.64%) | loss: 3.536855 | lrm: 1.00 | dt: 652.27ms | tok/sec: 803,787 | mfu: 50.24 | epoch: 1 | total time: 2.88m | eta: 179.3m +step 00275/16704 (1.65%) | loss: 3.521665 | lrm: 1.00 | dt: 652.12ms | tok/sec: 803,977 | mfu: 50.25 | epoch: 1 | total time: 2.89m | eta: 179.3m +step 00276/16704 (1.65%) | loss: 3.520722 | lrm: 1.00 | dt: 651.92ms | tok/sec: 804,217 | mfu: 50.26 | epoch: 1 | total time: 2.90m | eta: 179.3m +step 00277/16704 (1.66%) | loss: 3.521334 | lrm: 1.00 | dt: 651.30ms | tok/sec: 804,984 | mfu: 50.31 | epoch: 1 | total time: 2.91m | eta: 179.3m +step 00278/16704 (1.66%) | loss: 3.519375 | lrm: 1.00 | dt: 652.50ms | tok/sec: 803,505 | mfu: 50.22 | epoch: 1 | total time: 2.92m | eta: 179.2m +step 00279/16704 (1.67%) | loss: 3.514896 | lrm: 1.00 | dt: 653.54ms | tok/sec: 802,221 | mfu: 50.14 | epoch: 1 | total time: 2.94m | eta: 179.2m +step 00280/16704 (1.68%) | loss: 3.511858 | lrm: 1.00 | dt: 650.76ms | tok/sec: 805,653 | mfu: 50.35 | epoch: 1 | total time: 2.95m | eta: 179.2m +step 00281/16704 (1.68%) | loss: 3.517632 | lrm: 1.00 | dt: 651.86ms | tok/sec: 804,292 | mfu: 50.27 | epoch: 1 | total time: 2.96m | eta: 179.2m +step 00282/16704 (1.69%) | loss: 3.502986 | lrm: 1.00 | dt: 651.83ms | tok/sec: 804,333 | mfu: 50.27 | epoch: 1 | total time: 2.97m | eta: 179.2m +step 00283/16704 (1.69%) | loss: 3.498614 | lrm: 1.00 | dt: 651.06ms | tok/sec: 805,277 | mfu: 50.33 | epoch: 1 | total time: 2.98m | eta: 179.2m +step 00284/16704 (1.70%) | loss: 3.507263 | lrm: 1.00 | dt: 651.11ms | tok/sec: 805,217 | mfu: 50.33 | epoch: 1 | total time: 2.99m | eta: 179.2m +step 00285/16704 (1.71%) | loss: 3.511238 | lrm: 1.00 | dt: 653.12ms | tok/sec: 802,740 | mfu: 50.17 | epoch: 1 | total time: 3.00m | eta: 179.1m +step 00286/16704 (1.71%) | loss: 3.508409 | lrm: 1.00 | dt: 651.06ms | tok/sec: 805,287 | mfu: 50.33 | epoch: 1 | total time: 3.01m | eta: 179.1m +step 00287/16704 (1.72%) | loss: 3.515718 | lrm: 1.00 | dt: 652.00ms | tok/sec: 804,119 | mfu: 50.26 | epoch: 1 | total time: 3.02m | eta: 179.1m +step 00288/16704 (1.72%) | loss: 3.530188 | lrm: 1.00 | dt: 650.91ms | tok/sec: 805,464 | mfu: 50.34 | epoch: 1 | total time: 3.03m | eta: 179.1m +step 00289/16704 (1.73%) | loss: 3.519160 | lrm: 1.00 | dt: 649.45ms | tok/sec: 807,283 | mfu: 50.46 | epoch: 1 | total time: 3.04m | eta: 179.1m +step 00290/16704 (1.74%) | loss: 3.512037 | lrm: 1.00 | dt: 650.60ms | tok/sec: 805,849 | mfu: 50.37 | epoch: 1 | total time: 3.05m | eta: 179.1m +step 00291/16704 (1.74%) | loss: 3.515443 | lrm: 1.00 | dt: 651.04ms | tok/sec: 805,310 | mfu: 50.33 | epoch: 1 | total time: 3.07m | eta: 179.1m +step 00292/16704 (1.75%) | loss: 3.500319 | lrm: 1.00 | dt: 653.78ms | tok/sec: 801,931 | mfu: 50.12 | epoch: 1 | total time: 3.08m | eta: 179.0m +step 00293/16704 (1.75%) | loss: 3.483265 | lrm: 1.00 | dt: 650.09ms | tok/sec: 806,488 | mfu: 50.41 | epoch: 1 | total time: 3.09m | eta: 179.0m +step 00294/16704 (1.76%) | loss: 3.487119 | lrm: 1.00 | dt: 652.49ms | tok/sec: 803,514 | mfu: 50.22 | epoch: 1 | total time: 3.10m | eta: 179.0m +step 00295/16704 (1.77%) | loss: 3.481937 | lrm: 1.00 | dt: 651.01ms | tok/sec: 805,342 | mfu: 50.34 | epoch: 1 | total time: 3.11m | eta: 179.0m +step 00296/16704 (1.77%) | loss: 3.465830 | lrm: 1.00 | dt: 649.99ms | tok/sec: 806,609 | mfu: 50.41 | epoch: 1 | total time: 3.12m | eta: 179.0m +step 00297/16704 (1.78%) | loss: 3.480337 | lrm: 1.00 | dt: 650.85ms | tok/sec: 805,547 | mfu: 50.35 | epoch: 1 | total time: 3.13m | eta: 179.0m +step 00298/16704 (1.78%) | loss: 3.475031 | lrm: 1.00 | dt: 655.28ms | tok/sec: 800,096 | mfu: 50.01 | epoch: 1 | total time: 3.14m | eta: 179.0m +step 00299/16704 (1.79%) | loss: 3.460019 | lrm: 1.00 | dt: 648.23ms | tok/sec: 808,798 | mfu: 50.55 | epoch: 1 | total time: 3.15m | eta: 178.9m +step 00300/16704 (1.80%) | loss: 3.469169 | lrm: 1.00 | dt: 652.24ms | tok/sec: 803,831 | mfu: 50.24 | epoch: 1 | total time: 3.16m | eta: 178.9m +step 00301/16704 (1.80%) | loss: 3.466561 | lrm: 1.00 | dt: 652.20ms | tok/sec: 803,882 | mfu: 50.24 | epoch: 1 | total time: 3.17m | eta: 178.9m +step 00302/16704 (1.81%) | loss: 3.467738 | lrm: 1.00 | dt: 650.69ms | tok/sec: 805,747 | mfu: 50.36 | epoch: 1 | total time: 3.18m | eta: 178.9m +step 00303/16704 (1.81%) | loss: 3.469483 | lrm: 1.00 | dt: 650.10ms | tok/sec: 806,476 | mfu: 50.41 | epoch: 1 | total time: 3.20m | eta: 178.9m +step 00304/16704 (1.82%) | loss: 3.466210 | lrm: 1.00 | dt: 651.61ms | tok/sec: 804,599 | mfu: 50.29 | epoch: 1 | total time: 3.21m | eta: 178.9m +step 00305/16704 (1.83%) | loss: 3.461487 | lrm: 1.00 | dt: 649.94ms | tok/sec: 806,669 | mfu: 50.42 | epoch: 1 | total time: 3.22m | eta: 178.9m +step 00306/16704 (1.83%) | loss: 3.465563 | lrm: 1.00 | dt: 648.31ms | tok/sec: 808,704 | mfu: 50.55 | epoch: 1 | total time: 3.23m | eta: 178.8m +step 00307/16704 (1.84%) | loss: 3.444914 | lrm: 1.00 | dt: 652.50ms | tok/sec: 803,507 | mfu: 50.22 | epoch: 1 | total time: 3.24m | eta: 178.8m +step 00308/16704 (1.84%) | loss: 3.449692 | lrm: 1.00 | dt: 649.78ms | tok/sec: 806,871 | mfu: 50.43 | epoch: 1 | total time: 3.25m | eta: 178.8m +step 00309/16704 (1.85%) | loss: 3.455586 | lrm: 1.00 | dt: 650.90ms | tok/sec: 805,479 | mfu: 50.34 | epoch: 1 | total time: 3.26m | eta: 178.8m +step 00310/16704 (1.86%) | loss: 3.444646 | lrm: 1.00 | dt: 648.80ms | tok/sec: 808,091 | mfu: 50.51 | epoch: 1 | total time: 3.27m | eta: 178.8m +step 00311/16704 (1.86%) | loss: 3.444658 | lrm: 1.00 | dt: 652.12ms | tok/sec: 803,979 | mfu: 50.25 | epoch: 1 | total time: 3.28m | eta: 178.8m +step 00312/16704 (1.87%) | loss: 3.437107 | lrm: 1.00 | dt: 651.67ms | tok/sec: 804,529 | mfu: 50.28 | epoch: 1 | total time: 3.29m | eta: 178.8m +step 00313/16704 (1.87%) | loss: 3.435305 | lrm: 1.00 | dt: 650.12ms | tok/sec: 806,446 | mfu: 50.40 | epoch: 1 | total time: 3.30m | eta: 178.7m +step 00314/16704 (1.88%) | loss: 3.428285 | lrm: 1.00 | dt: 652.63ms | tok/sec: 803,344 | mfu: 50.21 | epoch: 1 | total time: 3.32m | eta: 178.7m +step 00315/16704 (1.89%) | loss: 3.436642 | lrm: 1.00 | dt: 648.68ms | tok/sec: 808,233 | mfu: 50.52 | epoch: 1 | total time: 3.33m | eta: 178.7m +step 00316/16704 (1.89%) | loss: 3.455480 | lrm: 1.00 | dt: 649.74ms | tok/sec: 806,914 | mfu: 50.43 | epoch: 1 | total time: 3.34m | eta: 178.7m +step 00317/16704 (1.90%) | loss: 3.457530 | lrm: 1.00 | dt: 650.01ms | tok/sec: 806,590 | mfu: 50.41 | epoch: 1 | total time: 3.35m | eta: 178.7m +step 00318/16704 (1.90%) | loss: 3.458769 | lrm: 1.00 | dt: 650.73ms | tok/sec: 805,696 | mfu: 50.36 | epoch: 1 | total time: 3.36m | eta: 178.7m +step 00319/16704 (1.91%) | loss: 3.457126 | lrm: 1.00 | dt: 648.46ms | tok/sec: 808,508 | mfu: 50.53 | epoch: 1 | total time: 3.37m | eta: 178.7m +step 00320/16704 (1.92%) | loss: 3.463135 | lrm: 1.00 | dt: 650.58ms | tok/sec: 805,872 | mfu: 50.37 | epoch: 1 | total time: 3.38m | eta: 178.6m +step 00321/16704 (1.92%) | loss: 3.476782 | lrm: 1.00 | dt: 648.76ms | tok/sec: 808,132 | mfu: 50.51 | epoch: 1 | total time: 3.39m | eta: 178.6m +step 00322/16704 (1.93%) | loss: 3.466993 | lrm: 1.00 | dt: 649.33ms | tok/sec: 807,425 | mfu: 50.47 | epoch: 1 | total time: 3.40m | eta: 178.6m +step 00323/16704 (1.93%) | loss: 3.469515 | lrm: 1.00 | dt: 648.47ms | tok/sec: 808,496 | mfu: 50.53 | epoch: 1 | total time: 3.41m | eta: 178.6m +step 00324/16704 (1.94%) | loss: 3.474063 | lrm: 1.00 | dt: 649.67ms | tok/sec: 807,007 | mfu: 50.44 | epoch: 1 | total time: 3.42m | eta: 178.6m +step 00325/16704 (1.95%) | loss: 3.467876 | lrm: 1.00 | dt: 650.19ms | tok/sec: 806,366 | mfu: 50.40 | epoch: 1 | total time: 3.43m | eta: 178.6m +step 00326/16704 (1.95%) | loss: 3.474159 | lrm: 1.00 | dt: 654.24ms | tok/sec: 801,370 | mfu: 50.09 | epoch: 1 | total time: 3.45m | eta: 178.6m +step 00327/16704 (1.96%) | loss: 3.475221 | lrm: 1.00 | dt: 649.64ms | tok/sec: 807,042 | mfu: 50.44 | epoch: 1 | total time: 3.46m | eta: 178.5m +step 00328/16704 (1.96%) | loss: 3.476255 | lrm: 1.00 | dt: 649.34ms | tok/sec: 807,421 | mfu: 50.46 | epoch: 1 | total time: 3.47m | eta: 178.5m +step 00329/16704 (1.97%) | loss: 3.457538 | lrm: 1.00 | dt: 651.86ms | tok/sec: 804,298 | mfu: 50.27 | epoch: 1 | total time: 3.48m | eta: 178.5m +step 00330/16704 (1.98%) | loss: 3.445961 | lrm: 1.00 | dt: 648.04ms | tok/sec: 809,034 | mfu: 50.57 | epoch: 1 | total time: 3.49m | eta: 178.5m +step 00331/16704 (1.98%) | loss: 3.447391 | lrm: 1.00 | dt: 650.62ms | tok/sec: 805,823 | mfu: 50.37 | epoch: 1 | total time: 3.50m | eta: 178.5m +step 00332/16704 (1.99%) | loss: 3.451452 | lrm: 1.00 | dt: 650.60ms | tok/sec: 805,850 | mfu: 50.37 | epoch: 1 | total time: 3.51m | eta: 178.5m +step 00333/16704 (1.99%) | loss: 3.454588 | lrm: 1.00 | dt: 652.68ms | tok/sec: 803,284 | mfu: 50.21 | epoch: 1 | total time: 3.52m | eta: 178.5m +step 00334/16704 (2.00%) | loss: 3.450534 | lrm: 1.00 | dt: 648.43ms | tok/sec: 808,548 | mfu: 50.54 | epoch: 1 | total time: 3.53m | eta: 178.4m +step 00335/16704 (2.01%) | loss: 3.441202 | lrm: 1.00 | dt: 651.42ms | tok/sec: 804,840 | mfu: 50.30 | epoch: 1 | total time: 3.54m | eta: 178.4m +step 00336/16704 (2.01%) | loss: 3.436888 | lrm: 1.00 | dt: 650.21ms | tok/sec: 806,341 | mfu: 50.40 | epoch: 1 | total time: 3.55m | eta: 178.4m +step 00337/16704 (2.02%) | loss: 3.438831 | lrm: 1.00 | dt: 651.68ms | tok/sec: 804,515 | mfu: 50.28 | epoch: 1 | total time: 3.56m | eta: 178.4m +step 00338/16704 (2.02%) | loss: 3.432587 | lrm: 1.00 | dt: 650.81ms | tok/sec: 805,595 | mfu: 50.35 | epoch: 1 | total time: 3.58m | eta: 178.4m +step 00339/16704 (2.03%) | loss: 3.428157 | lrm: 1.00 | dt: 652.67ms | tok/sec: 803,298 | mfu: 50.21 | epoch: 1 | total time: 3.59m | eta: 178.4m +step 00340/16704 (2.04%) | loss: 3.415194 | lrm: 1.00 | dt: 649.80ms | tok/sec: 806,851 | mfu: 50.43 | epoch: 1 | total time: 3.60m | eta: 178.4m +step 00341/16704 (2.04%) | loss: 3.403792 | lrm: 1.00 | dt: 652.29ms | tok/sec: 803,770 | mfu: 50.24 | epoch: 1 | total time: 3.61m | eta: 178.4m +step 00342/16704 (2.05%) | loss: 3.401769 | lrm: 1.00 | dt: 650.15ms | tok/sec: 806,415 | mfu: 50.40 | epoch: 1 | total time: 3.62m | eta: 178.3m +step 00343/16704 (2.05%) | loss: 3.405568 | lrm: 1.00 | dt: 651.00ms | tok/sec: 805,354 | mfu: 50.34 | epoch: 1 | total time: 3.63m | eta: 178.3m +step 00344/16704 (2.06%) | loss: 3.403685 | lrm: 1.00 | dt: 650.81ms | tok/sec: 805,594 | mfu: 50.35 | epoch: 1 | total time: 3.64m | eta: 178.3m +step 00345/16704 (2.07%) | loss: 3.411188 | lrm: 1.00 | dt: 649.79ms | tok/sec: 806,856 | mfu: 50.43 | epoch: 1 | total time: 3.65m | eta: 178.3m +step 00346/16704 (2.07%) | loss: 3.411852 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,174 | mfu: 50.57 | epoch: 1 | total time: 3.66m | eta: 178.3m +step 00347/16704 (2.08%) | loss: 3.415616 | lrm: 1.00 | dt: 650.61ms | tok/sec: 805,843 | mfu: 50.37 | epoch: 1 | total time: 3.67m | eta: 178.3m +step 00348/16704 (2.08%) | loss: 3.417462 | lrm: 1.00 | dt: 650.90ms | tok/sec: 805,478 | mfu: 50.34 | epoch: 1 | total time: 3.68m | eta: 178.3m +step 00349/16704 (2.09%) | loss: 3.413814 | lrm: 1.00 | dt: 651.16ms | tok/sec: 805,157 | mfu: 50.32 | epoch: 1 | total time: 3.69m | eta: 178.2m +step 00350/16704 (2.10%) | loss: 3.424810 | lrm: 1.00 | dt: 649.02ms | tok/sec: 807,814 | mfu: 50.49 | epoch: 1 | total time: 3.71m | eta: 178.2m +step 00351/16704 (2.10%) | loss: 3.423718 | lrm: 1.00 | dt: 649.76ms | tok/sec: 806,896 | mfu: 50.43 | epoch: 1 | total time: 3.72m | eta: 178.2m +step 00352/16704 (2.11%) | loss: 3.426563 | lrm: 1.00 | dt: 651.01ms | tok/sec: 805,342 | mfu: 50.34 | epoch: 1 | total time: 3.73m | eta: 178.2m +step 00353/16704 (2.11%) | loss: 3.418412 | lrm: 1.00 | dt: 648.53ms | tok/sec: 808,419 | mfu: 50.53 | epoch: 1 | total time: 3.74m | eta: 178.2m +step 00354/16704 (2.12%) | loss: 3.411893 | lrm: 1.00 | dt: 649.66ms | tok/sec: 807,018 | mfu: 50.44 | epoch: 1 | total time: 3.75m | eta: 178.2m +step 00355/16704 (2.13%) | loss: 3.412460 | lrm: 1.00 | dt: 651.37ms | tok/sec: 804,904 | mfu: 50.31 | epoch: 1 | total time: 3.76m | eta: 178.2m +step 00356/16704 (2.13%) | loss: 3.423776 | lrm: 1.00 | dt: 648.11ms | tok/sec: 808,952 | mfu: 50.56 | epoch: 1 | total time: 3.77m | eta: 178.1m +step 00357/16704 (2.14%) | loss: 3.420631 | lrm: 1.00 | dt: 650.23ms | tok/sec: 806,311 | mfu: 50.40 | epoch: 1 | total time: 3.78m | eta: 178.1m +step 00358/16704 (2.14%) | loss: 3.419202 | lrm: 1.00 | dt: 650.94ms | tok/sec: 805,430 | mfu: 50.34 | epoch: 1 | total time: 3.79m | eta: 178.1m +step 00359/16704 (2.15%) | loss: 3.422041 | lrm: 1.00 | dt: 650.75ms | tok/sec: 805,667 | mfu: 50.36 | epoch: 1 | total time: 3.80m | eta: 178.1m +step 00360/16704 (2.16%) | loss: 3.429467 | lrm: 1.00 | dt: 651.20ms | tok/sec: 805,113 | mfu: 50.32 | epoch: 1 | total time: 3.81m | eta: 178.1m +step 00361/16704 (2.16%) | loss: 3.428141 | lrm: 1.00 | dt: 648.07ms | tok/sec: 808,998 | mfu: 50.56 | epoch: 1 | total time: 3.82m | eta: 178.1m +step 00362/16704 (2.17%) | loss: 3.416694 | lrm: 1.00 | dt: 652.93ms | tok/sec: 802,972 | mfu: 50.19 | epoch: 1 | total time: 3.84m | eta: 178.1m +step 00363/16704 (2.17%) | loss: 3.430670 | lrm: 1.00 | dt: 648.99ms | tok/sec: 807,857 | mfu: 50.49 | epoch: 1 | total time: 3.85m | eta: 178.0m +step 00364/16704 (2.18%) | loss: 3.421828 | lrm: 1.00 | dt: 651.43ms | tok/sec: 804,825 | mfu: 50.30 | epoch: 1 | total time: 3.86m | eta: 178.0m +step 00365/16704 (2.19%) | loss: 3.424142 | lrm: 1.00 | dt: 649.81ms | tok/sec: 806,835 | mfu: 50.43 | epoch: 1 | total time: 3.87m | eta: 178.0m +step 00366/16704 (2.19%) | loss: 3.417448 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,419 | mfu: 50.65 | epoch: 1 | total time: 3.88m | eta: 178.0m +step 00367/16704 (2.20%) | loss: 3.428800 | lrm: 1.00 | dt: 652.18ms | tok/sec: 803,904 | mfu: 50.25 | epoch: 1 | total time: 3.89m | eta: 178.0m +step 00368/16704 (2.20%) | loss: 3.414020 | lrm: 1.00 | dt: 650.15ms | tok/sec: 806,416 | mfu: 50.40 | epoch: 1 | total time: 3.90m | eta: 178.0m +step 00369/16704 (2.21%) | loss: 3.408455 | lrm: 1.00 | dt: 652.22ms | tok/sec: 803,855 | mfu: 50.24 | epoch: 1 | total time: 3.91m | eta: 178.0m +step 00370/16704 (2.22%) | loss: 3.410871 | lrm: 1.00 | dt: 648.23ms | tok/sec: 808,801 | mfu: 50.55 | epoch: 1 | total time: 3.92m | eta: 178.0m +step 00371/16704 (2.22%) | loss: 3.402104 | lrm: 1.00 | dt: 652.42ms | tok/sec: 803,611 | mfu: 50.23 | epoch: 1 | total time: 3.93m | eta: 177.9m +step 00372/16704 (2.23%) | loss: 3.401290 | lrm: 1.00 | dt: 649.00ms | tok/sec: 807,839 | mfu: 50.49 | epoch: 1 | total time: 3.94m | eta: 177.9m +step 00373/16704 (2.23%) | loss: 3.410313 | lrm: 1.00 | dt: 651.87ms | tok/sec: 804,288 | mfu: 50.27 | epoch: 1 | total time: 3.95m | eta: 177.9m +step 00374/16704 (2.24%) | loss: 3.402565 | lrm: 1.00 | dt: 649.36ms | tok/sec: 807,391 | mfu: 50.46 | epoch: 1 | total time: 3.97m | eta: 177.9m +step 00375/16704 (2.24%) | loss: 3.398014 | lrm: 1.00 | dt: 648.31ms | tok/sec: 808,699 | mfu: 50.54 | epoch: 1 | total time: 3.98m | eta: 177.9m +step 00376/16704 (2.25%) | loss: 3.392255 | lrm: 1.00 | dt: 648.18ms | tok/sec: 808,860 | mfu: 50.55 | epoch: 1 | total time: 3.99m | eta: 177.9m +step 00377/16704 (2.26%) | loss: 3.412325 | lrm: 1.00 | dt: 652.43ms | tok/sec: 803,586 | mfu: 50.23 | epoch: 1 | total time: 4.00m | eta: 177.9m +step 00378/16704 (2.26%) | loss: 3.413568 | lrm: 1.00 | dt: 648.91ms | tok/sec: 807,952 | mfu: 50.50 | epoch: 1 | total time: 4.01m | eta: 177.8m +step 00379/16704 (2.27%) | loss: 3.419356 | lrm: 1.00 | dt: 649.89ms | tok/sec: 806,730 | mfu: 50.42 | epoch: 1 | total time: 4.02m | eta: 177.8m +step 00380/16704 (2.27%) | loss: 3.418872 | lrm: 1.00 | dt: 649.83ms | tok/sec: 806,807 | mfu: 50.43 | epoch: 1 | total time: 4.03m | eta: 177.8m +step 00381/16704 (2.28%) | loss: 3.409120 | lrm: 1.00 | dt: 649.46ms | tok/sec: 807,262 | mfu: 50.46 | epoch: 1 | total time: 4.04m | eta: 177.8m +step 00382/16704 (2.29%) | loss: 3.402174 | lrm: 1.00 | dt: 649.42ms | tok/sec: 807,323 | mfu: 50.46 | epoch: 1 | total time: 4.05m | eta: 177.8m +step 00383/16704 (2.29%) | loss: 3.391850 | lrm: 1.00 | dt: 652.70ms | tok/sec: 803,257 | mfu: 50.20 | epoch: 1 | total time: 4.06m | eta: 177.8m +step 00384/16704 (2.30%) | loss: 3.375979 | lrm: 1.00 | dt: 650.86ms | tok/sec: 805,529 | mfu: 50.35 | epoch: 1 | total time: 4.07m | eta: 177.8m +step 00385/16704 (2.30%) | loss: 3.375847 | lrm: 1.00 | dt: 650.51ms | tok/sec: 805,960 | mfu: 50.37 | epoch: 1 | total time: 4.08m | eta: 177.7m +step 00386/16704 (2.31%) | loss: 3.377087 | lrm: 1.00 | dt: 650.15ms | tok/sec: 806,411 | mfu: 50.40 | epoch: 1 | total time: 4.10m | eta: 177.7m +step 00387/16704 (2.32%) | loss: 3.364977 | lrm: 1.00 | dt: 651.46ms | tok/sec: 804,792 | mfu: 50.30 | epoch: 1 | total time: 4.11m | eta: 177.7m +step 00388/16704 (2.32%) | loss: 3.357661 | lrm: 1.00 | dt: 649.14ms | tok/sec: 807,662 | mfu: 50.48 | epoch: 1 | total time: 4.12m | eta: 177.7m +step 00389/16704 (2.33%) | loss: 3.357793 | lrm: 1.00 | dt: 650.36ms | tok/sec: 806,154 | mfu: 50.39 | epoch: 1 | total time: 4.13m | eta: 177.7m +step 00390/16704 (2.33%) | loss: 3.350647 | lrm: 1.00 | dt: 650.92ms | tok/sec: 805,461 | mfu: 50.34 | epoch: 1 | total time: 4.14m | eta: 177.7m +step 00391/16704 (2.34%) | loss: 3.340040 | lrm: 1.00 | dt: 647.72ms | tok/sec: 809,433 | mfu: 50.59 | epoch: 1 | total time: 4.15m | eta: 177.7m +step 00392/16704 (2.35%) | loss: 3.343687 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,944 | mfu: 50.50 | epoch: 1 | total time: 4.16m | eta: 177.7m +step 00393/16704 (2.35%) | loss: 3.339268 | lrm: 1.00 | dt: 650.22ms | tok/sec: 806,326 | mfu: 50.40 | epoch: 1 | total time: 4.17m | eta: 177.6m +step 00394/16704 (2.36%) | loss: 3.336673 | lrm: 1.00 | dt: 650.04ms | tok/sec: 806,547 | mfu: 50.41 | epoch: 1 | total time: 4.18m | eta: 177.6m +step 00395/16704 (2.36%) | loss: 3.335773 | lrm: 1.00 | dt: 649.60ms | tok/sec: 807,088 | mfu: 50.44 | epoch: 1 | total time: 4.19m | eta: 177.6m +step 00396/16704 (2.37%) | loss: 3.346971 | lrm: 1.00 | dt: 650.25ms | tok/sec: 806,289 | mfu: 50.39 | epoch: 1 | total time: 4.20m | eta: 177.6m +step 00397/16704 (2.38%) | loss: 3.342892 | lrm: 1.00 | dt: 650.12ms | tok/sec: 806,453 | mfu: 50.40 | epoch: 1 | total time: 4.21m | eta: 177.6m +step 00398/16704 (2.38%) | loss: 3.328409 | lrm: 1.00 | dt: 650.13ms | tok/sec: 806,437 | mfu: 50.40 | epoch: 1 | total time: 4.23m | eta: 177.6m +step 00399/16704 (2.39%) | loss: 3.332015 | lrm: 1.00 | dt: 652.15ms | tok/sec: 803,939 | mfu: 50.25 | epoch: 1 | total time: 4.24m | eta: 177.6m +step 00400/16704 (2.39%) | loss: 3.333878 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,777 | mfu: 50.55 | epoch: 1 | total time: 4.25m | eta: 177.5m +step 00401/16704 (2.40%) | loss: 3.342061 | lrm: 1.00 | dt: 648.85ms | tok/sec: 808,021 | mfu: 50.50 | epoch: 1 | total time: 4.26m | eta: 177.5m +step 00402/16704 (2.41%) | loss: 3.346544 | lrm: 1.00 | dt: 650.35ms | tok/sec: 806,158 | mfu: 50.39 | epoch: 1 | total time: 4.27m | eta: 177.5m +step 00403/16704 (2.41%) | loss: 3.348549 | lrm: 1.00 | dt: 647.54ms | tok/sec: 809,663 | mfu: 50.61 | epoch: 1 | total time: 4.28m | eta: 177.5m +step 00404/16704 (2.42%) | loss: 3.350877 | lrm: 1.00 | dt: 649.82ms | tok/sec: 806,820 | mfu: 50.43 | epoch: 1 | total time: 4.29m | eta: 177.5m +step 00405/16704 (2.42%) | loss: 3.349442 | lrm: 1.00 | dt: 650.26ms | tok/sec: 806,277 | mfu: 50.39 | epoch: 1 | total time: 4.30m | eta: 177.5m +step 00406/16704 (2.43%) | loss: 3.345497 | lrm: 1.00 | dt: 647.30ms | tok/sec: 809,967 | mfu: 50.62 | epoch: 1 | total time: 4.31m | eta: 177.5m +step 00407/16704 (2.44%) | loss: 3.342411 | lrm: 1.00 | dt: 651.35ms | tok/sec: 804,930 | mfu: 50.31 | epoch: 1 | total time: 4.32m | eta: 177.5m +step 00408/16704 (2.44%) | loss: 3.343235 | lrm: 1.00 | dt: 647.98ms | tok/sec: 809,106 | mfu: 50.57 | epoch: 1 | total time: 4.33m | eta: 177.4m +step 00409/16704 (2.45%) | loss: 3.334454 | lrm: 1.00 | dt: 651.83ms | tok/sec: 804,335 | mfu: 50.27 | epoch: 1 | total time: 4.34m | eta: 177.4m +step 00410/16704 (2.45%) | loss: 3.330579 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,476 | mfu: 50.66 | epoch: 1 | total time: 4.36m | eta: 177.4m +step 00411/16704 (2.46%) | loss: 3.330285 | lrm: 1.00 | dt: 650.08ms | tok/sec: 806,496 | mfu: 50.41 | epoch: 1 | total time: 4.37m | eta: 177.4m +step 00412/16704 (2.47%) | loss: 3.333800 | lrm: 1.00 | dt: 649.94ms | tok/sec: 806,668 | mfu: 50.42 | epoch: 1 | total time: 4.38m | eta: 177.4m +step 00413/16704 (2.47%) | loss: 3.332391 | lrm: 1.00 | dt: 650.42ms | tok/sec: 806,078 | mfu: 50.38 | epoch: 1 | total time: 4.39m | eta: 177.4m +step 00414/16704 (2.48%) | loss: 3.320922 | lrm: 1.00 | dt: 649.46ms | tok/sec: 807,265 | mfu: 50.46 | epoch: 1 | total time: 4.40m | eta: 177.4m +step 00415/16704 (2.48%) | loss: 3.338928 | lrm: 1.00 | dt: 648.73ms | tok/sec: 808,179 | mfu: 50.51 | epoch: 1 | total time: 4.41m | eta: 177.3m +step 00416/16704 (2.49%) | loss: 3.339197 | lrm: 1.00 | dt: 649.14ms | tok/sec: 807,667 | mfu: 50.48 | epoch: 1 | total time: 4.42m | eta: 177.3m +step 00417/16704 (2.50%) | loss: 3.346500 | lrm: 1.00 | dt: 649.58ms | tok/sec: 807,118 | mfu: 50.45 | epoch: 1 | total time: 4.43m | eta: 177.3m +step 00418/16704 (2.50%) | loss: 3.357857 | lrm: 1.00 | dt: 649.86ms | tok/sec: 806,768 | mfu: 50.42 | epoch: 1 | total time: 4.44m | eta: 177.3m +step 00419/16704 (2.51%) | loss: 3.369544 | lrm: 1.00 | dt: 651.94ms | tok/sec: 804,199 | mfu: 50.26 | epoch: 1 | total time: 4.45m | eta: 177.3m +step 00420/16704 (2.51%) | loss: 3.363343 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,429 | mfu: 50.65 | epoch: 1 | total time: 4.46m | eta: 177.3m +step 00421/16704 (2.52%) | loss: 3.370377 | lrm: 1.00 | dt: 649.33ms | tok/sec: 807,429 | mfu: 50.47 | epoch: 1 | total time: 4.47m | eta: 177.3m +step 00422/16704 (2.53%) | loss: 3.357217 | lrm: 1.00 | dt: 649.56ms | tok/sec: 807,145 | mfu: 50.45 | epoch: 1 | total time: 4.49m | eta: 177.3m +step 00423/16704 (2.53%) | loss: 3.350846 | lrm: 1.00 | dt: 650.53ms | tok/sec: 805,939 | mfu: 50.37 | epoch: 1 | total time: 4.50m | eta: 177.2m +step 00424/16704 (2.54%) | loss: 3.347925 | lrm: 1.00 | dt: 649.95ms | tok/sec: 806,662 | mfu: 50.42 | epoch: 1 | total time: 4.51m | eta: 177.2m +step 00425/16704 (2.54%) | loss: 3.356448 | lrm: 1.00 | dt: 647.34ms | tok/sec: 809,911 | mfu: 50.62 | epoch: 1 | total time: 4.52m | eta: 177.2m +step 00426/16704 (2.55%) | loss: 3.355578 | lrm: 1.00 | dt: 648.95ms | tok/sec: 807,900 | mfu: 50.49 | epoch: 1 | total time: 4.53m | eta: 177.2m +step 00427/16704 (2.56%) | loss: 3.356461 | lrm: 1.00 | dt: 650.59ms | tok/sec: 805,865 | mfu: 50.37 | epoch: 1 | total time: 4.54m | eta: 177.2m +step 00428/16704 (2.56%) | loss: 3.345883 | lrm: 1.00 | dt: 647.27ms | tok/sec: 809,994 | mfu: 50.63 | epoch: 1 | total time: 4.55m | eta: 177.2m +step 00429/16704 (2.57%) | loss: 3.337500 | lrm: 1.00 | dt: 648.90ms | tok/sec: 807,970 | mfu: 50.50 | epoch: 1 | total time: 4.56m | eta: 177.2m +step 00430/16704 (2.57%) | loss: 3.333408 | lrm: 1.00 | dt: 648.16ms | tok/sec: 808,890 | mfu: 50.56 | epoch: 1 | total time: 4.57m | eta: 177.1m +step 00431/16704 (2.58%) | loss: 3.325963 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,417 | mfu: 50.59 | epoch: 1 | total time: 4.58m | eta: 177.1m +step 00432/16704 (2.59%) | loss: 3.336745 | lrm: 1.00 | dt: 651.64ms | tok/sec: 804,569 | mfu: 50.29 | epoch: 1 | total time: 4.59m | eta: 177.1m +step 00433/16704 (2.59%) | loss: 3.335763 | lrm: 1.00 | dt: 649.49ms | tok/sec: 807,231 | mfu: 50.45 | epoch: 1 | total time: 4.60m | eta: 177.1m +step 00434/16704 (2.60%) | loss: 3.334670 | lrm: 1.00 | dt: 649.60ms | tok/sec: 807,092 | mfu: 50.44 | epoch: 1 | total time: 4.62m | eta: 177.1m +step 00435/16704 (2.60%) | loss: 3.331948 | lrm: 1.00 | dt: 650.30ms | tok/sec: 806,220 | mfu: 50.39 | epoch: 1 | total time: 4.63m | eta: 177.1m +step 00436/16704 (2.61%) | loss: 3.339412 | lrm: 1.00 | dt: 648.51ms | tok/sec: 808,445 | mfu: 50.53 | epoch: 1 | total time: 4.64m | eta: 177.1m +step 00437/16704 (2.62%) | loss: 3.337895 | lrm: 1.00 | dt: 648.65ms | tok/sec: 808,269 | mfu: 50.52 | epoch: 1 | total time: 4.65m | eta: 177.1m +step 00438/16704 (2.62%) | loss: 3.322977 | lrm: 1.00 | dt: 649.43ms | tok/sec: 807,310 | mfu: 50.46 | epoch: 1 | total time: 4.66m | eta: 177.0m +step 00439/16704 (2.63%) | loss: 3.337956 | lrm: 1.00 | dt: 648.66ms | tok/sec: 808,268 | mfu: 50.52 | epoch: 1 | total time: 4.67m | eta: 177.0m +step 00440/16704 (2.63%) | loss: 3.323699 | lrm: 1.00 | dt: 649.15ms | tok/sec: 807,658 | mfu: 50.48 | epoch: 1 | total time: 4.68m | eta: 177.0m +step 00441/16704 (2.64%) | loss: 3.332149 | lrm: 1.00 | dt: 648.63ms | tok/sec: 808,303 | mfu: 50.52 | epoch: 1 | total time: 4.69m | eta: 177.0m +step 00442/16704 (2.65%) | loss: 3.323764 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,487 | mfu: 50.59 | epoch: 1 | total time: 4.70m | eta: 177.0m +step 00443/16704 (2.65%) | loss: 3.315618 | lrm: 1.00 | dt: 647.88ms | tok/sec: 809,231 | mfu: 50.58 | epoch: 1 | total time: 4.71m | eta: 177.0m +step 00444/16704 (2.66%) | loss: 3.312939 | lrm: 1.00 | dt: 649.17ms | tok/sec: 807,623 | mfu: 50.48 | epoch: 1 | total time: 4.72m | eta: 177.0m +step 00445/16704 (2.66%) | loss: 3.319900 | lrm: 1.00 | dt: 651.38ms | tok/sec: 804,893 | mfu: 50.31 | epoch: 1 | total time: 4.73m | eta: 176.9m +step 00446/16704 (2.67%) | loss: 3.310330 | lrm: 1.00 | dt: 651.55ms | tok/sec: 804,678 | mfu: 50.29 | epoch: 1 | total time: 4.74m | eta: 176.9m +step 00447/16704 (2.68%) | loss: 3.312164 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,408 | mfu: 50.65 | epoch: 1 | total time: 4.76m | eta: 176.9m +step 00448/16704 (2.68%) | loss: 3.307347 | lrm: 1.00 | dt: 648.88ms | tok/sec: 807,992 | mfu: 50.50 | epoch: 1 | total time: 4.77m | eta: 176.9m +step 00449/16704 (2.69%) | loss: 3.305614 | lrm: 1.00 | dt: 650.96ms | tok/sec: 805,404 | mfu: 50.34 | epoch: 1 | total time: 4.78m | eta: 176.9m +step 00450/16704 (2.69%) | loss: 3.294366 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,115 | mfu: 50.63 | epoch: 1 | total time: 4.79m | eta: 176.9m +step 00451/16704 (2.70%) | loss: 3.313470 | lrm: 1.00 | dt: 647.88ms | tok/sec: 809,233 | mfu: 50.58 | epoch: 1 | total time: 4.80m | eta: 176.9m +step 00452/16704 (2.71%) | loss: 3.308116 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,496 | mfu: 50.66 | epoch: 1 | total time: 4.81m | eta: 176.8m +step 00453/16704 (2.71%) | loss: 3.309671 | lrm: 1.00 | dt: 650.42ms | tok/sec: 806,073 | mfu: 50.38 | epoch: 1 | total time: 4.82m | eta: 176.8m +step 00454/16704 (2.72%) | loss: 3.321595 | lrm: 1.00 | dt: 648.80ms | tok/sec: 808,083 | mfu: 50.51 | epoch: 1 | total time: 4.83m | eta: 176.8m +step 00455/16704 (2.72%) | loss: 3.311817 | lrm: 1.00 | dt: 649.16ms | tok/sec: 807,641 | mfu: 50.48 | epoch: 1 | total time: 4.84m | eta: 176.8m +step 00456/16704 (2.73%) | loss: 3.314591 | lrm: 1.00 | dt: 651.31ms | tok/sec: 804,971 | mfu: 50.31 | epoch: 1 | total time: 4.85m | eta: 176.8m +step 00457/16704 (2.74%) | loss: 3.301153 | lrm: 1.00 | dt: 649.48ms | tok/sec: 807,244 | mfu: 50.45 | epoch: 1 | total time: 4.86m | eta: 176.8m +step 00458/16704 (2.74%) | loss: 3.310241 | lrm: 1.00 | dt: 646.88ms | tok/sec: 810,483 | mfu: 50.66 | epoch: 1 | total time: 4.87m | eta: 176.8m +step 00459/16704 (2.75%) | loss: 3.305669 | lrm: 1.00 | dt: 651.35ms | tok/sec: 804,919 | mfu: 50.31 | epoch: 1 | total time: 4.89m | eta: 176.8m +step 00460/16704 (2.75%) | loss: 3.290614 | lrm: 1.00 | dt: 647.59ms | tok/sec: 809,602 | mfu: 50.60 | epoch: 1 | total time: 4.90m | eta: 176.7m +step 00461/16704 (2.76%) | loss: 3.294127 | lrm: 1.00 | dt: 648.42ms | tok/sec: 808,564 | mfu: 50.54 | epoch: 1 | total time: 4.91m | eta: 176.7m +step 00462/16704 (2.77%) | loss: 3.293266 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,509 | mfu: 50.72 | epoch: 1 | total time: 4.92m | eta: 176.7m +step 00463/16704 (2.77%) | loss: 3.294796 | lrm: 1.00 | dt: 648.64ms | tok/sec: 808,291 | mfu: 50.52 | epoch: 1 | total time: 4.93m | eta: 176.7m +step 00464/16704 (2.78%) | loss: 3.298067 | lrm: 1.00 | dt: 648.03ms | tok/sec: 809,045 | mfu: 50.57 | epoch: 1 | total time: 4.94m | eta: 176.7m +step 00465/16704 (2.78%) | loss: 3.301918 | lrm: 1.00 | dt: 648.11ms | tok/sec: 808,952 | mfu: 50.56 | epoch: 1 | total time: 4.95m | eta: 176.7m +step 00466/16704 (2.79%) | loss: 3.300707 | lrm: 1.00 | dt: 648.65ms | tok/sec: 808,270 | mfu: 50.52 | epoch: 1 | total time: 4.96m | eta: 176.7m +step 00467/16704 (2.80%) | loss: 3.285786 | lrm: 1.00 | dt: 651.62ms | tok/sec: 804,594 | mfu: 50.29 | epoch: 1 | total time: 4.97m | eta: 176.6m +step 00468/16704 (2.80%) | loss: 3.283564 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,055 | mfu: 50.82 | epoch: 1 | total time: 4.98m | eta: 176.6m +step 00469/16704 (2.81%) | loss: 3.269573 | lrm: 1.00 | dt: 647.87ms | tok/sec: 809,246 | mfu: 50.58 | epoch: 1 | total time: 4.99m | eta: 176.6m +step 00470/16704 (2.81%) | loss: 3.257266 | lrm: 1.00 | dt: 648.49ms | tok/sec: 808,475 | mfu: 50.53 | epoch: 1 | total time: 5.00m | eta: 176.6m +step 00471/16704 (2.82%) | loss: 3.246174 | lrm: 1.00 | dt: 648.14ms | tok/sec: 808,908 | mfu: 50.56 | epoch: 1 | total time: 5.02m | eta: 176.6m +step 00472/16704 (2.83%) | loss: 3.250117 | lrm: 1.00 | dt: 648.40ms | tok/sec: 808,583 | mfu: 50.54 | epoch: 1 | total time: 5.03m | eta: 176.6m +step 00473/16704 (2.83%) | loss: 3.257217 | lrm: 1.00 | dt: 648.42ms | tok/sec: 808,556 | mfu: 50.54 | epoch: 1 | total time: 5.04m | eta: 176.6m +step 00474/16704 (2.84%) | loss: 3.260698 | lrm: 1.00 | dt: 648.99ms | tok/sec: 807,847 | mfu: 50.49 | epoch: 1 | total time: 5.05m | eta: 176.6m +step 00475/16704 (2.84%) | loss: 3.281345 | lrm: 1.00 | dt: 647.60ms | tok/sec: 809,591 | mfu: 50.60 | epoch: 1 | total time: 5.06m | eta: 176.5m +step 00476/16704 (2.85%) | loss: 3.279091 | lrm: 1.00 | dt: 648.22ms | tok/sec: 808,812 | mfu: 50.55 | epoch: 1 | total time: 5.07m | eta: 176.5m +step 00477/16704 (2.86%) | loss: 3.287172 | lrm: 1.00 | dt: 647.74ms | tok/sec: 809,408 | mfu: 50.59 | epoch: 1 | total time: 5.08m | eta: 176.5m +step 00478/16704 (2.86%) | loss: 3.286468 | lrm: 1.00 | dt: 649.15ms | tok/sec: 807,649 | mfu: 50.48 | epoch: 1 | total time: 5.09m | eta: 176.5m +step 00479/16704 (2.87%) | loss: 3.298085 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,069 | mfu: 50.69 | epoch: 1 | total time: 5.10m | eta: 176.5m +step 00480/16704 (2.87%) | loss: 3.287808 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,563 | mfu: 50.66 | epoch: 1 | total time: 5.11m | eta: 176.5m +step 00481/16704 (2.88%) | loss: 3.284762 | lrm: 1.00 | dt: 648.68ms | tok/sec: 808,239 | mfu: 50.52 | epoch: 1 | total time: 5.12m | eta: 176.5m +step 00482/16704 (2.89%) | loss: 3.271937 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,794 | mfu: 50.74 | epoch: 1 | total time: 5.13m | eta: 176.4m +step 00483/16704 (2.89%) | loss: 3.269818 | lrm: 1.00 | dt: 648.79ms | tok/sec: 808,104 | mfu: 50.51 | epoch: 1 | total time: 5.14m | eta: 176.4m +step 00484/16704 (2.90%) | loss: 3.275994 | lrm: 1.00 | dt: 649.06ms | tok/sec: 807,769 | mfu: 50.49 | epoch: 1 | total time: 5.16m | eta: 176.4m +step 00485/16704 (2.90%) | loss: 3.287369 | lrm: 1.00 | dt: 647.98ms | tok/sec: 809,112 | mfu: 50.57 | epoch: 1 | total time: 5.17m | eta: 176.4m +step 00486/16704 (2.91%) | loss: 3.288462 | lrm: 1.00 | dt: 649.32ms | tok/sec: 807,442 | mfu: 50.47 | epoch: 1 | total time: 5.18m | eta: 176.4m +step 00487/16704 (2.92%) | loss: 3.293180 | lrm: 1.00 | dt: 648.05ms | tok/sec: 809,030 | mfu: 50.57 | epoch: 1 | total time: 5.19m | eta: 176.4m +step 00488/16704 (2.92%) | loss: 3.288734 | lrm: 1.00 | dt: 650.53ms | tok/sec: 805,935 | mfu: 50.37 | epoch: 1 | total time: 5.20m | eta: 176.4m +step 00489/16704 (2.93%) | loss: 3.276529 | lrm: 1.00 | dt: 649.03ms | tok/sec: 807,801 | mfu: 50.49 | epoch: 1 | total time: 5.21m | eta: 176.4m +step 00490/16704 (2.93%) | loss: 3.268070 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,095 | mfu: 50.57 | epoch: 1 | total time: 5.22m | eta: 176.3m +step 00491/16704 (2.94%) | loss: 3.269914 | lrm: 1.00 | dt: 648.42ms | tok/sec: 808,560 | mfu: 50.54 | epoch: 1 | total time: 5.23m | eta: 176.3m +step 00492/16704 (2.95%) | loss: 3.283953 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,083 | mfu: 50.69 | epoch: 1 | total time: 5.24m | eta: 176.3m +step 00493/16704 (2.95%) | loss: 3.299407 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,205 | mfu: 50.70 | epoch: 1 | total time: 5.25m | eta: 176.3m +step 00494/16704 (2.96%) | loss: 3.292149 | lrm: 1.00 | dt: 650.70ms | tok/sec: 805,733 | mfu: 50.36 | epoch: 1 | total time: 5.26m | eta: 176.3m +step 00495/16704 (2.96%) | loss: 3.301636 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,589 | mfu: 50.66 | epoch: 1 | total time: 5.27m | eta: 176.3m +step 00496/16704 (2.97%) | loss: 3.312888 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,724 | mfu: 50.67 | epoch: 1 | total time: 5.29m | eta: 176.3m +step 00497/16704 (2.98%) | loss: 3.316739 | lrm: 1.00 | dt: 649.04ms | tok/sec: 807,795 | mfu: 50.49 | epoch: 1 | total time: 5.30m | eta: 176.2m +step 00498/16704 (2.98%) | loss: 3.303186 | lrm: 1.00 | dt: 648.96ms | tok/sec: 807,884 | mfu: 50.49 | epoch: 1 | total time: 5.31m | eta: 176.2m +step 00499/16704 (2.99%) | loss: 3.306880 | lrm: 1.00 | dt: 648.30ms | tok/sec: 808,710 | mfu: 50.55 | epoch: 1 | total time: 5.32m | eta: 176.2m +Step 00500 | Validation bpb: 0.992190 +step 00500/16704 (2.99%) | loss: 3.300649 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,742 | mfu: 50.61 | epoch: 1 | total time: 5.33m | eta: 176.2m +step 00501/16704 (3.00%) | loss: 3.292358 | lrm: 1.00 | dt: 652.36ms | tok/sec: 803,683 | mfu: 50.23 | epoch: 1 | total time: 5.34m | eta: 176.2m +step 00502/16704 (3.01%) | loss: 3.286093 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,792 | mfu: 50.80 | epoch: 1 | total time: 5.35m | eta: 176.2m +step 00503/16704 (3.01%) | loss: 3.292651 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,692 | mfu: 50.54 | epoch: 1 | total time: 5.36m | eta: 176.2m +step 00504/16704 (3.02%) | loss: 3.287693 | lrm: 1.00 | dt: 649.39ms | tok/sec: 807,359 | mfu: 50.46 | epoch: 1 | total time: 5.37m | eta: 176.2m +step 00505/16704 (3.02%) | loss: 3.288782 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,964 | mfu: 50.69 | epoch: 1 | total time: 5.38m | eta: 176.1m +step 00506/16704 (3.03%) | loss: 3.293307 | lrm: 1.00 | dt: 650.29ms | tok/sec: 806,241 | mfu: 50.39 | epoch: 1 | total time: 5.39m | eta: 176.1m +step 00507/16704 (3.04%) | loss: 3.284787 | lrm: 1.00 | dt: 648.38ms | tok/sec: 808,610 | mfu: 50.54 | epoch: 1 | total time: 5.40m | eta: 176.1m +step 00508/16704 (3.04%) | loss: 3.285970 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,156 | mfu: 50.70 | epoch: 1 | total time: 5.41m | eta: 176.1m +step 00509/16704 (3.05%) | loss: 3.305572 | lrm: 1.00 | dt: 649.36ms | tok/sec: 807,389 | mfu: 50.46 | epoch: 1 | total time: 5.43m | eta: 176.1m +step 00510/16704 (3.05%) | loss: 3.307239 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,293 | mfu: 50.77 | epoch: 1 | total time: 5.44m | eta: 176.1m +step 00511/16704 (3.06%) | loss: 3.303586 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,999 | mfu: 50.75 | epoch: 1 | total time: 5.45m | eta: 176.1m +step 00512/16704 (3.07%) | loss: 3.293505 | lrm: 1.00 | dt: 646.02ms | tok/sec: 811,563 | mfu: 50.72 | epoch: 1 | total time: 5.46m | eta: 176.0m +step 00513/16704 (3.07%) | loss: 3.292774 | lrm: 1.00 | dt: 648.02ms | tok/sec: 809,059 | mfu: 50.57 | epoch: 1 | total time: 5.47m | eta: 176.0m +step 00514/16704 (3.08%) | loss: 3.286059 | lrm: 1.00 | dt: 647.52ms | tok/sec: 809,688 | mfu: 50.61 | epoch: 1 | total time: 5.48m | eta: 176.0m +step 00515/16704 (3.08%) | loss: 3.284454 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,273 | mfu: 50.58 | epoch: 1 | total time: 5.49m | eta: 176.0m +step 00516/16704 (3.09%) | loss: 3.300177 | lrm: 1.00 | dt: 648.58ms | tok/sec: 808,365 | mfu: 50.52 | epoch: 1 | total time: 5.50m | eta: 176.0m +step 00517/16704 (3.10%) | loss: 3.296663 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,075 | mfu: 50.57 | epoch: 1 | total time: 5.51m | eta: 176.0m +step 00518/16704 (3.10%) | loss: 3.298255 | lrm: 1.00 | dt: 650.28ms | tok/sec: 806,244 | mfu: 50.39 | epoch: 1 | total time: 5.52m | eta: 176.0m +step 00519/16704 (3.11%) | loss: 3.296664 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,650 | mfu: 50.60 | epoch: 1 | total time: 5.53m | eta: 176.0m +step 00520/16704 (3.11%) | loss: 3.275081 | lrm: 1.00 | dt: 649.34ms | tok/sec: 807,422 | mfu: 50.47 | epoch: 1 | total time: 5.54m | eta: 175.9m +step 00521/16704 (3.12%) | loss: 3.264917 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,418 | mfu: 50.59 | epoch: 1 | total time: 5.56m | eta: 175.9m +step 00522/16704 (3.12%) | loss: 3.269432 | lrm: 1.00 | dt: 648.12ms | tok/sec: 808,936 | mfu: 50.56 | epoch: 1 | total time: 5.57m | eta: 175.9m +step 00523/16704 (3.13%) | loss: 3.278932 | lrm: 1.00 | dt: 650.24ms | tok/sec: 806,298 | mfu: 50.39 | epoch: 1 | total time: 5.58m | eta: 175.9m +step 00524/16704 (3.14%) | loss: 3.270355 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,204 | mfu: 50.58 | epoch: 1 | total time: 5.59m | eta: 175.9m +step 00525/16704 (3.14%) | loss: 3.265331 | lrm: 1.00 | dt: 649.00ms | tok/sec: 807,843 | mfu: 50.49 | epoch: 1 | total time: 5.60m | eta: 175.9m +step 00526/16704 (3.15%) | loss: 3.274481 | lrm: 1.00 | dt: 647.62ms | tok/sec: 809,559 | mfu: 50.60 | epoch: 1 | total time: 5.61m | eta: 175.9m +step 00527/16704 (3.15%) | loss: 3.295203 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,564 | mfu: 50.91 | epoch: 1 | total time: 5.62m | eta: 175.8m +step 00528/16704 (3.16%) | loss: 3.293453 | lrm: 1.00 | dt: 649.91ms | tok/sec: 806,709 | mfu: 50.42 | epoch: 1 | total time: 5.63m | eta: 175.8m +step 00529/16704 (3.17%) | loss: 3.290988 | lrm: 1.00 | dt: 648.57ms | tok/sec: 808,373 | mfu: 50.52 | epoch: 1 | total time: 5.64m | eta: 175.8m +step 00530/16704 (3.17%) | loss: 3.290017 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,230 | mfu: 50.64 | epoch: 1 | total time: 5.65m | eta: 175.8m +step 00531/16704 (3.18%) | loss: 3.287613 | lrm: 1.00 | dt: 649.79ms | tok/sec: 806,852 | mfu: 50.43 | epoch: 1 | total time: 5.66m | eta: 175.8m +step 00532/16704 (3.18%) | loss: 3.289782 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,486 | mfu: 50.59 | epoch: 1 | total time: 5.67m | eta: 175.8m +step 00533/16704 (3.19%) | loss: 3.296327 | lrm: 1.00 | dt: 652.17ms | tok/sec: 803,910 | mfu: 50.25 | epoch: 1 | total time: 5.68m | eta: 175.8m +step 00534/16704 (3.20%) | loss: 3.286450 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,840 | mfu: 50.68 | epoch: 1 | total time: 5.70m | eta: 175.8m +step 00535/16704 (3.20%) | loss: 3.284654 | lrm: 1.00 | dt: 649.69ms | tok/sec: 806,985 | mfu: 50.44 | epoch: 1 | total time: 5.71m | eta: 175.7m +step 00536/16704 (3.21%) | loss: 3.291175 | lrm: 1.00 | dt: 648.38ms | tok/sec: 808,610 | mfu: 50.54 | epoch: 1 | total time: 5.72m | eta: 175.7m +step 00537/16704 (3.21%) | loss: 3.286529 | lrm: 1.00 | dt: 648.37ms | tok/sec: 808,621 | mfu: 50.54 | epoch: 1 | total time: 5.73m | eta: 175.7m +step 00538/16704 (3.22%) | loss: 3.286116 | lrm: 1.00 | dt: 650.95ms | tok/sec: 805,422 | mfu: 50.34 | epoch: 1 | total time: 5.74m | eta: 175.7m +step 00539/16704 (3.23%) | loss: 3.273239 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,582 | mfu: 50.79 | epoch: 1 | total time: 5.75m | eta: 175.7m +step 00540/16704 (3.23%) | loss: 3.266320 | lrm: 1.00 | dt: 648.27ms | tok/sec: 808,747 | mfu: 50.55 | epoch: 1 | total time: 5.76m | eta: 175.7m +step 00541/16704 (3.24%) | loss: 3.269014 | lrm: 1.00 | dt: 648.10ms | tok/sec: 808,963 | mfu: 50.56 | epoch: 1 | total time: 5.77m | eta: 175.7m +step 00542/16704 (3.24%) | loss: 3.261806 | lrm: 1.00 | dt: 648.85ms | tok/sec: 808,022 | mfu: 50.50 | epoch: 1 | total time: 5.78m | eta: 175.7m +step 00543/16704 (3.25%) | loss: 3.263631 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,324 | mfu: 50.71 | epoch: 1 | total time: 5.79m | eta: 175.6m +step 00544/16704 (3.26%) | loss: 3.262102 | lrm: 1.00 | dt: 649.50ms | tok/sec: 807,211 | mfu: 50.45 | epoch: 1 | total time: 5.80m | eta: 175.6m +step 00545/16704 (3.26%) | loss: 3.259256 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,079 | mfu: 50.69 | epoch: 1 | total time: 5.81m | eta: 175.6m +step 00546/16704 (3.27%) | loss: 3.247540 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,451 | mfu: 50.65 | epoch: 1 | total time: 5.83m | eta: 175.6m +step 00547/16704 (3.27%) | loss: 3.244228 | lrm: 1.00 | dt: 650.21ms | tok/sec: 806,335 | mfu: 50.40 | epoch: 1 | total time: 5.84m | eta: 175.6m +step 00548/16704 (3.28%) | loss: 3.246824 | lrm: 1.00 | dt: 646.83ms | tok/sec: 810,545 | mfu: 50.66 | epoch: 1 | total time: 5.85m | eta: 175.6m +step 00549/16704 (3.29%) | loss: 3.245038 | lrm: 1.00 | dt: 650.43ms | tok/sec: 806,060 | mfu: 50.38 | epoch: 1 | total time: 5.86m | eta: 175.6m +step 00550/16704 (3.29%) | loss: 3.240502 | lrm: 1.00 | dt: 647.58ms | tok/sec: 809,613 | mfu: 50.60 | epoch: 1 | total time: 5.87m | eta: 175.6m +step 00551/16704 (3.30%) | loss: 3.235174 | lrm: 1.00 | dt: 648.96ms | tok/sec: 807,886 | mfu: 50.49 | epoch: 1 | total time: 5.88m | eta: 175.5m +step 00552/16704 (3.30%) | loss: 3.234277 | lrm: 1.00 | dt: 650.20ms | tok/sec: 806,348 | mfu: 50.40 | epoch: 1 | total time: 5.89m | eta: 175.5m +step 00553/16704 (3.31%) | loss: 3.229444 | lrm: 1.00 | dt: 648.63ms | tok/sec: 808,305 | mfu: 50.52 | epoch: 1 | total time: 5.90m | eta: 175.5m +step 00554/16704 (3.32%) | loss: 3.231253 | lrm: 1.00 | dt: 649.45ms | tok/sec: 807,277 | mfu: 50.46 | epoch: 1 | total time: 5.91m | eta: 175.5m +step 00555/16704 (3.32%) | loss: 3.234266 | lrm: 1.00 | dt: 648.22ms | tok/sec: 808,814 | mfu: 50.55 | epoch: 1 | total time: 5.92m | eta: 175.5m +step 00556/16704 (3.33%) | loss: 3.248395 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,944 | mfu: 50.50 | epoch: 1 | total time: 5.93m | eta: 175.5m +step 00557/16704 (3.33%) | loss: 3.237734 | lrm: 1.00 | dt: 649.29ms | tok/sec: 807,480 | mfu: 50.47 | epoch: 1 | total time: 5.94m | eta: 175.5m +step 00558/16704 (3.34%) | loss: 3.241835 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,160 | mfu: 50.64 | epoch: 1 | total time: 5.95m | eta: 175.5m +step 00559/16704 (3.35%) | loss: 3.232793 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,339 | mfu: 50.71 | epoch: 1 | total time: 5.97m | eta: 175.4m +step 00560/16704 (3.35%) | loss: 3.230594 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,359 | mfu: 50.71 | epoch: 1 | total time: 5.98m | eta: 175.4m +step 00561/16704 (3.36%) | loss: 3.221631 | lrm: 1.00 | dt: 650.08ms | tok/sec: 806,498 | mfu: 50.41 | epoch: 1 | total time: 5.99m | eta: 175.4m +step 00562/16704 (3.36%) | loss: 3.233548 | lrm: 1.00 | dt: 650.18ms | tok/sec: 806,379 | mfu: 50.40 | epoch: 1 | total time: 6.00m | eta: 175.4m +step 00563/16704 (3.37%) | loss: 3.215439 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,104 | mfu: 50.63 | epoch: 1 | total time: 6.01m | eta: 175.4m +step 00564/16704 (3.38%) | loss: 3.217936 | lrm: 1.00 | dt: 648.36ms | tok/sec: 808,639 | mfu: 50.54 | epoch: 1 | total time: 6.02m | eta: 175.4m +step 00565/16704 (3.38%) | loss: 3.218684 | lrm: 1.00 | dt: 647.25ms | tok/sec: 810,026 | mfu: 50.63 | epoch: 1 | total time: 6.03m | eta: 175.4m +step 00566/16704 (3.39%) | loss: 3.222796 | lrm: 1.00 | dt: 647.66ms | tok/sec: 809,517 | mfu: 50.60 | epoch: 1 | total time: 6.04m | eta: 175.4m +step 00567/16704 (3.39%) | loss: 3.228705 | lrm: 1.00 | dt: 648.47ms | tok/sec: 808,502 | mfu: 50.53 | epoch: 1 | total time: 6.05m | eta: 175.3m +step 00568/16704 (3.40%) | loss: 3.227100 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,651 | mfu: 50.60 | epoch: 1 | total time: 6.06m | eta: 175.3m +step 00569/16704 (3.41%) | loss: 3.215658 | lrm: 1.00 | dt: 648.14ms | tok/sec: 808,914 | mfu: 50.56 | epoch: 1 | total time: 6.07m | eta: 175.3m +step 00570/16704 (3.41%) | loss: 3.214847 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,684 | mfu: 50.54 | epoch: 1 | total time: 6.08m | eta: 175.3m +step 00571/16704 (3.42%) | loss: 3.214278 | lrm: 1.00 | dt: 649.49ms | tok/sec: 807,232 | mfu: 50.45 | epoch: 1 | total time: 6.10m | eta: 175.3m +step 00572/16704 (3.42%) | loss: 3.222267 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,046 | mfu: 50.69 | epoch: 1 | total time: 6.11m | eta: 175.3m +step 00573/16704 (3.43%) | loss: 3.228626 | lrm: 1.00 | dt: 647.92ms | tok/sec: 809,187 | mfu: 50.58 | epoch: 1 | total time: 6.12m | eta: 175.3m +step 00574/16704 (3.44%) | loss: 3.243903 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,408 | mfu: 50.65 | epoch: 1 | total time: 6.13m | eta: 175.2m +step 00575/16704 (3.44%) | loss: 3.248295 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,099 | mfu: 50.69 | epoch: 1 | total time: 6.14m | eta: 175.2m +step 00576/16704 (3.45%) | loss: 3.260127 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,058 | mfu: 50.63 | epoch: 1 | total time: 6.15m | eta: 175.2m +step 00577/16704 (3.45%) | loss: 3.254963 | lrm: 1.00 | dt: 647.96ms | tok/sec: 809,141 | mfu: 50.57 | epoch: 1 | total time: 6.16m | eta: 175.2m +step 00578/16704 (3.46%) | loss: 3.252078 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,244 | mfu: 50.64 | epoch: 1 | total time: 6.17m | eta: 175.2m +step 00579/16704 (3.47%) | loss: 3.247691 | lrm: 1.00 | dt: 647.79ms | tok/sec: 809,343 | mfu: 50.59 | epoch: 1 | total time: 6.18m | eta: 175.2m +step 00580/16704 (3.47%) | loss: 3.241136 | lrm: 1.00 | dt: 647.38ms | tok/sec: 809,861 | mfu: 50.62 | epoch: 1 | total time: 6.19m | eta: 175.2m +step 00581/16704 (3.48%) | loss: 3.238929 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,898 | mfu: 50.62 | epoch: 1 | total time: 6.20m | eta: 175.2m +step 00582/16704 (3.48%) | loss: 3.240145 | lrm: 1.00 | dt: 647.53ms | tok/sec: 809,679 | mfu: 50.61 | epoch: 1 | total time: 6.21m | eta: 175.1m +step 00583/16704 (3.49%) | loss: 3.230872 | lrm: 1.00 | dt: 650.11ms | tok/sec: 806,458 | mfu: 50.40 | epoch: 1 | total time: 6.22m | eta: 175.1m +step 00584/16704 (3.50%) | loss: 3.233322 | lrm: 1.00 | dt: 648.98ms | tok/sec: 807,868 | mfu: 50.49 | epoch: 1 | total time: 6.24m | eta: 175.1m +step 00585/16704 (3.50%) | loss: 3.228367 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,102 | mfu: 50.70 | epoch: 1 | total time: 6.25m | eta: 175.1m +step 00586/16704 (3.51%) | loss: 3.224584 | lrm: 1.00 | dt: 648.78ms | tok/sec: 808,109 | mfu: 50.51 | epoch: 1 | total time: 6.26m | eta: 175.1m +step 00587/16704 (3.51%) | loss: 3.211843 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,856 | mfu: 50.68 | epoch: 1 | total time: 6.27m | eta: 175.1m +step 00588/16704 (3.52%) | loss: 3.209011 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,075 | mfu: 50.57 | epoch: 1 | total time: 6.28m | eta: 175.1m +step 00589/16704 (3.53%) | loss: 3.218279 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,488 | mfu: 50.59 | epoch: 1 | total time: 6.29m | eta: 175.1m +step 00590/16704 (3.53%) | loss: 3.215962 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,467 | mfu: 50.72 | epoch: 1 | total time: 6.30m | eta: 175.0m +step 00591/16704 (3.54%) | loss: 3.212970 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,749 | mfu: 50.61 | epoch: 1 | total time: 6.31m | eta: 175.0m +step 00592/16704 (3.54%) | loss: 3.211603 | lrm: 1.00 | dt: 647.38ms | tok/sec: 809,864 | mfu: 50.62 | epoch: 1 | total time: 6.32m | eta: 175.0m +step 00593/16704 (3.55%) | loss: 3.204739 | lrm: 1.00 | dt: 647.80ms | tok/sec: 809,331 | mfu: 50.58 | epoch: 1 | total time: 6.33m | eta: 175.0m +step 00594/16704 (3.56%) | loss: 3.189811 | lrm: 1.00 | dt: 648.53ms | tok/sec: 808,425 | mfu: 50.53 | epoch: 1 | total time: 6.34m | eta: 175.0m +step 00595/16704 (3.56%) | loss: 3.196621 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,081 | mfu: 50.63 | epoch: 1 | total time: 6.35m | eta: 175.0m +step 00596/16704 (3.57%) | loss: 3.182369 | lrm: 1.00 | dt: 648.36ms | tok/sec: 808,638 | mfu: 50.54 | epoch: 1 | total time: 6.37m | eta: 175.0m +step 00597/16704 (3.57%) | loss: 3.185698 | lrm: 1.00 | dt: 649.69ms | tok/sec: 806,983 | mfu: 50.44 | epoch: 1 | total time: 6.38m | eta: 175.0m +step 00598/16704 (3.58%) | loss: 3.195348 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,520 | mfu: 50.72 | epoch: 1 | total time: 6.39m | eta: 174.9m +step 00599/16704 (3.59%) | loss: 3.187959 | lrm: 1.00 | dt: 647.88ms | tok/sec: 809,240 | mfu: 50.58 | epoch: 1 | total time: 6.40m | eta: 174.9m +step 00600/16704 (3.59%) | loss: 3.180067 | lrm: 1.00 | dt: 648.65ms | tok/sec: 808,276 | mfu: 50.52 | epoch: 1 | total time: 6.41m | eta: 174.9m +step 00601/16704 (3.60%) | loss: 3.199819 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,663 | mfu: 50.67 | epoch: 1 | total time: 6.42m | eta: 174.9m +step 00602/16704 (3.60%) | loss: 3.195609 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,221 | mfu: 50.64 | epoch: 1 | total time: 6.43m | eta: 174.9m +step 00603/16704 (3.61%) | loss: 3.190243 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,826 | mfu: 50.74 | epoch: 1 | total time: 6.44m | eta: 174.9m +step 00604/16704 (3.62%) | loss: 3.195160 | lrm: 1.00 | dt: 648.40ms | tok/sec: 808,585 | mfu: 50.54 | epoch: 1 | total time: 6.45m | eta: 174.9m +step 00605/16704 (3.62%) | loss: 3.188531 | lrm: 1.00 | dt: 649.81ms | tok/sec: 806,829 | mfu: 50.43 | epoch: 1 | total time: 6.46m | eta: 174.9m +step 00606/16704 (3.63%) | loss: 3.190575 | lrm: 1.00 | dt: 647.77ms | tok/sec: 809,369 | mfu: 50.59 | epoch: 1 | total time: 6.47m | eta: 174.8m +step 00607/16704 (3.63%) | loss: 3.194877 | lrm: 1.00 | dt: 648.46ms | tok/sec: 808,509 | mfu: 50.53 | epoch: 1 | total time: 6.48m | eta: 174.8m +step 00608/16704 (3.64%) | loss: 3.190812 | lrm: 1.00 | dt: 647.56ms | tok/sec: 809,633 | mfu: 50.60 | epoch: 1 | total time: 6.49m | eta: 174.8m +step 00609/16704 (3.65%) | loss: 3.182731 | lrm: 1.00 | dt: 648.71ms | tok/sec: 808,205 | mfu: 50.51 | epoch: 1 | total time: 6.51m | eta: 174.8m +step 00610/16704 (3.65%) | loss: 3.188514 | lrm: 1.00 | dt: 647.23ms | tok/sec: 810,047 | mfu: 50.63 | epoch: 1 | total time: 6.52m | eta: 174.8m +step 00611/16704 (3.66%) | loss: 3.183394 | lrm: 1.00 | dt: 649.32ms | tok/sec: 807,436 | mfu: 50.47 | epoch: 1 | total time: 6.53m | eta: 174.8m +step 00612/16704 (3.66%) | loss: 3.194653 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,213 | mfu: 50.64 | epoch: 1 | total time: 6.54m | eta: 174.8m +step 00613/16704 (3.67%) | loss: 3.190479 | lrm: 1.00 | dt: 648.08ms | tok/sec: 808,980 | mfu: 50.56 | epoch: 1 | total time: 6.55m | eta: 174.8m +step 00614/16704 (3.68%) | loss: 3.202250 | lrm: 1.00 | dt: 650.15ms | tok/sec: 806,406 | mfu: 50.40 | epoch: 1 | total time: 6.56m | eta: 174.7m +step 00615/16704 (3.68%) | loss: 3.196456 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,580 | mfu: 50.66 | epoch: 1 | total time: 6.57m | eta: 174.7m +step 00616/16704 (3.69%) | loss: 3.190080 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,526 | mfu: 50.72 | epoch: 1 | total time: 6.58m | eta: 174.7m +step 00617/16704 (3.69%) | loss: 3.193947 | lrm: 1.00 | dt: 650.49ms | tok/sec: 805,988 | mfu: 50.38 | epoch: 1 | total time: 6.59m | eta: 174.7m +step 00618/16704 (3.70%) | loss: 3.207541 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,029 | mfu: 50.75 | epoch: 1 | total time: 6.60m | eta: 174.7m +step 00619/16704 (3.71%) | loss: 3.203772 | lrm: 1.00 | dt: 650.74ms | tok/sec: 805,685 | mfu: 50.36 | epoch: 1 | total time: 6.61m | eta: 174.7m +step 00620/16704 (3.71%) | loss: 3.201644 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,798 | mfu: 50.74 | epoch: 1 | total time: 6.62m | eta: 174.7m +step 00621/16704 (3.72%) | loss: 3.210214 | lrm: 1.00 | dt: 649.96ms | tok/sec: 806,643 | mfu: 50.42 | epoch: 1 | total time: 6.64m | eta: 174.7m +step 00622/16704 (3.72%) | loss: 3.201194 | lrm: 1.00 | dt: 648.53ms | tok/sec: 808,428 | mfu: 50.53 | epoch: 1 | total time: 6.65m | eta: 174.6m +step 00623/16704 (3.73%) | loss: 3.206693 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,130 | mfu: 50.70 | epoch: 1 | total time: 6.66m | eta: 174.6m +step 00624/16704 (3.74%) | loss: 3.195790 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,020 | mfu: 50.69 | epoch: 1 | total time: 6.67m | eta: 174.6m +step 00625/16704 (3.74%) | loss: 3.203835 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,698 | mfu: 50.67 | epoch: 1 | total time: 6.68m | eta: 174.6m +step 00626/16704 (3.75%) | loss: 3.203079 | lrm: 1.00 | dt: 650.85ms | tok/sec: 805,538 | mfu: 50.35 | epoch: 1 | total time: 6.69m | eta: 174.6m +step 00627/16704 (3.75%) | loss: 3.201887 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,947 | mfu: 50.62 | epoch: 1 | total time: 6.70m | eta: 174.6m +step 00628/16704 (3.76%) | loss: 3.213550 | lrm: 1.00 | dt: 651.50ms | tok/sec: 804,742 | mfu: 50.30 | epoch: 1 | total time: 6.71m | eta: 174.6m +step 00629/16704 (3.77%) | loss: 3.213405 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,859 | mfu: 50.74 | epoch: 1 | total time: 6.72m | eta: 174.6m +step 00630/16704 (3.77%) | loss: 3.219790 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,219 | mfu: 50.64 | epoch: 1 | total time: 6.73m | eta: 174.5m +step 00631/16704 (3.78%) | loss: 3.235478 | lrm: 1.00 | dt: 648.85ms | tok/sec: 808,025 | mfu: 50.50 | epoch: 1 | total time: 6.74m | eta: 174.5m +step 00632/16704 (3.78%) | loss: 3.232812 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,095 | mfu: 50.76 | epoch: 1 | total time: 6.75m | eta: 174.5m +step 00633/16704 (3.79%) | loss: 3.228479 | lrm: 1.00 | dt: 650.00ms | tok/sec: 806,600 | mfu: 50.41 | epoch: 1 | total time: 6.76m | eta: 174.5m +step 00634/16704 (3.80%) | loss: 3.217470 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,435 | mfu: 50.65 | epoch: 1 | total time: 6.78m | eta: 174.5m +step 00635/16704 (3.80%) | loss: 3.215663 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,315 | mfu: 50.71 | epoch: 1 | total time: 6.79m | eta: 174.5m +step 00636/16704 (3.81%) | loss: 3.205882 | lrm: 1.00 | dt: 648.21ms | tok/sec: 808,821 | mfu: 50.55 | epoch: 1 | total time: 6.80m | eta: 174.5m +step 00637/16704 (3.81%) | loss: 3.201202 | lrm: 1.00 | dt: 647.97ms | tok/sec: 809,124 | mfu: 50.57 | epoch: 1 | total time: 6.81m | eta: 174.5m +step 00638/16704 (3.82%) | loss: 3.186205 | lrm: 1.00 | dt: 647.70ms | tok/sec: 809,462 | mfu: 50.59 | epoch: 1 | total time: 6.82m | eta: 174.4m +step 00639/16704 (3.83%) | loss: 3.197440 | lrm: 1.00 | dt: 651.10ms | tok/sec: 805,235 | mfu: 50.33 | epoch: 1 | total time: 6.83m | eta: 174.4m +step 00640/16704 (3.83%) | loss: 3.194001 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,558 | mfu: 50.72 | epoch: 1 | total time: 6.84m | eta: 174.4m +step 00641/16704 (3.84%) | loss: 3.187269 | lrm: 1.00 | dt: 648.66ms | tok/sec: 808,261 | mfu: 50.52 | epoch: 1 | total time: 6.85m | eta: 174.4m +step 00642/16704 (3.84%) | loss: 3.194212 | lrm: 1.00 | dt: 646.02ms | tok/sec: 811,565 | mfu: 50.72 | epoch: 1 | total time: 6.86m | eta: 174.4m +step 00643/16704 (3.85%) | loss: 3.193852 | lrm: 1.00 | dt: 647.96ms | tok/sec: 809,132 | mfu: 50.57 | epoch: 1 | total time: 6.87m | eta: 174.4m +step 00644/16704 (3.86%) | loss: 3.194953 | lrm: 1.00 | dt: 646.95ms | tok/sec: 810,401 | mfu: 50.65 | epoch: 1 | total time: 6.88m | eta: 174.4m +step 00645/16704 (3.86%) | loss: 3.218102 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,415 | mfu: 50.71 | epoch: 1 | total time: 6.89m | eta: 174.4m +step 00646/16704 (3.87%) | loss: 3.233113 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,951 | mfu: 50.62 | epoch: 1 | total time: 6.91m | eta: 174.3m +step 00647/16704 (3.87%) | loss: 3.235612 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,575 | mfu: 50.60 | epoch: 1 | total time: 6.92m | eta: 174.3m +step 00648/16704 (3.88%) | loss: 3.229070 | lrm: 1.00 | dt: 649.40ms | tok/sec: 807,344 | mfu: 50.46 | epoch: 1 | total time: 6.93m | eta: 174.3m +step 00649/16704 (3.89%) | loss: 3.227032 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 6.94m | eta: 174.3m +step 00650/16704 (3.89%) | loss: 3.221599 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,285 | mfu: 50.64 | epoch: 1 | total time: 6.95m | eta: 174.3m +step 00651/16704 (3.90%) | loss: 3.220910 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,692 | mfu: 50.73 | epoch: 1 | total time: 6.96m | eta: 174.3m +step 00652/16704 (3.90%) | loss: 3.222762 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,542 | mfu: 50.66 | epoch: 1 | total time: 6.97m | eta: 174.3m +step 00653/16704 (3.91%) | loss: 3.219239 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,397 | mfu: 50.78 | epoch: 1 | total time: 6.98m | eta: 174.3m +step 00654/16704 (3.92%) | loss: 3.218235 | lrm: 1.00 | dt: 649.27ms | tok/sec: 807,509 | mfu: 50.47 | epoch: 1 | total time: 6.99m | eta: 174.2m +step 00655/16704 (3.92%) | loss: 3.209890 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,383 | mfu: 50.71 | epoch: 1 | total time: 7.00m | eta: 174.2m +step 00656/16704 (3.93%) | loss: 3.214345 | lrm: 1.00 | dt: 647.77ms | tok/sec: 809,379 | mfu: 50.59 | epoch: 1 | total time: 7.01m | eta: 174.2m +step 00657/16704 (3.93%) | loss: 3.211312 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,927 | mfu: 50.68 | epoch: 1 | total time: 7.02m | eta: 174.2m +step 00658/16704 (3.94%) | loss: 3.204143 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,486 | mfu: 50.78 | epoch: 1 | total time: 7.03m | eta: 174.2m +step 00659/16704 (3.95%) | loss: 3.202233 | lrm: 1.00 | dt: 647.44ms | tok/sec: 809,781 | mfu: 50.61 | epoch: 1 | total time: 7.05m | eta: 174.2m +step 00660/16704 (3.95%) | loss: 3.211810 | lrm: 1.00 | dt: 647.54ms | tok/sec: 809,663 | mfu: 50.61 | epoch: 1 | total time: 7.06m | eta: 174.2m +step 00661/16704 (3.96%) | loss: 3.201414 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,091 | mfu: 50.69 | epoch: 1 | total time: 7.07m | eta: 174.2m +step 00662/16704 (3.96%) | loss: 3.199981 | lrm: 1.00 | dt: 648.73ms | tok/sec: 808,179 | mfu: 50.51 | epoch: 1 | total time: 7.08m | eta: 174.1m +step 00663/16704 (3.97%) | loss: 3.198613 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,084 | mfu: 50.69 | epoch: 1 | total time: 7.09m | eta: 174.1m +step 00664/16704 (3.98%) | loss: 3.196192 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,346 | mfu: 50.77 | epoch: 1 | total time: 7.10m | eta: 174.1m +step 00665/16704 (3.98%) | loss: 3.197671 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,924 | mfu: 50.62 | epoch: 1 | total time: 7.11m | eta: 174.1m +step 00666/16704 (3.99%) | loss: 3.196813 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,701 | mfu: 50.67 | epoch: 1 | total time: 7.12m | eta: 174.1m +step 00667/16704 (3.99%) | loss: 3.181594 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,318 | mfu: 50.71 | epoch: 1 | total time: 7.13m | eta: 174.1m +step 00668/16704 (4.00%) | loss: 3.177729 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 1 | total time: 7.14m | eta: 174.1m +step 00669/16704 (4.01%) | loss: 3.185510 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,279 | mfu: 50.64 | epoch: 1 | total time: 7.15m | eta: 174.1m +step 00670/16704 (4.01%) | loss: 3.182716 | lrm: 1.00 | dt: 648.98ms | tok/sec: 807,866 | mfu: 50.49 | epoch: 1 | total time: 7.16m | eta: 174.0m +step 00671/16704 (4.02%) | loss: 3.176738 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,627 | mfu: 50.85 | epoch: 1 | total time: 7.17m | eta: 174.0m +step 00672/16704 (4.02%) | loss: 3.160661 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,811 | mfu: 50.61 | epoch: 1 | total time: 7.19m | eta: 174.0m +step 00673/16704 (4.03%) | loss: 3.152090 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,754 | mfu: 50.61 | epoch: 1 | total time: 7.20m | eta: 174.0m +step 00674/16704 (4.03%) | loss: 3.167319 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,953 | mfu: 50.81 | epoch: 1 | total time: 7.21m | eta: 174.0m +step 00675/16704 (4.04%) | loss: 3.164704 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,676 | mfu: 50.67 | epoch: 1 | total time: 7.22m | eta: 174.0m +step 00676/16704 (4.05%) | loss: 3.155893 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,886 | mfu: 50.81 | epoch: 1 | total time: 7.23m | eta: 174.0m +step 00677/16704 (4.05%) | loss: 3.146666 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,787 | mfu: 50.68 | epoch: 1 | total time: 7.24m | eta: 173.9m +step 00678/16704 (4.06%) | loss: 3.154961 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,699 | mfu: 50.67 | epoch: 1 | total time: 7.25m | eta: 173.9m +step 00679/16704 (4.06%) | loss: 3.149253 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,103 | mfu: 50.88 | epoch: 1 | total time: 7.26m | eta: 173.9m +step 00680/16704 (4.07%) | loss: 3.155288 | lrm: 1.00 | dt: 647.88ms | tok/sec: 809,235 | mfu: 50.58 | epoch: 1 | total time: 7.27m | eta: 173.9m +step 00681/16704 (4.08%) | loss: 3.142732 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,163 | mfu: 50.70 | epoch: 1 | total time: 7.28m | eta: 173.9m +step 00682/16704 (4.08%) | loss: 3.129332 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,010 | mfu: 50.69 | epoch: 1 | total time: 7.29m | eta: 173.9m +step 00683/16704 (4.09%) | loss: 3.126952 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,981 | mfu: 50.81 | epoch: 1 | total time: 7.30m | eta: 173.9m +step 00684/16704 (4.09%) | loss: 3.121220 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,521 | mfu: 50.72 | epoch: 1 | total time: 7.31m | eta: 173.9m +step 00685/16704 (4.10%) | loss: 3.134124 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,135 | mfu: 50.76 | epoch: 1 | total time: 7.33m | eta: 173.8m +step 00686/16704 (4.11%) | loss: 3.148702 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,448 | mfu: 50.78 | epoch: 1 | total time: 7.34m | eta: 173.8m +step 00687/16704 (4.11%) | loss: 3.139153 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,651 | mfu: 50.67 | epoch: 1 | total time: 7.35m | eta: 173.8m +step 00688/16704 (4.12%) | loss: 3.132016 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,978 | mfu: 50.75 | epoch: 1 | total time: 7.36m | eta: 173.8m +step 00689/16704 (4.12%) | loss: 3.142928 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,053 | mfu: 50.75 | epoch: 1 | total time: 7.37m | eta: 173.8m +step 00690/16704 (4.13%) | loss: 3.144460 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,147 | mfu: 50.70 | epoch: 1 | total time: 7.38m | eta: 173.8m +step 00691/16704 (4.14%) | loss: 3.148158 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,321 | mfu: 50.77 | epoch: 1 | total time: 7.39m | eta: 173.8m +step 00692/16704 (4.14%) | loss: 3.141031 | lrm: 1.00 | dt: 648.14ms | tok/sec: 808,911 | mfu: 50.56 | epoch: 1 | total time: 7.40m | eta: 173.8m +step 00693/16704 (4.15%) | loss: 3.143708 | lrm: 1.00 | dt: 646.99ms | tok/sec: 810,349 | mfu: 50.65 | epoch: 1 | total time: 7.41m | eta: 173.7m +step 00694/16704 (4.15%) | loss: 3.142154 | lrm: 1.00 | dt: 647.32ms | tok/sec: 809,935 | mfu: 50.62 | epoch: 1 | total time: 7.42m | eta: 173.7m +step 00695/16704 (4.16%) | loss: 3.142751 | lrm: 1.00 | dt: 648.83ms | tok/sec: 808,056 | mfu: 50.50 | epoch: 1 | total time: 7.43m | eta: 173.7m +step 00696/16704 (4.17%) | loss: 3.151682 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,497 | mfu: 50.72 | epoch: 1 | total time: 7.44m | eta: 173.7m +step 00697/16704 (4.17%) | loss: 3.137228 | lrm: 1.00 | dt: 648.54ms | tok/sec: 808,412 | mfu: 50.53 | epoch: 1 | total time: 7.45m | eta: 173.7m +step 00698/16704 (4.18%) | loss: 3.124802 | lrm: 1.00 | dt: 647.72ms | tok/sec: 809,442 | mfu: 50.59 | epoch: 1 | total time: 7.47m | eta: 173.7m +step 00699/16704 (4.18%) | loss: 3.131623 | lrm: 1.00 | dt: 647.71ms | tok/sec: 809,451 | mfu: 50.59 | epoch: 1 | total time: 7.48m | eta: 173.7m +step 00700/16704 (4.19%) | loss: 3.129778 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,349 | mfu: 50.71 | epoch: 1 | total time: 7.49m | eta: 173.7m +step 00701/16704 (4.20%) | loss: 3.129936 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,331 | mfu: 50.65 | epoch: 1 | total time: 7.50m | eta: 173.6m +step 00702/16704 (4.20%) | loss: 3.148658 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,057 | mfu: 50.63 | epoch: 1 | total time: 7.51m | eta: 173.6m +step 00703/16704 (4.21%) | loss: 3.162405 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,090 | mfu: 50.69 | epoch: 1 | total time: 7.52m | eta: 173.6m +step 00704/16704 (4.21%) | loss: 3.157443 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,098 | mfu: 50.63 | epoch: 1 | total time: 7.53m | eta: 173.6m +step 00705/16704 (4.22%) | loss: 3.150374 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,429 | mfu: 50.65 | epoch: 1 | total time: 7.54m | eta: 173.6m +step 00706/16704 (4.23%) | loss: 3.139335 | lrm: 1.00 | dt: 649.01ms | tok/sec: 807,829 | mfu: 50.49 | epoch: 1 | total time: 7.55m | eta: 173.6m +step 00707/16704 (4.23%) | loss: 3.133841 | lrm: 1.00 | dt: 647.77ms | tok/sec: 809,376 | mfu: 50.59 | epoch: 1 | total time: 7.56m | eta: 173.6m +step 00708/16704 (4.24%) | loss: 3.140450 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,755 | mfu: 50.74 | epoch: 1 | total time: 7.57m | eta: 173.6m +step 00709/16704 (4.24%) | loss: 3.144311 | lrm: 1.00 | dt: 648.35ms | tok/sec: 808,646 | mfu: 50.54 | epoch: 1 | total time: 7.58m | eta: 173.5m +step 00710/16704 (4.25%) | loss: 3.153190 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,197 | mfu: 50.58 | epoch: 1 | total time: 7.59m | eta: 173.5m +step 00711/16704 (4.26%) | loss: 3.160141 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,622 | mfu: 50.67 | epoch: 1 | total time: 7.61m | eta: 173.5m +step 00712/16704 (4.26%) | loss: 3.167589 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,076 | mfu: 50.69 | epoch: 1 | total time: 7.62m | eta: 173.5m +step 00713/16704 (4.27%) | loss: 3.160621 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,952 | mfu: 50.69 | epoch: 1 | total time: 7.63m | eta: 173.5m +step 00714/16704 (4.27%) | loss: 3.143116 | lrm: 1.00 | dt: 647.37ms | tok/sec: 809,871 | mfu: 50.62 | epoch: 1 | total time: 7.64m | eta: 173.5m +step 00715/16704 (4.28%) | loss: 3.139921 | lrm: 1.00 | dt: 647.23ms | tok/sec: 810,049 | mfu: 50.63 | epoch: 1 | total time: 7.65m | eta: 173.5m +step 00716/16704 (4.29%) | loss: 3.146029 | lrm: 1.00 | dt: 648.18ms | tok/sec: 808,861 | mfu: 50.56 | epoch: 1 | total time: 7.66m | eta: 173.5m +step 00717/16704 (4.29%) | loss: 3.137419 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,195 | mfu: 50.70 | epoch: 1 | total time: 7.67m | eta: 173.4m +step 00718/16704 (4.30%) | loss: 3.125747 | lrm: 1.00 | dt: 649.63ms | tok/sec: 807,053 | mfu: 50.44 | epoch: 1 | total time: 7.68m | eta: 173.4m +step 00719/16704 (4.30%) | loss: 3.127761 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,131 | mfu: 50.76 | epoch: 1 | total time: 7.69m | eta: 173.4m +step 00720/16704 (4.31%) | loss: 3.118073 | lrm: 1.00 | dt: 648.74ms | tok/sec: 808,162 | mfu: 50.51 | epoch: 1 | total time: 7.70m | eta: 173.4m +step 00721/16704 (4.32%) | loss: 3.108423 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,626 | mfu: 50.67 | epoch: 1 | total time: 7.71m | eta: 173.4m +step 00722/16704 (4.32%) | loss: 3.109000 | lrm: 1.00 | dt: 648.20ms | tok/sec: 808,840 | mfu: 50.55 | epoch: 1 | total time: 7.72m | eta: 173.4m +step 00723/16704 (4.33%) | loss: 3.106631 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,330 | mfu: 50.65 | epoch: 1 | total time: 7.74m | eta: 173.4m +step 00724/16704 (4.33%) | loss: 3.113365 | lrm: 1.00 | dt: 647.95ms | tok/sec: 809,154 | mfu: 50.57 | epoch: 1 | total time: 7.75m | eta: 173.4m +step 00725/16704 (4.34%) | loss: 3.124273 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,760 | mfu: 50.61 | epoch: 1 | total time: 7.76m | eta: 173.4m +step 00726/16704 (4.35%) | loss: 3.127658 | lrm: 1.00 | dt: 647.01ms | tok/sec: 810,325 | mfu: 50.65 | epoch: 1 | total time: 7.77m | eta: 173.3m +step 00727/16704 (4.35%) | loss: 3.121125 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,778 | mfu: 50.67 | epoch: 1 | total time: 7.78m | eta: 173.3m +step 00728/16704 (4.36%) | loss: 3.139133 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,525 | mfu: 50.78 | epoch: 1 | total time: 7.79m | eta: 173.3m +step 00729/16704 (4.36%) | loss: 3.141294 | lrm: 1.00 | dt: 649.22ms | tok/sec: 807,567 | mfu: 50.47 | epoch: 1 | total time: 7.80m | eta: 173.3m +step 00730/16704 (4.37%) | loss: 3.138773 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,943 | mfu: 50.75 | epoch: 1 | total time: 7.81m | eta: 173.3m +step 00731/16704 (4.38%) | loss: 3.143451 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,774 | mfu: 50.55 | epoch: 1 | total time: 7.82m | eta: 173.3m +step 00732/16704 (4.38%) | loss: 3.152030 | lrm: 1.00 | dt: 649.17ms | tok/sec: 807,625 | mfu: 50.48 | epoch: 1 | total time: 7.83m | eta: 173.3m +step 00733/16704 (4.39%) | loss: 3.150714 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,874 | mfu: 50.68 | epoch: 1 | total time: 7.84m | eta: 173.3m +step 00734/16704 (4.39%) | loss: 3.147400 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,967 | mfu: 50.81 | epoch: 1 | total time: 7.85m | eta: 173.2m +step 00735/16704 (4.40%) | loss: 3.140365 | lrm: 1.00 | dt: 648.70ms | tok/sec: 808,218 | mfu: 50.51 | epoch: 1 | total time: 7.86m | eta: 173.2m +step 00736/16704 (4.41%) | loss: 3.147104 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,632 | mfu: 50.73 | epoch: 1 | total time: 7.88m | eta: 173.2m +step 00737/16704 (4.41%) | loss: 3.146719 | lrm: 1.00 | dt: 647.52ms | tok/sec: 809,681 | mfu: 50.61 | epoch: 1 | total time: 7.89m | eta: 173.2m +step 00738/16704 (4.42%) | loss: 3.142103 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,485 | mfu: 50.59 | epoch: 1 | total time: 7.90m | eta: 173.2m +step 00739/16704 (4.42%) | loss: 3.139209 | lrm: 1.00 | dt: 646.63ms | tok/sec: 810,805 | mfu: 50.68 | epoch: 1 | total time: 7.91m | eta: 173.2m +step 00740/16704 (4.43%) | loss: 3.132768 | lrm: 1.00 | dt: 648.39ms | tok/sec: 808,594 | mfu: 50.54 | epoch: 1 | total time: 7.92m | eta: 173.2m +step 00741/16704 (4.44%) | loss: 3.135733 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,581 | mfu: 50.79 | epoch: 1 | total time: 7.93m | eta: 173.2m +step 00742/16704 (4.44%) | loss: 3.145469 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,928 | mfu: 50.62 | epoch: 1 | total time: 7.94m | eta: 173.1m +step 00743/16704 (4.45%) | loss: 3.136552 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,923 | mfu: 50.62 | epoch: 1 | total time: 7.95m | eta: 173.1m +step 00744/16704 (4.45%) | loss: 3.141003 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,926 | mfu: 50.68 | epoch: 1 | total time: 7.96m | eta: 173.1m +step 00745/16704 (4.46%) | loss: 3.148636 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,081 | mfu: 50.76 | epoch: 1 | total time: 7.97m | eta: 173.1m +step 00746/16704 (4.47%) | loss: 3.137879 | lrm: 1.00 | dt: 648.60ms | tok/sec: 808,335 | mfu: 50.52 | epoch: 1 | total time: 7.98m | eta: 173.1m +step 00747/16704 (4.47%) | loss: 3.131004 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,579 | mfu: 50.85 | epoch: 1 | total time: 7.99m | eta: 173.1m +step 00748/16704 (4.48%) | loss: 3.132282 | lrm: 1.00 | dt: 650.12ms | tok/sec: 806,448 | mfu: 50.40 | epoch: 1 | total time: 8.00m | eta: 173.1m +step 00749/16704 (4.48%) | loss: 3.135425 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,619 | mfu: 50.73 | epoch: 1 | total time: 8.02m | eta: 173.1m +Step 00750 | Validation bpb: 0.952577 +step 00750/16704 (4.49%) | loss: 3.128532 | lrm: 1.00 | dt: 650.32ms | tok/sec: 806,200 | mfu: 50.39 | epoch: 1 | total time: 8.03m | eta: 173.0m +step 00751/16704 (4.50%) | loss: 3.123613 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,819 | mfu: 50.68 | epoch: 1 | total time: 8.04m | eta: 173.0m +step 00752/16704 (4.50%) | loss: 3.139840 | lrm: 1.00 | dt: 647.58ms | tok/sec: 809,607 | mfu: 50.60 | epoch: 1 | total time: 8.05m | eta: 173.0m +step 00753/16704 (4.51%) | loss: 3.140548 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,682 | mfu: 50.92 | epoch: 1 | total time: 8.06m | eta: 173.0m +step 00754/16704 (4.51%) | loss: 3.146631 | lrm: 1.00 | dt: 649.83ms | tok/sec: 806,811 | mfu: 50.43 | epoch: 1 | total time: 8.07m | eta: 173.0m +step 00755/16704 (4.52%) | loss: 3.145682 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,404 | mfu: 50.71 | epoch: 1 | total time: 8.08m | eta: 173.0m +step 00756/16704 (4.53%) | loss: 3.139465 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,810 | mfu: 50.61 | epoch: 1 | total time: 8.09m | eta: 173.0m +step 00757/16704 (4.53%) | loss: 3.127534 | lrm: 1.00 | dt: 647.37ms | tok/sec: 809,870 | mfu: 50.62 | epoch: 1 | total time: 8.10m | eta: 173.0m +step 00758/16704 (4.54%) | loss: 3.143287 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 1 | total time: 8.11m | eta: 172.9m +step 00759/16704 (4.54%) | loss: 3.146083 | lrm: 1.00 | dt: 649.64ms | tok/sec: 807,050 | mfu: 50.44 | epoch: 1 | total time: 8.12m | eta: 172.9m +step 00760/16704 (4.55%) | loss: 3.153311 | lrm: 1.00 | dt: 647.59ms | tok/sec: 809,592 | mfu: 50.60 | epoch: 1 | total time: 8.13m | eta: 172.9m +step 00761/16704 (4.56%) | loss: 3.143095 | lrm: 1.00 | dt: 650.42ms | tok/sec: 806,070 | mfu: 50.38 | epoch: 1 | total time: 8.15m | eta: 172.9m +step 00762/16704 (4.56%) | loss: 3.135843 | lrm: 1.00 | dt: 649.57ms | tok/sec: 807,134 | mfu: 50.45 | epoch: 1 | total time: 8.16m | eta: 172.9m +step 00763/16704 (4.57%) | loss: 3.125364 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,089 | mfu: 50.63 | epoch: 1 | total time: 8.17m | eta: 172.9m +step 00764/16704 (4.57%) | loss: 3.122786 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,875 | mfu: 50.68 | epoch: 1 | total time: 8.18m | eta: 172.9m +step 00765/16704 (4.58%) | loss: 3.110634 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,771 | mfu: 50.67 | epoch: 1 | total time: 8.19m | eta: 172.9m +step 00766/16704 (4.59%) | loss: 3.102615 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,586 | mfu: 50.85 | epoch: 1 | total time: 8.20m | eta: 172.9m +step 00767/16704 (4.59%) | loss: 3.095866 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,675 | mfu: 50.73 | epoch: 1 | total time: 8.21m | eta: 172.8m +step 00768/16704 (4.60%) | loss: 3.101754 | lrm: 1.00 | dt: 646.72ms | tok/sec: 810,686 | mfu: 50.67 | epoch: 1 | total time: 8.22m | eta: 172.8m +step 00769/16704 (4.60%) | loss: 3.115546 | lrm: 1.00 | dt: 648.86ms | tok/sec: 808,012 | mfu: 50.50 | epoch: 1 | total time: 8.23m | eta: 172.8m +step 00770/16704 (4.61%) | loss: 3.123077 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,177 | mfu: 50.89 | epoch: 1 | total time: 8.24m | eta: 172.8m +step 00771/16704 (4.62%) | loss: 3.115885 | lrm: 1.00 | dt: 649.52ms | tok/sec: 807,186 | mfu: 50.45 | epoch: 1 | total time: 8.25m | eta: 172.8m +step 00772/16704 (4.62%) | loss: 3.117575 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,966 | mfu: 50.75 | epoch: 1 | total time: 8.26m | eta: 172.8m +step 00773/16704 (4.63%) | loss: 3.118849 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,651 | mfu: 50.73 | epoch: 1 | total time: 8.27m | eta: 172.8m +step 00774/16704 (4.63%) | loss: 3.132344 | lrm: 1.00 | dt: 648.18ms | tok/sec: 808,859 | mfu: 50.55 | epoch: 1 | total time: 8.29m | eta: 172.8m +step 00775/16704 (4.64%) | loss: 3.138810 | lrm: 1.00 | dt: 648.40ms | tok/sec: 808,585 | mfu: 50.54 | epoch: 1 | total time: 8.30m | eta: 172.7m +step 00776/16704 (4.65%) | loss: 3.128614 | lrm: 1.00 | dt: 648.24ms | tok/sec: 808,781 | mfu: 50.55 | epoch: 1 | total time: 8.31m | eta: 172.7m +step 00777/16704 (4.65%) | loss: 3.142385 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,489 | mfu: 50.72 | epoch: 1 | total time: 8.32m | eta: 172.7m +step 00778/16704 (4.66%) | loss: 3.141584 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,154 | mfu: 50.70 | epoch: 1 | total time: 8.33m | eta: 172.7m +step 00779/16704 (4.66%) | loss: 3.144516 | lrm: 1.00 | dt: 646.76ms | tok/sec: 810,642 | mfu: 50.67 | epoch: 1 | total time: 8.34m | eta: 172.7m +step 00780/16704 (4.67%) | loss: 3.156329 | lrm: 1.00 | dt: 647.49ms | tok/sec: 809,729 | mfu: 50.61 | epoch: 1 | total time: 8.35m | eta: 172.7m +step 00781/16704 (4.68%) | loss: 3.145901 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,712 | mfu: 50.73 | epoch: 1 | total time: 8.36m | eta: 172.7m +step 00782/16704 (4.68%) | loss: 3.137895 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,171 | mfu: 50.64 | epoch: 1 | total time: 8.37m | eta: 172.7m +step 00783/16704 (4.69%) | loss: 3.137655 | lrm: 1.00 | dt: 647.12ms | tok/sec: 810,182 | mfu: 50.64 | epoch: 1 | total time: 8.38m | eta: 172.6m +step 00784/16704 (4.69%) | loss: 3.131117 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,000 | mfu: 50.75 | epoch: 1 | total time: 8.39m | eta: 172.6m +step 00785/16704 (4.70%) | loss: 3.125630 | lrm: 1.00 | dt: 648.78ms | tok/sec: 808,109 | mfu: 50.51 | epoch: 1 | total time: 8.40m | eta: 172.6m +step 00786/16704 (4.71%) | loss: 3.111886 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,614 | mfu: 50.66 | epoch: 1 | total time: 8.41m | eta: 172.6m +step 00787/16704 (4.71%) | loss: 3.101855 | lrm: 1.00 | dt: 647.96ms | tok/sec: 809,137 | mfu: 50.57 | epoch: 1 | total time: 8.43m | eta: 172.6m +step 00788/16704 (4.72%) | loss: 3.102329 | lrm: 1.00 | dt: 646.38ms | tok/sec: 811,113 | mfu: 50.70 | epoch: 1 | total time: 8.44m | eta: 172.6m +step 00789/16704 (4.72%) | loss: 3.118721 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,437 | mfu: 50.90 | epoch: 1 | total time: 8.45m | eta: 172.6m +step 00790/16704 (4.73%) | loss: 3.128019 | lrm: 1.00 | dt: 646.79ms | tok/sec: 810,597 | mfu: 50.66 | epoch: 1 | total time: 8.46m | eta: 172.6m +step 00791/16704 (4.74%) | loss: 3.121228 | lrm: 1.00 | dt: 648.38ms | tok/sec: 808,612 | mfu: 50.54 | epoch: 1 | total time: 8.47m | eta: 172.5m +step 00792/16704 (4.74%) | loss: 3.129482 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,936 | mfu: 50.81 | epoch: 1 | total time: 8.48m | eta: 172.5m +step 00793/16704 (4.75%) | loss: 3.130811 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,148 | mfu: 50.70 | epoch: 1 | total time: 8.49m | eta: 172.5m +step 00794/16704 (4.75%) | loss: 3.127800 | lrm: 1.00 | dt: 648.66ms | tok/sec: 808,268 | mfu: 50.52 | epoch: 1 | total time: 8.50m | eta: 172.5m +step 00795/16704 (4.76%) | loss: 3.128164 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,177 | mfu: 50.89 | epoch: 1 | total time: 8.51m | eta: 172.5m +step 00796/16704 (4.77%) | loss: 3.130658 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,091 | mfu: 50.63 | epoch: 1 | total time: 8.52m | eta: 172.5m +step 00797/16704 (4.77%) | loss: 3.144124 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,088 | mfu: 50.63 | epoch: 1 | total time: 8.53m | eta: 172.5m +step 00798/16704 (4.78%) | loss: 3.143861 | lrm: 1.00 | dt: 654.12ms | tok/sec: 801,515 | mfu: 50.10 | epoch: 1 | total time: 8.54m | eta: 172.5m +step 00799/16704 (4.78%) | loss: 3.136405 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,701 | mfu: 50.98 | epoch: 1 | total time: 8.55m | eta: 172.5m +step 00800/16704 (4.79%) | loss: 3.157201 | lrm: 1.00 | dt: 648.63ms | tok/sec: 808,296 | mfu: 50.52 | epoch: 1 | total time: 8.57m | eta: 172.4m +step 00801/16704 (4.80%) | loss: 3.141877 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,331 | mfu: 50.65 | epoch: 1 | total time: 8.58m | eta: 172.4m +step 00802/16704 (4.80%) | loss: 3.135504 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,200 | mfu: 50.83 | epoch: 1 | total time: 8.59m | eta: 172.4m +step 00803/16704 (4.81%) | loss: 3.137432 | lrm: 1.00 | dt: 649.52ms | tok/sec: 807,187 | mfu: 50.45 | epoch: 1 | total time: 8.60m | eta: 172.4m +step 00804/16704 (4.81%) | loss: 3.141728 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,522 | mfu: 50.85 | epoch: 1 | total time: 8.61m | eta: 172.4m +step 00805/16704 (4.82%) | loss: 3.148678 | lrm: 1.00 | dt: 648.61ms | tok/sec: 808,330 | mfu: 50.52 | epoch: 1 | total time: 8.62m | eta: 172.4m +step 00806/16704 (4.83%) | loss: 3.152720 | lrm: 1.00 | dt: 646.79ms | tok/sec: 810,603 | mfu: 50.66 | epoch: 1 | total time: 8.63m | eta: 172.4m +step 00807/16704 (4.83%) | loss: 3.159143 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,775 | mfu: 50.86 | epoch: 1 | total time: 8.64m | eta: 172.4m +step 00808/16704 (4.84%) | loss: 3.148649 | lrm: 1.00 | dt: 648.34ms | tok/sec: 808,657 | mfu: 50.54 | epoch: 1 | total time: 8.65m | eta: 172.3m +step 00809/16704 (4.84%) | loss: 3.155267 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,327 | mfu: 50.58 | epoch: 1 | total time: 8.66m | eta: 172.3m +step 00810/16704 (4.85%) | loss: 3.148032 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,204 | mfu: 50.70 | epoch: 1 | total time: 8.67m | eta: 172.3m +step 00811/16704 (4.86%) | loss: 3.137398 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,197 | mfu: 50.70 | epoch: 1 | total time: 8.68m | eta: 172.3m +step 00812/16704 (4.86%) | loss: 3.135539 | lrm: 1.00 | dt: 649.29ms | tok/sec: 807,478 | mfu: 50.47 | epoch: 1 | total time: 8.70m | eta: 172.3m +step 00813/16704 (4.87%) | loss: 3.126927 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,249 | mfu: 50.70 | epoch: 1 | total time: 8.71m | eta: 172.3m +step 00814/16704 (4.87%) | loss: 3.122359 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,563 | mfu: 50.66 | epoch: 1 | total time: 8.72m | eta: 172.3m +step 00815/16704 (4.88%) | loss: 3.111527 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,259 | mfu: 50.70 | epoch: 1 | total time: 8.73m | eta: 172.3m +step 00816/16704 (4.89%) | loss: 3.124027 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,055 | mfu: 50.63 | epoch: 1 | total time: 8.74m | eta: 172.2m +step 00817/16704 (4.89%) | loss: 3.123978 | lrm: 1.00 | dt: 648.90ms | tok/sec: 807,960 | mfu: 50.50 | epoch: 1 | total time: 8.75m | eta: 172.2m +step 00818/16704 (4.90%) | loss: 3.123750 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,262 | mfu: 50.71 | epoch: 1 | total time: 8.76m | eta: 172.2m +step 00819/16704 (4.90%) | loss: 3.118148 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,221 | mfu: 50.70 | epoch: 1 | total time: 8.77m | eta: 172.2m +step 00820/16704 (4.91%) | loss: 3.123624 | lrm: 1.00 | dt: 648.28ms | tok/sec: 808,730 | mfu: 50.55 | epoch: 1 | total time: 8.78m | eta: 172.2m +step 00821/16704 (4.91%) | loss: 3.130503 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,619 | mfu: 50.85 | epoch: 1 | total time: 8.79m | eta: 172.2m +step 00822/16704 (4.92%) | loss: 3.129223 | lrm: 1.00 | dt: 649.32ms | tok/sec: 807,437 | mfu: 50.47 | epoch: 1 | total time: 8.80m | eta: 172.2m +step 00823/16704 (4.93%) | loss: 3.131028 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,534 | mfu: 50.85 | epoch: 1 | total time: 8.81m | eta: 172.2m +step 00824/16704 (4.93%) | loss: 3.123264 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,703 | mfu: 50.67 | epoch: 1 | total time: 8.82m | eta: 172.2m +step 00825/16704 (4.94%) | loss: 3.120374 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,915 | mfu: 50.75 | epoch: 1 | total time: 8.84m | eta: 172.1m +step 00826/16704 (4.94%) | loss: 3.110383 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,387 | mfu: 50.84 | epoch: 1 | total time: 8.85m | eta: 172.1m +step 00827/16704 (4.95%) | loss: 3.105604 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,275 | mfu: 50.71 | epoch: 1 | total time: 8.86m | eta: 172.1m +step 00828/16704 (4.96%) | loss: 3.116593 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,241 | mfu: 50.83 | epoch: 1 | total time: 8.87m | eta: 172.1m +step 00829/16704 (4.96%) | loss: 3.108299 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,524 | mfu: 50.72 | epoch: 1 | total time: 8.88m | eta: 172.1m +step 00830/16704 (4.97%) | loss: 3.104887 | lrm: 1.00 | dt: 647.39ms | tok/sec: 809,850 | mfu: 50.62 | epoch: 1 | total time: 8.89m | eta: 172.1m +step 00831/16704 (4.97%) | loss: 3.104189 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,427 | mfu: 50.78 | epoch: 1 | total time: 8.90m | eta: 172.1m +step 00832/16704 (4.98%) | loss: 3.108169 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,766 | mfu: 50.67 | epoch: 1 | total time: 8.91m | eta: 172.1m +step 00833/16704 (4.99%) | loss: 3.108916 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,290 | mfu: 50.77 | epoch: 1 | total time: 8.92m | eta: 172.0m +step 00834/16704 (4.99%) | loss: 3.102397 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,740 | mfu: 50.73 | epoch: 1 | total time: 8.93m | eta: 172.0m +step 00835/16704 (5.00%) | loss: 3.105880 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,343 | mfu: 50.71 | epoch: 1 | total time: 8.94m | eta: 172.0m +step 00836/16704 (5.00%) | loss: 3.103362 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,805 | mfu: 50.61 | epoch: 1 | total time: 8.95m | eta: 172.0m +step 00837/16704 (5.01%) | loss: 3.105536 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,005 | mfu: 50.81 | epoch: 1 | total time: 8.96m | eta: 172.0m +step 00838/16704 (5.02%) | loss: 3.104382 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,288 | mfu: 50.58 | epoch: 1 | total time: 8.98m | eta: 172.0m +step 00839/16704 (5.02%) | loss: 3.101601 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,664 | mfu: 50.79 | epoch: 1 | total time: 8.99m | eta: 172.0m +step 00840/16704 (5.03%) | loss: 3.118120 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,067 | mfu: 50.69 | epoch: 1 | total time: 9.00m | eta: 172.0m +step 00841/16704 (5.03%) | loss: 3.118835 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,666 | mfu: 50.67 | epoch: 1 | total time: 9.01m | eta: 171.9m +step 00842/16704 (5.04%) | loss: 3.116117 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,130 | mfu: 50.70 | epoch: 1 | total time: 9.02m | eta: 171.9m +step 00843/16704 (5.05%) | loss: 3.130341 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,840 | mfu: 50.74 | epoch: 1 | total time: 9.03m | eta: 171.9m +step 00844/16704 (5.05%) | loss: 3.123881 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,289 | mfu: 50.71 | epoch: 1 | total time: 9.04m | eta: 171.9m +step 00845/16704 (5.06%) | loss: 3.118111 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 9.05m | eta: 171.9m +step 00846/16704 (5.06%) | loss: 3.103503 | lrm: 1.00 | dt: 646.99ms | tok/sec: 810,354 | mfu: 50.65 | epoch: 1 | total time: 9.06m | eta: 171.9m +step 00847/16704 (5.07%) | loss: 3.102149 | lrm: 1.00 | dt: 648.50ms | tok/sec: 808,467 | mfu: 50.53 | epoch: 1 | total time: 9.07m | eta: 171.9m +step 00848/16704 (5.08%) | loss: 3.096460 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,876 | mfu: 50.81 | epoch: 1 | total time: 9.08m | eta: 171.9m +step 00849/16704 (5.08%) | loss: 3.087452 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 9.09m | eta: 171.8m +step 00850/16704 (5.09%) | loss: 3.072214 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,388 | mfu: 50.71 | epoch: 1 | total time: 9.10m | eta: 171.8m +step 00851/16704 (5.09%) | loss: 3.084540 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,156 | mfu: 50.76 | epoch: 1 | total time: 9.12m | eta: 171.8m +step 00852/16704 (5.10%) | loss: 3.094660 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,010 | mfu: 50.75 | epoch: 1 | total time: 9.13m | eta: 171.8m +step 00853/16704 (5.11%) | loss: 3.114067 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,032 | mfu: 50.63 | epoch: 1 | total time: 9.14m | eta: 171.8m +step 00854/16704 (5.11%) | loss: 3.107756 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 1 | total time: 9.15m | eta: 171.8m +step 00855/16704 (5.12%) | loss: 3.103899 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,146 | mfu: 50.70 | epoch: 1 | total time: 9.16m | eta: 171.8m +step 00856/16704 (5.12%) | loss: 3.094437 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,647 | mfu: 50.92 | epoch: 1 | total time: 9.17m | eta: 171.8m +step 00857/16704 (5.13%) | loss: 3.083242 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,683 | mfu: 50.54 | epoch: 1 | total time: 9.18m | eta: 171.7m +step 00858/16704 (5.14%) | loss: 3.085661 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,780 | mfu: 50.67 | epoch: 1 | total time: 9.19m | eta: 171.7m +step 00859/16704 (5.14%) | loss: 3.094529 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,307 | mfu: 50.65 | epoch: 1 | total time: 9.20m | eta: 171.7m +step 00860/16704 (5.15%) | loss: 3.098565 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,112 | mfu: 50.76 | epoch: 1 | total time: 9.21m | eta: 171.7m +step 00861/16704 (5.15%) | loss: 3.101660 | lrm: 1.00 | dt: 648.33ms | tok/sec: 808,677 | mfu: 50.54 | epoch: 1 | total time: 9.22m | eta: 171.7m +step 00862/16704 (5.16%) | loss: 3.099395 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,708 | mfu: 50.73 | epoch: 1 | total time: 9.23m | eta: 171.7m +step 00863/16704 (5.17%) | loss: 3.105622 | lrm: 1.00 | dt: 649.91ms | tok/sec: 806,711 | mfu: 50.42 | epoch: 1 | total time: 9.24m | eta: 171.7m +step 00864/16704 (5.17%) | loss: 3.102600 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,392 | mfu: 50.78 | epoch: 1 | total time: 9.26m | eta: 171.7m +step 00865/16704 (5.18%) | loss: 3.102019 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,305 | mfu: 50.71 | epoch: 1 | total time: 9.27m | eta: 171.7m +step 00866/16704 (5.18%) | loss: 3.099530 | lrm: 1.00 | dt: 649.63ms | tok/sec: 807,056 | mfu: 50.44 | epoch: 1 | total time: 9.28m | eta: 171.6m +step 00867/16704 (5.19%) | loss: 3.085879 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,984 | mfu: 50.75 | epoch: 1 | total time: 9.29m | eta: 171.6m +step 00868/16704 (5.20%) | loss: 3.089055 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,249 | mfu: 50.77 | epoch: 1 | total time: 9.30m | eta: 171.6m +step 00869/16704 (5.20%) | loss: 3.079915 | lrm: 1.00 | dt: 647.40ms | tok/sec: 809,833 | mfu: 50.62 | epoch: 1 | total time: 9.31m | eta: 171.6m +step 00870/16704 (5.21%) | loss: 3.076238 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,516 | mfu: 50.97 | epoch: 1 | total time: 9.32m | eta: 171.6m +step 00871/16704 (5.21%) | loss: 3.072075 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,888 | mfu: 50.74 | epoch: 1 | total time: 9.33m | eta: 171.6m +step 00872/16704 (5.22%) | loss: 3.078343 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,188 | mfu: 50.89 | epoch: 1 | total time: 9.34m | eta: 171.6m +step 00873/16704 (5.23%) | loss: 3.081347 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,614 | mfu: 50.73 | epoch: 1 | total time: 9.35m | eta: 171.6m +step 00874/16704 (5.23%) | loss: 3.086771 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,944 | mfu: 50.50 | epoch: 1 | total time: 9.36m | eta: 171.5m +step 00875/16704 (5.24%) | loss: 3.087360 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,096 | mfu: 50.82 | epoch: 1 | total time: 9.37m | eta: 171.5m +step 00876/16704 (5.24%) | loss: 3.092625 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,740 | mfu: 50.61 | epoch: 1 | total time: 9.38m | eta: 171.5m +step 00877/16704 (5.25%) | loss: 3.084828 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,973 | mfu: 50.69 | epoch: 1 | total time: 9.40m | eta: 171.5m +step 00878/16704 (5.26%) | loss: 3.081386 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,309 | mfu: 50.65 | epoch: 1 | total time: 9.41m | eta: 171.5m +step 00879/16704 (5.26%) | loss: 3.080029 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,747 | mfu: 50.67 | epoch: 1 | total time: 9.42m | eta: 171.5m +step 00880/16704 (5.27%) | loss: 3.092788 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,497 | mfu: 50.84 | epoch: 1 | total time: 9.43m | eta: 171.5m +step 00881/16704 (5.27%) | loss: 3.092037 | lrm: 1.00 | dt: 649.36ms | tok/sec: 807,392 | mfu: 50.46 | epoch: 1 | total time: 9.44m | eta: 171.5m +step 00882/16704 (5.28%) | loss: 3.082397 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,507 | mfu: 50.66 | epoch: 1 | total time: 9.45m | eta: 171.5m +step 00883/16704 (5.29%) | loss: 3.083743 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,693 | mfu: 50.73 | epoch: 1 | total time: 9.46m | eta: 171.4m +step 00884/16704 (5.29%) | loss: 3.083760 | lrm: 1.00 | dt: 649.46ms | tok/sec: 807,271 | mfu: 50.46 | epoch: 1 | total time: 9.47m | eta: 171.4m +step 00885/16704 (5.30%) | loss: 3.083747 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,660 | mfu: 50.79 | epoch: 1 | total time: 9.48m | eta: 171.4m +step 00886/16704 (5.30%) | loss: 3.090620 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,495 | mfu: 50.78 | epoch: 1 | total time: 9.49m | eta: 171.4m +step 00887/16704 (5.31%) | loss: 3.079508 | lrm: 1.00 | dt: 647.30ms | tok/sec: 809,962 | mfu: 50.62 | epoch: 1 | total time: 9.50m | eta: 171.4m +step 00888/16704 (5.32%) | loss: 3.082150 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,876 | mfu: 50.81 | epoch: 1 | total time: 9.51m | eta: 171.4m +step 00889/16704 (5.32%) | loss: 3.075061 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,762 | mfu: 50.74 | epoch: 1 | total time: 9.52m | eta: 171.4m +step 00890/16704 (5.33%) | loss: 3.067735 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,448 | mfu: 50.72 | epoch: 1 | total time: 9.54m | eta: 171.4m +step 00891/16704 (5.33%) | loss: 3.057471 | lrm: 1.00 | dt: 648.22ms | tok/sec: 808,807 | mfu: 50.55 | epoch: 1 | total time: 9.55m | eta: 171.3m +step 00892/16704 (5.34%) | loss: 3.063834 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,663 | mfu: 50.73 | epoch: 1 | total time: 9.56m | eta: 171.3m +step 00893/16704 (5.35%) | loss: 3.067997 | lrm: 1.00 | dt: 648.03ms | tok/sec: 809,049 | mfu: 50.57 | epoch: 1 | total time: 9.57m | eta: 171.3m +step 00894/16704 (5.35%) | loss: 3.069980 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,246 | mfu: 50.64 | epoch: 1 | total time: 9.58m | eta: 171.3m +step 00895/16704 (5.36%) | loss: 3.077640 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,877 | mfu: 50.87 | epoch: 1 | total time: 9.59m | eta: 171.3m +step 00896/16704 (5.36%) | loss: 3.077755 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,712 | mfu: 50.80 | epoch: 1 | total time: 9.60m | eta: 171.3m +step 00897/16704 (5.37%) | loss: 3.070659 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,387 | mfu: 50.78 | epoch: 1 | total time: 9.61m | eta: 171.3m +step 00898/16704 (5.38%) | loss: 3.066916 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,384 | mfu: 50.84 | epoch: 1 | total time: 9.62m | eta: 171.3m +step 00899/16704 (5.38%) | loss: 3.062715 | lrm: 1.00 | dt: 646.70ms | tok/sec: 810,717 | mfu: 50.67 | epoch: 1 | total time: 9.63m | eta: 171.2m +step 00900/16704 (5.39%) | loss: 3.068935 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,499 | mfu: 50.72 | epoch: 1 | total time: 9.64m | eta: 171.2m +step 00901/16704 (5.39%) | loss: 3.073406 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,922 | mfu: 50.75 | epoch: 1 | total time: 9.65m | eta: 171.2m +step 00902/16704 (5.40%) | loss: 3.072529 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,612 | mfu: 50.73 | epoch: 1 | total time: 9.66m | eta: 171.2m +step 00903/16704 (5.41%) | loss: 3.075850 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,988 | mfu: 50.88 | epoch: 1 | total time: 9.68m | eta: 171.2m +step 00904/16704 (5.41%) | loss: 3.075127 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,099 | mfu: 50.69 | epoch: 1 | total time: 9.69m | eta: 171.2m +step 00905/16704 (5.42%) | loss: 3.073128 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,165 | mfu: 50.76 | epoch: 1 | total time: 9.70m | eta: 171.2m +step 00906/16704 (5.42%) | loss: 3.076659 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,334 | mfu: 50.77 | epoch: 1 | total time: 9.71m | eta: 171.2m +step 00907/16704 (5.43%) | loss: 3.082126 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,083 | mfu: 50.76 | epoch: 1 | total time: 9.72m | eta: 171.1m +step 00908/16704 (5.44%) | loss: 3.084094 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,724 | mfu: 50.73 | epoch: 1 | total time: 9.73m | eta: 171.1m +step 00909/16704 (5.44%) | loss: 3.091050 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,015 | mfu: 50.75 | epoch: 1 | total time: 9.74m | eta: 171.1m +step 00910/16704 (5.45%) | loss: 3.085845 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,822 | mfu: 50.74 | epoch: 1 | total time: 9.75m | eta: 171.1m +step 00911/16704 (5.45%) | loss: 3.089379 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,702 | mfu: 50.73 | epoch: 1 | total time: 9.76m | eta: 171.1m +step 00912/16704 (5.46%) | loss: 3.080790 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,829 | mfu: 50.74 | epoch: 1 | total time: 9.77m | eta: 171.1m +step 00913/16704 (5.47%) | loss: 3.078845 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,710 | mfu: 50.73 | epoch: 1 | total time: 9.78m | eta: 171.1m +step 00914/16704 (5.47%) | loss: 3.064951 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,439 | mfu: 50.72 | epoch: 1 | total time: 9.79m | eta: 171.1m +step 00915/16704 (5.48%) | loss: 3.081746 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,742 | mfu: 50.61 | epoch: 1 | total time: 9.80m | eta: 171.1m +step 00916/16704 (5.48%) | loss: 3.072954 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,434 | mfu: 50.72 | epoch: 1 | total time: 9.82m | eta: 171.0m +step 00917/16704 (5.49%) | loss: 3.077267 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,236 | mfu: 50.64 | epoch: 1 | total time: 9.83m | eta: 171.0m +step 00918/16704 (5.50%) | loss: 3.071721 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,765 | mfu: 50.67 | epoch: 1 | total time: 9.84m | eta: 171.0m +step 00919/16704 (5.50%) | loss: 3.063805 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,760 | mfu: 50.61 | epoch: 1 | total time: 9.85m | eta: 171.0m +step 00920/16704 (5.51%) | loss: 3.064799 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,323 | mfu: 50.58 | epoch: 1 | total time: 9.86m | eta: 171.0m +step 00921/16704 (5.51%) | loss: 3.063736 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,164 | mfu: 50.76 | epoch: 1 | total time: 9.87m | eta: 171.0m +step 00922/16704 (5.52%) | loss: 3.069195 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,421 | mfu: 50.65 | epoch: 1 | total time: 9.88m | eta: 171.0m +step 00923/16704 (5.53%) | loss: 3.062810 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,899 | mfu: 50.74 | epoch: 1 | total time: 9.89m | eta: 171.0m +step 00924/16704 (5.53%) | loss: 3.059520 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,045 | mfu: 50.82 | epoch: 1 | total time: 9.90m | eta: 170.9m +step 00925/16704 (5.54%) | loss: 3.056317 | lrm: 1.00 | dt: 646.38ms | tok/sec: 811,113 | mfu: 50.70 | epoch: 1 | total time: 9.91m | eta: 170.9m +step 00926/16704 (5.54%) | loss: 3.059418 | lrm: 1.00 | dt: 648.06ms | tok/sec: 809,013 | mfu: 50.56 | epoch: 1 | total time: 9.92m | eta: 170.9m +step 00927/16704 (5.55%) | loss: 3.071182 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,880 | mfu: 50.74 | epoch: 1 | total time: 9.93m | eta: 170.9m +step 00928/16704 (5.56%) | loss: 3.080736 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,447 | mfu: 50.65 | epoch: 1 | total time: 9.94m | eta: 170.9m +step 00929/16704 (5.56%) | loss: 3.083396 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,241 | mfu: 50.77 | epoch: 1 | total time: 9.96m | eta: 170.9m +step 00930/16704 (5.57%) | loss: 3.086783 | lrm: 1.00 | dt: 646.97ms | tok/sec: 810,380 | mfu: 50.65 | epoch: 1 | total time: 9.97m | eta: 170.9m +step 00931/16704 (5.57%) | loss: 3.082359 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,903 | mfu: 50.81 | epoch: 1 | total time: 9.98m | eta: 170.9m +step 00932/16704 (5.58%) | loss: 3.093708 | lrm: 1.00 | dt: 649.75ms | tok/sec: 806,906 | mfu: 50.43 | epoch: 1 | total time: 9.99m | eta: 170.9m +step 00933/16704 (5.59%) | loss: 3.093042 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,197 | mfu: 50.76 | epoch: 1 | total time: 10.00m | eta: 170.8m +step 00934/16704 (5.59%) | loss: 3.093387 | lrm: 1.00 | dt: 647.67ms | tok/sec: 809,494 | mfu: 50.59 | epoch: 1 | total time: 10.01m | eta: 170.8m +step 00935/16704 (5.60%) | loss: 3.102504 | lrm: 1.00 | dt: 647.87ms | tok/sec: 809,254 | mfu: 50.58 | epoch: 1 | total time: 10.02m | eta: 170.8m +step 00936/16704 (5.60%) | loss: 3.102463 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,549 | mfu: 50.72 | epoch: 1 | total time: 10.03m | eta: 170.8m +step 00937/16704 (5.61%) | loss: 3.098501 | lrm: 1.00 | dt: 648.13ms | tok/sec: 808,928 | mfu: 50.56 | epoch: 1 | total time: 10.04m | eta: 170.8m +step 00938/16704 (5.62%) | loss: 3.093553 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,919 | mfu: 50.62 | epoch: 1 | total time: 10.05m | eta: 170.8m +step 00939/16704 (5.62%) | loss: 3.076090 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,245 | mfu: 50.64 | epoch: 1 | total time: 10.06m | eta: 170.8m +step 00940/16704 (5.63%) | loss: 3.066849 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,166 | mfu: 50.70 | epoch: 1 | total time: 10.07m | eta: 170.8m +step 00941/16704 (5.63%) | loss: 3.076045 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,026 | mfu: 50.75 | epoch: 1 | total time: 10.08m | eta: 170.7m +step 00942/16704 (5.64%) | loss: 3.080338 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,668 | mfu: 50.67 | epoch: 1 | total time: 10.10m | eta: 170.7m +step 00943/16704 (5.65%) | loss: 3.086125 | lrm: 1.00 | dt: 649.04ms | tok/sec: 807,784 | mfu: 50.49 | epoch: 1 | total time: 10.11m | eta: 170.7m +step 00944/16704 (5.65%) | loss: 3.098282 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,228 | mfu: 50.77 | epoch: 1 | total time: 10.12m | eta: 170.7m +step 00945/16704 (5.66%) | loss: 3.089826 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,172 | mfu: 50.64 | epoch: 1 | total time: 10.13m | eta: 170.7m +step 00946/16704 (5.66%) | loss: 3.085945 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,669 | mfu: 50.73 | epoch: 1 | total time: 10.14m | eta: 170.7m +step 00947/16704 (5.67%) | loss: 3.096239 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,887 | mfu: 50.62 | epoch: 1 | total time: 10.15m | eta: 170.7m +step 00948/16704 (5.68%) | loss: 3.090089 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,872 | mfu: 50.81 | epoch: 1 | total time: 10.16m | eta: 170.7m +step 00949/16704 (5.68%) | loss: 3.091218 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,317 | mfu: 50.58 | epoch: 1 | total time: 10.17m | eta: 170.7m +step 00950/16704 (5.69%) | loss: 3.097659 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,227 | mfu: 50.70 | epoch: 1 | total time: 10.18m | eta: 170.6m +step 00951/16704 (5.69%) | loss: 3.097899 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,494 | mfu: 50.78 | epoch: 1 | total time: 10.19m | eta: 170.6m +step 00952/16704 (5.70%) | loss: 3.085681 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,440 | mfu: 50.78 | epoch: 1 | total time: 10.20m | eta: 170.6m +step 00953/16704 (5.71%) | loss: 3.092812 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,600 | mfu: 50.73 | epoch: 1 | total time: 10.21m | eta: 170.6m +step 00954/16704 (5.71%) | loss: 3.091470 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,097 | mfu: 50.63 | epoch: 1 | total time: 10.22m | eta: 170.6m +step 00955/16704 (5.72%) | loss: 3.075222 | lrm: 1.00 | dt: 649.13ms | tok/sec: 807,682 | mfu: 50.48 | epoch: 1 | total time: 10.24m | eta: 170.6m +step 00956/16704 (5.72%) | loss: 3.085460 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,980 | mfu: 50.75 | epoch: 1 | total time: 10.25m | eta: 170.6m +step 00957/16704 (5.73%) | loss: 3.080362 | lrm: 1.00 | dt: 648.29ms | tok/sec: 808,721 | mfu: 50.55 | epoch: 1 | total time: 10.26m | eta: 170.6m +step 00958/16704 (5.74%) | loss: 3.079991 | lrm: 1.00 | dt: 649.77ms | tok/sec: 806,886 | mfu: 50.43 | epoch: 1 | total time: 10.27m | eta: 170.5m +step 00959/16704 (5.74%) | loss: 3.091508 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,117 | mfu: 50.76 | epoch: 1 | total time: 10.28m | eta: 170.5m +step 00960/16704 (5.75%) | loss: 3.088621 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,970 | mfu: 50.75 | epoch: 1 | total time: 10.29m | eta: 170.5m +step 00961/16704 (5.75%) | loss: 3.077280 | lrm: 1.00 | dt: 647.28ms | tok/sec: 809,984 | mfu: 50.63 | epoch: 1 | total time: 10.30m | eta: 170.5m +step 00962/16704 (5.76%) | loss: 3.075860 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,241 | mfu: 50.89 | epoch: 1 | total time: 10.31m | eta: 170.5m +step 00963/16704 (5.77%) | loss: 3.070488 | lrm: 1.00 | dt: 648.12ms | tok/sec: 808,932 | mfu: 50.56 | epoch: 1 | total time: 10.32m | eta: 170.5m +step 00964/16704 (5.77%) | loss: 3.074776 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,701 | mfu: 50.73 | epoch: 1 | total time: 10.33m | eta: 170.5m +step 00965/16704 (5.78%) | loss: 3.062972 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,753 | mfu: 50.67 | epoch: 1 | total time: 10.34m | eta: 170.5m +step 00966/16704 (5.78%) | loss: 3.083734 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,063 | mfu: 50.69 | epoch: 1 | total time: 10.35m | eta: 170.5m +step 00967/16704 (5.79%) | loss: 3.083072 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,226 | mfu: 50.70 | epoch: 1 | total time: 10.36m | eta: 170.4m +step 00968/16704 (5.80%) | loss: 3.067863 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,749 | mfu: 50.74 | epoch: 1 | total time: 10.38m | eta: 170.4m +step 00969/16704 (5.80%) | loss: 3.066516 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,450 | mfu: 50.78 | epoch: 1 | total time: 10.39m | eta: 170.4m +step 00970/16704 (5.81%) | loss: 3.090922 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,880 | mfu: 50.62 | epoch: 1 | total time: 10.40m | eta: 170.4m +step 00971/16704 (5.81%) | loss: 3.093249 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,623 | mfu: 50.60 | epoch: 1 | total time: 10.41m | eta: 170.4m +step 00972/16704 (5.82%) | loss: 3.084511 | lrm: 1.00 | dt: 647.75ms | tok/sec: 809,402 | mfu: 50.59 | epoch: 1 | total time: 10.42m | eta: 170.4m +step 00973/16704 (5.82%) | loss: 3.079790 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,936 | mfu: 50.75 | epoch: 1 | total time: 10.43m | eta: 170.4m +step 00974/16704 (5.83%) | loss: 3.079595 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,350 | mfu: 50.71 | epoch: 1 | total time: 10.44m | eta: 170.4m +step 00975/16704 (5.84%) | loss: 3.097659 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,691 | mfu: 50.73 | epoch: 1 | total time: 10.45m | eta: 170.3m +step 00976/16704 (5.84%) | loss: 3.088337 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,509 | mfu: 50.72 | epoch: 1 | total time: 10.46m | eta: 170.3m +step 00977/16704 (5.85%) | loss: 3.093239 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,909 | mfu: 50.75 | epoch: 1 | total time: 10.47m | eta: 170.3m +step 00978/16704 (5.85%) | loss: 3.092195 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,974 | mfu: 50.69 | epoch: 1 | total time: 10.48m | eta: 170.3m +step 00979/16704 (5.86%) | loss: 3.085626 | lrm: 1.00 | dt: 648.16ms | tok/sec: 808,889 | mfu: 50.56 | epoch: 1 | total time: 10.49m | eta: 170.3m +step 00980/16704 (5.87%) | loss: 3.085894 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,887 | mfu: 50.81 | epoch: 1 | total time: 10.51m | eta: 170.3m +step 00981/16704 (5.87%) | loss: 3.077897 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,370 | mfu: 50.77 | epoch: 1 | total time: 10.52m | eta: 170.3m +step 00982/16704 (5.88%) | loss: 3.081660 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,441 | mfu: 50.53 | epoch: 1 | total time: 10.53m | eta: 170.3m +step 00983/16704 (5.88%) | loss: 3.080680 | lrm: 1.00 | dt: 648.48ms | tok/sec: 808,490 | mfu: 50.53 | epoch: 1 | total time: 10.54m | eta: 170.3m +step 00984/16704 (5.89%) | loss: 3.072113 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,406 | mfu: 50.71 | epoch: 1 | total time: 10.55m | eta: 170.2m +step 00985/16704 (5.90%) | loss: 3.066169 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,374 | mfu: 50.84 | epoch: 1 | total time: 10.56m | eta: 170.2m +step 00986/16704 (5.90%) | loss: 3.075903 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,590 | mfu: 50.79 | epoch: 1 | total time: 10.57m | eta: 170.2m +step 00987/16704 (5.91%) | loss: 3.063893 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,979 | mfu: 50.62 | epoch: 1 | total time: 10.58m | eta: 170.2m +step 00988/16704 (5.91%) | loss: 3.065479 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,262 | mfu: 50.77 | epoch: 1 | total time: 10.59m | eta: 170.2m +step 00989/16704 (5.92%) | loss: 3.065287 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,196 | mfu: 50.89 | epoch: 1 | total time: 10.60m | eta: 170.2m +step 00990/16704 (5.93%) | loss: 3.052997 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,910 | mfu: 50.75 | epoch: 1 | total time: 10.61m | eta: 170.2m +step 00991/16704 (5.93%) | loss: 3.046213 | lrm: 1.00 | dt: 648.68ms | tok/sec: 808,237 | mfu: 50.52 | epoch: 1 | total time: 10.62m | eta: 170.2m +step 00992/16704 (5.94%) | loss: 3.031285 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,981 | mfu: 50.75 | epoch: 1 | total time: 10.63m | eta: 170.1m +step 00993/16704 (5.94%) | loss: 3.032636 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,816 | mfu: 50.74 | epoch: 1 | total time: 10.65m | eta: 170.1m +step 00994/16704 (5.95%) | loss: 3.036524 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,317 | mfu: 50.65 | epoch: 1 | total time: 10.66m | eta: 170.1m +step 00995/16704 (5.96%) | loss: 3.040388 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,274 | mfu: 50.77 | epoch: 1 | total time: 10.67m | eta: 170.1m +step 00996/16704 (5.96%) | loss: 3.050047 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,625 | mfu: 50.73 | epoch: 1 | total time: 10.68m | eta: 170.1m +step 00997/16704 (5.97%) | loss: 3.052475 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,795 | mfu: 50.74 | epoch: 1 | total time: 10.69m | eta: 170.1m +step 00998/16704 (5.97%) | loss: 3.060507 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,719 | mfu: 50.67 | epoch: 1 | total time: 10.70m | eta: 170.1m +step 00999/16704 (5.98%) | loss: 3.055869 | lrm: 1.00 | dt: 647.69ms | tok/sec: 809,472 | mfu: 50.59 | epoch: 1 | total time: 10.71m | eta: 170.1m +Step 01000 | Validation bpb: 0.929247 +step 01000/16704 (5.99%) | loss: 3.058072 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,692 | mfu: 50.54 | epoch: 1 | total time: 10.72m | eta: 170.1m +step 01001/16704 (5.99%) | loss: 3.053116 | lrm: 1.00 | dt: 650.44ms | tok/sec: 806,048 | mfu: 50.38 | epoch: 1 | total time: 10.73m | eta: 170.0m +step 01002/16704 (6.00%) | loss: 3.038046 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,312 | mfu: 50.71 | epoch: 1 | total time: 10.74m | eta: 170.0m +step 01003/16704 (6.00%) | loss: 3.036975 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,486 | mfu: 50.97 | epoch: 1 | total time: 10.75m | eta: 170.0m +step 01004/16704 (6.01%) | loss: 3.041436 | lrm: 1.00 | dt: 650.64ms | tok/sec: 805,808 | mfu: 50.36 | epoch: 1 | total time: 10.76m | eta: 170.0m +step 01005/16704 (6.02%) | loss: 3.046443 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,932 | mfu: 50.81 | epoch: 1 | total time: 10.77m | eta: 170.0m +step 01006/16704 (6.02%) | loss: 3.055732 | lrm: 1.00 | dt: 648.28ms | tok/sec: 808,739 | mfu: 50.55 | epoch: 1 | total time: 10.79m | eta: 170.0m +step 01007/16704 (6.03%) | loss: 3.056040 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,807 | mfu: 50.80 | epoch: 1 | total time: 10.80m | eta: 170.0m +step 01008/16704 (6.03%) | loss: 3.069916 | lrm: 1.00 | dt: 647.21ms | tok/sec: 810,069 | mfu: 50.63 | epoch: 1 | total time: 10.81m | eta: 170.0m +step 01009/16704 (6.04%) | loss: 3.060130 | lrm: 1.00 | dt: 649.17ms | tok/sec: 807,631 | mfu: 50.48 | epoch: 1 | total time: 10.82m | eta: 170.0m +step 01010/16704 (6.05%) | loss: 3.048348 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,577 | mfu: 50.72 | epoch: 1 | total time: 10.83m | eta: 169.9m +step 01011/16704 (6.05%) | loss: 3.040022 | lrm: 1.00 | dt: 647.43ms | tok/sec: 809,804 | mfu: 50.61 | epoch: 1 | total time: 10.84m | eta: 169.9m +step 01012/16704 (6.06%) | loss: 3.044710 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,304 | mfu: 50.71 | epoch: 1 | total time: 10.85m | eta: 169.9m +step 01013/16704 (6.06%) | loss: 3.048466 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,111 | mfu: 50.88 | epoch: 1 | total time: 10.86m | eta: 169.9m +step 01014/16704 (6.07%) | loss: 3.043034 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,427 | mfu: 50.65 | epoch: 1 | total time: 10.87m | eta: 169.9m +step 01015/16704 (6.08%) | loss: 3.055017 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,029 | mfu: 50.82 | epoch: 1 | total time: 10.88m | eta: 169.9m +step 01016/16704 (6.08%) | loss: 3.057402 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,439 | mfu: 50.84 | epoch: 1 | total time: 10.89m | eta: 169.9m +step 01017/16704 (6.09%) | loss: 3.054278 | lrm: 1.00 | dt: 647.52ms | tok/sec: 809,681 | mfu: 50.61 | epoch: 1 | total time: 10.90m | eta: 169.9m +step 01018/16704 (6.09%) | loss: 3.043425 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,203 | mfu: 50.83 | epoch: 1 | total time: 10.91m | eta: 169.8m +step 01019/16704 (6.10%) | loss: 3.049081 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,126 | mfu: 50.76 | epoch: 1 | total time: 10.93m | eta: 169.8m +step 01020/16704 (6.11%) | loss: 3.054765 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,591 | mfu: 50.66 | epoch: 1 | total time: 10.94m | eta: 169.8m +step 01021/16704 (6.11%) | loss: 3.063059 | lrm: 1.00 | dt: 648.15ms | tok/sec: 808,897 | mfu: 50.56 | epoch: 1 | total time: 10.95m | eta: 169.8m +step 01022/16704 (6.12%) | loss: 3.075253 | lrm: 1.00 | dt: 648.82ms | tok/sec: 808,069 | mfu: 50.51 | epoch: 1 | total time: 10.96m | eta: 169.8m +step 01023/16704 (6.12%) | loss: 3.061557 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,274 | mfu: 50.77 | epoch: 1 | total time: 10.97m | eta: 169.8m +step 01024/16704 (6.13%) | loss: 3.046235 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,097 | mfu: 50.63 | epoch: 1 | total time: 10.98m | eta: 169.8m +step 01025/16704 (6.14%) | loss: 3.056241 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,195 | mfu: 50.76 | epoch: 1 | total time: 10.99m | eta: 169.8m +step 01026/16704 (6.14%) | loss: 3.056787 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,696 | mfu: 50.61 | epoch: 1 | total time: 11.00m | eta: 169.8m +step 01027/16704 (6.15%) | loss: 3.066928 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,826 | mfu: 50.74 | epoch: 1 | total time: 11.01m | eta: 169.7m +step 01028/16704 (6.15%) | loss: 3.063386 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,525 | mfu: 50.78 | epoch: 1 | total time: 11.02m | eta: 169.7m +step 01029/16704 (6.16%) | loss: 3.049357 | lrm: 1.00 | dt: 647.23ms | tok/sec: 810,051 | mfu: 50.63 | epoch: 1 | total time: 11.03m | eta: 169.7m +step 01030/16704 (6.17%) | loss: 3.048269 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,872 | mfu: 50.68 | epoch: 1 | total time: 11.04m | eta: 169.7m +step 01031/16704 (6.17%) | loss: 3.044301 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,747 | mfu: 50.80 | epoch: 1 | total time: 11.05m | eta: 169.7m +step 01032/16704 (6.18%) | loss: 3.043251 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,356 | mfu: 50.71 | epoch: 1 | total time: 11.07m | eta: 169.7m +step 01033/16704 (6.18%) | loss: 3.047320 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,624 | mfu: 50.73 | epoch: 1 | total time: 11.08m | eta: 169.7m +step 01034/16704 (6.19%) | loss: 3.053558 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,480 | mfu: 50.72 | epoch: 1 | total time: 11.09m | eta: 169.7m +step 01035/16704 (6.20%) | loss: 3.053786 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,652 | mfu: 50.60 | epoch: 1 | total time: 11.10m | eta: 169.6m +step 01036/16704 (6.20%) | loss: 3.053397 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,675 | mfu: 50.73 | epoch: 1 | total time: 11.11m | eta: 169.6m +step 01037/16704 (6.21%) | loss: 3.045893 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,352 | mfu: 50.77 | epoch: 1 | total time: 11.12m | eta: 169.6m +step 01038/16704 (6.21%) | loss: 3.043082 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,141 | mfu: 50.70 | epoch: 1 | total time: 11.13m | eta: 169.6m +step 01039/16704 (6.22%) | loss: 3.042467 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,511 | mfu: 50.66 | epoch: 1 | total time: 11.14m | eta: 169.6m +step 01040/16704 (6.23%) | loss: 3.038584 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,088 | mfu: 50.82 | epoch: 1 | total time: 11.15m | eta: 169.6m +step 01041/16704 (6.23%) | loss: 3.026477 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,105 | mfu: 50.63 | epoch: 1 | total time: 11.16m | eta: 169.6m +step 01042/16704 (6.24%) | loss: 3.028398 | lrm: 1.00 | dt: 647.40ms | tok/sec: 809,831 | mfu: 50.62 | epoch: 1 | total time: 11.17m | eta: 169.6m +step 01043/16704 (6.24%) | loss: 3.024362 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,083 | mfu: 50.82 | epoch: 1 | total time: 11.18m | eta: 169.6m +step 01044/16704 (6.25%) | loss: 3.015518 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,020 | mfu: 50.75 | epoch: 1 | total time: 11.19m | eta: 169.5m +step 01045/16704 (6.26%) | loss: 3.012209 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,518 | mfu: 50.66 | epoch: 1 | total time: 11.21m | eta: 169.5m +step 01046/16704 (6.26%) | loss: 3.012055 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,475 | mfu: 50.78 | epoch: 1 | total time: 11.22m | eta: 169.5m +step 01047/16704 (6.27%) | loss: 3.006821 | lrm: 1.00 | dt: 647.23ms | tok/sec: 810,050 | mfu: 50.63 | epoch: 1 | total time: 11.23m | eta: 169.5m +step 01048/16704 (6.27%) | loss: 3.014923 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,397 | mfu: 50.78 | epoch: 1 | total time: 11.24m | eta: 169.5m +step 01049/16704 (6.28%) | loss: 3.026078 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,642 | mfu: 50.60 | epoch: 1 | total time: 11.25m | eta: 169.5m +step 01050/16704 (6.29%) | loss: 3.029540 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,303 | mfu: 50.90 | epoch: 1 | total time: 11.26m | eta: 169.5m +step 01051/16704 (6.29%) | loss: 3.033083 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 1 | total time: 11.27m | eta: 169.5m +step 01052/16704 (6.30%) | loss: 3.028117 | lrm: 1.00 | dt: 646.99ms | tok/sec: 810,351 | mfu: 50.65 | epoch: 1 | total time: 11.28m | eta: 169.4m +step 01053/16704 (6.30%) | loss: 3.037429 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,496 | mfu: 50.72 | epoch: 1 | total time: 11.29m | eta: 169.4m +step 01054/16704 (6.31%) | loss: 3.047619 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,708 | mfu: 50.92 | epoch: 1 | total time: 11.30m | eta: 169.4m +step 01055/16704 (6.32%) | loss: 3.050531 | lrm: 1.00 | dt: 648.30ms | tok/sec: 808,714 | mfu: 50.55 | epoch: 1 | total time: 11.31m | eta: 169.4m +step 01056/16704 (6.32%) | loss: 3.036193 | lrm: 1.00 | dt: 647.69ms | tok/sec: 809,477 | mfu: 50.59 | epoch: 1 | total time: 11.32m | eta: 169.4m +step 01057/16704 (6.33%) | loss: 3.047725 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,496 | mfu: 50.84 | epoch: 1 | total time: 11.33m | eta: 169.4m +step 01058/16704 (6.33%) | loss: 3.033329 | lrm: 1.00 | dt: 648.63ms | tok/sec: 808,304 | mfu: 50.52 | epoch: 1 | total time: 11.35m | eta: 169.4m +step 01059/16704 (6.34%) | loss: 3.029429 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,533 | mfu: 50.85 | epoch: 1 | total time: 11.36m | eta: 169.4m +step 01060/16704 (6.35%) | loss: 3.023847 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,767 | mfu: 50.74 | epoch: 1 | total time: 11.37m | eta: 169.4m +step 01061/16704 (6.35%) | loss: 3.018454 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,575 | mfu: 50.66 | epoch: 1 | total time: 11.38m | eta: 169.3m +step 01062/16704 (6.36%) | loss: 3.019789 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,607 | mfu: 50.79 | epoch: 1 | total time: 11.39m | eta: 169.3m +step 01063/16704 (6.36%) | loss: 3.018514 | lrm: 1.00 | dt: 648.85ms | tok/sec: 808,027 | mfu: 50.50 | epoch: 1 | total time: 11.40m | eta: 169.3m +step 01064/16704 (6.37%) | loss: 3.022566 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,418 | mfu: 50.59 | epoch: 1 | total time: 11.41m | eta: 169.3m +step 01065/16704 (6.38%) | loss: 3.036205 | lrm: 1.00 | dt: 648.20ms | tok/sec: 808,835 | mfu: 50.55 | epoch: 1 | total time: 11.42m | eta: 169.3m +step 01066/16704 (6.38%) | loss: 3.031646 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,780 | mfu: 50.86 | epoch: 1 | total time: 11.43m | eta: 169.3m +step 01067/16704 (6.39%) | loss: 3.038484 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,366 | mfu: 50.90 | epoch: 1 | total time: 11.44m | eta: 169.3m +step 01068/16704 (6.39%) | loss: 3.016472 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,959 | mfu: 50.81 | epoch: 1 | total time: 11.45m | eta: 169.3m +step 01069/16704 (6.40%) | loss: 3.016688 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,688 | mfu: 50.73 | epoch: 1 | total time: 11.46m | eta: 169.2m +step 01070/16704 (6.41%) | loss: 3.007059 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 1 | total time: 11.47m | eta: 169.2m +step 01071/16704 (6.41%) | loss: 3.007464 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,083 | mfu: 50.82 | epoch: 1 | total time: 11.49m | eta: 169.2m +step 01072/16704 (6.42%) | loss: 3.003603 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 1 | total time: 11.50m | eta: 169.2m +step 01073/16704 (6.42%) | loss: 3.022632 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,477 | mfu: 50.66 | epoch: 1 | total time: 11.51m | eta: 169.2m +step 01074/16704 (6.43%) | loss: 3.024502 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,270 | mfu: 50.96 | epoch: 1 | total time: 11.52m | eta: 169.2m +step 01075/16704 (6.44%) | loss: 3.034397 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,107 | mfu: 50.76 | epoch: 1 | total time: 11.53m | eta: 169.2m +step 01076/16704 (6.44%) | loss: 3.030452 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,991 | mfu: 50.75 | epoch: 1 | total time: 11.54m | eta: 169.2m +step 01077/16704 (6.45%) | loss: 3.026768 | lrm: 1.00 | dt: 648.59ms | tok/sec: 808,348 | mfu: 50.52 | epoch: 1 | total time: 11.55m | eta: 169.2m +step 01078/16704 (6.45%) | loss: 3.028408 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,146 | mfu: 50.82 | epoch: 1 | total time: 11.56m | eta: 169.1m +step 01079/16704 (6.46%) | loss: 3.026677 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,120 | mfu: 50.82 | epoch: 1 | total time: 11.57m | eta: 169.1m +step 01080/16704 (6.47%) | loss: 3.017258 | lrm: 1.00 | dt: 648.14ms | tok/sec: 808,911 | mfu: 50.56 | epoch: 1 | total time: 11.58m | eta: 169.1m +step 01081/16704 (6.47%) | loss: 3.013183 | lrm: 1.00 | dt: 649.61ms | tok/sec: 807,086 | mfu: 50.44 | epoch: 1 | total time: 11.59m | eta: 169.1m +step 01082/16704 (6.48%) | loss: 3.022860 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,691 | mfu: 50.79 | epoch: 1 | total time: 11.60m | eta: 169.1m +step 01083/16704 (6.48%) | loss: 3.001324 | lrm: 1.00 | dt: 648.50ms | tok/sec: 808,467 | mfu: 50.53 | epoch: 1 | total time: 11.61m | eta: 169.1m +step 01084/16704 (6.49%) | loss: 2.985681 | lrm: 1.00 | dt: 647.17ms | tok/sec: 810,119 | mfu: 50.63 | epoch: 1 | total time: 11.63m | eta: 169.1m +step 01085/16704 (6.50%) | loss: 2.983131 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 1 | total time: 11.64m | eta: 169.1m +step 01086/16704 (6.50%) | loss: 2.967636 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,866 | mfu: 50.68 | epoch: 1 | total time: 11.65m | eta: 169.1m +step 01087/16704 (6.51%) | loss: 2.989818 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,500 | mfu: 50.84 | epoch: 1 | total time: 11.66m | eta: 169.0m +step 01088/16704 (6.51%) | loss: 3.005013 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,478 | mfu: 50.66 | epoch: 1 | total time: 11.67m | eta: 169.0m +step 01089/16704 (6.52%) | loss: 3.004854 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,152 | mfu: 50.76 | epoch: 1 | total time: 11.68m | eta: 169.0m +step 01090/16704 (6.53%) | loss: 3.015717 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,104 | mfu: 50.70 | epoch: 1 | total time: 11.69m | eta: 169.0m +step 01091/16704 (6.53%) | loss: 3.017423 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,978 | mfu: 50.75 | epoch: 1 | total time: 11.70m | eta: 169.0m +step 01092/16704 (6.54%) | loss: 3.013422 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,023 | mfu: 50.69 | epoch: 1 | total time: 11.71m | eta: 169.0m +step 01093/16704 (6.54%) | loss: 3.014538 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,074 | mfu: 50.88 | epoch: 1 | total time: 11.72m | eta: 169.0m +step 01094/16704 (6.55%) | loss: 3.000049 | lrm: 1.00 | dt: 646.72ms | tok/sec: 810,692 | mfu: 50.67 | epoch: 1 | total time: 11.73m | eta: 169.0m +step 01095/16704 (6.56%) | loss: 2.988005 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,577 | mfu: 50.85 | epoch: 1 | total time: 11.74m | eta: 168.9m +step 01096/16704 (6.56%) | loss: 2.984067 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,853 | mfu: 50.74 | epoch: 1 | total time: 11.75m | eta: 168.9m +step 01097/16704 (6.57%) | loss: 3.006332 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,772 | mfu: 50.67 | epoch: 1 | total time: 11.77m | eta: 168.9m +step 01098/16704 (6.57%) | loss: 3.015296 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,403 | mfu: 50.96 | epoch: 1 | total time: 11.78m | eta: 168.9m +step 01099/16704 (6.58%) | loss: 3.017317 | lrm: 1.00 | dt: 647.67ms | tok/sec: 809,501 | mfu: 50.59 | epoch: 1 | total time: 11.79m | eta: 168.9m +step 01100/16704 (6.59%) | loss: 3.025885 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,078 | mfu: 50.69 | epoch: 1 | total time: 11.80m | eta: 168.9m +step 01101/16704 (6.59%) | loss: 3.035248 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,050 | mfu: 50.75 | epoch: 1 | total time: 11.81m | eta: 168.9m +step 01102/16704 (6.60%) | loss: 3.042088 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,213 | mfu: 50.83 | epoch: 1 | total time: 11.82m | eta: 168.9m +step 01103/16704 (6.60%) | loss: 3.048952 | lrm: 1.00 | dt: 647.58ms | tok/sec: 809,613 | mfu: 50.60 | epoch: 1 | total time: 11.83m | eta: 168.9m +step 01104/16704 (6.61%) | loss: 3.045829 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 1 | total time: 11.84m | eta: 168.8m +step 01105/16704 (6.62%) | loss: 3.039376 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,767 | mfu: 50.67 | epoch: 1 | total time: 11.85m | eta: 168.8m +step 01106/16704 (6.62%) | loss: 3.047410 | lrm: 1.00 | dt: 647.83ms | tok/sec: 809,296 | mfu: 50.58 | epoch: 1 | total time: 11.86m | eta: 168.8m +step 01107/16704 (6.63%) | loss: 3.039834 | lrm: 1.00 | dt: 648.72ms | tok/sec: 808,183 | mfu: 50.51 | epoch: 1 | total time: 11.87m | eta: 168.8m +step 01108/16704 (6.63%) | loss: 3.054195 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 1 | total time: 11.88m | eta: 168.8m +step 01109/16704 (6.64%) | loss: 3.064139 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,359 | mfu: 50.71 | epoch: 1 | total time: 11.89m | eta: 168.8m +step 01110/16704 (6.65%) | loss: 3.070390 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,317 | mfu: 50.58 | epoch: 1 | total time: 11.91m | eta: 168.8m +step 01111/16704 (6.65%) | loss: 3.067950 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,752 | mfu: 50.67 | epoch: 1 | total time: 11.92m | eta: 168.8m +step 01112/16704 (6.66%) | loss: 3.059089 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,830 | mfu: 50.74 | epoch: 1 | total time: 11.93m | eta: 168.7m +step 01113/16704 (6.66%) | loss: 3.059692 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,737 | mfu: 50.61 | epoch: 1 | total time: 11.94m | eta: 168.7m +step 01114/16704 (6.67%) | loss: 3.076460 | lrm: 1.00 | dt: 648.26ms | tok/sec: 808,763 | mfu: 50.55 | epoch: 1 | total time: 11.95m | eta: 168.7m +step 01115/16704 (6.68%) | loss: 3.071049 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,292 | mfu: 50.64 | epoch: 1 | total time: 11.96m | eta: 168.7m +step 01116/16704 (6.68%) | loss: 3.055787 | lrm: 1.00 | dt: 647.03ms | tok/sec: 810,301 | mfu: 50.64 | epoch: 1 | total time: 11.97m | eta: 168.7m +step 01117/16704 (6.69%) | loss: 3.055574 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,708 | mfu: 50.73 | epoch: 1 | total time: 11.98m | eta: 168.7m +step 01118/16704 (6.69%) | loss: 3.048496 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,330 | mfu: 50.71 | epoch: 1 | total time: 11.99m | eta: 168.7m +step 01119/16704 (6.70%) | loss: 3.042594 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,454 | mfu: 50.72 | epoch: 1 | total time: 12.00m | eta: 168.7m +step 01120/16704 (6.70%) | loss: 3.047840 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,717 | mfu: 50.73 | epoch: 1 | total time: 12.01m | eta: 168.7m +step 01121/16704 (6.71%) | loss: 3.045189 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,611 | mfu: 50.66 | epoch: 1 | total time: 12.02m | eta: 168.6m +step 01122/16704 (6.72%) | loss: 3.044268 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,411 | mfu: 50.71 | epoch: 1 | total time: 12.03m | eta: 168.6m +step 01123/16704 (6.72%) | loss: 3.038846 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,422 | mfu: 50.65 | epoch: 1 | total time: 12.05m | eta: 168.6m +step 01124/16704 (6.73%) | loss: 3.044338 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 1 | total time: 12.06m | eta: 168.6m +step 01125/16704 (6.73%) | loss: 3.048712 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,221 | mfu: 50.64 | epoch: 1 | total time: 12.07m | eta: 168.6m +step 01126/16704 (6.74%) | loss: 3.037309 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,579 | mfu: 50.91 | epoch: 1 | total time: 12.08m | eta: 168.6m +step 01127/16704 (6.75%) | loss: 3.037555 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,164 | mfu: 50.64 | epoch: 1 | total time: 12.09m | eta: 168.6m +step 01128/16704 (6.75%) | loss: 3.035767 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,330 | mfu: 50.71 | epoch: 1 | total time: 12.10m | eta: 168.6m +step 01129/16704 (6.76%) | loss: 3.018133 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,173 | mfu: 50.70 | epoch: 1 | total time: 12.11m | eta: 168.6m +step 01130/16704 (6.76%) | loss: 3.011060 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,046 | mfu: 50.69 | epoch: 1 | total time: 12.12m | eta: 168.5m +step 01131/16704 (6.77%) | loss: 3.015019 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,919 | mfu: 50.75 | epoch: 1 | total time: 12.13m | eta: 168.5m +step 01132/16704 (6.78%) | loss: 3.028379 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,313 | mfu: 50.77 | epoch: 1 | total time: 12.14m | eta: 168.5m +step 01133/16704 (6.78%) | loss: 3.029682 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,442 | mfu: 50.72 | epoch: 1 | total time: 12.15m | eta: 168.5m +step 01134/16704 (6.79%) | loss: 3.021221 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,850 | mfu: 50.80 | epoch: 1 | total time: 12.16m | eta: 168.5m +step 01135/16704 (6.79%) | loss: 3.011335 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,279 | mfu: 50.83 | epoch: 1 | total time: 12.17m | eta: 168.5m +step 01136/16704 (6.80%) | loss: 3.018376 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,644 | mfu: 50.60 | epoch: 1 | total time: 12.19m | eta: 168.5m +step 01137/16704 (6.81%) | loss: 3.023568 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,263 | mfu: 50.77 | epoch: 1 | total time: 12.20m | eta: 168.5m +step 01138/16704 (6.81%) | loss: 3.034893 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,640 | mfu: 50.79 | epoch: 1 | total time: 12.21m | eta: 168.4m +step 01139/16704 (6.82%) | loss: 3.026259 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,034 | mfu: 50.63 | epoch: 1 | total time: 12.22m | eta: 168.4m +step 01140/16704 (6.82%) | loss: 3.032758 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 1 | total time: 12.23m | eta: 168.4m +step 01141/16704 (6.83%) | loss: 3.043979 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,190 | mfu: 50.70 | epoch: 1 | total time: 12.24m | eta: 168.4m +step 01142/16704 (6.84%) | loss: 3.048611 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 1 | total time: 12.25m | eta: 168.4m +step 01143/16704 (6.84%) | loss: 3.040904 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,286 | mfu: 50.71 | epoch: 1 | total time: 12.26m | eta: 168.4m +step 01144/16704 (6.85%) | loss: 3.030526 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,471 | mfu: 50.78 | epoch: 1 | total time: 12.27m | eta: 168.4m +step 01145/16704 (6.85%) | loss: 3.029843 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,408 | mfu: 50.71 | epoch: 1 | total time: 12.28m | eta: 168.4m +step 01146/16704 (6.86%) | loss: 3.030995 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,536 | mfu: 50.72 | epoch: 1 | total time: 12.29m | eta: 168.4m +step 01147/16704 (6.87%) | loss: 3.015119 | lrm: 1.00 | dt: 648.48ms | tok/sec: 808,487 | mfu: 50.53 | epoch: 1 | total time: 12.30m | eta: 168.3m +step 01148/16704 (6.87%) | loss: 3.028982 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,228 | mfu: 50.95 | epoch: 1 | total time: 12.31m | eta: 168.3m +step 01149/16704 (6.88%) | loss: 3.025051 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,846 | mfu: 50.74 | epoch: 1 | total time: 12.33m | eta: 168.3m +step 01150/16704 (6.88%) | loss: 3.020117 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,813 | mfu: 50.68 | epoch: 1 | total time: 12.34m | eta: 168.3m +step 01151/16704 (6.89%) | loss: 3.027266 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,720 | mfu: 50.67 | epoch: 1 | total time: 12.35m | eta: 168.3m +step 01152/16704 (6.90%) | loss: 3.023434 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,449 | mfu: 50.72 | epoch: 1 | total time: 12.36m | eta: 168.3m +step 01153/16704 (6.90%) | loss: 3.030930 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,116 | mfu: 50.82 | epoch: 1 | total time: 12.37m | eta: 168.3m +step 01154/16704 (6.91%) | loss: 3.025328 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,016 | mfu: 50.81 | epoch: 1 | total time: 12.38m | eta: 168.3m +step 01155/16704 (6.91%) | loss: 3.026673 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,848 | mfu: 50.74 | epoch: 1 | total time: 12.39m | eta: 168.3m +step 01156/16704 (6.92%) | loss: 3.024955 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,450 | mfu: 50.65 | epoch: 1 | total time: 12.40m | eta: 168.2m +step 01157/16704 (6.93%) | loss: 3.030096 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,889 | mfu: 50.62 | epoch: 1 | total time: 12.41m | eta: 168.2m +step 01158/16704 (6.93%) | loss: 3.036511 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,017 | mfu: 50.69 | epoch: 1 | total time: 12.42m | eta: 168.2m +step 01159/16704 (6.94%) | loss: 3.029568 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,244 | mfu: 50.70 | epoch: 1 | total time: 12.43m | eta: 168.2m +step 01160/16704 (6.94%) | loss: 3.038931 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,614 | mfu: 50.66 | epoch: 1 | total time: 12.44m | eta: 168.2m +step 01161/16704 (6.95%) | loss: 3.036031 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,427 | mfu: 50.72 | epoch: 1 | total time: 12.45m | eta: 168.2m +step 01162/16704 (6.96%) | loss: 3.023577 | lrm: 1.00 | dt: 649.25ms | tok/sec: 807,532 | mfu: 50.47 | epoch: 1 | total time: 12.47m | eta: 168.2m +step 01163/16704 (6.96%) | loss: 3.033941 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,232 | mfu: 50.77 | epoch: 1 | total time: 12.48m | eta: 168.2m +step 01164/16704 (6.97%) | loss: 3.030721 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,919 | mfu: 51.00 | epoch: 1 | total time: 12.49m | eta: 168.1m +step 01165/16704 (6.97%) | loss: 3.033659 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,919 | mfu: 50.68 | epoch: 1 | total time: 12.50m | eta: 168.1m +step 01166/16704 (6.98%) | loss: 3.025760 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,910 | mfu: 50.75 | epoch: 1 | total time: 12.51m | eta: 168.1m +step 01167/16704 (6.99%) | loss: 3.034598 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,278 | mfu: 50.64 | epoch: 1 | total time: 12.52m | eta: 168.1m +step 01168/16704 (6.99%) | loss: 3.013754 | lrm: 1.00 | dt: 648.31ms | tok/sec: 808,705 | mfu: 50.55 | epoch: 1 | total time: 12.53m | eta: 168.1m +step 01169/16704 (7.00%) | loss: 3.018920 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,898 | mfu: 50.74 | epoch: 1 | total time: 12.54m | eta: 168.1m +step 01170/16704 (7.00%) | loss: 3.034162 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,496 | mfu: 50.66 | epoch: 1 | total time: 12.55m | eta: 168.1m +step 01171/16704 (7.01%) | loss: 3.030340 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,161 | mfu: 50.70 | epoch: 1 | total time: 12.56m | eta: 168.1m +step 01172/16704 (7.02%) | loss: 3.021099 | lrm: 1.00 | dt: 647.65ms | tok/sec: 809,528 | mfu: 50.60 | epoch: 1 | total time: 12.57m | eta: 168.1m +step 01173/16704 (7.02%) | loss: 3.005093 | lrm: 1.00 | dt: 648.14ms | tok/sec: 808,906 | mfu: 50.56 | epoch: 1 | total time: 12.58m | eta: 168.0m +step 01174/16704 (7.03%) | loss: 3.011335 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,906 | mfu: 50.75 | epoch: 1 | total time: 12.59m | eta: 168.0m +step 01175/16704 (7.03%) | loss: 3.018618 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,062 | mfu: 50.63 | epoch: 1 | total time: 12.61m | eta: 168.0m +step 01176/16704 (7.04%) | loss: 3.027255 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,687 | mfu: 50.73 | epoch: 1 | total time: 12.62m | eta: 168.0m +step 01177/16704 (7.05%) | loss: 3.026042 | lrm: 1.00 | dt: 652.41ms | tok/sec: 803,612 | mfu: 50.23 | epoch: 1 | total time: 12.63m | eta: 168.0m +step 01178/16704 (7.05%) | loss: 3.030432 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,765 | mfu: 50.74 | epoch: 1 | total time: 12.64m | eta: 168.0m +step 01179/16704 (7.06%) | loss: 3.025666 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,570 | mfu: 50.60 | epoch: 1 | total time: 12.65m | eta: 168.0m +step 01180/16704 (7.06%) | loss: 3.029868 | lrm: 1.00 | dt: 647.76ms | tok/sec: 809,388 | mfu: 50.59 | epoch: 1 | total time: 12.66m | eta: 168.0m +step 01181/16704 (7.07%) | loss: 3.019829 | lrm: 1.00 | dt: 648.55ms | tok/sec: 808,396 | mfu: 50.53 | epoch: 1 | total time: 12.67m | eta: 168.0m +step 01182/16704 (7.08%) | loss: 3.029040 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,614 | mfu: 50.66 | epoch: 1 | total time: 12.68m | eta: 167.9m +step 01183/16704 (7.08%) | loss: 3.019476 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,434 | mfu: 50.72 | epoch: 1 | total time: 12.69m | eta: 167.9m +step 01184/16704 (7.09%) | loss: 3.028591 | lrm: 1.00 | dt: 647.11ms | tok/sec: 810,200 | mfu: 50.64 | epoch: 1 | total time: 12.70m | eta: 167.9m +step 01185/16704 (7.09%) | loss: 3.019836 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,762 | mfu: 50.67 | epoch: 1 | total time: 12.71m | eta: 167.9m +step 01186/16704 (7.10%) | loss: 3.016321 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,593 | mfu: 50.73 | epoch: 1 | total time: 12.72m | eta: 167.9m +step 01187/16704 (7.11%) | loss: 3.019861 | lrm: 1.00 | dt: 650.82ms | tok/sec: 805,579 | mfu: 50.35 | epoch: 1 | total time: 12.73m | eta: 167.9m +step 01188/16704 (7.11%) | loss: 3.025709 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,106 | mfu: 50.70 | epoch: 1 | total time: 12.75m | eta: 167.9m +step 01189/16704 (7.12%) | loss: 3.033038 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,099 | mfu: 50.69 | epoch: 1 | total time: 12.76m | eta: 167.9m +step 01190/16704 (7.12%) | loss: 3.031310 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,148 | mfu: 50.76 | epoch: 1 | total time: 12.77m | eta: 167.9m +step 01191/16704 (7.13%) | loss: 3.033268 | lrm: 1.00 | dt: 646.63ms | tok/sec: 810,801 | mfu: 50.68 | epoch: 1 | total time: 12.78m | eta: 167.8m +step 01192/16704 (7.14%) | loss: 3.031223 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,862 | mfu: 50.74 | epoch: 1 | total time: 12.79m | eta: 167.8m +step 01193/16704 (7.14%) | loss: 3.027770 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,174 | mfu: 50.57 | epoch: 1 | total time: 12.80m | eta: 167.8m +step 01194/16704 (7.15%) | loss: 3.025135 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,977 | mfu: 50.75 | epoch: 1 | total time: 12.81m | eta: 167.8m +step 01195/16704 (7.15%) | loss: 3.025654 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,698 | mfu: 50.73 | epoch: 1 | total time: 12.82m | eta: 167.8m +step 01196/16704 (7.16%) | loss: 3.016642 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,478 | mfu: 50.78 | epoch: 1 | total time: 12.83m | eta: 167.8m +step 01197/16704 (7.17%) | loss: 3.011608 | lrm: 1.00 | dt: 647.60ms | tok/sec: 809,585 | mfu: 50.60 | epoch: 1 | total time: 12.84m | eta: 167.8m +step 01198/16704 (7.17%) | loss: 3.017455 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,896 | mfu: 50.81 | epoch: 1 | total time: 12.85m | eta: 167.8m +step 01199/16704 (7.18%) | loss: 3.020622 | lrm: 1.00 | dt: 651.04ms | tok/sec: 805,314 | mfu: 50.33 | epoch: 1 | total time: 12.86m | eta: 167.8m +step 01200/16704 (7.18%) | loss: 3.022189 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,215 | mfu: 50.70 | epoch: 1 | total time: 12.87m | eta: 167.7m +step 01201/16704 (7.19%) | loss: 3.018394 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,516 | mfu: 50.78 | epoch: 1 | total time: 12.89m | eta: 167.7m +step 01202/16704 (7.20%) | loss: 3.020632 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,079 | mfu: 50.69 | epoch: 1 | total time: 12.90m | eta: 167.7m +step 01203/16704 (7.20%) | loss: 3.038119 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,903 | mfu: 50.68 | epoch: 1 | total time: 12.91m | eta: 167.7m +step 01204/16704 (7.21%) | loss: 3.027926 | lrm: 1.00 | dt: 650.13ms | tok/sec: 806,434 | mfu: 50.40 | epoch: 1 | total time: 12.92m | eta: 167.7m +step 01205/16704 (7.21%) | loss: 3.027183 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,305 | mfu: 50.77 | epoch: 1 | total time: 12.93m | eta: 167.7m +step 01206/16704 (7.22%) | loss: 3.027967 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,278 | mfu: 50.64 | epoch: 1 | total time: 12.94m | eta: 167.7m +step 01207/16704 (7.23%) | loss: 3.027116 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,692 | mfu: 50.61 | epoch: 1 | total time: 12.95m | eta: 167.7m +step 01208/16704 (7.23%) | loss: 3.023947 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,221 | mfu: 50.58 | epoch: 1 | total time: 12.96m | eta: 167.7m +step 01209/16704 (7.24%) | loss: 3.026224 | lrm: 1.00 | dt: 649.32ms | tok/sec: 807,443 | mfu: 50.47 | epoch: 1 | total time: 12.97m | eta: 167.6m +step 01210/16704 (7.24%) | loss: 3.027086 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 1 | total time: 12.98m | eta: 167.6m +step 01211/16704 (7.25%) | loss: 3.024804 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,899 | mfu: 50.62 | epoch: 1 | total time: 12.99m | eta: 167.6m +step 01212/16704 (7.26%) | loss: 3.030271 | lrm: 1.00 | dt: 649.31ms | tok/sec: 807,455 | mfu: 50.47 | epoch: 1 | total time: 13.00m | eta: 167.6m +step 01213/16704 (7.26%) | loss: 3.037971 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,409 | mfu: 50.65 | epoch: 1 | total time: 13.02m | eta: 167.6m +step 01214/16704 (7.27%) | loss: 3.041086 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,867 | mfu: 50.68 | epoch: 1 | total time: 13.03m | eta: 167.6m +step 01215/16704 (7.27%) | loss: 3.043466 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,758 | mfu: 50.67 | epoch: 1 | total time: 13.04m | eta: 167.6m +step 01216/16704 (7.28%) | loss: 3.051120 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,948 | mfu: 50.87 | epoch: 1 | total time: 13.05m | eta: 167.6m +step 01217/16704 (7.29%) | loss: 3.059467 | lrm: 1.00 | dt: 647.38ms | tok/sec: 809,860 | mfu: 50.62 | epoch: 1 | total time: 13.06m | eta: 167.5m +step 01218/16704 (7.29%) | loss: 3.056532 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,721 | mfu: 50.67 | epoch: 1 | total time: 13.07m | eta: 167.5m +step 01219/16704 (7.30%) | loss: 3.057045 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,669 | mfu: 50.79 | epoch: 1 | total time: 13.08m | eta: 167.5m +step 01220/16704 (7.30%) | loss: 3.052486 | lrm: 1.00 | dt: 647.59ms | tok/sec: 809,599 | mfu: 50.60 | epoch: 1 | total time: 13.09m | eta: 167.5m +step 01221/16704 (7.31%) | loss: 3.040817 | lrm: 1.00 | dt: 646.02ms | tok/sec: 811,569 | mfu: 50.72 | epoch: 1 | total time: 13.10m | eta: 167.5m +step 01222/16704 (7.32%) | loss: 3.040688 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,355 | mfu: 50.71 | epoch: 1 | total time: 13.11m | eta: 167.5m +step 01223/16704 (7.32%) | loss: 3.033974 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 1 | total time: 13.12m | eta: 167.5m +step 01224/16704 (7.33%) | loss: 3.024822 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,787 | mfu: 50.74 | epoch: 1 | total time: 13.13m | eta: 167.5m +step 01225/16704 (7.33%) | loss: 3.016369 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,765 | mfu: 50.61 | epoch: 1 | total time: 13.14m | eta: 167.5m +step 01226/16704 (7.34%) | loss: 3.025699 | lrm: 1.00 | dt: 648.94ms | tok/sec: 807,918 | mfu: 50.50 | epoch: 1 | total time: 13.16m | eta: 167.4m +step 01227/16704 (7.35%) | loss: 3.005559 | lrm: 1.00 | dt: 642.10ms | tok/sec: 816,526 | mfu: 51.03 | epoch: 1 | total time: 13.17m | eta: 167.4m +step 01228/16704 (7.35%) | loss: 3.002096 | lrm: 1.00 | dt: 647.65ms | tok/sec: 809,519 | mfu: 50.60 | epoch: 1 | total time: 13.18m | eta: 167.4m +step 01229/16704 (7.36%) | loss: 2.982067 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,111 | mfu: 50.63 | epoch: 1 | total time: 13.19m | eta: 167.4m +step 01230/16704 (7.36%) | loss: 2.982180 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,327 | mfu: 50.83 | epoch: 1 | total time: 13.20m | eta: 167.4m +step 01231/16704 (7.37%) | loss: 2.986668 | lrm: 1.00 | dt: 648.84ms | tok/sec: 808,032 | mfu: 50.50 | epoch: 1 | total time: 13.21m | eta: 167.4m +step 01232/16704 (7.38%) | loss: 2.991666 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 1 | total time: 13.22m | eta: 167.4m +step 01233/16704 (7.38%) | loss: 2.995028 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,943 | mfu: 50.69 | epoch: 1 | total time: 13.23m | eta: 167.4m +step 01234/16704 (7.39%) | loss: 2.993938 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,970 | mfu: 50.62 | epoch: 1 | total time: 13.24m | eta: 167.4m +step 01235/16704 (7.39%) | loss: 3.000818 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,758 | mfu: 50.80 | epoch: 1 | total time: 13.25m | eta: 167.3m +step 01236/16704 (7.40%) | loss: 3.016748 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,405 | mfu: 50.78 | epoch: 1 | total time: 13.26m | eta: 167.3m +step 01237/16704 (7.41%) | loss: 3.000863 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,889 | mfu: 50.74 | epoch: 1 | total time: 13.27m | eta: 167.3m +step 01238/16704 (7.41%) | loss: 3.009982 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,120 | mfu: 50.82 | epoch: 1 | total time: 13.28m | eta: 167.3m +step 01239/16704 (7.42%) | loss: 3.011389 | lrm: 1.00 | dt: 652.26ms | tok/sec: 803,802 | mfu: 50.24 | epoch: 1 | total time: 13.30m | eta: 167.3m +step 01240/16704 (7.42%) | loss: 3.028206 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,340 | mfu: 50.83 | epoch: 1 | total time: 13.31m | eta: 167.3m +step 01241/16704 (7.43%) | loss: 3.028673 | lrm: 1.00 | dt: 648.89ms | tok/sec: 807,980 | mfu: 50.50 | epoch: 1 | total time: 13.32m | eta: 167.3m +step 01242/16704 (7.44%) | loss: 3.031016 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,532 | mfu: 50.66 | epoch: 1 | total time: 13.33m | eta: 167.3m +step 01243/16704 (7.44%) | loss: 3.025299 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,979 | mfu: 50.62 | epoch: 1 | total time: 13.34m | eta: 167.3m +step 01244/16704 (7.45%) | loss: 3.022364 | lrm: 1.00 | dt: 646.97ms | tok/sec: 810,370 | mfu: 50.65 | epoch: 1 | total time: 13.35m | eta: 167.2m +step 01245/16704 (7.45%) | loss: 3.017450 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,783 | mfu: 50.74 | epoch: 1 | total time: 13.36m | eta: 167.2m +step 01246/16704 (7.46%) | loss: 3.009631 | lrm: 1.00 | dt: 647.64ms | tok/sec: 809,538 | mfu: 50.60 | epoch: 1 | total time: 13.37m | eta: 167.2m +step 01247/16704 (7.47%) | loss: 3.000457 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,656 | mfu: 50.67 | epoch: 1 | total time: 13.38m | eta: 167.2m +step 01248/16704 (7.47%) | loss: 2.995424 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,698 | mfu: 50.79 | epoch: 1 | total time: 13.39m | eta: 167.2m +step 01249/16704 (7.48%) | loss: 2.994963 | lrm: 1.00 | dt: 649.12ms | tok/sec: 807,695 | mfu: 50.48 | epoch: 1 | total time: 13.40m | eta: 167.2m +Step 01250 | Validation bpb: 0.913046 +step 01250/16704 (7.48%) | loss: 3.003997 | lrm: 1.00 | dt: 649.48ms | tok/sec: 807,240 | mfu: 50.45 | epoch: 1 | total time: 13.41m | eta: 167.2m +step 01251/16704 (7.49%) | loss: 3.003086 | lrm: 1.00 | dt: 648.72ms | tok/sec: 808,187 | mfu: 50.51 | epoch: 1 | total time: 13.42m | eta: 167.2m +step 01252/16704 (7.50%) | loss: 2.998515 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,978 | mfu: 50.81 | epoch: 1 | total time: 13.44m | eta: 167.2m +step 01253/16704 (7.50%) | loss: 3.005587 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,272 | mfu: 50.58 | epoch: 1 | total time: 13.45m | eta: 167.1m +step 01254/16704 (7.51%) | loss: 3.011039 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,609 | mfu: 50.79 | epoch: 1 | total time: 13.46m | eta: 167.1m +step 01255/16704 (7.51%) | loss: 3.013805 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,821 | mfu: 50.93 | epoch: 1 | total time: 13.47m | eta: 167.1m +step 01256/16704 (7.52%) | loss: 3.017225 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 1 | total time: 13.48m | eta: 167.1m +step 01257/16704 (7.53%) | loss: 3.009747 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,755 | mfu: 50.61 | epoch: 1 | total time: 13.49m | eta: 167.1m +step 01258/16704 (7.53%) | loss: 2.998695 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,636 | mfu: 50.92 | epoch: 1 | total time: 13.50m | eta: 167.1m +step 01259/16704 (7.54%) | loss: 3.005548 | lrm: 1.00 | dt: 649.68ms | tok/sec: 806,994 | mfu: 50.44 | epoch: 1 | total time: 13.51m | eta: 167.1m +step 01260/16704 (7.54%) | loss: 2.998192 | lrm: 1.00 | dt: 646.95ms | tok/sec: 810,397 | mfu: 50.65 | epoch: 1 | total time: 13.52m | eta: 167.1m +step 01261/16704 (7.55%) | loss: 2.995017 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,046 | mfu: 50.82 | epoch: 1 | total time: 13.53m | eta: 167.0m +step 01262/16704 (7.56%) | loss: 2.995588 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,095 | mfu: 50.63 | epoch: 1 | total time: 13.54m | eta: 167.0m +step 01263/16704 (7.56%) | loss: 2.994692 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,998 | mfu: 50.88 | epoch: 1 | total time: 13.55m | eta: 167.0m +step 01264/16704 (7.57%) | loss: 3.005313 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 1 | total time: 13.56m | eta: 167.0m +step 01265/16704 (7.57%) | loss: 3.004710 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 1 | total time: 13.58m | eta: 167.0m +step 01266/16704 (7.58%) | loss: 2.997187 | lrm: 1.00 | dt: 647.70ms | tok/sec: 809,464 | mfu: 50.59 | epoch: 1 | total time: 13.59m | eta: 167.0m +step 01267/16704 (7.59%) | loss: 2.998195 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,218 | mfu: 50.70 | epoch: 1 | total time: 13.60m | eta: 167.0m +step 01268/16704 (7.59%) | loss: 2.991146 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,959 | mfu: 50.94 | epoch: 1 | total time: 13.61m | eta: 167.0m +step 01269/16704 (7.60%) | loss: 2.988283 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,265 | mfu: 50.77 | epoch: 1 | total time: 13.62m | eta: 167.0m +step 01270/16704 (7.60%) | loss: 2.986699 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,918 | mfu: 50.75 | epoch: 1 | total time: 13.63m | eta: 166.9m +step 01271/16704 (7.61%) | loss: 2.990545 | lrm: 1.00 | dt: 647.43ms | tok/sec: 809,799 | mfu: 50.61 | epoch: 1 | total time: 13.64m | eta: 166.9m +step 01272/16704 (7.61%) | loss: 2.986039 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,653 | mfu: 50.73 | epoch: 1 | total time: 13.65m | eta: 166.9m +step 01273/16704 (7.62%) | loss: 2.989622 | lrm: 1.00 | dt: 649.57ms | tok/sec: 807,124 | mfu: 50.45 | epoch: 1 | total time: 13.66m | eta: 166.9m +step 01274/16704 (7.63%) | loss: 2.983983 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,428 | mfu: 50.78 | epoch: 1 | total time: 13.67m | eta: 166.9m +step 01275/16704 (7.63%) | loss: 2.987357 | lrm: 1.00 | dt: 648.93ms | tok/sec: 807,927 | mfu: 50.50 | epoch: 1 | total time: 13.68m | eta: 166.9m +step 01276/16704 (7.64%) | loss: 2.988756 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,263 | mfu: 50.71 | epoch: 1 | total time: 13.69m | eta: 166.9m +step 01277/16704 (7.64%) | loss: 2.981047 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,523 | mfu: 50.85 | epoch: 1 | total time: 13.70m | eta: 166.9m +step 01278/16704 (7.65%) | loss: 2.983741 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,490 | mfu: 50.72 | epoch: 1 | total time: 13.72m | eta: 166.9m +step 01279/16704 (7.66%) | loss: 2.982173 | lrm: 1.00 | dt: 647.50ms | tok/sec: 809,717 | mfu: 50.61 | epoch: 1 | total time: 13.73m | eta: 166.8m +step 01280/16704 (7.66%) | loss: 2.973643 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,034 | mfu: 50.82 | epoch: 1 | total time: 13.74m | eta: 166.8m +step 01281/16704 (7.67%) | loss: 2.982728 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,473 | mfu: 50.66 | epoch: 1 | total time: 13.75m | eta: 166.8m +step 01282/16704 (7.67%) | loss: 2.981756 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 1 | total time: 13.76m | eta: 166.8m +step 01283/16704 (7.68%) | loss: 2.981481 | lrm: 1.00 | dt: 648.58ms | tok/sec: 808,362 | mfu: 50.52 | epoch: 1 | total time: 13.77m | eta: 166.8m +step 01284/16704 (7.69%) | loss: 2.974497 | lrm: 1.00 | dt: 648.74ms | tok/sec: 808,160 | mfu: 50.51 | epoch: 1 | total time: 13.78m | eta: 166.8m +step 01285/16704 (7.69%) | loss: 2.978417 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,779 | mfu: 50.80 | epoch: 1 | total time: 13.79m | eta: 166.8m +step 01286/16704 (7.70%) | loss: 2.976813 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,465 | mfu: 50.78 | epoch: 1 | total time: 13.80m | eta: 166.8m +step 01287/16704 (7.70%) | loss: 2.996239 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,467 | mfu: 50.66 | epoch: 1 | total time: 13.81m | eta: 166.8m +step 01288/16704 (7.71%) | loss: 2.999752 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,849 | mfu: 50.68 | epoch: 1 | total time: 13.82m | eta: 166.7m +step 01289/16704 (7.72%) | loss: 3.005820 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,834 | mfu: 50.68 | epoch: 1 | total time: 13.83m | eta: 166.7m +step 01290/16704 (7.72%) | loss: 3.000060 | lrm: 1.00 | dt: 648.75ms | tok/sec: 808,157 | mfu: 50.51 | epoch: 1 | total time: 13.84m | eta: 166.7m +step 01291/16704 (7.73%) | loss: 2.990477 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,311 | mfu: 50.77 | epoch: 1 | total time: 13.86m | eta: 166.7m +step 01292/16704 (7.73%) | loss: 2.990234 | lrm: 1.00 | dt: 647.45ms | tok/sec: 809,768 | mfu: 50.61 | epoch: 1 | total time: 13.87m | eta: 166.7m +step 01293/16704 (7.74%) | loss: 2.993053 | lrm: 1.00 | dt: 649.04ms | tok/sec: 807,789 | mfu: 50.49 | epoch: 1 | total time: 13.88m | eta: 166.7m +step 01294/16704 (7.75%) | loss: 2.995474 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,691 | mfu: 50.79 | epoch: 1 | total time: 13.89m | eta: 166.7m +step 01295/16704 (7.75%) | loss: 2.990916 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,276 | mfu: 50.58 | epoch: 1 | total time: 13.90m | eta: 166.7m +step 01296/16704 (7.76%) | loss: 2.984314 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,501 | mfu: 50.78 | epoch: 1 | total time: 13.91m | eta: 166.7m +step 01297/16704 (7.76%) | loss: 2.999886 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,896 | mfu: 50.87 | epoch: 1 | total time: 13.92m | eta: 166.6m +step 01298/16704 (7.77%) | loss: 2.987460 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,619 | mfu: 50.73 | epoch: 1 | total time: 13.93m | eta: 166.6m +step 01299/16704 (7.78%) | loss: 2.986664 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,226 | mfu: 50.58 | epoch: 1 | total time: 13.94m | eta: 166.6m +step 01300/16704 (7.78%) | loss: 2.983496 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,960 | mfu: 50.75 | epoch: 1 | total time: 13.95m | eta: 166.6m +step 01301/16704 (7.79%) | loss: 2.982338 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,495 | mfu: 50.72 | epoch: 1 | total time: 13.96m | eta: 166.6m +step 01302/16704 (7.79%) | loss: 2.997444 | lrm: 1.00 | dt: 647.39ms | tok/sec: 809,852 | mfu: 50.62 | epoch: 1 | total time: 13.97m | eta: 166.6m +step 01303/16704 (7.80%) | loss: 3.005366 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,272 | mfu: 50.71 | epoch: 1 | total time: 13.98m | eta: 166.6m +step 01304/16704 (7.81%) | loss: 3.005475 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 1 | total time: 14.00m | eta: 166.6m +step 01305/16704 (7.81%) | loss: 3.003714 | lrm: 1.00 | dt: 647.03ms | tok/sec: 810,295 | mfu: 50.64 | epoch: 1 | total time: 14.01m | eta: 166.6m +step 01306/16704 (7.82%) | loss: 2.998787 | lrm: 1.00 | dt: 648.39ms | tok/sec: 808,598 | mfu: 50.54 | epoch: 1 | total time: 14.02m | eta: 166.5m +step 01307/16704 (7.82%) | loss: 3.004938 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,216 | mfu: 50.76 | epoch: 1 | total time: 14.03m | eta: 166.5m +step 01308/16704 (7.83%) | loss: 3.007690 | lrm: 1.00 | dt: 647.56ms | tok/sec: 809,633 | mfu: 50.60 | epoch: 1 | total time: 14.04m | eta: 166.5m +step 01309/16704 (7.84%) | loss: 3.017711 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,127 | mfu: 50.82 | epoch: 1 | total time: 14.05m | eta: 166.5m +step 01310/16704 (7.84%) | loss: 3.015888 | lrm: 1.00 | dt: 649.30ms | tok/sec: 807,463 | mfu: 50.47 | epoch: 1 | total time: 14.06m | eta: 166.5m +step 01311/16704 (7.85%) | loss: 3.011684 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,853 | mfu: 50.74 | epoch: 1 | total time: 14.07m | eta: 166.5m +step 01312/16704 (7.85%) | loss: 3.021346 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,744 | mfu: 50.74 | epoch: 1 | total time: 14.08m | eta: 166.5m +step 01313/16704 (7.86%) | loss: 3.018352 | lrm: 1.00 | dt: 649.66ms | tok/sec: 807,014 | mfu: 50.44 | epoch: 1 | total time: 14.09m | eta: 166.5m +step 01314/16704 (7.87%) | loss: 3.009714 | lrm: 1.00 | dt: 649.36ms | tok/sec: 807,393 | mfu: 50.46 | epoch: 1 | total time: 14.10m | eta: 166.5m +step 01315/16704 (7.87%) | loss: 3.002607 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,121 | mfu: 50.76 | epoch: 1 | total time: 14.11m | eta: 166.4m +step 01316/16704 (7.88%) | loss: 3.010574 | lrm: 1.00 | dt: 649.44ms | tok/sec: 807,290 | mfu: 50.46 | epoch: 1 | total time: 14.13m | eta: 166.4m +step 01317/16704 (7.88%) | loss: 3.005207 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 1 | total time: 14.14m | eta: 166.4m +step 01318/16704 (7.89%) | loss: 2.992765 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,695 | mfu: 50.67 | epoch: 1 | total time: 14.15m | eta: 166.4m +step 01319/16704 (7.90%) | loss: 2.994952 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,355 | mfu: 50.71 | epoch: 1 | total time: 14.16m | eta: 166.4m +step 01320/16704 (7.90%) | loss: 2.996851 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,237 | mfu: 50.83 | epoch: 1 | total time: 14.17m | eta: 166.4m +step 01321/16704 (7.91%) | loss: 2.983835 | lrm: 1.00 | dt: 650.51ms | tok/sec: 805,964 | mfu: 50.37 | epoch: 1 | total time: 14.18m | eta: 166.4m +step 01322/16704 (7.91%) | loss: 2.984659 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,534 | mfu: 50.72 | epoch: 1 | total time: 14.19m | eta: 166.4m +step 01323/16704 (7.92%) | loss: 2.987255 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,057 | mfu: 50.63 | epoch: 1 | total time: 14.20m | eta: 166.3m +step 01324/16704 (7.93%) | loss: 2.981182 | lrm: 1.00 | dt: 650.43ms | tok/sec: 806,057 | mfu: 50.38 | epoch: 1 | total time: 14.21m | eta: 166.3m +step 01325/16704 (7.93%) | loss: 2.985747 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,585 | mfu: 50.79 | epoch: 1 | total time: 14.22m | eta: 166.3m +step 01326/16704 (7.94%) | loss: 2.986833 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,409 | mfu: 50.78 | epoch: 1 | total time: 14.23m | eta: 166.3m +step 01327/16704 (7.94%) | loss: 2.986224 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,898 | mfu: 50.62 | epoch: 1 | total time: 14.24m | eta: 166.3m +step 01328/16704 (7.95%) | loss: 2.989152 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,378 | mfu: 50.84 | epoch: 1 | total time: 14.25m | eta: 166.3m +step 01329/16704 (7.96%) | loss: 2.988412 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,511 | mfu: 50.66 | epoch: 1 | total time: 14.27m | eta: 166.3m +step 01330/16704 (7.96%) | loss: 2.995993 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,040 | mfu: 50.69 | epoch: 1 | total time: 14.28m | eta: 166.3m +step 01331/16704 (7.97%) | loss: 2.988853 | lrm: 1.00 | dt: 647.26ms | tok/sec: 810,010 | mfu: 50.63 | epoch: 1 | total time: 14.29m | eta: 166.3m +step 01332/16704 (7.97%) | loss: 2.977629 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,948 | mfu: 50.75 | epoch: 1 | total time: 14.30m | eta: 166.2m +step 01333/16704 (7.98%) | loss: 2.959743 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,883 | mfu: 50.68 | epoch: 1 | total time: 14.31m | eta: 166.2m +step 01334/16704 (7.99%) | loss: 2.945644 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,607 | mfu: 50.66 | epoch: 1 | total time: 14.32m | eta: 166.2m +step 01335/16704 (7.99%) | loss: 2.959180 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,736 | mfu: 50.80 | epoch: 1 | total time: 14.33m | eta: 166.2m +step 01336/16704 (8.00%) | loss: 2.962558 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,466 | mfu: 50.72 | epoch: 1 | total time: 14.34m | eta: 166.2m +step 01337/16704 (8.00%) | loss: 2.961189 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,145 | mfu: 50.76 | epoch: 1 | total time: 14.35m | eta: 166.2m +step 01338/16704 (8.01%) | loss: 2.958004 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,732 | mfu: 50.67 | epoch: 1 | total time: 14.36m | eta: 166.2m +step 01339/16704 (8.02%) | loss: 2.963342 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,242 | mfu: 50.83 | epoch: 1 | total time: 14.37m | eta: 166.2m +step 01340/16704 (8.02%) | loss: 2.961221 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,501 | mfu: 50.66 | epoch: 1 | total time: 14.38m | eta: 166.2m +step 01341/16704 (8.03%) | loss: 2.965275 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,453 | mfu: 50.72 | epoch: 1 | total time: 14.39m | eta: 166.1m +step 01342/16704 (8.03%) | loss: 2.961440 | lrm: 1.00 | dt: 647.65ms | tok/sec: 809,520 | mfu: 50.60 | epoch: 1 | total time: 14.41m | eta: 166.1m +step 01343/16704 (8.04%) | loss: 2.952033 | lrm: 1.00 | dt: 647.39ms | tok/sec: 809,853 | mfu: 50.62 | epoch: 1 | total time: 14.42m | eta: 166.1m +step 01344/16704 (8.05%) | loss: 2.958340 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 1 | total time: 14.43m | eta: 166.1m +step 01345/16704 (8.05%) | loss: 2.964279 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,099 | mfu: 50.57 | epoch: 1 | total time: 14.44m | eta: 166.1m +step 01346/16704 (8.06%) | loss: 2.979842 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,619 | mfu: 50.73 | epoch: 1 | total time: 14.45m | eta: 166.1m +step 01347/16704 (8.06%) | loss: 2.986781 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,358 | mfu: 50.84 | epoch: 1 | total time: 14.46m | eta: 166.1m +step 01348/16704 (8.07%) | loss: 2.985790 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,591 | mfu: 50.66 | epoch: 1 | total time: 14.47m | eta: 166.1m +step 01349/16704 (8.08%) | loss: 3.002086 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,472 | mfu: 50.72 | epoch: 1 | total time: 14.48m | eta: 166.1m +step 01350/16704 (8.08%) | loss: 3.011421 | lrm: 1.00 | dt: 647.70ms | tok/sec: 809,461 | mfu: 50.59 | epoch: 1 | total time: 14.49m | eta: 166.0m +step 01351/16704 (8.09%) | loss: 3.013715 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 1 | total time: 14.50m | eta: 166.0m +step 01352/16704 (8.09%) | loss: 3.004280 | lrm: 1.00 | dt: 649.02ms | tok/sec: 807,813 | mfu: 50.49 | epoch: 1 | total time: 14.51m | eta: 166.0m +step 01353/16704 (8.10%) | loss: 2.996151 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,650 | mfu: 50.73 | epoch: 1 | total time: 14.52m | eta: 166.0m +step 01354/16704 (8.11%) | loss: 2.990812 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,292 | mfu: 50.83 | epoch: 1 | total time: 14.53m | eta: 166.0m +step 01355/16704 (8.11%) | loss: 2.996706 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,014 | mfu: 50.75 | epoch: 1 | total time: 14.55m | eta: 166.0m +step 01356/16704 (8.12%) | loss: 2.977712 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,141 | mfu: 50.70 | epoch: 1 | total time: 14.56m | eta: 166.0m +step 01357/16704 (8.12%) | loss: 2.980825 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,813 | mfu: 50.68 | epoch: 1 | total time: 14.57m | eta: 166.0m +step 01358/16704 (8.13%) | loss: 2.969919 | lrm: 1.00 | dt: 648.06ms | tok/sec: 809,011 | mfu: 50.56 | epoch: 1 | total time: 14.58m | eta: 166.0m +step 01359/16704 (8.14%) | loss: 2.970126 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,464 | mfu: 50.66 | epoch: 1 | total time: 14.59m | eta: 165.9m +step 01360/16704 (8.14%) | loss: 2.976982 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,229 | mfu: 50.83 | epoch: 1 | total time: 14.60m | eta: 165.9m +step 01361/16704 (8.15%) | loss: 2.981308 | lrm: 1.00 | dt: 647.38ms | tok/sec: 809,856 | mfu: 50.62 | epoch: 1 | total time: 14.61m | eta: 165.9m +step 01362/16704 (8.15%) | loss: 2.976187 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,173 | mfu: 50.76 | epoch: 1 | total time: 14.62m | eta: 165.9m +step 01363/16704 (8.16%) | loss: 2.978711 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,224 | mfu: 50.58 | epoch: 1 | total time: 14.63m | eta: 165.9m +step 01364/16704 (8.17%) | loss: 2.975866 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,068 | mfu: 50.76 | epoch: 1 | total time: 14.64m | eta: 165.9m +step 01365/16704 (8.17%) | loss: 2.973873 | lrm: 1.00 | dt: 649.37ms | tok/sec: 807,374 | mfu: 50.46 | epoch: 1 | total time: 14.65m | eta: 165.9m +step 01366/16704 (8.18%) | loss: 2.971268 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,636 | mfu: 50.85 | epoch: 1 | total time: 14.66m | eta: 165.9m +step 01367/16704 (8.18%) | loss: 2.982040 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,291 | mfu: 50.77 | epoch: 1 | total time: 14.67m | eta: 165.9m +step 01368/16704 (8.19%) | loss: 2.986604 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,398 | mfu: 50.71 | epoch: 1 | total time: 14.69m | eta: 165.8m +step 01369/16704 (8.20%) | loss: 2.976211 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,892 | mfu: 50.68 | epoch: 1 | total time: 14.70m | eta: 165.8m +step 01370/16704 (8.20%) | loss: 2.983883 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,745 | mfu: 50.80 | epoch: 1 | total time: 14.71m | eta: 165.8m +step 01371/16704 (8.21%) | loss: 2.991426 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,773 | mfu: 50.74 | epoch: 1 | total time: 14.72m | eta: 165.8m +step 01372/16704 (8.21%) | loss: 3.004822 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,618 | mfu: 50.91 | epoch: 1 | total time: 14.73m | eta: 165.8m +step 01373/16704 (8.22%) | loss: 3.007611 | lrm: 1.00 | dt: 646.63ms | tok/sec: 810,805 | mfu: 50.68 | epoch: 1 | total time: 14.74m | eta: 165.8m +step 01374/16704 (8.23%) | loss: 3.009089 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,438 | mfu: 50.90 | epoch: 1 | total time: 14.75m | eta: 165.8m +step 01375/16704 (8.23%) | loss: 3.012723 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,469 | mfu: 50.84 | epoch: 1 | total time: 14.76m | eta: 165.8m +step 01376/16704 (8.24%) | loss: 2.996766 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,895 | mfu: 50.74 | epoch: 1 | total time: 14.77m | eta: 165.7m +step 01377/16704 (8.24%) | loss: 3.006297 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,508 | mfu: 50.72 | epoch: 1 | total time: 14.78m | eta: 165.7m +step 01378/16704 (8.25%) | loss: 3.004719 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,460 | mfu: 50.90 | epoch: 1 | total time: 14.79m | eta: 165.7m +step 01379/16704 (8.26%) | loss: 3.012336 | lrm: 1.00 | dt: 647.94ms | tok/sec: 809,165 | mfu: 50.57 | epoch: 1 | total time: 14.80m | eta: 165.7m +step 01380/16704 (8.26%) | loss: 3.011223 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,562 | mfu: 50.91 | epoch: 1 | total time: 14.81m | eta: 165.7m +step 01381/16704 (8.27%) | loss: 3.006356 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,006 | mfu: 50.75 | epoch: 1 | total time: 14.83m | eta: 165.7m +step 01382/16704 (8.27%) | loss: 3.012728 | lrm: 1.00 | dt: 647.83ms | tok/sec: 809,297 | mfu: 50.58 | epoch: 1 | total time: 14.84m | eta: 165.7m +step 01383/16704 (8.28%) | loss: 3.008346 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,217 | mfu: 50.89 | epoch: 1 | total time: 14.85m | eta: 165.7m +step 01384/16704 (8.29%) | loss: 3.004780 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,440 | mfu: 50.72 | epoch: 1 | total time: 14.86m | eta: 165.7m +step 01385/16704 (8.29%) | loss: 2.984800 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,673 | mfu: 50.67 | epoch: 1 | total time: 14.87m | eta: 165.6m +step 01386/16704 (8.30%) | loss: 2.992119 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,458 | mfu: 50.84 | epoch: 1 | total time: 14.88m | eta: 165.6m +step 01387/16704 (8.30%) | loss: 2.989034 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,054 | mfu: 50.75 | epoch: 1 | total time: 14.89m | eta: 165.6m +step 01388/16704 (8.31%) | loss: 3.004535 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,150 | mfu: 50.82 | epoch: 1 | total time: 14.90m | eta: 165.6m +step 01389/16704 (8.32%) | loss: 3.013589 | lrm: 1.00 | dt: 647.44ms | tok/sec: 809,781 | mfu: 50.61 | epoch: 1 | total time: 14.91m | eta: 165.6m +step 01390/16704 (8.32%) | loss: 3.002192 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,536 | mfu: 50.72 | epoch: 1 | total time: 14.92m | eta: 165.6m +step 01391/16704 (8.33%) | loss: 3.016400 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,222 | mfu: 50.83 | epoch: 1 | total time: 14.93m | eta: 165.6m +step 01392/16704 (8.33%) | loss: 3.021701 | lrm: 1.00 | dt: 647.76ms | tok/sec: 809,381 | mfu: 50.59 | epoch: 1 | total time: 14.94m | eta: 165.6m +step 01393/16704 (8.34%) | loss: 3.021782 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,202 | mfu: 50.89 | epoch: 1 | total time: 14.95m | eta: 165.6m +step 01394/16704 (8.35%) | loss: 3.018647 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,762 | mfu: 50.74 | epoch: 1 | total time: 14.96m | eta: 165.5m +step 01395/16704 (8.35%) | loss: 3.017958 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,630 | mfu: 50.79 | epoch: 1 | total time: 14.98m | eta: 165.5m +step 01396/16704 (8.36%) | loss: 3.019164 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,007 | mfu: 50.81 | epoch: 1 | total time: 14.99m | eta: 165.5m +step 01397/16704 (8.36%) | loss: 3.007398 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.85 | epoch: 1 | total time: 15.00m | eta: 165.5m +step 01398/16704 (8.37%) | loss: 3.008218 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,050 | mfu: 50.75 | epoch: 1 | total time: 15.01m | eta: 165.5m +step 01399/16704 (8.38%) | loss: 3.006252 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,262 | mfu: 50.77 | epoch: 1 | total time: 15.02m | eta: 165.5m +step 01400/16704 (8.38%) | loss: 3.005115 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,749 | mfu: 50.67 | epoch: 1 | total time: 15.03m | eta: 165.5m +step 01401/16704 (8.39%) | loss: 3.006046 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,789 | mfu: 50.74 | epoch: 1 | total time: 15.04m | eta: 165.5m +step 01402/16704 (8.39%) | loss: 3.008032 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 1 | total time: 15.05m | eta: 165.5m +step 01403/16704 (8.40%) | loss: 3.010856 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,820 | mfu: 50.80 | epoch: 1 | total time: 15.06m | eta: 165.4m +step 01404/16704 (8.41%) | loss: 2.997199 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,269 | mfu: 50.64 | epoch: 1 | total time: 15.07m | eta: 165.4m +step 01405/16704 (8.41%) | loss: 2.987447 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,800 | mfu: 50.99 | epoch: 1 | total time: 15.08m | eta: 165.4m +step 01406/16704 (8.42%) | loss: 2.992154 | lrm: 1.00 | dt: 647.70ms | tok/sec: 809,464 | mfu: 50.59 | epoch: 1 | total time: 15.09m | eta: 165.4m +step 01407/16704 (8.42%) | loss: 2.990254 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,605 | mfu: 50.79 | epoch: 1 | total time: 15.10m | eta: 165.4m +step 01408/16704 (8.43%) | loss: 2.994390 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,606 | mfu: 50.66 | epoch: 1 | total time: 15.12m | eta: 165.4m +step 01409/16704 (8.44%) | loss: 2.997823 | lrm: 1.00 | dt: 642.32ms | tok/sec: 816,244 | mfu: 51.02 | epoch: 1 | total time: 15.13m | eta: 165.4m +step 01410/16704 (8.44%) | loss: 2.996732 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,806 | mfu: 50.80 | epoch: 1 | total time: 15.14m | eta: 165.4m +step 01411/16704 (8.45%) | loss: 2.991978 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,750 | mfu: 50.80 | epoch: 1 | total time: 15.15m | eta: 165.3m +step 01412/16704 (8.45%) | loss: 2.991392 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,177 | mfu: 50.70 | epoch: 1 | total time: 15.16m | eta: 165.3m +step 01413/16704 (8.46%) | loss: 2.978981 | lrm: 1.00 | dt: 647.21ms | tok/sec: 810,075 | mfu: 50.63 | epoch: 1 | total time: 15.17m | eta: 165.3m +step 01414/16704 (8.47%) | loss: 2.988397 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,485 | mfu: 50.72 | epoch: 1 | total time: 15.18m | eta: 165.3m +step 01415/16704 (8.47%) | loss: 2.980233 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,936 | mfu: 50.87 | epoch: 1 | total time: 15.19m | eta: 165.3m +step 01416/16704 (8.48%) | loss: 2.977562 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,216 | mfu: 50.70 | epoch: 1 | total time: 15.20m | eta: 165.3m +step 01417/16704 (8.48%) | loss: 2.987112 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,873 | mfu: 50.87 | epoch: 1 | total time: 15.21m | eta: 165.3m +step 01418/16704 (8.49%) | loss: 2.978412 | lrm: 1.00 | dt: 648.20ms | tok/sec: 808,833 | mfu: 50.55 | epoch: 1 | total time: 15.22m | eta: 165.3m +step 01419/16704 (8.49%) | loss: 2.972472 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,231 | mfu: 50.89 | epoch: 1 | total time: 15.23m | eta: 165.3m +step 01420/16704 (8.50%) | loss: 2.966603 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,228 | mfu: 50.95 | epoch: 1 | total time: 15.24m | eta: 165.2m +step 01421/16704 (8.51%) | loss: 2.979261 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,360 | mfu: 50.71 | epoch: 1 | total time: 15.26m | eta: 165.2m +step 01422/16704 (8.51%) | loss: 2.965978 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,462 | mfu: 50.84 | epoch: 1 | total time: 15.27m | eta: 165.2m +step 01423/16704 (8.52%) | loss: 2.958855 | lrm: 1.00 | dt: 648.07ms | tok/sec: 809,003 | mfu: 50.56 | epoch: 1 | total time: 15.28m | eta: 165.2m +step 01424/16704 (8.52%) | loss: 2.961824 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,818 | mfu: 50.93 | epoch: 1 | total time: 15.29m | eta: 165.2m +step 01425/16704 (8.53%) | loss: 2.971077 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,758 | mfu: 50.74 | epoch: 1 | total time: 15.30m | eta: 165.2m +step 01426/16704 (8.54%) | loss: 2.965094 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,859 | mfu: 50.74 | epoch: 1 | total time: 15.31m | eta: 165.2m +step 01427/16704 (8.54%) | loss: 2.963820 | lrm: 1.00 | dt: 641.97ms | tok/sec: 816,689 | mfu: 51.04 | epoch: 1 | total time: 15.32m | eta: 165.2m +step 01428/16704 (8.55%) | loss: 2.961637 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,137 | mfu: 50.70 | epoch: 1 | total time: 15.33m | eta: 165.2m +step 01429/16704 (8.55%) | loss: 2.967844 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,740 | mfu: 50.92 | epoch: 1 | total time: 15.34m | eta: 165.1m +step 01430/16704 (8.56%) | loss: 2.971245 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,475 | mfu: 50.91 | epoch: 1 | total time: 15.35m | eta: 165.1m +step 01431/16704 (8.57%) | loss: 2.965942 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,706 | mfu: 50.67 | epoch: 1 | total time: 15.36m | eta: 165.1m +step 01432/16704 (8.57%) | loss: 2.961155 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,885 | mfu: 50.93 | epoch: 1 | total time: 15.37m | eta: 165.1m +step 01433/16704 (8.58%) | loss: 2.960561 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,963 | mfu: 50.81 | epoch: 1 | total time: 15.38m | eta: 165.1m +step 01434/16704 (8.58%) | loss: 2.962998 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,019 | mfu: 50.81 | epoch: 1 | total time: 15.40m | eta: 165.1m +step 01435/16704 (8.59%) | loss: 2.967679 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,189 | mfu: 50.70 | epoch: 1 | total time: 15.41m | eta: 165.1m +step 01436/16704 (8.60%) | loss: 2.959648 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,319 | mfu: 50.77 | epoch: 1 | total time: 15.42m | eta: 165.1m +step 01437/16704 (8.60%) | loss: 2.953984 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,942 | mfu: 50.75 | epoch: 1 | total time: 15.43m | eta: 165.1m +step 01438/16704 (8.61%) | loss: 2.944748 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,697 | mfu: 50.92 | epoch: 1 | total time: 15.44m | eta: 165.0m +step 01439/16704 (8.61%) | loss: 2.944523 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,451 | mfu: 50.72 | epoch: 1 | total time: 15.45m | eta: 165.0m +step 01440/16704 (8.62%) | loss: 2.954241 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 1 | total time: 15.46m | eta: 165.0m +step 01441/16704 (8.63%) | loss: 2.961636 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,172 | mfu: 50.76 | epoch: 1 | total time: 15.47m | eta: 165.0m +step 01442/16704 (8.63%) | loss: 2.963030 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,714 | mfu: 50.73 | epoch: 1 | total time: 15.48m | eta: 165.0m +step 01443/16704 (8.64%) | loss: 2.974229 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,693 | mfu: 50.61 | epoch: 1 | total time: 15.49m | eta: 165.0m +step 01444/16704 (8.64%) | loss: 2.965507 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,490 | mfu: 50.78 | epoch: 1 | total time: 15.50m | eta: 165.0m +step 01445/16704 (8.65%) | loss: 2.969352 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,402 | mfu: 50.84 | epoch: 1 | total time: 15.51m | eta: 165.0m +step 01446/16704 (8.66%) | loss: 2.956784 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,480 | mfu: 50.66 | epoch: 1 | total time: 15.52m | eta: 165.0m +step 01447/16704 (8.66%) | loss: 2.964979 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,820 | mfu: 50.80 | epoch: 1 | total time: 15.54m | eta: 164.9m +step 01448/16704 (8.67%) | loss: 2.956141 | lrm: 1.00 | dt: 649.17ms | tok/sec: 807,630 | mfu: 50.48 | epoch: 1 | total time: 15.55m | eta: 164.9m +step 01449/16704 (8.67%) | loss: 2.949262 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,558 | mfu: 50.72 | epoch: 1 | total time: 15.56m | eta: 164.9m +step 01450/16704 (8.68%) | loss: 2.943057 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,441 | mfu: 50.53 | epoch: 1 | total time: 15.57m | eta: 164.9m +step 01451/16704 (8.69%) | loss: 2.937759 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,818 | mfu: 50.93 | epoch: 1 | total time: 15.58m | eta: 164.9m +step 01452/16704 (8.69%) | loss: 2.933792 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,134 | mfu: 50.88 | epoch: 1 | total time: 15.59m | eta: 164.9m +step 01453/16704 (8.70%) | loss: 2.938605 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,606 | mfu: 50.73 | epoch: 1 | total time: 15.60m | eta: 164.9m +step 01454/16704 (8.70%) | loss: 2.929538 | lrm: 1.00 | dt: 647.21ms | tok/sec: 810,072 | mfu: 50.63 | epoch: 1 | total time: 15.61m | eta: 164.9m +step 01455/16704 (8.71%) | loss: 2.936312 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,756 | mfu: 50.86 | epoch: 1 | total time: 15.62m | eta: 164.8m +step 01456/16704 (8.72%) | loss: 2.938009 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,673 | mfu: 50.73 | epoch: 1 | total time: 15.63m | eta: 164.8m +step 01457/16704 (8.72%) | loss: 2.943039 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,761 | mfu: 50.74 | epoch: 1 | total time: 15.64m | eta: 164.8m +step 01458/16704 (8.73%) | loss: 2.946468 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,748 | mfu: 50.80 | epoch: 1 | total time: 15.65m | eta: 164.8m +step 01459/16704 (8.73%) | loss: 2.949736 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,336 | mfu: 50.65 | epoch: 1 | total time: 15.66m | eta: 164.8m +step 01460/16704 (8.74%) | loss: 2.952527 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,748 | mfu: 50.80 | epoch: 1 | total time: 15.67m | eta: 164.8m +step 01461/16704 (8.75%) | loss: 2.965064 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,479 | mfu: 50.78 | epoch: 1 | total time: 15.69m | eta: 164.8m +step 01462/16704 (8.75%) | loss: 2.959492 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,459 | mfu: 50.65 | epoch: 1 | total time: 15.70m | eta: 164.8m +step 01463/16704 (8.76%) | loss: 2.951958 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,769 | mfu: 50.86 | epoch: 1 | total time: 15.71m | eta: 164.8m +step 01464/16704 (8.76%) | loss: 2.954387 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,874 | mfu: 50.81 | epoch: 1 | total time: 15.72m | eta: 164.7m +step 01465/16704 (8.77%) | loss: 2.945021 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 1 | total time: 15.73m | eta: 164.7m +step 01466/16704 (8.78%) | loss: 2.962203 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,529 | mfu: 50.72 | epoch: 1 | total time: 15.74m | eta: 164.7m +step 01467/16704 (8.78%) | loss: 2.971940 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,771 | mfu: 50.80 | epoch: 1 | total time: 15.75m | eta: 164.7m +step 01468/16704 (8.79%) | loss: 2.954712 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,826 | mfu: 50.74 | epoch: 1 | total time: 15.76m | eta: 164.7m +step 01469/16704 (8.79%) | loss: 2.954736 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,897 | mfu: 50.68 | epoch: 1 | total time: 15.77m | eta: 164.7m +step 01470/16704 (8.80%) | loss: 2.952744 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,555 | mfu: 50.91 | epoch: 1 | total time: 15.78m | eta: 164.7m +step 01471/16704 (8.81%) | loss: 2.948580 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,072 | mfu: 50.76 | epoch: 1 | total time: 15.79m | eta: 164.7m +step 01472/16704 (8.81%) | loss: 2.947007 | lrm: 1.00 | dt: 649.31ms | tok/sec: 807,452 | mfu: 50.47 | epoch: 1 | total time: 15.80m | eta: 164.7m +step 01473/16704 (8.82%) | loss: 2.948872 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,405 | mfu: 50.90 | epoch: 1 | total time: 15.81m | eta: 164.6m +step 01474/16704 (8.82%) | loss: 2.949962 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,494 | mfu: 50.66 | epoch: 1 | total time: 15.83m | eta: 164.6m +step 01475/16704 (8.83%) | loss: 2.959744 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,169 | mfu: 50.64 | epoch: 1 | total time: 15.84m | eta: 164.6m +step 01476/16704 (8.84%) | loss: 2.964577 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,858 | mfu: 50.80 | epoch: 1 | total time: 15.85m | eta: 164.6m +step 01477/16704 (8.84%) | loss: 2.964162 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,208 | mfu: 50.76 | epoch: 1 | total time: 15.86m | eta: 164.6m +step 01478/16704 (8.85%) | loss: 2.971557 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,093 | mfu: 50.94 | epoch: 1 | total time: 15.87m | eta: 164.6m +step 01479/16704 (8.85%) | loss: 2.966828 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,673 | mfu: 50.67 | epoch: 1 | total time: 15.88m | eta: 164.6m +step 01480/16704 (8.86%) | loss: 2.963530 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,253 | mfu: 50.70 | epoch: 1 | total time: 15.89m | eta: 164.6m +step 01481/16704 (8.87%) | loss: 2.953331 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,423 | mfu: 50.97 | epoch: 1 | total time: 15.90m | eta: 164.6m +step 01482/16704 (8.87%) | loss: 2.942547 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,601 | mfu: 50.73 | epoch: 1 | total time: 15.91m | eta: 164.5m +step 01483/16704 (8.88%) | loss: 2.950687 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,336 | mfu: 50.83 | epoch: 1 | total time: 15.92m | eta: 164.5m +step 01484/16704 (8.88%) | loss: 2.966931 | lrm: 1.00 | dt: 648.21ms | tok/sec: 808,823 | mfu: 50.55 | epoch: 1 | total time: 15.93m | eta: 164.5m +step 01485/16704 (8.89%) | loss: 2.968322 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,890 | mfu: 50.68 | epoch: 1 | total time: 15.94m | eta: 164.5m +step 01486/16704 (8.90%) | loss: 2.968153 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,154 | mfu: 50.82 | epoch: 1 | total time: 15.95m | eta: 164.5m +step 01487/16704 (8.90%) | loss: 2.982418 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,122 | mfu: 50.76 | epoch: 1 | total time: 15.97m | eta: 164.5m +step 01488/16704 (8.91%) | loss: 2.967178 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,836 | mfu: 50.68 | epoch: 1 | total time: 15.98m | eta: 164.5m +step 01489/16704 (8.91%) | loss: 2.963193 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,249 | mfu: 50.89 | epoch: 1 | total time: 15.99m | eta: 164.5m +step 01490/16704 (8.92%) | loss: 2.958570 | lrm: 1.00 | dt: 649.02ms | tok/sec: 807,818 | mfu: 50.49 | epoch: 1 | total time: 16.00m | eta: 164.5m +step 01491/16704 (8.93%) | loss: 2.961283 | lrm: 1.00 | dt: 642.21ms | tok/sec: 816,377 | mfu: 51.02 | epoch: 1 | total time: 16.01m | eta: 164.4m +step 01492/16704 (8.93%) | loss: 2.942205 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,939 | mfu: 50.68 | epoch: 1 | total time: 16.02m | eta: 164.4m +step 01493/16704 (8.94%) | loss: 2.929154 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,663 | mfu: 50.79 | epoch: 1 | total time: 16.03m | eta: 164.4m +step 01494/16704 (8.94%) | loss: 2.935784 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,190 | mfu: 50.83 | epoch: 1 | total time: 16.04m | eta: 164.4m +step 01495/16704 (8.95%) | loss: 2.944649 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,740 | mfu: 50.67 | epoch: 1 | total time: 16.05m | eta: 164.4m +step 01496/16704 (8.96%) | loss: 2.945471 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,236 | mfu: 50.89 | epoch: 1 | total time: 16.06m | eta: 164.4m +step 01497/16704 (8.96%) | loss: 2.952955 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,257 | mfu: 50.77 | epoch: 1 | total time: 16.07m | eta: 164.4m +step 01498/16704 (8.97%) | loss: 2.962465 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,736 | mfu: 50.73 | epoch: 1 | total time: 16.08m | eta: 164.4m +step 01499/16704 (8.97%) | loss: 2.951917 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,002 | mfu: 50.69 | epoch: 1 | total time: 16.09m | eta: 164.4m +Step 01500 | Validation bpb: 0.901016 +step 01500/16704 (8.98%) | loss: 2.940585 | lrm: 1.00 | dt: 652.84ms | tok/sec: 803,085 | mfu: 50.19 | epoch: 1 | total time: 16.11m | eta: 164.3m +step 01501/16704 (8.99%) | loss: 2.949165 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,814 | mfu: 50.61 | epoch: 1 | total time: 16.12m | eta: 164.3m +step 01502/16704 (8.99%) | loss: 2.958371 | lrm: 1.00 | dt: 647.50ms | tok/sec: 809,715 | mfu: 50.61 | epoch: 1 | total time: 16.13m | eta: 164.3m +step 01503/16704 (9.00%) | loss: 2.959738 | lrm: 1.00 | dt: 648.00ms | tok/sec: 809,085 | mfu: 50.57 | epoch: 1 | total time: 16.14m | eta: 164.3m +step 01504/16704 (9.00%) | loss: 2.974811 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,673 | mfu: 50.73 | epoch: 1 | total time: 16.15m | eta: 164.3m +step 01505/16704 (9.01%) | loss: 2.974422 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,631 | mfu: 50.92 | epoch: 1 | total time: 16.16m | eta: 164.3m +step 01506/16704 (9.02%) | loss: 2.974251 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,738 | mfu: 50.73 | epoch: 1 | total time: 16.17m | eta: 164.3m +step 01507/16704 (9.02%) | loss: 2.972267 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,645 | mfu: 50.73 | epoch: 1 | total time: 16.18m | eta: 164.3m +step 01508/16704 (9.03%) | loss: 2.979196 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,521 | mfu: 50.97 | epoch: 1 | total time: 16.19m | eta: 164.3m +step 01509/16704 (9.03%) | loss: 2.979306 | lrm: 1.00 | dt: 648.84ms | tok/sec: 808,038 | mfu: 50.50 | epoch: 1 | total time: 16.20m | eta: 164.2m +step 01510/16704 (9.04%) | loss: 2.985208 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,376 | mfu: 50.77 | epoch: 1 | total time: 16.21m | eta: 164.2m +step 01511/16704 (9.05%) | loss: 2.970987 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,322 | mfu: 50.90 | epoch: 1 | total time: 16.22m | eta: 164.2m +step 01512/16704 (9.05%) | loss: 2.981362 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,899 | mfu: 50.81 | epoch: 1 | total time: 16.23m | eta: 164.2m +step 01513/16704 (9.06%) | loss: 2.979282 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,912 | mfu: 50.68 | epoch: 1 | total time: 16.25m | eta: 164.2m +step 01514/16704 (9.06%) | loss: 2.976313 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,199 | mfu: 50.76 | epoch: 1 | total time: 16.26m | eta: 164.2m +step 01515/16704 (9.07%) | loss: 2.972544 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,184 | mfu: 50.76 | epoch: 1 | total time: 16.27m | eta: 164.2m +step 01516/16704 (9.08%) | loss: 2.981341 | lrm: 1.00 | dt: 647.83ms | tok/sec: 809,298 | mfu: 50.58 | epoch: 1 | total time: 16.28m | eta: 164.2m +step 01517/16704 (9.08%) | loss: 2.984636 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,800 | mfu: 50.86 | epoch: 1 | total time: 16.29m | eta: 164.1m +step 01518/16704 (9.09%) | loss: 2.986642 | lrm: 1.00 | dt: 646.48ms | tok/sec: 810,988 | mfu: 50.69 | epoch: 1 | total time: 16.30m | eta: 164.1m +step 01519/16704 (9.09%) | loss: 2.986291 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 1 | total time: 16.31m | eta: 164.1m +step 01520/16704 (9.10%) | loss: 2.988612 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,746 | mfu: 50.80 | epoch: 1 | total time: 16.32m | eta: 164.1m +step 01521/16704 (9.11%) | loss: 2.996541 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,948 | mfu: 50.69 | epoch: 1 | total time: 16.33m | eta: 164.1m +step 01522/16704 (9.11%) | loss: 2.996823 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 1 | total time: 16.34m | eta: 164.1m +step 01523/16704 (9.12%) | loss: 2.991774 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,447 | mfu: 50.65 | epoch: 1 | total time: 16.35m | eta: 164.1m +step 01524/16704 (9.12%) | loss: 2.986034 | lrm: 1.00 | dt: 647.97ms | tok/sec: 809,125 | mfu: 50.57 | epoch: 1 | total time: 16.36m | eta: 164.1m +step 01525/16704 (9.13%) | loss: 2.989776 | lrm: 1.00 | dt: 647.50ms | tok/sec: 809,707 | mfu: 50.61 | epoch: 1 | total time: 16.37m | eta: 164.1m +step 01526/16704 (9.14%) | loss: 2.985069 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,622 | mfu: 50.73 | epoch: 1 | total time: 16.39m | eta: 164.0m +step 01527/16704 (9.14%) | loss: 2.982249 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,683 | mfu: 50.86 | epoch: 1 | total time: 16.40m | eta: 164.0m +step 01528/16704 (9.15%) | loss: 2.982276 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,679 | mfu: 50.79 | epoch: 1 | total time: 16.41m | eta: 164.0m +step 01529/16704 (9.15%) | loss: 2.973510 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,227 | mfu: 50.58 | epoch: 1 | total time: 16.42m | eta: 164.0m +step 01530/16704 (9.16%) | loss: 2.991618 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,517 | mfu: 50.97 | epoch: 1 | total time: 16.43m | eta: 164.0m +step 01531/16704 (9.17%) | loss: 2.980821 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,877 | mfu: 50.68 | epoch: 1 | total time: 16.44m | eta: 164.0m +step 01532/16704 (9.17%) | loss: 2.988106 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,589 | mfu: 50.73 | epoch: 1 | total time: 16.45m | eta: 164.0m +step 01533/16704 (9.18%) | loss: 2.981173 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,417 | mfu: 50.78 | epoch: 1 | total time: 16.46m | eta: 164.0m +step 01534/16704 (9.18%) | loss: 2.963484 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,336 | mfu: 50.83 | epoch: 1 | total time: 16.47m | eta: 164.0m +step 01535/16704 (9.19%) | loss: 2.967434 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,693 | mfu: 50.73 | epoch: 1 | total time: 16.48m | eta: 163.9m +step 01536/16704 (9.20%) | loss: 2.962641 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,051 | mfu: 50.69 | epoch: 1 | total time: 16.49m | eta: 163.9m +step 01537/16704 (9.20%) | loss: 2.962803 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,270 | mfu: 50.83 | epoch: 1 | total time: 16.50m | eta: 163.9m +step 01538/16704 (9.21%) | loss: 2.963289 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,620 | mfu: 50.79 | epoch: 1 | total time: 16.51m | eta: 163.9m +step 01539/16704 (9.21%) | loss: 2.958754 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,967 | mfu: 50.87 | epoch: 1 | total time: 16.53m | eta: 163.9m +step 01540/16704 (9.22%) | loss: 2.951219 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 1 | total time: 16.54m | eta: 163.9m +step 01541/16704 (9.23%) | loss: 2.954029 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,029 | mfu: 50.69 | epoch: 1 | total time: 16.55m | eta: 163.9m +step 01542/16704 (9.23%) | loss: 2.947178 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,782 | mfu: 50.80 | epoch: 1 | total time: 16.56m | eta: 163.9m +step 01543/16704 (9.24%) | loss: 2.928306 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,077 | mfu: 50.82 | epoch: 1 | total time: 16.57m | eta: 163.9m +step 01544/16704 (9.24%) | loss: 2.929147 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,321 | mfu: 50.83 | epoch: 1 | total time: 16.58m | eta: 163.8m +step 01545/16704 (9.25%) | loss: 2.933061 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,945 | mfu: 50.81 | epoch: 1 | total time: 16.59m | eta: 163.8m +step 01546/16704 (9.26%) | loss: 2.938015 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,719 | mfu: 50.92 | epoch: 1 | total time: 16.60m | eta: 163.8m +step 01547/16704 (9.26%) | loss: 2.938903 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,159 | mfu: 50.64 | epoch: 1 | total time: 16.61m | eta: 163.8m +step 01548/16704 (9.27%) | loss: 2.941983 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,157 | mfu: 50.64 | epoch: 1 | total time: 16.62m | eta: 163.8m +step 01549/16704 (9.27%) | loss: 2.947071 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,574 | mfu: 50.79 | epoch: 1 | total time: 16.63m | eta: 163.8m +step 01550/16704 (9.28%) | loss: 2.951572 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,073 | mfu: 50.76 | epoch: 1 | total time: 16.64m | eta: 163.8m +step 01551/16704 (9.29%) | loss: 2.958261 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,112 | mfu: 50.82 | epoch: 1 | total time: 16.65m | eta: 163.8m +step 01552/16704 (9.29%) | loss: 2.964579 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,437 | mfu: 50.78 | epoch: 1 | total time: 16.67m | eta: 163.8m +step 01553/16704 (9.30%) | loss: 2.970527 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,942 | mfu: 50.69 | epoch: 1 | total time: 16.68m | eta: 163.7m +step 01554/16704 (9.30%) | loss: 2.980279 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,514 | mfu: 50.85 | epoch: 1 | total time: 16.69m | eta: 163.7m +step 01555/16704 (9.31%) | loss: 2.983703 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,849 | mfu: 50.87 | epoch: 1 | total time: 16.70m | eta: 163.7m +step 01556/16704 (9.32%) | loss: 2.974990 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,470 | mfu: 50.72 | epoch: 1 | total time: 16.71m | eta: 163.7m +step 01557/16704 (9.32%) | loss: 2.969601 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,558 | mfu: 50.91 | epoch: 1 | total time: 16.72m | eta: 163.7m +step 01558/16704 (9.33%) | loss: 2.969788 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,645 | mfu: 50.67 | epoch: 1 | total time: 16.73m | eta: 163.7m +step 01559/16704 (9.33%) | loss: 2.968646 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,515 | mfu: 50.78 | epoch: 1 | total time: 16.74m | eta: 163.7m +step 01560/16704 (9.34%) | loss: 2.971453 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,067 | mfu: 50.82 | epoch: 1 | total time: 16.75m | eta: 163.7m +step 01561/16704 (9.35%) | loss: 2.966752 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,896 | mfu: 50.74 | epoch: 1 | total time: 16.76m | eta: 163.7m +step 01562/16704 (9.35%) | loss: 2.963098 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,638 | mfu: 50.85 | epoch: 1 | total time: 16.77m | eta: 163.6m +step 01563/16704 (9.36%) | loss: 2.957962 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 1 | total time: 16.78m | eta: 163.6m +step 01564/16704 (9.36%) | loss: 2.958323 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 1 | total time: 16.79m | eta: 163.6m +step 01565/16704 (9.37%) | loss: 2.975652 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,351 | mfu: 50.71 | epoch: 1 | total time: 16.80m | eta: 163.6m +step 01566/16704 (9.38%) | loss: 2.976099 | lrm: 1.00 | dt: 649.10ms | tok/sec: 807,719 | mfu: 50.48 | epoch: 1 | total time: 16.82m | eta: 163.6m +step 01567/16704 (9.38%) | loss: 2.968154 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,421 | mfu: 50.72 | epoch: 1 | total time: 16.83m | eta: 163.6m +step 01568/16704 (9.39%) | loss: 2.970667 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 1 | total time: 16.84m | eta: 163.6m +step 01569/16704 (9.39%) | loss: 2.965195 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,226 | mfu: 50.64 | epoch: 1 | total time: 16.85m | eta: 163.6m +step 01570/16704 (9.40%) | loss: 2.964898 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,668 | mfu: 50.86 | epoch: 1 | total time: 16.86m | eta: 163.6m +step 01571/16704 (9.40%) | loss: 2.967723 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,480 | mfu: 50.78 | epoch: 1 | total time: 16.87m | eta: 163.5m +step 01572/16704 (9.41%) | loss: 2.976651 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,474 | mfu: 50.72 | epoch: 1 | total time: 16.88m | eta: 163.5m +step 01573/16704 (9.42%) | loss: 2.952928 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,582 | mfu: 50.98 | epoch: 1 | total time: 16.89m | eta: 163.5m +step 01574/16704 (9.42%) | loss: 2.957372 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,345 | mfu: 50.84 | epoch: 1 | total time: 16.90m | eta: 163.5m +step 01575/16704 (9.43%) | loss: 2.956943 | lrm: 1.00 | dt: 647.17ms | tok/sec: 810,122 | mfu: 50.63 | epoch: 1 | total time: 16.91m | eta: 163.5m +step 01576/16704 (9.43%) | loss: 2.962597 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,909 | mfu: 50.75 | epoch: 1 | total time: 16.92m | eta: 163.5m +step 01577/16704 (9.44%) | loss: 2.969456 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,320 | mfu: 50.58 | epoch: 1 | total time: 16.93m | eta: 163.5m +step 01578/16704 (9.45%) | loss: 2.963182 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,128 | mfu: 50.76 | epoch: 1 | total time: 16.94m | eta: 163.5m +step 01579/16704 (9.45%) | loss: 2.966814 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,761 | mfu: 50.61 | epoch: 1 | total time: 16.96m | eta: 163.4m +step 01580/16704 (9.46%) | loss: 2.975736 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,734 | mfu: 50.67 | epoch: 1 | total time: 16.97m | eta: 163.4m +step 01581/16704 (9.46%) | loss: 2.965484 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,393 | mfu: 50.78 | epoch: 1 | total time: 16.98m | eta: 163.4m +step 01582/16704 (9.47%) | loss: 2.963883 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,606 | mfu: 50.79 | epoch: 1 | total time: 16.99m | eta: 163.4m +step 01583/16704 (9.48%) | loss: 2.965251 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,082 | mfu: 50.82 | epoch: 1 | total time: 17.00m | eta: 163.4m +step 01584/16704 (9.48%) | loss: 2.960985 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,169 | mfu: 50.57 | epoch: 1 | total time: 17.01m | eta: 163.4m +step 01585/16704 (9.49%) | loss: 2.962139 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,633 | mfu: 50.73 | epoch: 1 | total time: 17.02m | eta: 163.4m +step 01586/16704 (9.49%) | loss: 2.952096 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,570 | mfu: 50.79 | epoch: 1 | total time: 17.03m | eta: 163.4m +step 01587/16704 (9.50%) | loss: 2.961645 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,495 | mfu: 50.72 | epoch: 1 | total time: 17.04m | eta: 163.4m +step 01588/16704 (9.51%) | loss: 2.963956 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,934 | mfu: 50.68 | epoch: 1 | total time: 17.05m | eta: 163.3m +step 01589/16704 (9.51%) | loss: 2.957641 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,290 | mfu: 50.71 | epoch: 1 | total time: 17.06m | eta: 163.3m +step 01590/16704 (9.52%) | loss: 2.953086 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,440 | mfu: 50.84 | epoch: 1 | total time: 17.07m | eta: 163.3m +step 01591/16704 (9.52%) | loss: 2.950109 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,177 | mfu: 50.76 | epoch: 1 | total time: 17.08m | eta: 163.3m +step 01592/16704 (9.53%) | loss: 2.940141 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,112 | mfu: 50.63 | epoch: 1 | total time: 17.10m | eta: 163.3m +step 01593/16704 (9.54%) | loss: 2.945628 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,837 | mfu: 50.74 | epoch: 1 | total time: 17.11m | eta: 163.3m +step 01594/16704 (9.54%) | loss: 2.937639 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,567 | mfu: 50.79 | epoch: 1 | total time: 17.12m | eta: 163.3m +step 01595/16704 (9.55%) | loss: 2.931448 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,676 | mfu: 50.67 | epoch: 1 | total time: 17.13m | eta: 163.3m +step 01596/16704 (9.55%) | loss: 2.941268 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,574 | mfu: 50.91 | epoch: 1 | total time: 17.14m | eta: 163.3m +step 01597/16704 (9.56%) | loss: 2.943522 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,731 | mfu: 50.86 | epoch: 1 | total time: 17.15m | eta: 163.2m +step 01598/16704 (9.57%) | loss: 2.932362 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 1 | total time: 17.16m | eta: 163.2m +step 01599/16704 (9.57%) | loss: 2.937442 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,859 | mfu: 50.87 | epoch: 1 | total time: 17.17m | eta: 163.2m +step 01600/16704 (9.58%) | loss: 2.942580 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,647 | mfu: 50.60 | epoch: 1 | total time: 17.18m | eta: 163.2m +step 01601/16704 (9.58%) | loss: 2.946370 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,693 | mfu: 50.86 | epoch: 1 | total time: 17.19m | eta: 163.2m +step 01602/16704 (9.59%) | loss: 2.945316 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,217 | mfu: 50.89 | epoch: 1 | total time: 17.20m | eta: 163.2m +step 01603/16704 (9.60%) | loss: 2.943261 | lrm: 1.00 | dt: 648.00ms | tok/sec: 809,092 | mfu: 50.57 | epoch: 1 | total time: 17.21m | eta: 163.2m +step 01604/16704 (9.60%) | loss: 2.953429 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,201 | mfu: 50.95 | epoch: 1 | total time: 17.22m | eta: 163.2m +step 01605/16704 (9.61%) | loss: 2.954964 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,386 | mfu: 50.71 | epoch: 1 | total time: 17.24m | eta: 163.2m +step 01606/16704 (9.61%) | loss: 2.941814 | lrm: 1.00 | dt: 647.16ms | tok/sec: 810,138 | mfu: 50.63 | epoch: 1 | total time: 17.25m | eta: 163.1m +step 01607/16704 (9.62%) | loss: 2.958064 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 1 | total time: 17.26m | eta: 163.1m +step 01608/16704 (9.63%) | loss: 2.960686 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,756 | mfu: 50.67 | epoch: 1 | total time: 17.27m | eta: 163.1m +step 01609/16704 (9.63%) | loss: 2.964988 | lrm: 1.00 | dt: 648.41ms | tok/sec: 808,580 | mfu: 50.54 | epoch: 1 | total time: 17.28m | eta: 163.1m +step 01610/16704 (9.64%) | loss: 2.953951 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,132 | mfu: 50.76 | epoch: 1 | total time: 17.29m | eta: 163.1m +step 01611/16704 (9.64%) | loss: 2.964239 | lrm: 1.00 | dt: 647.71ms | tok/sec: 809,449 | mfu: 50.59 | epoch: 1 | total time: 17.30m | eta: 163.1m +step 01612/16704 (9.65%) | loss: 2.950724 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,642 | mfu: 50.79 | epoch: 1 | total time: 17.31m | eta: 163.1m +step 01613/16704 (9.66%) | loss: 2.943521 | lrm: 1.00 | dt: 647.01ms | tok/sec: 810,323 | mfu: 50.65 | epoch: 1 | total time: 17.32m | eta: 163.1m +step 01614/16704 (9.66%) | loss: 2.950459 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,936 | mfu: 50.75 | epoch: 1 | total time: 17.33m | eta: 163.1m +step 01615/16704 (9.67%) | loss: 2.950730 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,981 | mfu: 50.81 | epoch: 1 | total time: 17.34m | eta: 163.0m +step 01616/16704 (9.67%) | loss: 2.952242 | lrm: 1.00 | dt: 647.66ms | tok/sec: 809,506 | mfu: 50.60 | epoch: 1 | total time: 17.35m | eta: 163.0m +step 01617/16704 (9.68%) | loss: 2.945676 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,720 | mfu: 50.80 | epoch: 1 | total time: 17.36m | eta: 163.0m +step 01618/16704 (9.69%) | loss: 2.947234 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,262 | mfu: 50.71 | epoch: 1 | total time: 17.38m | eta: 163.0m +step 01619/16704 (9.69%) | loss: 2.950690 | lrm: 1.00 | dt: 648.47ms | tok/sec: 808,497 | mfu: 50.53 | epoch: 1 | total time: 17.39m | eta: 163.0m +step 01620/16704 (9.70%) | loss: 2.946736 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,421 | mfu: 50.71 | epoch: 1 | total time: 17.40m | eta: 163.0m +step 01621/16704 (9.70%) | loss: 2.939941 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,913 | mfu: 50.75 | epoch: 1 | total time: 17.41m | eta: 163.0m +step 01622/16704 (9.71%) | loss: 2.941334 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,285 | mfu: 50.58 | epoch: 1 | total time: 17.42m | eta: 163.0m +step 01623/16704 (9.72%) | loss: 2.941100 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,626 | mfu: 50.67 | epoch: 1 | total time: 17.43m | eta: 163.0m +step 01624/16704 (9.72%) | loss: 2.949306 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,432 | mfu: 50.90 | epoch: 1 | total time: 17.44m | eta: 162.9m +step 01625/16704 (9.73%) | loss: 2.938686 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,879 | mfu: 50.81 | epoch: 1 | total time: 17.45m | eta: 162.9m +step 01626/16704 (9.73%) | loss: 2.945869 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,210 | mfu: 50.83 | epoch: 1 | total time: 17.46m | eta: 162.9m +step 01627/16704 (9.74%) | loss: 2.935925 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,327 | mfu: 50.71 | epoch: 1 | total time: 17.47m | eta: 162.9m +step 01628/16704 (9.75%) | loss: 2.929741 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,455 | mfu: 50.78 | epoch: 1 | total time: 17.48m | eta: 162.9m +step 01629/16704 (9.75%) | loss: 2.929482 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,367 | mfu: 50.71 | epoch: 1 | total time: 17.49m | eta: 162.9m +step 01630/16704 (9.76%) | loss: 2.916509 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,720 | mfu: 50.73 | epoch: 1 | total time: 17.50m | eta: 162.9m +step 01631/16704 (9.76%) | loss: 2.921784 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,187 | mfu: 50.83 | epoch: 1 | total time: 17.52m | eta: 162.9m +step 01632/16704 (9.77%) | loss: 2.917918 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 1 | total time: 17.53m | eta: 162.9m +step 01633/16704 (9.78%) | loss: 2.934952 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,653 | mfu: 50.73 | epoch: 1 | total time: 17.54m | eta: 162.8m +step 01634/16704 (9.78%) | loss: 2.940368 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,898 | mfu: 50.81 | epoch: 1 | total time: 17.55m | eta: 162.8m +step 01635/16704 (9.79%) | loss: 2.939048 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,966 | mfu: 50.69 | epoch: 1 | total time: 17.56m | eta: 162.8m +step 01636/16704 (9.79%) | loss: 2.949789 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,342 | mfu: 50.65 | epoch: 1 | total time: 17.57m | eta: 162.8m +step 01637/16704 (9.80%) | loss: 2.970961 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,648 | mfu: 50.73 | epoch: 1 | total time: 17.58m | eta: 162.8m +step 01638/16704 (9.81%) | loss: 2.962295 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 1 | total time: 17.59m | eta: 162.8m +step 01639/16704 (9.81%) | loss: 2.948808 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,524 | mfu: 50.66 | epoch: 1 | total time: 17.60m | eta: 162.8m +step 01640/16704 (9.82%) | loss: 2.932915 | lrm: 1.00 | dt: 641.60ms | tok/sec: 817,156 | mfu: 51.07 | epoch: 1 | total time: 17.61m | eta: 162.8m +step 01641/16704 (9.82%) | loss: 2.936556 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,334 | mfu: 50.71 | epoch: 1 | total time: 17.62m | eta: 162.8m +step 01642/16704 (9.83%) | loss: 2.929081 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,218 | mfu: 50.76 | epoch: 1 | total time: 17.63m | eta: 162.7m +step 01643/16704 (9.84%) | loss: 2.926576 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,025 | mfu: 50.88 | epoch: 1 | total time: 17.64m | eta: 162.7m +step 01644/16704 (9.84%) | loss: 2.914015 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,413 | mfu: 50.78 | epoch: 1 | total time: 17.66m | eta: 162.7m +step 01645/16704 (9.85%) | loss: 2.913190 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,938 | mfu: 50.68 | epoch: 1 | total time: 17.67m | eta: 162.7m +step 01646/16704 (9.85%) | loss: 2.922020 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,666 | mfu: 50.79 | epoch: 1 | total time: 17.68m | eta: 162.7m +step 01647/16704 (9.86%) | loss: 2.933723 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,617 | mfu: 50.73 | epoch: 1 | total time: 17.69m | eta: 162.7m +step 01648/16704 (9.87%) | loss: 2.935933 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,483 | mfu: 50.72 | epoch: 1 | total time: 17.70m | eta: 162.7m +step 01649/16704 (9.87%) | loss: 2.932373 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,388 | mfu: 50.84 | epoch: 1 | total time: 17.71m | eta: 162.7m +step 01650/16704 (9.88%) | loss: 2.945675 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,936 | mfu: 50.68 | epoch: 1 | total time: 17.72m | eta: 162.7m +step 01651/16704 (9.88%) | loss: 2.925723 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,843 | mfu: 50.80 | epoch: 1 | total time: 17.73m | eta: 162.6m +step 01652/16704 (9.89%) | loss: 2.923064 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,875 | mfu: 50.68 | epoch: 1 | total time: 17.74m | eta: 162.6m +step 01653/16704 (9.90%) | loss: 2.931599 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,850 | mfu: 50.80 | epoch: 1 | total time: 17.75m | eta: 162.6m +step 01654/16704 (9.90%) | loss: 2.929681 | lrm: 1.00 | dt: 649.06ms | tok/sec: 807,766 | mfu: 50.49 | epoch: 1 | total time: 17.76m | eta: 162.6m +step 01655/16704 (9.91%) | loss: 2.940325 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,514 | mfu: 50.85 | epoch: 1 | total time: 17.77m | eta: 162.6m +step 01656/16704 (9.91%) | loss: 2.939379 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,882 | mfu: 50.87 | epoch: 1 | total time: 17.78m | eta: 162.6m +step 01657/16704 (9.92%) | loss: 2.946698 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,039 | mfu: 50.63 | epoch: 1 | total time: 17.80m | eta: 162.6m +step 01658/16704 (9.93%) | loss: 2.948025 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,565 | mfu: 50.79 | epoch: 1 | total time: 17.81m | eta: 162.6m +step 01659/16704 (9.93%) | loss: 2.934710 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,076 | mfu: 50.69 | epoch: 1 | total time: 17.82m | eta: 162.6m +step 01660/16704 (9.94%) | loss: 2.939013 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,990 | mfu: 50.88 | epoch: 1 | total time: 17.83m | eta: 162.5m +step 01661/16704 (9.94%) | loss: 2.936913 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,132 | mfu: 50.82 | epoch: 1 | total time: 17.84m | eta: 162.5m +step 01662/16704 (9.95%) | loss: 2.921704 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,532 | mfu: 50.72 | epoch: 1 | total time: 17.85m | eta: 162.5m +step 01663/16704 (9.96%) | loss: 2.932908 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,844 | mfu: 50.68 | epoch: 1 | total time: 17.86m | eta: 162.5m +step 01664/16704 (9.96%) | loss: 2.936039 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,744 | mfu: 50.61 | epoch: 1 | total time: 17.87m | eta: 162.5m +step 01665/16704 (9.97%) | loss: 2.930627 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,386 | mfu: 50.84 | epoch: 1 | total time: 17.88m | eta: 162.5m +step 01666/16704 (9.97%) | loss: 2.940763 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,178 | mfu: 50.76 | epoch: 1 | total time: 17.89m | eta: 162.5m +step 01667/16704 (9.98%) | loss: 2.949408 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,189 | mfu: 50.76 | epoch: 1 | total time: 17.90m | eta: 162.5m +step 01668/16704 (9.99%) | loss: 2.943184 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,948 | mfu: 50.75 | epoch: 1 | total time: 17.91m | eta: 162.5m +step 01669/16704 (9.99%) | loss: 2.950081 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,191 | mfu: 50.89 | epoch: 1 | total time: 17.92m | eta: 162.4m +step 01670/16704 (10.00%) | loss: 2.944021 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,364 | mfu: 50.77 | epoch: 1 | total time: 17.93m | eta: 162.4m +step 01671/16704 (10.00%) | loss: 2.948960 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,909 | mfu: 50.81 | epoch: 1 | total time: 17.95m | eta: 162.4m +step 01672/16704 (10.01%) | loss: 2.943592 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,531 | mfu: 50.78 | epoch: 1 | total time: 17.96m | eta: 162.4m +step 01673/16704 (10.02%) | loss: 2.955380 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,428 | mfu: 50.90 | epoch: 1 | total time: 17.97m | eta: 162.4m +step 01674/16704 (10.02%) | loss: 2.943410 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,418 | mfu: 50.84 | epoch: 1 | total time: 17.98m | eta: 162.4m +step 01675/16704 (10.03%) | loss: 2.950486 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,256 | mfu: 50.70 | epoch: 1 | total time: 17.99m | eta: 162.4m +step 01676/16704 (10.03%) | loss: 2.956729 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,388 | mfu: 50.90 | epoch: 1 | total time: 18.00m | eta: 162.4m +step 01677/16704 (10.04%) | loss: 2.958748 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,574 | mfu: 50.72 | epoch: 1 | total time: 18.01m | eta: 162.4m +step 01678/16704 (10.05%) | loss: 2.965913 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,194 | mfu: 50.83 | epoch: 1 | total time: 18.02m | eta: 162.3m +step 01679/16704 (10.05%) | loss: 2.957858 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,292 | mfu: 50.71 | epoch: 1 | total time: 18.03m | eta: 162.3m +step 01680/16704 (10.06%) | loss: 2.953164 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,271 | mfu: 50.77 | epoch: 1 | total time: 18.04m | eta: 162.3m +step 01681/16704 (10.06%) | loss: 2.946697 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,826 | mfu: 50.80 | epoch: 1 | total time: 18.05m | eta: 162.3m +step 01682/16704 (10.07%) | loss: 2.946479 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,460 | mfu: 50.78 | epoch: 1 | total time: 18.06m | eta: 162.3m +step 01683/16704 (10.08%) | loss: 2.950257 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,882 | mfu: 50.87 | epoch: 1 | total time: 18.07m | eta: 162.3m +step 01684/16704 (10.08%) | loss: 2.951910 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,880 | mfu: 50.87 | epoch: 1 | total time: 18.09m | eta: 162.3m +step 01685/16704 (10.09%) | loss: 2.967221 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 1 | total time: 18.10m | eta: 162.3m +step 01686/16704 (10.09%) | loss: 2.957322 | lrm: 1.00 | dt: 645.83ms | tok/sec: 811,808 | mfu: 50.74 | epoch: 1 | total time: 18.11m | eta: 162.2m +step 01687/16704 (10.10%) | loss: 2.951599 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,185 | mfu: 50.83 | epoch: 1 | total time: 18.12m | eta: 162.2m +step 01688/16704 (10.11%) | loss: 2.951797 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,083 | mfu: 50.76 | epoch: 1 | total time: 18.13m | eta: 162.2m +step 01689/16704 (10.11%) | loss: 2.947338 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,457 | mfu: 50.78 | epoch: 1 | total time: 18.14m | eta: 162.2m +step 01690/16704 (10.12%) | loss: 2.953289 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 1 | total time: 18.15m | eta: 162.2m +step 01691/16704 (10.12%) | loss: 2.968796 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,535 | mfu: 50.72 | epoch: 1 | total time: 18.16m | eta: 162.2m +step 01692/16704 (10.13%) | loss: 2.964652 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,704 | mfu: 50.86 | epoch: 1 | total time: 18.17m | eta: 162.2m +step 01693/16704 (10.14%) | loss: 2.945406 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,993 | mfu: 50.75 | epoch: 1 | total time: 18.18m | eta: 162.2m +step 01694/16704 (10.14%) | loss: 2.948260 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,581 | mfu: 50.66 | epoch: 1 | total time: 18.19m | eta: 162.2m +step 01695/16704 (10.15%) | loss: 2.947786 | lrm: 1.00 | dt: 649.00ms | tok/sec: 807,844 | mfu: 50.49 | epoch: 1 | total time: 18.20m | eta: 162.1m +step 01696/16704 (10.15%) | loss: 2.931688 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,448 | mfu: 50.84 | epoch: 1 | total time: 18.21m | eta: 162.1m +step 01697/16704 (10.16%) | loss: 2.931668 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,536 | mfu: 50.66 | epoch: 1 | total time: 18.23m | eta: 162.1m +step 01698/16704 (10.17%) | loss: 2.921363 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,165 | mfu: 50.70 | epoch: 1 | total time: 18.24m | eta: 162.1m +step 01699/16704 (10.17%) | loss: 2.925007 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,995 | mfu: 50.88 | epoch: 1 | total time: 18.25m | eta: 162.1m +step 01700/16704 (10.18%) | loss: 2.929579 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,331 | mfu: 50.77 | epoch: 1 | total time: 18.26m | eta: 162.1m +step 01701/16704 (10.18%) | loss: 2.934431 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,471 | mfu: 50.72 | epoch: 1 | total time: 18.27m | eta: 162.1m +step 01702/16704 (10.19%) | loss: 2.931905 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,432 | mfu: 50.90 | epoch: 1 | total time: 18.28m | eta: 162.1m +step 01703/16704 (10.20%) | loss: 2.932741 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,825 | mfu: 50.74 | epoch: 1 | total time: 18.29m | eta: 162.1m +step 01704/16704 (10.20%) | loss: 2.946988 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 1 | total time: 18.30m | eta: 162.0m +step 01705/16704 (10.21%) | loss: 2.951175 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,995 | mfu: 50.75 | epoch: 1 | total time: 18.31m | eta: 162.0m +step 01706/16704 (10.21%) | loss: 2.942199 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,259 | mfu: 50.83 | epoch: 1 | total time: 18.32m | eta: 162.0m +step 01707/16704 (10.22%) | loss: 2.939202 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,756 | mfu: 50.86 | epoch: 1 | total time: 18.33m | eta: 162.0m +step 01708/16704 (10.23%) | loss: 2.941651 | lrm: 1.00 | dt: 648.41ms | tok/sec: 808,574 | mfu: 50.54 | epoch: 1 | total time: 18.34m | eta: 162.0m +step 01709/16704 (10.23%) | loss: 2.927403 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,129 | mfu: 50.76 | epoch: 1 | total time: 18.35m | eta: 162.0m +step 01710/16704 (10.24%) | loss: 2.931525 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,483 | mfu: 50.84 | epoch: 1 | total time: 18.37m | eta: 162.0m +step 01711/16704 (10.24%) | loss: 2.927950 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,726 | mfu: 50.80 | epoch: 1 | total time: 18.38m | eta: 162.0m +step 01712/16704 (10.25%) | loss: 2.942045 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,366 | mfu: 50.77 | epoch: 1 | total time: 18.39m | eta: 162.0m +step 01713/16704 (10.26%) | loss: 2.935389 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,789 | mfu: 50.74 | epoch: 1 | total time: 18.40m | eta: 161.9m +step 01714/16704 (10.26%) | loss: 2.930220 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,083 | mfu: 50.82 | epoch: 1 | total time: 18.41m | eta: 161.9m +step 01715/16704 (10.27%) | loss: 2.928000 | lrm: 1.00 | dt: 648.45ms | tok/sec: 808,526 | mfu: 50.53 | epoch: 1 | total time: 18.42m | eta: 161.9m +step 01716/16704 (10.27%) | loss: 2.925303 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,670 | mfu: 50.79 | epoch: 1 | total time: 18.43m | eta: 161.9m +step 01717/16704 (10.28%) | loss: 2.913557 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,939 | mfu: 50.75 | epoch: 1 | total time: 18.44m | eta: 161.9m +step 01718/16704 (10.28%) | loss: 2.927500 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,171 | mfu: 50.64 | epoch: 1 | total time: 18.45m | eta: 161.9m +step 01719/16704 (10.29%) | loss: 2.921769 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,786 | mfu: 50.80 | epoch: 1 | total time: 18.46m | eta: 161.9m +step 01720/16704 (10.30%) | loss: 2.921818 | lrm: 1.00 | dt: 647.43ms | tok/sec: 809,797 | mfu: 50.61 | epoch: 1 | total time: 18.47m | eta: 161.9m +step 01721/16704 (10.30%) | loss: 2.928012 | lrm: 1.00 | dt: 651.07ms | tok/sec: 805,266 | mfu: 50.33 | epoch: 1 | total time: 18.48m | eta: 161.9m +step 01722/16704 (10.31%) | loss: 2.923715 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,242 | mfu: 50.83 | epoch: 1 | total time: 18.49m | eta: 161.8m +step 01723/16704 (10.31%) | loss: 2.917238 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,026 | mfu: 50.75 | epoch: 1 | total time: 18.51m | eta: 161.8m +step 01724/16704 (10.32%) | loss: 2.919916 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,812 | mfu: 50.61 | epoch: 1 | total time: 18.52m | eta: 161.8m +step 01725/16704 (10.33%) | loss: 2.928499 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,331 | mfu: 50.83 | epoch: 1 | total time: 18.53m | eta: 161.8m +step 01726/16704 (10.33%) | loss: 2.928039 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,944 | mfu: 50.69 | epoch: 1 | total time: 18.54m | eta: 161.8m +step 01727/16704 (10.34%) | loss: 2.924189 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,680 | mfu: 50.67 | epoch: 1 | total time: 18.55m | eta: 161.8m +step 01728/16704 (10.34%) | loss: 2.934336 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,413 | mfu: 50.90 | epoch: 1 | total time: 18.56m | eta: 161.8m +step 01729/16704 (10.35%) | loss: 2.927135 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,171 | mfu: 50.70 | epoch: 1 | total time: 18.57m | eta: 161.8m +step 01730/16704 (10.36%) | loss: 2.932201 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,700 | mfu: 50.73 | epoch: 1 | total time: 18.58m | eta: 161.8m +step 01731/16704 (10.36%) | loss: 2.911732 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,572 | mfu: 50.66 | epoch: 1 | total time: 18.59m | eta: 161.7m +step 01732/16704 (10.37%) | loss: 2.915088 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,322 | mfu: 50.77 | epoch: 1 | total time: 18.60m | eta: 161.7m +step 01733/16704 (10.37%) | loss: 2.917711 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,621 | mfu: 50.79 | epoch: 1 | total time: 18.61m | eta: 161.7m +step 01734/16704 (10.38%) | loss: 2.924640 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,917 | mfu: 50.87 | epoch: 1 | total time: 18.62m | eta: 161.7m +step 01735/16704 (10.39%) | loss: 2.917501 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,880 | mfu: 50.93 | epoch: 1 | total time: 18.63m | eta: 161.7m +step 01736/16704 (10.39%) | loss: 2.918351 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,750 | mfu: 50.80 | epoch: 1 | total time: 18.64m | eta: 161.7m +step 01737/16704 (10.40%) | loss: 2.913796 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,016 | mfu: 50.81 | epoch: 1 | total time: 18.66m | eta: 161.7m +step 01738/16704 (10.40%) | loss: 2.925812 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,227 | mfu: 50.83 | epoch: 1 | total time: 18.67m | eta: 161.7m +step 01739/16704 (10.41%) | loss: 2.928828 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,433 | mfu: 50.78 | epoch: 1 | total time: 18.68m | eta: 161.7m +step 01740/16704 (10.42%) | loss: 2.925159 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,098 | mfu: 50.57 | epoch: 1 | total time: 18.69m | eta: 161.6m +step 01741/16704 (10.42%) | loss: 2.954215 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,738 | mfu: 50.67 | epoch: 1 | total time: 18.70m | eta: 161.6m +step 01742/16704 (10.43%) | loss: 2.947247 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,929 | mfu: 50.93 | epoch: 1 | total time: 18.71m | eta: 161.6m +step 01743/16704 (10.43%) | loss: 2.943101 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,211 | mfu: 50.83 | epoch: 1 | total time: 18.72m | eta: 161.6m +step 01744/16704 (10.44%) | loss: 2.939839 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,204 | mfu: 50.76 | epoch: 1 | total time: 18.73m | eta: 161.6m +step 01745/16704 (10.45%) | loss: 2.937674 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,924 | mfu: 50.75 | epoch: 1 | total time: 18.74m | eta: 161.6m +step 01746/16704 (10.45%) | loss: 2.931059 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,938 | mfu: 50.81 | epoch: 1 | total time: 18.75m | eta: 161.6m +step 01747/16704 (10.46%) | loss: 2.938544 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,257 | mfu: 50.83 | epoch: 1 | total time: 18.76m | eta: 161.6m +step 01748/16704 (10.46%) | loss: 2.935142 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,814 | mfu: 50.74 | epoch: 1 | total time: 18.77m | eta: 161.6m +step 01749/16704 (10.47%) | loss: 2.931276 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,426 | mfu: 50.72 | epoch: 1 | total time: 18.78m | eta: 161.5m +Step 01750 | Validation bpb: 0.891558 +step 01750/16704 (10.48%) | loss: 2.930066 | lrm: 1.00 | dt: 653.16ms | tok/sec: 802,700 | mfu: 50.17 | epoch: 1 | total time: 18.80m | eta: 161.5m +step 01751/16704 (10.48%) | loss: 2.918410 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 1 | total time: 18.81m | eta: 161.5m +step 01752/16704 (10.49%) | loss: 2.926367 | lrm: 1.00 | dt: 647.54ms | tok/sec: 809,664 | mfu: 50.61 | epoch: 1 | total time: 18.82m | eta: 161.5m +step 01753/16704 (10.49%) | loss: 2.929963 | lrm: 1.00 | dt: 641.34ms | tok/sec: 817,493 | mfu: 51.09 | epoch: 1 | total time: 18.83m | eta: 161.5m +step 01754/16704 (10.50%) | loss: 2.919387 | lrm: 1.00 | dt: 646.96ms | tok/sec: 810,390 | mfu: 50.65 | epoch: 1 | total time: 18.84m | eta: 161.5m +step 01755/16704 (10.51%) | loss: 2.912108 | lrm: 1.00 | dt: 648.28ms | tok/sec: 808,734 | mfu: 50.55 | epoch: 1 | total time: 18.85m | eta: 161.5m +step 01756/16704 (10.51%) | loss: 2.915865 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,858 | mfu: 50.80 | epoch: 1 | total time: 18.86m | eta: 161.5m +step 01757/16704 (10.52%) | loss: 2.939083 | lrm: 1.00 | dt: 648.30ms | tok/sec: 808,705 | mfu: 50.55 | epoch: 1 | total time: 18.87m | eta: 161.5m +step 01758/16704 (10.52%) | loss: 2.936571 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,258 | mfu: 50.77 | epoch: 1 | total time: 18.88m | eta: 161.4m +step 01759/16704 (10.53%) | loss: 2.931713 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,618 | mfu: 50.60 | epoch: 1 | total time: 18.89m | eta: 161.4m +step 01760/16704 (10.54%) | loss: 2.942811 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 1 | total time: 18.90m | eta: 161.4m +step 01761/16704 (10.54%) | loss: 2.928344 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,283 | mfu: 50.77 | epoch: 1 | total time: 18.91m | eta: 161.4m +step 01762/16704 (10.55%) | loss: 2.924408 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,734 | mfu: 50.80 | epoch: 1 | total time: 18.92m | eta: 161.4m +step 01763/16704 (10.55%) | loss: 2.916141 | lrm: 1.00 | dt: 648.10ms | tok/sec: 808,961 | mfu: 50.56 | epoch: 1 | total time: 18.94m | eta: 161.4m +step 01764/16704 (10.56%) | loss: 2.909826 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,902 | mfu: 50.62 | epoch: 1 | total time: 18.95m | eta: 161.4m +step 01765/16704 (10.57%) | loss: 2.898823 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,430 | mfu: 50.78 | epoch: 1 | total time: 18.96m | eta: 161.4m +step 01766/16704 (10.57%) | loss: 2.908833 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,252 | mfu: 50.64 | epoch: 1 | total time: 18.97m | eta: 161.4m +step 01767/16704 (10.58%) | loss: 2.928690 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,971 | mfu: 50.75 | epoch: 1 | total time: 18.98m | eta: 161.3m +step 01768/16704 (10.58%) | loss: 2.938019 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,821 | mfu: 50.68 | epoch: 1 | total time: 18.99m | eta: 161.3m +step 01769/16704 (10.59%) | loss: 2.935771 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,462 | mfu: 50.66 | epoch: 1 | total time: 19.00m | eta: 161.3m +step 01770/16704 (10.60%) | loss: 2.944236 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,538 | mfu: 50.66 | epoch: 1 | total time: 19.01m | eta: 161.3m +step 01771/16704 (10.60%) | loss: 2.940169 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,092 | mfu: 50.63 | epoch: 1 | total time: 19.02m | eta: 161.3m +step 01772/16704 (10.61%) | loss: 2.944252 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,628 | mfu: 50.73 | epoch: 1 | total time: 19.03m | eta: 161.3m +step 01773/16704 (10.61%) | loss: 2.938776 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,226 | mfu: 50.64 | epoch: 1 | total time: 19.04m | eta: 161.3m +step 01774/16704 (10.62%) | loss: 2.943205 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,096 | mfu: 50.69 | epoch: 1 | total time: 19.05m | eta: 161.3m +step 01775/16704 (10.63%) | loss: 2.933532 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,973 | mfu: 50.87 | epoch: 1 | total time: 19.06m | eta: 161.3m +step 01776/16704 (10.63%) | loss: 2.935658 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,286 | mfu: 50.77 | epoch: 1 | total time: 19.08m | eta: 161.2m +step 01777/16704 (10.64%) | loss: 2.933857 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,808 | mfu: 50.68 | epoch: 1 | total time: 19.09m | eta: 161.2m +step 01778/16704 (10.64%) | loss: 2.935661 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,406 | mfu: 50.71 | epoch: 1 | total time: 19.10m | eta: 161.2m +step 01779/16704 (10.65%) | loss: 2.921734 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,254 | mfu: 50.64 | epoch: 1 | total time: 19.11m | eta: 161.2m +step 01780/16704 (10.66%) | loss: 2.910967 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,277 | mfu: 50.83 | epoch: 1 | total time: 19.12m | eta: 161.2m +step 01781/16704 (10.66%) | loss: 2.928339 | lrm: 1.00 | dt: 650.13ms | tok/sec: 806,432 | mfu: 50.40 | epoch: 1 | total time: 19.13m | eta: 161.2m +step 01782/16704 (10.67%) | loss: 2.922585 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,165 | mfu: 50.82 | epoch: 1 | total time: 19.14m | eta: 161.2m +step 01783/16704 (10.67%) | loss: 2.916280 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,708 | mfu: 50.80 | epoch: 1 | total time: 19.15m | eta: 161.2m +step 01784/16704 (10.68%) | loss: 2.917482 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,452 | mfu: 50.72 | epoch: 1 | total time: 19.16m | eta: 161.2m +step 01785/16704 (10.69%) | loss: 2.916630 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,899 | mfu: 50.81 | epoch: 1 | total time: 19.17m | eta: 161.1m +step 01786/16704 (10.69%) | loss: 2.916147 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,735 | mfu: 50.80 | epoch: 1 | total time: 19.18m | eta: 161.1m +step 01787/16704 (10.70%) | loss: 2.905741 | lrm: 1.00 | dt: 647.90ms | tok/sec: 809,211 | mfu: 50.58 | epoch: 1 | total time: 19.19m | eta: 161.1m +step 01788/16704 (10.70%) | loss: 2.899292 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,408 | mfu: 50.71 | epoch: 1 | total time: 19.20m | eta: 161.1m +step 01789/16704 (10.71%) | loss: 2.908529 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,241 | mfu: 50.77 | epoch: 1 | total time: 19.22m | eta: 161.1m +step 01790/16704 (10.72%) | loss: 2.916822 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,289 | mfu: 50.64 | epoch: 1 | total time: 19.23m | eta: 161.1m +step 01791/16704 (10.72%) | loss: 2.917218 | lrm: 1.00 | dt: 648.15ms | tok/sec: 808,898 | mfu: 50.56 | epoch: 1 | total time: 19.24m | eta: 161.1m +step 01792/16704 (10.73%) | loss: 2.919138 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,145 | mfu: 50.76 | epoch: 1 | total time: 19.25m | eta: 161.1m +step 01793/16704 (10.73%) | loss: 2.919161 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,757 | mfu: 50.80 | epoch: 1 | total time: 19.26m | eta: 161.1m +step 01794/16704 (10.74%) | loss: 2.923628 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,849 | mfu: 50.74 | epoch: 1 | total time: 19.27m | eta: 161.0m +step 01795/16704 (10.75%) | loss: 2.912623 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,402 | mfu: 50.84 | epoch: 1 | total time: 19.28m | eta: 161.0m +step 01796/16704 (10.75%) | loss: 2.906850 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,286 | mfu: 50.64 | epoch: 1 | total time: 19.29m | eta: 161.0m +step 01797/16704 (10.76%) | loss: 2.902148 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,877 | mfu: 50.68 | epoch: 1 | total time: 19.30m | eta: 161.0m +step 01798/16704 (10.76%) | loss: 2.918651 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,142 | mfu: 50.70 | epoch: 1 | total time: 19.31m | eta: 161.0m +step 01799/16704 (10.77%) | loss: 2.921176 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,837 | mfu: 50.74 | epoch: 1 | total time: 19.32m | eta: 161.0m +step 01800/16704 (10.78%) | loss: 2.923417 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,596 | mfu: 50.85 | epoch: 1 | total time: 19.33m | eta: 161.0m +step 01801/16704 (10.78%) | loss: 2.931071 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,934 | mfu: 50.68 | epoch: 1 | total time: 19.34m | eta: 161.0m +step 01802/16704 (10.79%) | loss: 2.924602 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,741 | mfu: 50.67 | epoch: 1 | total time: 19.36m | eta: 161.0m +step 01803/16704 (10.79%) | loss: 2.923475 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,395 | mfu: 50.78 | epoch: 1 | total time: 19.37m | eta: 160.9m +step 01804/16704 (10.80%) | loss: 2.918076 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,395 | mfu: 50.78 | epoch: 1 | total time: 19.38m | eta: 160.9m +step 01805/16704 (10.81%) | loss: 2.910812 | lrm: 1.00 | dt: 647.06ms | tok/sec: 810,265 | mfu: 50.64 | epoch: 1 | total time: 19.39m | eta: 160.9m +step 01806/16704 (10.81%) | loss: 2.916263 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,162 | mfu: 50.76 | epoch: 1 | total time: 19.40m | eta: 160.9m +step 01807/16704 (10.82%) | loss: 2.927554 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,392 | mfu: 50.71 | epoch: 1 | total time: 19.41m | eta: 160.9m +step 01808/16704 (10.82%) | loss: 2.932700 | lrm: 1.00 | dt: 647.15ms | tok/sec: 810,155 | mfu: 50.64 | epoch: 1 | total time: 19.42m | eta: 160.9m +step 01809/16704 (10.83%) | loss: 2.932664 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,238 | mfu: 50.70 | epoch: 1 | total time: 19.43m | eta: 160.9m +step 01810/16704 (10.84%) | loss: 2.928922 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,858 | mfu: 50.87 | epoch: 1 | total time: 19.44m | eta: 160.9m +step 01811/16704 (10.84%) | loss: 2.945907 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,665 | mfu: 50.67 | epoch: 1 | total time: 19.45m | eta: 160.9m +step 01812/16704 (10.85%) | loss: 2.938326 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,523 | mfu: 50.66 | epoch: 1 | total time: 19.46m | eta: 160.8m +step 01813/16704 (10.85%) | loss: 2.938682 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,632 | mfu: 50.79 | epoch: 1 | total time: 19.47m | eta: 160.8m +step 01814/16704 (10.86%) | loss: 2.935608 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,704 | mfu: 50.73 | epoch: 1 | total time: 19.48m | eta: 160.8m +step 01815/16704 (10.87%) | loss: 2.918780 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,208 | mfu: 50.83 | epoch: 1 | total time: 19.50m | eta: 160.8m +step 01816/16704 (10.87%) | loss: 2.920885 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,033 | mfu: 50.63 | epoch: 1 | total time: 19.51m | eta: 160.8m +step 01817/16704 (10.88%) | loss: 2.915160 | lrm: 1.00 | dt: 649.59ms | tok/sec: 807,109 | mfu: 50.45 | epoch: 1 | total time: 19.52m | eta: 160.8m +step 01818/16704 (10.88%) | loss: 2.918442 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 1 | total time: 19.53m | eta: 160.8m +step 01819/16704 (10.89%) | loss: 2.918386 | lrm: 1.00 | dt: 648.39ms | tok/sec: 808,605 | mfu: 50.54 | epoch: 1 | total time: 19.54m | eta: 160.8m +step 01820/16704 (10.90%) | loss: 2.914459 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,537 | mfu: 50.78 | epoch: 1 | total time: 19.55m | eta: 160.8m +step 01821/16704 (10.90%) | loss: 2.909288 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,508 | mfu: 50.78 | epoch: 1 | total time: 19.56m | eta: 160.7m +step 01822/16704 (10.91%) | loss: 2.907507 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,915 | mfu: 50.68 | epoch: 1 | total time: 19.57m | eta: 160.7m +step 01823/16704 (10.91%) | loss: 2.900514 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,081 | mfu: 50.69 | epoch: 1 | total time: 19.58m | eta: 160.7m +step 01824/16704 (10.92%) | loss: 2.900570 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,872 | mfu: 50.81 | epoch: 1 | total time: 19.59m | eta: 160.7m +step 01825/16704 (10.93%) | loss: 2.891721 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,502 | mfu: 50.66 | epoch: 1 | total time: 19.60m | eta: 160.7m +step 01826/16704 (10.93%) | loss: 2.891095 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,217 | mfu: 50.89 | epoch: 1 | total time: 19.61m | eta: 160.7m +step 01827/16704 (10.94%) | loss: 2.908392 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 1 | total time: 19.62m | eta: 160.7m +step 01828/16704 (10.94%) | loss: 2.909495 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,834 | mfu: 50.80 | epoch: 1 | total time: 19.64m | eta: 160.7m +step 01829/16704 (10.95%) | loss: 2.905670 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,741 | mfu: 50.73 | epoch: 1 | total time: 19.65m | eta: 160.7m +step 01830/16704 (10.96%) | loss: 2.905873 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,039 | mfu: 50.75 | epoch: 1 | total time: 19.66m | eta: 160.6m +step 01831/16704 (10.96%) | loss: 2.929717 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,428 | mfu: 50.72 | epoch: 1 | total time: 19.67m | eta: 160.6m +step 01832/16704 (10.97%) | loss: 2.927021 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,843 | mfu: 50.74 | epoch: 1 | total time: 19.68m | eta: 160.6m +step 01833/16704 (10.97%) | loss: 2.918901 | lrm: 1.00 | dt: 648.93ms | tok/sec: 807,924 | mfu: 50.50 | epoch: 1 | total time: 19.69m | eta: 160.6m +step 01834/16704 (10.98%) | loss: 2.924443 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,843 | mfu: 50.74 | epoch: 1 | total time: 19.70m | eta: 160.6m +step 01835/16704 (10.99%) | loss: 2.913520 | lrm: 1.00 | dt: 649.22ms | tok/sec: 807,563 | mfu: 50.47 | epoch: 1 | total time: 19.71m | eta: 160.6m +step 01836/16704 (10.99%) | loss: 2.900823 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,838 | mfu: 50.80 | epoch: 1 | total time: 19.72m | eta: 160.6m +step 01837/16704 (11.00%) | loss: 2.894522 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,316 | mfu: 50.90 | epoch: 1 | total time: 19.73m | eta: 160.6m +step 01838/16704 (11.00%) | loss: 2.900691 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,348 | mfu: 50.71 | epoch: 1 | total time: 19.74m | eta: 160.6m +step 01839/16704 (11.01%) | loss: 2.909157 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,537 | mfu: 50.78 | epoch: 1 | total time: 19.75m | eta: 160.5m +step 01840/16704 (11.02%) | loss: 2.914110 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,065 | mfu: 50.82 | epoch: 1 | total time: 19.76m | eta: 160.5m +step 01841/16704 (11.02%) | loss: 2.900946 | lrm: 1.00 | dt: 647.63ms | tok/sec: 809,543 | mfu: 50.60 | epoch: 1 | total time: 19.78m | eta: 160.5m +step 01842/16704 (11.03%) | loss: 2.902845 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,779 | mfu: 50.55 | epoch: 1 | total time: 19.79m | eta: 160.5m +step 01843/16704 (11.03%) | loss: 2.907316 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,989 | mfu: 50.81 | epoch: 1 | total time: 19.80m | eta: 160.5m +step 01844/16704 (11.04%) | loss: 2.915421 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,139 | mfu: 50.70 | epoch: 1 | total time: 19.81m | eta: 160.5m +step 01845/16704 (11.05%) | loss: 2.908824 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,589 | mfu: 50.79 | epoch: 1 | total time: 19.82m | eta: 160.5m +step 01846/16704 (11.05%) | loss: 2.911153 | lrm: 1.00 | dt: 647.92ms | tok/sec: 809,189 | mfu: 50.58 | epoch: 1 | total time: 19.83m | eta: 160.5m +step 01847/16704 (11.06%) | loss: 2.909973 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,089 | mfu: 50.69 | epoch: 1 | total time: 19.84m | eta: 160.5m +step 01848/16704 (11.06%) | loss: 2.907116 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,646 | mfu: 50.79 | epoch: 1 | total time: 19.85m | eta: 160.5m +step 01849/16704 (11.07%) | loss: 2.915532 | lrm: 1.00 | dt: 647.34ms | tok/sec: 809,912 | mfu: 50.62 | epoch: 1 | total time: 19.86m | eta: 160.4m +step 01850/16704 (11.08%) | loss: 2.916799 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,756 | mfu: 50.86 | epoch: 1 | total time: 19.87m | eta: 160.4m +step 01851/16704 (11.08%) | loss: 2.908879 | lrm: 1.00 | dt: 648.89ms | tok/sec: 807,972 | mfu: 50.50 | epoch: 1 | total time: 19.88m | eta: 160.4m +step 01852/16704 (11.09%) | loss: 2.916494 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,367 | mfu: 50.77 | epoch: 1 | total time: 19.89m | eta: 160.4m +step 01853/16704 (11.09%) | loss: 2.913701 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,157 | mfu: 50.82 | epoch: 1 | total time: 19.90m | eta: 160.4m +step 01854/16704 (11.10%) | loss: 2.911072 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,782 | mfu: 50.68 | epoch: 1 | total time: 19.92m | eta: 160.4m +step 01855/16704 (11.11%) | loss: 2.916160 | lrm: 1.00 | dt: 647.38ms | tok/sec: 809,856 | mfu: 50.62 | epoch: 1 | total time: 19.93m | eta: 160.4m +step 01856/16704 (11.11%) | loss: 2.918070 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,220 | mfu: 50.95 | epoch: 1 | total time: 19.94m | eta: 160.4m +step 01857/16704 (11.12%) | loss: 2.897199 | lrm: 1.00 | dt: 647.32ms | tok/sec: 809,931 | mfu: 50.62 | epoch: 1 | total time: 19.95m | eta: 160.4m +step 01858/16704 (11.12%) | loss: 2.901550 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,954 | mfu: 50.75 | epoch: 1 | total time: 19.96m | eta: 160.3m +step 01859/16704 (11.13%) | loss: 2.900679 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,284 | mfu: 50.96 | epoch: 1 | total time: 19.97m | eta: 160.3m +step 01860/16704 (11.14%) | loss: 2.908851 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,729 | mfu: 50.73 | epoch: 1 | total time: 19.98m | eta: 160.3m +step 01861/16704 (11.14%) | loss: 2.895710 | lrm: 1.00 | dt: 647.50ms | tok/sec: 809,706 | mfu: 50.61 | epoch: 1 | total time: 19.99m | eta: 160.3m +step 01862/16704 (11.15%) | loss: 2.902282 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,454 | mfu: 50.72 | epoch: 1 | total time: 20.00m | eta: 160.3m +step 01863/16704 (11.15%) | loss: 2.901763 | lrm: 1.00 | dt: 647.65ms | tok/sec: 809,522 | mfu: 50.60 | epoch: 1 | total time: 20.01m | eta: 160.3m +step 01864/16704 (11.16%) | loss: 2.914089 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,269 | mfu: 50.64 | epoch: 1 | total time: 20.02m | eta: 160.3m +step 01865/16704 (11.16%) | loss: 2.910916 | lrm: 1.00 | dt: 647.83ms | tok/sec: 809,299 | mfu: 50.58 | epoch: 1 | total time: 20.03m | eta: 160.3m +step 01866/16704 (11.17%) | loss: 2.906916 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,356 | mfu: 50.84 | epoch: 1 | total time: 20.04m | eta: 160.3m +step 01867/16704 (11.18%) | loss: 2.914631 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,302 | mfu: 50.83 | epoch: 1 | total time: 20.06m | eta: 160.2m +step 01868/16704 (11.18%) | loss: 2.928288 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,179 | mfu: 50.64 | epoch: 1 | total time: 20.07m | eta: 160.2m +step 01869/16704 (11.19%) | loss: 2.927560 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,017 | mfu: 50.88 | epoch: 1 | total time: 20.08m | eta: 160.2m +step 01870/16704 (11.19%) | loss: 2.908077 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 20.09m | eta: 160.2m +step 01871/16704 (11.20%) | loss: 2.921978 | lrm: 1.00 | dt: 648.02ms | tok/sec: 809,064 | mfu: 50.57 | epoch: 1 | total time: 20.10m | eta: 160.2m +step 01872/16704 (11.21%) | loss: 2.915060 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,570 | mfu: 50.85 | epoch: 1 | total time: 20.11m | eta: 160.2m +step 01873/16704 (11.21%) | loss: 2.910660 | lrm: 1.00 | dt: 646.99ms | tok/sec: 810,351 | mfu: 50.65 | epoch: 1 | total time: 20.12m | eta: 160.2m +step 01874/16704 (11.22%) | loss: 2.927374 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 1 | total time: 20.13m | eta: 160.2m +step 01875/16704 (11.22%) | loss: 2.932460 | lrm: 1.00 | dt: 646.72ms | tok/sec: 810,689 | mfu: 50.67 | epoch: 1 | total time: 20.14m | eta: 160.2m +step 01876/16704 (11.23%) | loss: 2.931987 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,460 | mfu: 50.72 | epoch: 1 | total time: 20.15m | eta: 160.1m +step 01877/16704 (11.24%) | loss: 2.950251 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,305 | mfu: 50.83 | epoch: 1 | total time: 20.16m | eta: 160.1m +step 01878/16704 (11.24%) | loss: 2.938173 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,678 | mfu: 50.73 | epoch: 1 | total time: 20.17m | eta: 160.1m +step 01879/16704 (11.25%) | loss: 2.932674 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,350 | mfu: 50.77 | epoch: 1 | total time: 20.18m | eta: 160.1m +step 01880/16704 (11.25%) | loss: 2.928584 | lrm: 1.00 | dt: 649.57ms | tok/sec: 807,131 | mfu: 50.45 | epoch: 1 | total time: 20.20m | eta: 160.1m +step 01881/16704 (11.26%) | loss: 2.941896 | lrm: 1.00 | dt: 646.72ms | tok/sec: 810,693 | mfu: 50.67 | epoch: 1 | total time: 20.21m | eta: 160.1m +step 01882/16704 (11.27%) | loss: 2.942654 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,421 | mfu: 50.84 | epoch: 1 | total time: 20.22m | eta: 160.1m +step 01883/16704 (11.27%) | loss: 2.945855 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,241 | mfu: 50.77 | epoch: 1 | total time: 20.23m | eta: 160.1m +step 01884/16704 (11.28%) | loss: 2.943254 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,438 | mfu: 50.65 | epoch: 1 | total time: 20.24m | eta: 160.1m +step 01885/16704 (11.28%) | loss: 2.955446 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,429 | mfu: 50.65 | epoch: 1 | total time: 20.25m | eta: 160.0m +step 01886/16704 (11.29%) | loss: 2.948608 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,283 | mfu: 50.71 | epoch: 1 | total time: 20.26m | eta: 160.0m +step 01887/16704 (11.30%) | loss: 2.958264 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,704 | mfu: 50.67 | epoch: 1 | total time: 20.27m | eta: 160.0m +step 01888/16704 (11.30%) | loss: 2.954506 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,062 | mfu: 50.63 | epoch: 1 | total time: 20.28m | eta: 160.0m +step 01889/16704 (11.31%) | loss: 2.969726 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,688 | mfu: 50.79 | epoch: 1 | total time: 20.29m | eta: 160.0m +step 01890/16704 (11.31%) | loss: 2.972115 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,265 | mfu: 50.71 | epoch: 1 | total time: 20.30m | eta: 160.0m +step 01891/16704 (11.32%) | loss: 2.976963 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,140 | mfu: 50.76 | epoch: 1 | total time: 20.31m | eta: 160.0m +step 01892/16704 (11.33%) | loss: 2.979682 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,930 | mfu: 50.81 | epoch: 1 | total time: 20.32m | eta: 160.0m +step 01893/16704 (11.33%) | loss: 2.980773 | lrm: 1.00 | dt: 647.96ms | tok/sec: 809,142 | mfu: 50.57 | epoch: 1 | total time: 20.34m | eta: 160.0m +step 01894/16704 (11.34%) | loss: 2.968875 | lrm: 1.00 | dt: 648.23ms | tok/sec: 808,799 | mfu: 50.55 | epoch: 1 | total time: 20.35m | eta: 159.9m +step 01895/16704 (11.34%) | loss: 2.970712 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,178 | mfu: 50.89 | epoch: 1 | total time: 20.36m | eta: 159.9m +step 01896/16704 (11.35%) | loss: 2.951006 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 20.37m | eta: 159.9m +step 01897/16704 (11.36%) | loss: 2.959552 | lrm: 1.00 | dt: 648.12ms | tok/sec: 808,934 | mfu: 50.56 | epoch: 1 | total time: 20.38m | eta: 159.9m +step 01898/16704 (11.36%) | loss: 2.959734 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,124 | mfu: 50.70 | epoch: 1 | total time: 20.39m | eta: 159.9m +step 01899/16704 (11.37%) | loss: 2.959689 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,687 | mfu: 50.79 | epoch: 1 | total time: 20.40m | eta: 159.9m +step 01900/16704 (11.37%) | loss: 2.963697 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,976 | mfu: 50.62 | epoch: 1 | total time: 20.41m | eta: 159.9m +step 01901/16704 (11.38%) | loss: 2.958490 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,841 | mfu: 50.74 | epoch: 1 | total time: 20.42m | eta: 159.9m +step 01902/16704 (11.39%) | loss: 2.942209 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,757 | mfu: 50.61 | epoch: 1 | total time: 20.43m | eta: 159.9m +step 01903/16704 (11.39%) | loss: 2.950326 | lrm: 1.00 | dt: 647.32ms | tok/sec: 809,935 | mfu: 50.62 | epoch: 1 | total time: 20.44m | eta: 159.8m +step 01904/16704 (11.40%) | loss: 2.945596 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,493 | mfu: 50.78 | epoch: 1 | total time: 20.45m | eta: 159.8m +step 01905/16704 (11.40%) | loss: 2.939372 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,708 | mfu: 50.73 | epoch: 1 | total time: 20.46m | eta: 159.8m +step 01906/16704 (11.41%) | loss: 2.926156 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,223 | mfu: 50.83 | epoch: 1 | total time: 20.48m | eta: 159.8m +step 01907/16704 (11.42%) | loss: 2.920434 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,074 | mfu: 50.57 | epoch: 1 | total time: 20.49m | eta: 159.8m +step 01908/16704 (11.42%) | loss: 2.916594 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,507 | mfu: 50.72 | epoch: 1 | total time: 20.50m | eta: 159.8m +step 01909/16704 (11.43%) | loss: 2.907652 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,280 | mfu: 50.58 | epoch: 1 | total time: 20.51m | eta: 159.8m +step 01910/16704 (11.43%) | loss: 2.934981 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 1 | total time: 20.52m | eta: 159.8m +step 01911/16704 (11.44%) | loss: 2.934173 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,889 | mfu: 50.62 | epoch: 1 | total time: 20.53m | eta: 159.8m +step 01912/16704 (11.45%) | loss: 2.923388 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,236 | mfu: 50.70 | epoch: 1 | total time: 20.54m | eta: 159.7m +step 01913/16704 (11.45%) | loss: 2.916282 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,204 | mfu: 50.83 | epoch: 1 | total time: 20.55m | eta: 159.7m +step 01914/16704 (11.46%) | loss: 2.903335 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,666 | mfu: 50.67 | epoch: 1 | total time: 20.56m | eta: 159.7m +step 01915/16704 (11.46%) | loss: 2.901629 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,406 | mfu: 50.84 | epoch: 1 | total time: 20.57m | eta: 159.7m +step 01916/16704 (11.47%) | loss: 2.893209 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,234 | mfu: 50.77 | epoch: 1 | total time: 20.58m | eta: 159.7m +step 01917/16704 (11.48%) | loss: 2.892568 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,650 | mfu: 50.60 | epoch: 1 | total time: 20.59m | eta: 159.7m +step 01918/16704 (11.48%) | loss: 2.894305 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,390 | mfu: 50.78 | epoch: 1 | total time: 20.60m | eta: 159.7m +step 01919/16704 (11.49%) | loss: 2.885828 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,166 | mfu: 50.64 | epoch: 1 | total time: 20.62m | eta: 159.7m +step 01920/16704 (11.49%) | loss: 2.900324 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,927 | mfu: 50.75 | epoch: 1 | total time: 20.63m | eta: 159.7m +step 01921/16704 (11.50%) | loss: 2.896858 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 1 | total time: 20.64m | eta: 159.6m +step 01922/16704 (11.51%) | loss: 2.898213 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,077 | mfu: 50.69 | epoch: 1 | total time: 20.65m | eta: 159.6m +step 01923/16704 (11.51%) | loss: 2.902933 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,181 | mfu: 50.70 | epoch: 1 | total time: 20.66m | eta: 159.6m +step 01924/16704 (11.52%) | loss: 2.902494 | lrm: 1.00 | dt: 649.90ms | tok/sec: 806,727 | mfu: 50.42 | epoch: 1 | total time: 20.67m | eta: 159.6m +step 01925/16704 (11.52%) | loss: 2.899102 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,126 | mfu: 50.76 | epoch: 1 | total time: 20.68m | eta: 159.6m +step 01926/16704 (11.53%) | loss: 2.909895 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,947 | mfu: 50.87 | epoch: 1 | total time: 20.69m | eta: 159.6m +step 01927/16704 (11.54%) | loss: 2.907918 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,755 | mfu: 50.67 | epoch: 1 | total time: 20.70m | eta: 159.6m +step 01928/16704 (11.54%) | loss: 2.906273 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,592 | mfu: 50.73 | epoch: 1 | total time: 20.71m | eta: 159.6m +step 01929/16704 (11.55%) | loss: 2.915990 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,421 | mfu: 50.78 | epoch: 1 | total time: 20.72m | eta: 159.6m +step 01930/16704 (11.55%) | loss: 2.916297 | lrm: 1.00 | dt: 646.48ms | tok/sec: 810,993 | mfu: 50.69 | epoch: 1 | total time: 20.73m | eta: 159.5m +step 01931/16704 (11.56%) | loss: 2.915221 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,762 | mfu: 50.80 | epoch: 1 | total time: 20.74m | eta: 159.5m +step 01932/16704 (11.57%) | loss: 2.923463 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,899 | mfu: 50.81 | epoch: 1 | total time: 20.76m | eta: 159.5m +step 01933/16704 (11.57%) | loss: 2.923088 | lrm: 1.00 | dt: 647.40ms | tok/sec: 809,840 | mfu: 50.62 | epoch: 1 | total time: 20.77m | eta: 159.5m +step 01934/16704 (11.58%) | loss: 2.917408 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,831 | mfu: 50.74 | epoch: 1 | total time: 20.78m | eta: 159.5m +step 01935/16704 (11.58%) | loss: 2.917191 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,448 | mfu: 50.72 | epoch: 1 | total time: 20.79m | eta: 159.5m +step 01936/16704 (11.59%) | loss: 2.912245 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,197 | mfu: 50.76 | epoch: 1 | total time: 20.80m | eta: 159.5m +step 01937/16704 (11.60%) | loss: 2.903245 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,530 | mfu: 50.72 | epoch: 1 | total time: 20.81m | eta: 159.5m +step 01938/16704 (11.60%) | loss: 2.901868 | lrm: 1.00 | dt: 648.27ms | tok/sec: 808,749 | mfu: 50.55 | epoch: 1 | total time: 20.82m | eta: 159.5m +step 01939/16704 (11.61%) | loss: 2.901713 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,922 | mfu: 50.68 | epoch: 1 | total time: 20.83m | eta: 159.4m +step 01940/16704 (11.61%) | loss: 2.900450 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,566 | mfu: 50.79 | epoch: 1 | total time: 20.84m | eta: 159.4m +step 01941/16704 (11.62%) | loss: 2.902249 | lrm: 1.00 | dt: 648.69ms | tok/sec: 808,225 | mfu: 50.52 | epoch: 1 | total time: 20.85m | eta: 159.4m +step 01942/16704 (11.63%) | loss: 2.900869 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,733 | mfu: 50.80 | epoch: 1 | total time: 20.86m | eta: 159.4m +step 01943/16704 (11.63%) | loss: 2.909025 | lrm: 1.00 | dt: 646.98ms | tok/sec: 810,364 | mfu: 50.65 | epoch: 1 | total time: 20.87m | eta: 159.4m +step 01944/16704 (11.64%) | loss: 2.910224 | lrm: 1.00 | dt: 648.03ms | tok/sec: 809,043 | mfu: 50.57 | epoch: 1 | total time: 20.89m | eta: 159.4m +step 01945/16704 (11.64%) | loss: 2.914702 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,443 | mfu: 50.72 | epoch: 1 | total time: 20.90m | eta: 159.4m +step 01946/16704 (11.65%) | loss: 2.905676 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,806 | mfu: 50.80 | epoch: 1 | total time: 20.91m | eta: 159.4m +step 01947/16704 (11.66%) | loss: 2.902744 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,108 | mfu: 50.82 | epoch: 1 | total time: 20.92m | eta: 159.4m +step 01948/16704 (11.66%) | loss: 2.897467 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,073 | mfu: 50.57 | epoch: 1 | total time: 20.93m | eta: 159.3m +step 01949/16704 (11.67%) | loss: 2.891673 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,006 | mfu: 50.88 | epoch: 1 | total time: 20.94m | eta: 159.3m +step 01950/16704 (11.67%) | loss: 2.878821 | lrm: 1.00 | dt: 648.99ms | tok/sec: 807,853 | mfu: 50.49 | epoch: 1 | total time: 20.95m | eta: 159.3m +step 01951/16704 (11.68%) | loss: 2.889820 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,127 | mfu: 50.82 | epoch: 1 | total time: 20.96m | eta: 159.3m +step 01952/16704 (11.69%) | loss: 2.881769 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,245 | mfu: 50.64 | epoch: 1 | total time: 20.97m | eta: 159.3m +step 01953/16704 (11.69%) | loss: 2.893174 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,102 | mfu: 50.88 | epoch: 1 | total time: 20.98m | eta: 159.3m +step 01954/16704 (11.70%) | loss: 2.890402 | lrm: 1.00 | dt: 646.96ms | tok/sec: 810,382 | mfu: 50.65 | epoch: 1 | total time: 20.99m | eta: 159.3m +step 01955/16704 (11.70%) | loss: 2.892336 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,873 | mfu: 50.74 | epoch: 1 | total time: 21.00m | eta: 159.3m +step 01956/16704 (11.71%) | loss: 2.895359 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,223 | mfu: 50.83 | epoch: 1 | total time: 21.01m | eta: 159.3m +step 01957/16704 (11.72%) | loss: 2.912052 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,902 | mfu: 50.87 | epoch: 1 | total time: 21.02m | eta: 159.2m +step 01958/16704 (11.72%) | loss: 2.917083 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,349 | mfu: 50.71 | epoch: 1 | total time: 21.04m | eta: 159.2m +step 01959/16704 (11.73%) | loss: 2.917824 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,374 | mfu: 50.77 | epoch: 1 | total time: 21.05m | eta: 159.2m +step 01960/16704 (11.73%) | loss: 2.924335 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,308 | mfu: 50.71 | epoch: 1 | total time: 21.06m | eta: 159.2m +step 01961/16704 (11.74%) | loss: 2.923466 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,757 | mfu: 50.67 | epoch: 1 | total time: 21.07m | eta: 159.2m +step 01962/16704 (11.75%) | loss: 2.927674 | lrm: 1.00 | dt: 649.21ms | tok/sec: 807,576 | mfu: 50.47 | epoch: 1 | total time: 21.08m | eta: 159.2m +step 01963/16704 (11.75%) | loss: 2.920034 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,755 | mfu: 50.67 | epoch: 1 | total time: 21.09m | eta: 159.2m +step 01964/16704 (11.76%) | loss: 2.928084 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,185 | mfu: 50.70 | epoch: 1 | total time: 21.10m | eta: 159.2m +step 01965/16704 (11.76%) | loss: 2.927372 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,093 | mfu: 50.69 | epoch: 1 | total time: 21.11m | eta: 159.2m +step 01966/16704 (11.77%) | loss: 2.925928 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,468 | mfu: 50.72 | epoch: 1 | total time: 21.12m | eta: 159.1m +step 01967/16704 (11.78%) | loss: 2.938805 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,552 | mfu: 50.79 | epoch: 1 | total time: 21.13m | eta: 159.1m +step 01968/16704 (11.78%) | loss: 2.933790 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,432 | mfu: 50.72 | epoch: 1 | total time: 21.14m | eta: 159.1m +step 01969/16704 (11.79%) | loss: 2.937080 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,113 | mfu: 50.63 | epoch: 1 | total time: 21.15m | eta: 159.1m +step 01970/16704 (11.79%) | loss: 2.930122 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,177 | mfu: 50.76 | epoch: 1 | total time: 21.16m | eta: 159.1m +step 01971/16704 (11.80%) | loss: 2.928366 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,226 | mfu: 50.77 | epoch: 1 | total time: 21.18m | eta: 159.1m +step 01972/16704 (11.81%) | loss: 2.929230 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,005 | mfu: 50.69 | epoch: 1 | total time: 21.19m | eta: 159.1m +step 01973/16704 (11.81%) | loss: 2.923863 | lrm: 1.00 | dt: 647.26ms | tok/sec: 810,008 | mfu: 50.63 | epoch: 1 | total time: 21.20m | eta: 159.1m +step 01974/16704 (11.82%) | loss: 2.906510 | lrm: 1.00 | dt: 647.75ms | tok/sec: 809,397 | mfu: 50.59 | epoch: 1 | total time: 21.21m | eta: 159.1m +step 01975/16704 (11.82%) | loss: 2.908294 | lrm: 1.00 | dt: 648.31ms | tok/sec: 808,701 | mfu: 50.55 | epoch: 1 | total time: 21.22m | eta: 159.1m +step 01976/16704 (11.83%) | loss: 2.902602 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,049 | mfu: 50.82 | epoch: 1 | total time: 21.23m | eta: 159.0m +step 01977/16704 (11.84%) | loss: 2.897728 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,122 | mfu: 50.70 | epoch: 1 | total time: 21.24m | eta: 159.0m +step 01978/16704 (11.84%) | loss: 2.892139 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,168 | mfu: 50.64 | epoch: 1 | total time: 21.25m | eta: 159.0m +step 01979/16704 (11.85%) | loss: 2.895557 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,269 | mfu: 51.02 | epoch: 1 | total time: 21.26m | eta: 159.0m +step 01980/16704 (11.85%) | loss: 2.902038 | lrm: 1.00 | dt: 647.66ms | tok/sec: 809,511 | mfu: 50.60 | epoch: 1 | total time: 21.27m | eta: 159.0m +step 01981/16704 (11.86%) | loss: 2.903230 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,353 | mfu: 50.77 | epoch: 1 | total time: 21.28m | eta: 159.0m +step 01982/16704 (11.87%) | loss: 2.903720 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,313 | mfu: 50.71 | epoch: 1 | total time: 21.29m | eta: 159.0m +step 01983/16704 (11.87%) | loss: 2.902919 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,991 | mfu: 50.75 | epoch: 1 | total time: 21.31m | eta: 159.0m +step 01984/16704 (11.88%) | loss: 2.910147 | lrm: 1.00 | dt: 646.79ms | tok/sec: 810,596 | mfu: 50.66 | epoch: 1 | total time: 21.32m | eta: 159.0m +step 01985/16704 (11.88%) | loss: 2.911602 | lrm: 1.00 | dt: 648.70ms | tok/sec: 808,208 | mfu: 50.51 | epoch: 1 | total time: 21.33m | eta: 158.9m +step 01986/16704 (11.89%) | loss: 2.903874 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,772 | mfu: 50.86 | epoch: 1 | total time: 21.34m | eta: 158.9m +step 01987/16704 (11.90%) | loss: 2.897489 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,933 | mfu: 50.68 | epoch: 1 | total time: 21.35m | eta: 158.9m +step 01988/16704 (11.90%) | loss: 2.895132 | lrm: 1.00 | dt: 652.19ms | tok/sec: 803,892 | mfu: 50.24 | epoch: 1 | total time: 21.36m | eta: 158.9m +step 01989/16704 (11.91%) | loss: 2.889041 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,266 | mfu: 51.02 | epoch: 1 | total time: 21.37m | eta: 158.9m +step 01990/16704 (11.91%) | loss: 2.891695 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,897 | mfu: 50.68 | epoch: 1 | total time: 21.38m | eta: 158.9m +step 01991/16704 (11.92%) | loss: 2.890611 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,314 | mfu: 50.58 | epoch: 1 | total time: 21.39m | eta: 158.9m +step 01992/16704 (11.93%) | loss: 2.884165 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,154 | mfu: 50.70 | epoch: 1 | total time: 21.40m | eta: 158.9m +step 01993/16704 (11.93%) | loss: 2.895962 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 1 | total time: 21.41m | eta: 158.9m +step 01994/16704 (11.94%) | loss: 2.900283 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,260 | mfu: 50.89 | epoch: 1 | total time: 21.42m | eta: 158.8m +step 01995/16704 (11.94%) | loss: 2.903382 | lrm: 1.00 | dt: 648.79ms | tok/sec: 808,100 | mfu: 50.51 | epoch: 1 | total time: 21.43m | eta: 158.8m +step 01996/16704 (11.95%) | loss: 2.909112 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,948 | mfu: 50.69 | epoch: 1 | total time: 21.45m | eta: 158.8m +step 01997/16704 (11.96%) | loss: 2.897852 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,176 | mfu: 50.89 | epoch: 1 | total time: 21.46m | eta: 158.8m +step 01998/16704 (11.96%) | loss: 2.908278 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,355 | mfu: 50.77 | epoch: 1 | total time: 21.47m | eta: 158.8m +step 01999/16704 (11.97%) | loss: 2.900353 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,232 | mfu: 50.64 | epoch: 1 | total time: 21.48m | eta: 158.8m +[GC rank7] gen2: 13.9ms collected 0 objects +[GC rank6] gen2: 14.0ms collected 0 objects +[GC rank4] gen2: 14.4ms collected 0 objects +[GC rank0] gen2: 14.4ms collected 0 objects +[GC rank1] gen2: 14.5ms collected 0 objects +[GC rank3] gen2: 14.6ms collected 0 objects +[GC rank2] gen2: 14.7ms collected 0 objects +[GC rank5] gen2: 17.1ms collected 0 objects +Step 02000 | Validation bpb: 0.884278 +step 02000/16704 (11.97%) | loss: 2.886813 | lrm: 1.00 | dt: 648.81ms | tok/sec: 808,077 | mfu: 50.51 | epoch: 1 | total time: 21.49m | eta: 158.8m +step 02001/16704 (11.98%) | loss: 2.882872 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,440 | mfu: 50.53 | epoch: 1 | total time: 21.50m | eta: 158.8m +step 02002/16704 (11.99%) | loss: 2.882696 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,648 | mfu: 50.60 | epoch: 1 | total time: 21.51m | eta: 158.8m +step 02003/16704 (11.99%) | loss: 2.882854 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,131 | mfu: 50.76 | epoch: 1 | total time: 21.52m | eta: 158.7m +step 02004/16704 (12.00%) | loss: 2.877860 | lrm: 1.00 | dt: 650.42ms | tok/sec: 806,079 | mfu: 50.38 | epoch: 1 | total time: 21.53m | eta: 158.7m +step 02005/16704 (12.00%) | loss: 2.875306 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,425 | mfu: 50.78 | epoch: 1 | total time: 21.54m | eta: 158.7m +step 02006/16704 (12.01%) | loss: 2.878179 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,878 | mfu: 50.74 | epoch: 1 | total time: 21.55m | eta: 158.7m +step 02007/16704 (12.02%) | loss: 2.884633 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,483 | mfu: 50.59 | epoch: 1 | total time: 21.56m | eta: 158.7m +step 02008/16704 (12.02%) | loss: 2.875376 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,969 | mfu: 50.94 | epoch: 1 | total time: 21.57m | eta: 158.7m +step 02009/16704 (12.03%) | loss: 2.863301 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,079 | mfu: 50.82 | epoch: 1 | total time: 21.59m | eta: 158.7m +step 02010/16704 (12.03%) | loss: 2.866778 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,069 | mfu: 50.57 | epoch: 1 | total time: 21.60m | eta: 158.7m +step 02011/16704 (12.04%) | loss: 2.872467 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,462 | mfu: 50.84 | epoch: 1 | total time: 21.61m | eta: 158.7m +step 02012/16704 (12.05%) | loss: 2.876610 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,684 | mfu: 50.54 | epoch: 1 | total time: 21.62m | eta: 158.6m +step 02013/16704 (12.05%) | loss: 2.876908 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,705 | mfu: 50.80 | epoch: 1 | total time: 21.63m | eta: 158.6m +step 02014/16704 (12.06%) | loss: 2.864359 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,110 | mfu: 50.76 | epoch: 1 | total time: 21.64m | eta: 158.6m +step 02015/16704 (12.06%) | loss: 2.863691 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,812 | mfu: 50.68 | epoch: 1 | total time: 21.65m | eta: 158.6m +step 02016/16704 (12.07%) | loss: 2.860976 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,885 | mfu: 50.74 | epoch: 1 | total time: 21.66m | eta: 158.6m +step 02017/16704 (12.07%) | loss: 2.868080 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,619 | mfu: 50.73 | epoch: 1 | total time: 21.67m | eta: 158.6m +step 02018/16704 (12.08%) | loss: 2.858419 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,246 | mfu: 50.64 | epoch: 1 | total time: 21.68m | eta: 158.6m +step 02019/16704 (12.09%) | loss: 2.859089 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,228 | mfu: 50.58 | epoch: 1 | total time: 21.69m | eta: 158.6m +step 02020/16704 (12.09%) | loss: 2.853877 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,184 | mfu: 50.76 | epoch: 1 | total time: 21.70m | eta: 158.6m +step 02021/16704 (12.10%) | loss: 2.864067 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 1 | total time: 21.71m | eta: 158.5m +step 02022/16704 (12.10%) | loss: 2.867764 | lrm: 1.00 | dt: 646.63ms | tok/sec: 810,802 | mfu: 50.68 | epoch: 1 | total time: 21.73m | eta: 158.5m +step 02023/16704 (12.11%) | loss: 2.874228 | lrm: 1.00 | dt: 647.88ms | tok/sec: 809,233 | mfu: 50.58 | epoch: 1 | total time: 21.74m | eta: 158.5m +step 02024/16704 (12.12%) | loss: 2.863361 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,411 | mfu: 50.65 | epoch: 1 | total time: 21.75m | eta: 158.5m +step 02025/16704 (12.12%) | loss: 2.852614 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,703 | mfu: 50.67 | epoch: 1 | total time: 21.76m | eta: 158.5m +step 02026/16704 (12.13%) | loss: 2.876896 | lrm: 1.00 | dt: 647.30ms | tok/sec: 809,956 | mfu: 50.62 | epoch: 1 | total time: 21.77m | eta: 158.5m +step 02027/16704 (12.13%) | loss: 2.891932 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,264 | mfu: 50.77 | epoch: 1 | total time: 21.78m | eta: 158.5m +step 02028/16704 (12.14%) | loss: 2.897032 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,996 | mfu: 50.75 | epoch: 1 | total time: 21.79m | eta: 158.5m +step 02029/16704 (12.15%) | loss: 2.894709 | lrm: 1.00 | dt: 647.11ms | tok/sec: 810,195 | mfu: 50.64 | epoch: 1 | total time: 21.80m | eta: 158.5m +step 02030/16704 (12.15%) | loss: 2.892224 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,053 | mfu: 50.69 | epoch: 1 | total time: 21.81m | eta: 158.4m +step 02031/16704 (12.16%) | loss: 2.887774 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,319 | mfu: 50.90 | epoch: 1 | total time: 21.82m | eta: 158.4m +step 02032/16704 (12.16%) | loss: 2.881786 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,828 | mfu: 50.68 | epoch: 1 | total time: 21.83m | eta: 158.4m +step 02033/16704 (12.17%) | loss: 2.890895 | lrm: 1.00 | dt: 647.74ms | tok/sec: 809,406 | mfu: 50.59 | epoch: 1 | total time: 21.84m | eta: 158.4m +step 02034/16704 (12.18%) | loss: 2.892872 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,842 | mfu: 50.80 | epoch: 1 | total time: 21.85m | eta: 158.4m +step 02035/16704 (12.18%) | loss: 2.902060 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,750 | mfu: 50.80 | epoch: 1 | total time: 21.87m | eta: 158.4m +step 02036/16704 (12.19%) | loss: 2.901430 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,011 | mfu: 50.69 | epoch: 1 | total time: 21.88m | eta: 158.4m +step 02037/16704 (12.19%) | loss: 2.894535 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,101 | mfu: 50.63 | epoch: 1 | total time: 21.89m | eta: 158.4m +step 02038/16704 (12.20%) | loss: 2.895884 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,730 | mfu: 50.73 | epoch: 1 | total time: 21.90m | eta: 158.4m +step 02039/16704 (12.21%) | loss: 2.910986 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,708 | mfu: 50.80 | epoch: 1 | total time: 21.91m | eta: 158.3m +step 02040/16704 (12.21%) | loss: 2.909505 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,811 | mfu: 50.68 | epoch: 1 | total time: 21.92m | eta: 158.3m +step 02041/16704 (12.22%) | loss: 2.902705 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,013 | mfu: 50.81 | epoch: 1 | total time: 21.93m | eta: 158.3m +step 02042/16704 (12.22%) | loss: 2.899651 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,447 | mfu: 50.72 | epoch: 1 | total time: 21.94m | eta: 158.3m +step 02043/16704 (12.23%) | loss: 2.900608 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,577 | mfu: 50.72 | epoch: 1 | total time: 21.95m | eta: 158.3m +step 02044/16704 (12.24%) | loss: 2.911591 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,569 | mfu: 50.79 | epoch: 1 | total time: 21.96m | eta: 158.3m +step 02045/16704 (12.24%) | loss: 2.923382 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,163 | mfu: 50.76 | epoch: 1 | total time: 21.97m | eta: 158.3m +step 02046/16704 (12.25%) | loss: 2.919035 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,746 | mfu: 50.61 | epoch: 1 | total time: 21.98m | eta: 158.3m +step 02047/16704 (12.25%) | loss: 2.925642 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,767 | mfu: 50.74 | epoch: 1 | total time: 21.99m | eta: 158.3m +step 02048/16704 (12.26%) | loss: 2.925773 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,316 | mfu: 50.77 | epoch: 1 | total time: 22.01m | eta: 158.2m +step 02049/16704 (12.27%) | loss: 2.927792 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,039 | mfu: 50.63 | epoch: 1 | total time: 22.02m | eta: 158.2m +step 02050/16704 (12.27%) | loss: 2.916585 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,189 | mfu: 50.70 | epoch: 1 | total time: 22.03m | eta: 158.2m +step 02051/16704 (12.28%) | loss: 2.925850 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,233 | mfu: 50.70 | epoch: 1 | total time: 22.04m | eta: 158.2m +step 02052/16704 (12.28%) | loss: 2.929070 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,209 | mfu: 50.64 | epoch: 1 | total time: 22.05m | eta: 158.2m +step 02053/16704 (12.29%) | loss: 2.929131 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,453 | mfu: 50.65 | epoch: 1 | total time: 22.06m | eta: 158.2m +step 02054/16704 (12.30%) | loss: 2.934015 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,405 | mfu: 50.84 | epoch: 1 | total time: 22.07m | eta: 158.2m +step 02055/16704 (12.30%) | loss: 2.926869 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,062 | mfu: 50.63 | epoch: 1 | total time: 22.08m | eta: 158.2m +step 02056/16704 (12.31%) | loss: 2.925187 | lrm: 1.00 | dt: 648.76ms | tok/sec: 808,144 | mfu: 50.51 | epoch: 1 | total time: 22.09m | eta: 158.2m +step 02057/16704 (12.31%) | loss: 2.929748 | lrm: 1.00 | dt: 646.48ms | tok/sec: 810,992 | mfu: 50.69 | epoch: 1 | total time: 22.10m | eta: 158.1m +step 02058/16704 (12.32%) | loss: 2.921831 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,305 | mfu: 50.77 | epoch: 1 | total time: 22.11m | eta: 158.1m +step 02059/16704 (12.33%) | loss: 2.922339 | lrm: 1.00 | dt: 649.15ms | tok/sec: 807,651 | mfu: 50.48 | epoch: 1 | total time: 22.12m | eta: 158.1m +step 02060/16704 (12.33%) | loss: 2.924286 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,167 | mfu: 50.82 | epoch: 1 | total time: 22.13m | eta: 158.1m +step 02061/16704 (12.34%) | loss: 2.940909 | lrm: 1.00 | dt: 646.70ms | tok/sec: 810,715 | mfu: 50.67 | epoch: 1 | total time: 22.15m | eta: 158.1m +step 02062/16704 (12.34%) | loss: 2.923379 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,925 | mfu: 50.68 | epoch: 1 | total time: 22.16m | eta: 158.1m +step 02063/16704 (12.35%) | loss: 2.912638 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,176 | mfu: 50.82 | epoch: 1 | total time: 22.17m | eta: 158.1m +step 02064/16704 (12.36%) | loss: 2.893158 | lrm: 1.00 | dt: 648.22ms | tok/sec: 808,815 | mfu: 50.55 | epoch: 1 | total time: 22.18m | eta: 158.1m +step 02065/16704 (12.36%) | loss: 2.901531 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,561 | mfu: 50.79 | epoch: 1 | total time: 22.19m | eta: 158.1m +step 02066/16704 (12.37%) | loss: 2.897322 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,319 | mfu: 50.83 | epoch: 1 | total time: 22.20m | eta: 158.1m +step 02067/16704 (12.37%) | loss: 2.894217 | lrm: 1.00 | dt: 648.67ms | tok/sec: 808,246 | mfu: 50.52 | epoch: 1 | total time: 22.21m | eta: 158.0m +step 02068/16704 (12.38%) | loss: 2.893269 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,329 | mfu: 50.77 | epoch: 1 | total time: 22.22m | eta: 158.0m +step 02069/16704 (12.39%) | loss: 2.896975 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,463 | mfu: 50.72 | epoch: 1 | total time: 22.23m | eta: 158.0m +step 02070/16704 (12.39%) | loss: 2.886333 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,467 | mfu: 50.66 | epoch: 1 | total time: 22.24m | eta: 158.0m +step 02071/16704 (12.40%) | loss: 2.908385 | lrm: 1.00 | dt: 649.67ms | tok/sec: 807,002 | mfu: 50.44 | epoch: 1 | total time: 22.25m | eta: 158.0m +step 02072/16704 (12.40%) | loss: 2.903787 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,462 | mfu: 50.72 | epoch: 1 | total time: 22.26m | eta: 158.0m +step 02073/16704 (12.41%) | loss: 2.881992 | lrm: 1.00 | dt: 649.74ms | tok/sec: 806,924 | mfu: 50.43 | epoch: 1 | total time: 22.27m | eta: 158.0m +step 02074/16704 (12.42%) | loss: 2.890413 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,494 | mfu: 50.78 | epoch: 1 | total time: 22.29m | eta: 158.0m +step 02075/16704 (12.42%) | loss: 2.889523 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,771 | mfu: 50.67 | epoch: 1 | total time: 22.30m | eta: 158.0m +step 02076/16704 (12.43%) | loss: 2.894695 | lrm: 1.00 | dt: 648.55ms | tok/sec: 808,397 | mfu: 50.53 | epoch: 1 | total time: 22.31m | eta: 157.9m +step 02077/16704 (12.43%) | loss: 2.897052 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,285 | mfu: 50.83 | epoch: 1 | total time: 22.32m | eta: 157.9m +step 02078/16704 (12.44%) | loss: 2.894570 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,351 | mfu: 50.77 | epoch: 1 | total time: 22.33m | eta: 157.9m +step 02079/16704 (12.45%) | loss: 2.890322 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,928 | mfu: 50.62 | epoch: 1 | total time: 22.34m | eta: 157.9m +step 02080/16704 (12.45%) | loss: 2.907113 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,101 | mfu: 50.82 | epoch: 1 | total time: 22.35m | eta: 157.9m +step 02081/16704 (12.46%) | loss: 2.907874 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,862 | mfu: 50.68 | epoch: 1 | total time: 22.36m | eta: 157.9m +step 02082/16704 (12.46%) | loss: 2.903634 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,575 | mfu: 50.66 | epoch: 1 | total time: 22.37m | eta: 157.9m +step 02083/16704 (12.47%) | loss: 2.913065 | lrm: 1.00 | dt: 647.90ms | tok/sec: 809,215 | mfu: 50.58 | epoch: 1 | total time: 22.38m | eta: 157.9m +step 02084/16704 (12.48%) | loss: 2.917008 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,938 | mfu: 50.68 | epoch: 1 | total time: 22.39m | eta: 157.9m +step 02085/16704 (12.48%) | loss: 2.908596 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,717 | mfu: 50.73 | epoch: 1 | total time: 22.40m | eta: 157.8m +step 02086/16704 (12.49%) | loss: 2.896919 | lrm: 1.00 | dt: 649.89ms | tok/sec: 806,736 | mfu: 50.42 | epoch: 1 | total time: 22.41m | eta: 157.8m +step 02087/16704 (12.49%) | loss: 2.909599 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,250 | mfu: 50.70 | epoch: 1 | total time: 22.43m | eta: 157.8m +step 02088/16704 (12.50%) | loss: 2.907294 | lrm: 1.00 | dt: 648.73ms | tok/sec: 808,170 | mfu: 50.51 | epoch: 1 | total time: 22.44m | eta: 157.8m +step 02089/16704 (12.51%) | loss: 2.915720 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,356 | mfu: 50.84 | epoch: 1 | total time: 22.45m | eta: 157.8m +step 02090/16704 (12.51%) | loss: 2.918425 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,305 | mfu: 50.83 | epoch: 1 | total time: 22.46m | eta: 157.8m +step 02091/16704 (12.52%) | loss: 2.913407 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,244 | mfu: 50.70 | epoch: 1 | total time: 22.47m | eta: 157.8m +step 02092/16704 (12.52%) | loss: 2.911673 | lrm: 1.00 | dt: 647.69ms | tok/sec: 809,474 | mfu: 50.59 | epoch: 1 | total time: 22.48m | eta: 157.8m +step 02093/16704 (12.53%) | loss: 2.903492 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,897 | mfu: 50.74 | epoch: 1 | total time: 22.49m | eta: 157.8m +step 02094/16704 (12.54%) | loss: 2.903167 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,382 | mfu: 50.71 | epoch: 1 | total time: 22.50m | eta: 157.7m +step 02095/16704 (12.54%) | loss: 2.899278 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 1 | total time: 22.51m | eta: 157.7m +step 02096/16704 (12.55%) | loss: 2.913675 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,095 | mfu: 50.82 | epoch: 1 | total time: 22.52m | eta: 157.7m +step 02097/16704 (12.55%) | loss: 2.902709 | lrm: 1.00 | dt: 648.05ms | tok/sec: 809,029 | mfu: 50.57 | epoch: 1 | total time: 22.53m | eta: 157.7m +step 02098/16704 (12.56%) | loss: 2.887214 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,074 | mfu: 50.76 | epoch: 1 | total time: 22.54m | eta: 157.7m +step 02099/16704 (12.57%) | loss: 2.890018 | lrm: 1.00 | dt: 648.70ms | tok/sec: 808,214 | mfu: 50.51 | epoch: 1 | total time: 22.55m | eta: 157.7m +step 02100/16704 (12.57%) | loss: 2.885829 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,849 | mfu: 50.68 | epoch: 1 | total time: 22.57m | eta: 157.7m +step 02101/16704 (12.58%) | loss: 2.892185 | lrm: 1.00 | dt: 648.43ms | tok/sec: 808,545 | mfu: 50.54 | epoch: 1 | total time: 22.58m | eta: 157.7m +step 02102/16704 (12.58%) | loss: 2.888349 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,075 | mfu: 50.82 | epoch: 1 | total time: 22.59m | eta: 157.7m +step 02103/16704 (12.59%) | loss: 2.892086 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,248 | mfu: 50.70 | epoch: 1 | total time: 22.60m | eta: 157.6m +step 02104/16704 (12.60%) | loss: 2.894928 | lrm: 1.00 | dt: 647.38ms | tok/sec: 809,859 | mfu: 50.62 | epoch: 1 | total time: 22.61m | eta: 157.6m +step 02105/16704 (12.60%) | loss: 2.913883 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,975 | mfu: 50.69 | epoch: 1 | total time: 22.62m | eta: 157.6m +step 02106/16704 (12.61%) | loss: 2.920322 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,663 | mfu: 50.73 | epoch: 1 | total time: 22.63m | eta: 157.6m +step 02107/16704 (12.61%) | loss: 2.919371 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,673 | mfu: 50.73 | epoch: 1 | total time: 22.64m | eta: 157.6m +step 02108/16704 (12.62%) | loss: 2.925163 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,821 | mfu: 50.80 | epoch: 1 | total time: 22.65m | eta: 157.6m +step 02109/16704 (12.63%) | loss: 2.911432 | lrm: 1.00 | dt: 650.33ms | tok/sec: 806,193 | mfu: 50.39 | epoch: 1 | total time: 22.66m | eta: 157.6m +step 02110/16704 (12.63%) | loss: 2.916603 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,558 | mfu: 50.85 | epoch: 1 | total time: 22.67m | eta: 157.6m +step 02111/16704 (12.64%) | loss: 2.914597 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 1 | total time: 22.68m | eta: 157.6m +step 02112/16704 (12.64%) | loss: 2.913071 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,293 | mfu: 50.71 | epoch: 1 | total time: 22.69m | eta: 157.5m +step 02113/16704 (12.65%) | loss: 2.904280 | lrm: 1.00 | dt: 649.55ms | tok/sec: 807,153 | mfu: 50.45 | epoch: 1 | total time: 22.71m | eta: 157.5m +step 02114/16704 (12.66%) | loss: 2.905660 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,906 | mfu: 50.68 | epoch: 1 | total time: 22.72m | eta: 157.5m +step 02115/16704 (12.66%) | loss: 2.915210 | lrm: 1.00 | dt: 648.36ms | tok/sec: 808,637 | mfu: 50.54 | epoch: 1 | total time: 22.73m | eta: 157.5m +step 02116/16704 (12.67%) | loss: 2.907268 | lrm: 1.00 | dt: 647.67ms | tok/sec: 809,496 | mfu: 50.59 | epoch: 1 | total time: 22.74m | eta: 157.5m +step 02117/16704 (12.67%) | loss: 2.901897 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,094 | mfu: 50.82 | epoch: 1 | total time: 22.75m | eta: 157.5m +step 02118/16704 (12.68%) | loss: 2.912124 | lrm: 1.00 | dt: 649.48ms | tok/sec: 807,237 | mfu: 50.45 | epoch: 1 | total time: 22.76m | eta: 157.5m +step 02119/16704 (12.69%) | loss: 2.901265 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,516 | mfu: 50.78 | epoch: 1 | total time: 22.77m | eta: 157.5m +step 02120/16704 (12.69%) | loss: 2.918248 | lrm: 1.00 | dt: 648.05ms | tok/sec: 809,029 | mfu: 50.57 | epoch: 1 | total time: 22.78m | eta: 157.5m +step 02121/16704 (12.70%) | loss: 2.911456 | lrm: 1.00 | dt: 647.66ms | tok/sec: 809,511 | mfu: 50.60 | epoch: 1 | total time: 22.79m | eta: 157.4m +step 02122/16704 (12.70%) | loss: 2.912203 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,470 | mfu: 50.91 | epoch: 1 | total time: 22.80m | eta: 157.4m +step 02123/16704 (12.71%) | loss: 2.908943 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,731 | mfu: 50.73 | epoch: 1 | total time: 22.81m | eta: 157.4m +step 02124/16704 (12.72%) | loss: 2.902155 | lrm: 1.00 | dt: 649.10ms | tok/sec: 807,721 | mfu: 50.48 | epoch: 1 | total time: 22.82m | eta: 157.4m +step 02125/16704 (12.72%) | loss: 2.893524 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 1 | total time: 22.84m | eta: 157.4m +step 02126/16704 (12.73%) | loss: 2.896294 | lrm: 1.00 | dt: 648.05ms | tok/sec: 809,028 | mfu: 50.57 | epoch: 1 | total time: 22.85m | eta: 157.4m +step 02127/16704 (12.73%) | loss: 2.900988 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,011 | mfu: 50.69 | epoch: 1 | total time: 22.86m | eta: 157.4m +step 02128/16704 (12.74%) | loss: 2.895649 | lrm: 1.00 | dt: 647.77ms | tok/sec: 809,371 | mfu: 50.59 | epoch: 1 | total time: 22.87m | eta: 157.4m +step 02129/16704 (12.75%) | loss: 2.897524 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,697 | mfu: 50.86 | epoch: 1 | total time: 22.88m | eta: 157.4m +step 02130/16704 (12.75%) | loss: 2.899333 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,094 | mfu: 50.69 | epoch: 1 | total time: 22.89m | eta: 157.4m +step 02131/16704 (12.76%) | loss: 2.914201 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,203 | mfu: 50.70 | epoch: 1 | total time: 22.90m | eta: 157.3m +step 02132/16704 (12.76%) | loss: 2.910303 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,246 | mfu: 50.70 | epoch: 1 | total time: 22.91m | eta: 157.3m +step 02133/16704 (12.77%) | loss: 2.906749 | lrm: 1.00 | dt: 647.78ms | tok/sec: 809,355 | mfu: 50.59 | epoch: 1 | total time: 22.92m | eta: 157.3m +step 02134/16704 (12.78%) | loss: 2.912840 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,878 | mfu: 50.74 | epoch: 1 | total time: 22.93m | eta: 157.3m +step 02135/16704 (12.78%) | loss: 2.907868 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,519 | mfu: 50.72 | epoch: 1 | total time: 22.94m | eta: 157.3m +step 02136/16704 (12.79%) | loss: 2.893662 | lrm: 1.00 | dt: 648.29ms | tok/sec: 808,724 | mfu: 50.55 | epoch: 1 | total time: 22.95m | eta: 157.3m +step 02137/16704 (12.79%) | loss: 2.892965 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,131 | mfu: 50.70 | epoch: 1 | total time: 22.96m | eta: 157.3m +step 02138/16704 (12.80%) | loss: 2.906936 | lrm: 1.00 | dt: 650.00ms | tok/sec: 806,595 | mfu: 50.41 | epoch: 1 | total time: 22.98m | eta: 157.3m +step 02139/16704 (12.81%) | loss: 2.911627 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,127 | mfu: 50.76 | epoch: 1 | total time: 22.99m | eta: 157.3m +step 02140/16704 (12.81%) | loss: 2.910478 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,855 | mfu: 50.80 | epoch: 1 | total time: 23.00m | eta: 157.2m +step 02141/16704 (12.82%) | loss: 2.920679 | lrm: 1.00 | dt: 649.52ms | tok/sec: 807,191 | mfu: 50.45 | epoch: 1 | total time: 23.01m | eta: 157.2m +step 02142/16704 (12.82%) | loss: 2.917681 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,370 | mfu: 50.90 | epoch: 1 | total time: 23.02m | eta: 157.2m +step 02143/16704 (12.83%) | loss: 2.920368 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,493 | mfu: 50.78 | epoch: 1 | total time: 23.03m | eta: 157.2m +step 02144/16704 (12.84%) | loss: 2.921432 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,979 | mfu: 50.75 | epoch: 1 | total time: 23.04m | eta: 157.2m +step 02145/16704 (12.84%) | loss: 2.914814 | lrm: 1.00 | dt: 647.80ms | tok/sec: 809,332 | mfu: 50.58 | epoch: 1 | total time: 23.05m | eta: 157.2m +step 02146/16704 (12.85%) | loss: 2.910224 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,776 | mfu: 50.74 | epoch: 1 | total time: 23.06m | eta: 157.2m +step 02147/16704 (12.85%) | loss: 2.902366 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,527 | mfu: 50.66 | epoch: 1 | total time: 23.07m | eta: 157.2m +step 02148/16704 (12.86%) | loss: 2.903530 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,886 | mfu: 50.74 | epoch: 1 | total time: 23.08m | eta: 157.2m +step 02149/16704 (12.87%) | loss: 2.903855 | lrm: 1.00 | dt: 648.23ms | tok/sec: 808,798 | mfu: 50.55 | epoch: 1 | total time: 23.09m | eta: 157.1m +step 02150/16704 (12.87%) | loss: 2.901559 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,994 | mfu: 50.88 | epoch: 1 | total time: 23.10m | eta: 157.1m +step 02151/16704 (12.88%) | loss: 2.888289 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,777 | mfu: 50.55 | epoch: 1 | total time: 23.12m | eta: 157.1m +step 02152/16704 (12.88%) | loss: 2.891775 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,498 | mfu: 50.78 | epoch: 1 | total time: 23.13m | eta: 157.1m +step 02153/16704 (12.89%) | loss: 2.875467 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,879 | mfu: 50.74 | epoch: 1 | total time: 23.14m | eta: 157.1m +step 02154/16704 (12.90%) | loss: 2.878701 | lrm: 1.00 | dt: 646.63ms | tok/sec: 810,796 | mfu: 50.68 | epoch: 1 | total time: 23.15m | eta: 157.1m +step 02155/16704 (12.90%) | loss: 2.876910 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,509 | mfu: 50.78 | epoch: 1 | total time: 23.16m | eta: 157.1m +step 02156/16704 (12.91%) | loss: 2.877351 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,151 | mfu: 50.70 | epoch: 1 | total time: 23.17m | eta: 157.1m +step 02157/16704 (12.91%) | loss: 2.882782 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,424 | mfu: 50.59 | epoch: 1 | total time: 23.18m | eta: 157.1m +step 02158/16704 (12.92%) | loss: 2.888799 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,868 | mfu: 50.74 | epoch: 1 | total time: 23.19m | eta: 157.0m +step 02159/16704 (12.93%) | loss: 2.882962 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,352 | mfu: 50.71 | epoch: 1 | total time: 23.20m | eta: 157.0m +step 02160/16704 (12.93%) | loss: 2.894928 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,837 | mfu: 50.74 | epoch: 1 | total time: 23.21m | eta: 157.0m +step 02161/16704 (12.94%) | loss: 2.901018 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,540 | mfu: 50.91 | epoch: 1 | total time: 23.22m | eta: 157.0m +step 02162/16704 (12.94%) | loss: 2.895248 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,752 | mfu: 50.67 | epoch: 1 | total time: 23.23m | eta: 157.0m +step 02163/16704 (12.95%) | loss: 2.900285 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,077 | mfu: 50.76 | epoch: 1 | total time: 23.24m | eta: 157.0m +step 02164/16704 (12.95%) | loss: 2.881777 | lrm: 1.00 | dt: 648.65ms | tok/sec: 808,272 | mfu: 50.52 | epoch: 1 | total time: 23.26m | eta: 157.0m +step 02165/16704 (12.96%) | loss: 2.870547 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,464 | mfu: 50.78 | epoch: 1 | total time: 23.27m | eta: 157.0m +step 02166/16704 (12.97%) | loss: 2.853205 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,148 | mfu: 50.70 | epoch: 1 | total time: 23.28m | eta: 157.0m +step 02167/16704 (12.97%) | loss: 2.873964 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,776 | mfu: 50.67 | epoch: 1 | total time: 23.29m | eta: 156.9m +step 02168/16704 (12.98%) | loss: 2.875693 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,217 | mfu: 50.83 | epoch: 1 | total time: 23.30m | eta: 156.9m +step 02169/16704 (12.98%) | loss: 2.870322 | lrm: 1.00 | dt: 649.29ms | tok/sec: 807,482 | mfu: 50.47 | epoch: 1 | total time: 23.31m | eta: 156.9m +step 02170/16704 (12.99%) | loss: 2.880824 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,412 | mfu: 50.71 | epoch: 1 | total time: 23.32m | eta: 156.9m +step 02171/16704 (13.00%) | loss: 2.886582 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,049 | mfu: 50.88 | epoch: 1 | total time: 23.33m | eta: 156.9m +step 02172/16704 (13.00%) | loss: 2.883918 | lrm: 1.00 | dt: 647.94ms | tok/sec: 809,165 | mfu: 50.57 | epoch: 1 | total time: 23.34m | eta: 156.9m +step 02173/16704 (13.01%) | loss: 2.889107 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,776 | mfu: 50.67 | epoch: 1 | total time: 23.35m | eta: 156.9m +step 02174/16704 (13.01%) | loss: 2.889763 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,548 | mfu: 50.85 | epoch: 1 | total time: 23.36m | eta: 156.9m +step 02175/16704 (13.02%) | loss: 2.884546 | lrm: 1.00 | dt: 648.86ms | tok/sec: 808,017 | mfu: 50.50 | epoch: 1 | total time: 23.37m | eta: 156.9m +step 02176/16704 (13.03%) | loss: 2.872471 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,302 | mfu: 50.71 | epoch: 1 | total time: 23.38m | eta: 156.8m +step 02177/16704 (13.03%) | loss: 2.875735 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,203 | mfu: 50.70 | epoch: 1 | total time: 23.40m | eta: 156.8m +step 02178/16704 (13.04%) | loss: 2.872267 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,368 | mfu: 50.71 | epoch: 1 | total time: 23.41m | eta: 156.8m +step 02179/16704 (13.04%) | loss: 2.885161 | lrm: 1.00 | dt: 648.97ms | tok/sec: 807,875 | mfu: 50.49 | epoch: 1 | total time: 23.42m | eta: 156.8m +step 02180/16704 (13.05%) | loss: 2.880372 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,867 | mfu: 50.87 | epoch: 1 | total time: 23.43m | eta: 156.8m +step 02181/16704 (13.06%) | loss: 2.875358 | lrm: 1.00 | dt: 647.27ms | tok/sec: 810,003 | mfu: 50.63 | epoch: 1 | total time: 23.44m | eta: 156.8m +step 02182/16704 (13.06%) | loss: 2.874538 | lrm: 1.00 | dt: 647.83ms | tok/sec: 809,295 | mfu: 50.58 | epoch: 1 | total time: 23.45m | eta: 156.8m +step 02183/16704 (13.07%) | loss: 2.878646 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,454 | mfu: 50.65 | epoch: 1 | total time: 23.46m | eta: 156.8m +step 02184/16704 (13.07%) | loss: 2.879340 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,660 | mfu: 50.67 | epoch: 1 | total time: 23.47m | eta: 156.8m +step 02185/16704 (13.08%) | loss: 2.881000 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,005 | mfu: 50.69 | epoch: 1 | total time: 23.48m | eta: 156.7m +step 02186/16704 (13.09%) | loss: 2.881779 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,462 | mfu: 50.72 | epoch: 1 | total time: 23.49m | eta: 156.7m +step 02187/16704 (13.09%) | loss: 2.878833 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,055 | mfu: 50.69 | epoch: 1 | total time: 23.50m | eta: 156.7m +step 02188/16704 (13.10%) | loss: 2.878729 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,865 | mfu: 50.74 | epoch: 1 | total time: 23.51m | eta: 156.7m +step 02189/16704 (13.10%) | loss: 2.858271 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,749 | mfu: 50.74 | epoch: 1 | total time: 23.52m | eta: 156.7m +step 02190/16704 (13.11%) | loss: 2.870871 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,090 | mfu: 50.69 | epoch: 1 | total time: 23.54m | eta: 156.7m +step 02191/16704 (13.12%) | loss: 2.886064 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,438 | mfu: 50.53 | epoch: 1 | total time: 23.55m | eta: 156.7m +step 02192/16704 (13.12%) | loss: 2.889337 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,678 | mfu: 50.79 | epoch: 1 | total time: 23.56m | eta: 156.7m +step 02193/16704 (13.13%) | loss: 2.886295 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,627 | mfu: 50.60 | epoch: 1 | total time: 23.57m | eta: 156.7m +step 02194/16704 (13.13%) | loss: 2.907028 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 1 | total time: 23.58m | eta: 156.7m +step 02195/16704 (13.14%) | loss: 2.899269 | lrm: 1.00 | dt: 647.79ms | tok/sec: 809,351 | mfu: 50.59 | epoch: 1 | total time: 23.59m | eta: 156.6m +step 02196/16704 (13.15%) | loss: 2.891604 | lrm: 1.00 | dt: 648.48ms | tok/sec: 808,483 | mfu: 50.53 | epoch: 1 | total time: 23.60m | eta: 156.6m +step 02197/16704 (13.15%) | loss: 2.891938 | lrm: 1.00 | dt: 647.11ms | tok/sec: 810,194 | mfu: 50.64 | epoch: 1 | total time: 23.61m | eta: 156.6m +step 02198/16704 (13.16%) | loss: 2.889422 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,109 | mfu: 50.88 | epoch: 1 | total time: 23.62m | eta: 156.6m +step 02199/16704 (13.16%) | loss: 2.904354 | lrm: 1.00 | dt: 647.01ms | tok/sec: 810,326 | mfu: 50.65 | epoch: 1 | total time: 23.63m | eta: 156.6m +step 02200/16704 (13.17%) | loss: 2.874834 | lrm: 1.00 | dt: 647.21ms | tok/sec: 810,069 | mfu: 50.63 | epoch: 1 | total time: 23.64m | eta: 156.6m +step 02201/16704 (13.18%) | loss: 2.887309 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,303 | mfu: 50.71 | epoch: 1 | total time: 23.65m | eta: 156.6m +step 02202/16704 (13.18%) | loss: 2.885280 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,892 | mfu: 50.74 | epoch: 1 | total time: 23.66m | eta: 156.6m +step 02203/16704 (13.19%) | loss: 2.893571 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,441 | mfu: 50.84 | epoch: 1 | total time: 23.68m | eta: 156.6m +step 02204/16704 (13.19%) | loss: 2.895407 | lrm: 1.00 | dt: 647.15ms | tok/sec: 810,153 | mfu: 50.64 | epoch: 1 | total time: 23.69m | eta: 156.5m +step 02205/16704 (13.20%) | loss: 2.875056 | lrm: 1.00 | dt: 647.45ms | tok/sec: 809,768 | mfu: 50.61 | epoch: 1 | total time: 23.70m | eta: 156.5m +step 02206/16704 (13.21%) | loss: 2.879452 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,321 | mfu: 50.58 | epoch: 1 | total time: 23.71m | eta: 156.5m +step 02207/16704 (13.21%) | loss: 2.873244 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,217 | mfu: 50.89 | epoch: 1 | total time: 23.72m | eta: 156.5m +step 02208/16704 (13.22%) | loss: 2.873696 | lrm: 1.00 | dt: 648.11ms | tok/sec: 808,954 | mfu: 50.56 | epoch: 1 | total time: 23.73m | eta: 156.5m +step 02209/16704 (13.22%) | loss: 2.875625 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,027 | mfu: 50.82 | epoch: 1 | total time: 23.74m | eta: 156.5m +step 02210/16704 (13.23%) | loss: 2.880923 | lrm: 1.00 | dt: 647.17ms | tok/sec: 810,120 | mfu: 50.63 | epoch: 1 | total time: 23.75m | eta: 156.5m +step 02211/16704 (13.24%) | loss: 2.887681 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,986 | mfu: 50.75 | epoch: 1 | total time: 23.76m | eta: 156.5m +step 02212/16704 (13.24%) | loss: 2.887430 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,402 | mfu: 50.90 | epoch: 1 | total time: 23.77m | eta: 156.5m +step 02213/16704 (13.25%) | loss: 2.886905 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,452 | mfu: 50.72 | epoch: 1 | total time: 23.78m | eta: 156.4m +step 02214/16704 (13.25%) | loss: 2.875928 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,187 | mfu: 50.76 | epoch: 1 | total time: 23.79m | eta: 156.4m +step 02215/16704 (13.26%) | loss: 2.882060 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,315 | mfu: 50.83 | epoch: 1 | total time: 23.80m | eta: 156.4m +step 02216/16704 (13.27%) | loss: 2.879654 | lrm: 1.00 | dt: 647.86ms | tok/sec: 809,263 | mfu: 50.58 | epoch: 1 | total time: 23.82m | eta: 156.4m +step 02217/16704 (13.27%) | loss: 2.893807 | lrm: 1.00 | dt: 648.20ms | tok/sec: 808,840 | mfu: 50.55 | epoch: 1 | total time: 23.83m | eta: 156.4m +step 02218/16704 (13.28%) | loss: 2.889405 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,246 | mfu: 50.77 | epoch: 1 | total time: 23.84m | eta: 156.4m +step 02219/16704 (13.28%) | loss: 2.891518 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,858 | mfu: 50.74 | epoch: 1 | total time: 23.85m | eta: 156.4m +step 02220/16704 (13.29%) | loss: 2.886940 | lrm: 1.00 | dt: 648.81ms | tok/sec: 808,072 | mfu: 50.51 | epoch: 1 | total time: 23.86m | eta: 156.4m +step 02221/16704 (13.30%) | loss: 2.877072 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,228 | mfu: 50.83 | epoch: 1 | total time: 23.87m | eta: 156.4m +step 02222/16704 (13.30%) | loss: 2.874409 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,069 | mfu: 50.57 | epoch: 1 | total time: 23.88m | eta: 156.3m +step 02223/16704 (13.31%) | loss: 2.884451 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 1 | total time: 23.89m | eta: 156.3m +step 02224/16704 (13.31%) | loss: 2.883712 | lrm: 1.00 | dt: 648.15ms | tok/sec: 808,901 | mfu: 50.56 | epoch: 1 | total time: 23.90m | eta: 156.3m +step 02225/16704 (13.32%) | loss: 2.871760 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,313 | mfu: 50.77 | epoch: 1 | total time: 23.91m | eta: 156.3m +step 02226/16704 (13.33%) | loss: 2.893253 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,303 | mfu: 50.77 | epoch: 1 | total time: 23.92m | eta: 156.3m +step 02227/16704 (13.33%) | loss: 2.897885 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,399 | mfu: 50.71 | epoch: 1 | total time: 23.93m | eta: 156.3m +step 02228/16704 (13.34%) | loss: 2.896978 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,542 | mfu: 50.66 | epoch: 1 | total time: 23.94m | eta: 156.3m +step 02229/16704 (13.34%) | loss: 2.912360 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,952 | mfu: 50.81 | epoch: 1 | total time: 23.96m | eta: 156.3m +step 02230/16704 (13.35%) | loss: 2.908017 | lrm: 1.00 | dt: 646.70ms | tok/sec: 810,710 | mfu: 50.67 | epoch: 1 | total time: 23.97m | eta: 156.3m +step 02231/16704 (13.36%) | loss: 2.893476 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,944 | mfu: 50.75 | epoch: 1 | total time: 23.98m | eta: 156.2m +step 02232/16704 (13.36%) | loss: 2.895262 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,875 | mfu: 50.81 | epoch: 1 | total time: 23.99m | eta: 156.2m +step 02233/16704 (13.37%) | loss: 2.894540 | lrm: 1.00 | dt: 648.27ms | tok/sec: 808,754 | mfu: 50.55 | epoch: 1 | total time: 24.00m | eta: 156.2m +step 02234/16704 (13.37%) | loss: 2.893735 | lrm: 1.00 | dt: 649.23ms | tok/sec: 807,547 | mfu: 50.47 | epoch: 1 | total time: 24.01m | eta: 156.2m +step 02235/16704 (13.38%) | loss: 2.896142 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,363 | mfu: 50.71 | epoch: 1 | total time: 24.02m | eta: 156.2m +step 02236/16704 (13.39%) | loss: 2.897266 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,226 | mfu: 50.77 | epoch: 1 | total time: 24.03m | eta: 156.2m +step 02237/16704 (13.39%) | loss: 2.881858 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,255 | mfu: 50.70 | epoch: 1 | total time: 24.04m | eta: 156.2m +step 02238/16704 (13.40%) | loss: 2.881933 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,190 | mfu: 50.70 | epoch: 1 | total time: 24.05m | eta: 156.2m +step 02239/16704 (13.40%) | loss: 2.874342 | lrm: 1.00 | dt: 648.44ms | tok/sec: 808,539 | mfu: 50.53 | epoch: 1 | total time: 24.06m | eta: 156.2m +step 02240/16704 (13.41%) | loss: 2.860730 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,497 | mfu: 50.66 | epoch: 1 | total time: 24.07m | eta: 156.1m +step 02241/16704 (13.42%) | loss: 2.859484 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 1 | total time: 24.08m | eta: 156.1m +step 02242/16704 (13.42%) | loss: 2.874162 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,377 | mfu: 50.71 | epoch: 1 | total time: 24.10m | eta: 156.1m +step 02243/16704 (13.43%) | loss: 2.874546 | lrm: 1.00 | dt: 647.39ms | tok/sec: 809,849 | mfu: 50.62 | epoch: 1 | total time: 24.11m | eta: 156.1m +step 02244/16704 (13.43%) | loss: 2.876566 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,272 | mfu: 50.77 | epoch: 1 | total time: 24.12m | eta: 156.1m +step 02245/16704 (13.44%) | loss: 2.879457 | lrm: 1.00 | dt: 647.78ms | tok/sec: 809,361 | mfu: 50.59 | epoch: 1 | total time: 24.13m | eta: 156.1m +step 02246/16704 (13.45%) | loss: 2.883371 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,595 | mfu: 50.85 | epoch: 1 | total time: 24.14m | eta: 156.1m +step 02247/16704 (13.45%) | loss: 2.888411 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,166 | mfu: 50.76 | epoch: 1 | total time: 24.15m | eta: 156.1m +step 02248/16704 (13.46%) | loss: 2.885172 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,732 | mfu: 50.80 | epoch: 1 | total time: 24.16m | eta: 156.1m +step 02249/16704 (13.46%) | loss: 2.897734 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,090 | mfu: 50.69 | epoch: 1 | total time: 24.17m | eta: 156.0m +Step 02250 | Validation bpb: 0.877760 +step 02250/16704 (13.47%) | loss: 2.885789 | lrm: 1.00 | dt: 649.73ms | tok/sec: 806,933 | mfu: 50.43 | epoch: 1 | total time: 24.18m | eta: 156.0m +step 02251/16704 (13.48%) | loss: 2.894021 | lrm: 1.00 | dt: 650.01ms | tok/sec: 806,590 | mfu: 50.41 | epoch: 1 | total time: 24.19m | eta: 156.0m +step 02252/16704 (13.48%) | loss: 2.888505 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,676 | mfu: 50.86 | epoch: 1 | total time: 24.20m | eta: 156.0m +step 02253/16704 (13.49%) | loss: 2.897144 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,901 | mfu: 50.87 | epoch: 1 | total time: 24.21m | eta: 156.0m +step 02254/16704 (13.49%) | loss: 2.899135 | lrm: 1.00 | dt: 648.69ms | tok/sec: 808,230 | mfu: 50.52 | epoch: 1 | total time: 24.22m | eta: 156.0m +step 02255/16704 (13.50%) | loss: 2.915951 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,714 | mfu: 50.80 | epoch: 1 | total time: 24.24m | eta: 156.0m +step 02256/16704 (13.51%) | loss: 2.908689 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,254 | mfu: 50.83 | epoch: 1 | total time: 24.25m | eta: 156.0m +step 02257/16704 (13.51%) | loss: 2.907222 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,440 | mfu: 50.53 | epoch: 1 | total time: 24.26m | eta: 156.0m +step 02258/16704 (13.52%) | loss: 2.894877 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,012 | mfu: 50.88 | epoch: 1 | total time: 24.27m | eta: 155.9m +step 02259/16704 (13.52%) | loss: 2.898546 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,991 | mfu: 50.75 | epoch: 1 | total time: 24.28m | eta: 155.9m +step 02260/16704 (13.53%) | loss: 2.894267 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,501 | mfu: 50.66 | epoch: 1 | total time: 24.29m | eta: 155.9m +step 02261/16704 (13.54%) | loss: 2.884105 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,206 | mfu: 50.76 | epoch: 1 | total time: 24.30m | eta: 155.9m +step 02262/16704 (13.54%) | loss: 2.892333 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,265 | mfu: 50.77 | epoch: 1 | total time: 24.31m | eta: 155.9m +step 02263/16704 (13.55%) | loss: 2.916027 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,905 | mfu: 50.75 | epoch: 1 | total time: 24.32m | eta: 155.9m +step 02264/16704 (13.55%) | loss: 2.915093 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,170 | mfu: 50.70 | epoch: 1 | total time: 24.33m | eta: 155.9m +step 02265/16704 (13.56%) | loss: 2.913024 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,008 | mfu: 50.81 | epoch: 1 | total time: 24.34m | eta: 155.9m +step 02266/16704 (13.57%) | loss: 2.920863 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,714 | mfu: 50.86 | epoch: 1 | total time: 24.35m | eta: 155.9m +step 02267/16704 (13.57%) | loss: 2.910276 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,947 | mfu: 50.62 | epoch: 1 | total time: 24.36m | eta: 155.9m +step 02268/16704 (13.58%) | loss: 2.911223 | lrm: 1.00 | dt: 648.33ms | tok/sec: 808,679 | mfu: 50.54 | epoch: 1 | total time: 24.38m | eta: 155.8m +step 02269/16704 (13.58%) | loss: 2.908279 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,070 | mfu: 50.76 | epoch: 1 | total time: 24.39m | eta: 155.8m +step 02270/16704 (13.59%) | loss: 2.917973 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,362 | mfu: 50.71 | epoch: 1 | total time: 24.40m | eta: 155.8m +step 02271/16704 (13.60%) | loss: 2.922334 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,237 | mfu: 50.70 | epoch: 1 | total time: 24.41m | eta: 155.8m +step 02272/16704 (13.60%) | loss: 2.917881 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,325 | mfu: 50.77 | epoch: 1 | total time: 24.42m | eta: 155.8m +step 02273/16704 (13.61%) | loss: 2.913145 | lrm: 1.00 | dt: 648.29ms | tok/sec: 808,727 | mfu: 50.55 | epoch: 1 | total time: 24.43m | eta: 155.8m +step 02274/16704 (13.61%) | loss: 2.906682 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,509 | mfu: 50.72 | epoch: 1 | total time: 24.44m | eta: 155.8m +step 02275/16704 (13.62%) | loss: 2.876069 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,810 | mfu: 50.80 | epoch: 1 | total time: 24.45m | eta: 155.8m +step 02276/16704 (13.63%) | loss: 2.877974 | lrm: 1.00 | dt: 648.38ms | tok/sec: 808,612 | mfu: 50.54 | epoch: 1 | total time: 24.46m | eta: 155.8m +step 02277/16704 (13.63%) | loss: 2.881285 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,435 | mfu: 50.90 | epoch: 1 | total time: 24.47m | eta: 155.7m +step 02278/16704 (13.64%) | loss: 2.883638 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,690 | mfu: 50.73 | epoch: 1 | total time: 24.48m | eta: 155.7m +step 02279/16704 (13.64%) | loss: 2.894787 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,833 | mfu: 50.80 | epoch: 1 | total time: 24.49m | eta: 155.7m +step 02280/16704 (13.65%) | loss: 2.889846 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,427 | mfu: 50.84 | epoch: 1 | total time: 24.50m | eta: 155.7m +step 02281/16704 (13.66%) | loss: 2.880261 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,047 | mfu: 50.82 | epoch: 1 | total time: 24.52m | eta: 155.7m +step 02282/16704 (13.66%) | loss: 2.864307 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,151 | mfu: 50.76 | epoch: 1 | total time: 24.53m | eta: 155.7m +step 02283/16704 (13.67%) | loss: 2.871802 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,582 | mfu: 50.73 | epoch: 1 | total time: 24.54m | eta: 155.7m +step 02284/16704 (13.67%) | loss: 2.875189 | lrm: 1.00 | dt: 648.91ms | tok/sec: 807,953 | mfu: 50.50 | epoch: 1 | total time: 24.55m | eta: 155.7m +step 02285/16704 (13.68%) | loss: 2.881130 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,845 | mfu: 50.87 | epoch: 1 | total time: 24.56m | eta: 155.7m +step 02286/16704 (13.69%) | loss: 2.856855 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,688 | mfu: 50.73 | epoch: 1 | total time: 24.57m | eta: 155.6m +step 02287/16704 (13.69%) | loss: 2.866655 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,160 | mfu: 50.70 | epoch: 1 | total time: 24.58m | eta: 155.6m +step 02288/16704 (13.70%) | loss: 2.882490 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,144 | mfu: 50.70 | epoch: 1 | total time: 24.59m | eta: 155.6m +step 02289/16704 (13.70%) | loss: 2.868820 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,736 | mfu: 50.67 | epoch: 1 | total time: 24.60m | eta: 155.6m +step 02290/16704 (13.71%) | loss: 2.874141 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,880 | mfu: 50.68 | epoch: 1 | total time: 24.61m | eta: 155.6m +step 02291/16704 (13.72%) | loss: 2.874880 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 1 | total time: 24.62m | eta: 155.6m +step 02292/16704 (13.72%) | loss: 2.873848 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,280 | mfu: 50.71 | epoch: 1 | total time: 24.63m | eta: 155.6m +step 02293/16704 (13.73%) | loss: 2.873828 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,454 | mfu: 50.65 | epoch: 1 | total time: 24.64m | eta: 155.6m +step 02294/16704 (13.73%) | loss: 2.897006 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,021 | mfu: 50.75 | epoch: 1 | total time: 24.66m | eta: 155.6m +step 02295/16704 (13.74%) | loss: 2.898370 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,946 | mfu: 50.75 | epoch: 1 | total time: 24.67m | eta: 155.5m +step 02296/16704 (13.75%) | loss: 2.889291 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,643 | mfu: 50.67 | epoch: 1 | total time: 24.68m | eta: 155.5m +step 02297/16704 (13.75%) | loss: 2.878499 | lrm: 1.00 | dt: 647.27ms | tok/sec: 810,003 | mfu: 50.63 | epoch: 1 | total time: 24.69m | eta: 155.5m +step 02298/16704 (13.76%) | loss: 2.880774 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,198 | mfu: 50.70 | epoch: 1 | total time: 24.70m | eta: 155.5m +step 02299/16704 (13.76%) | loss: 2.878163 | lrm: 1.00 | dt: 648.88ms | tok/sec: 807,990 | mfu: 50.50 | epoch: 1 | total time: 24.71m | eta: 155.5m +step 02300/16704 (13.77%) | loss: 2.887576 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,963 | mfu: 50.81 | epoch: 1 | total time: 24.72m | eta: 155.5m +step 02301/16704 (13.78%) | loss: 2.879163 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,429 | mfu: 50.59 | epoch: 1 | total time: 24.73m | eta: 155.5m +step 02302/16704 (13.78%) | loss: 2.880066 | lrm: 1.00 | dt: 649.20ms | tok/sec: 807,590 | mfu: 50.48 | epoch: 1 | total time: 24.74m | eta: 155.5m +step 02303/16704 (13.79%) | loss: 2.880805 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,411 | mfu: 50.84 | epoch: 1 | total time: 24.75m | eta: 155.5m +step 02304/16704 (13.79%) | loss: 2.891941 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,701 | mfu: 50.61 | epoch: 1 | total time: 24.76m | eta: 155.4m +step 02305/16704 (13.80%) | loss: 2.886444 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,855 | mfu: 50.74 | epoch: 1 | total time: 24.77m | eta: 155.4m +step 02306/16704 (13.81%) | loss: 2.883815 | lrm: 1.00 | dt: 647.06ms | tok/sec: 810,259 | mfu: 50.64 | epoch: 1 | total time: 24.78m | eta: 155.4m +step 02307/16704 (13.81%) | loss: 2.891121 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,463 | mfu: 50.72 | epoch: 1 | total time: 24.80m | eta: 155.4m +step 02308/16704 (13.82%) | loss: 2.859753 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,513 | mfu: 50.66 | epoch: 1 | total time: 24.81m | eta: 155.4m +step 02309/16704 (13.82%) | loss: 2.864216 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,091 | mfu: 50.82 | epoch: 1 | total time: 24.82m | eta: 155.4m +step 02310/16704 (13.83%) | loss: 2.859204 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,622 | mfu: 50.60 | epoch: 1 | total time: 24.83m | eta: 155.4m +step 02311/16704 (13.84%) | loss: 2.855730 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,111 | mfu: 50.76 | epoch: 1 | total time: 24.84m | eta: 155.4m +step 02312/16704 (13.84%) | loss: 2.865866 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,093 | mfu: 50.69 | epoch: 1 | total time: 24.85m | eta: 155.4m +step 02313/16704 (13.85%) | loss: 2.882149 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,255 | mfu: 50.70 | epoch: 1 | total time: 24.86m | eta: 155.3m +step 02314/16704 (13.85%) | loss: 2.884135 | lrm: 1.00 | dt: 648.58ms | tok/sec: 808,358 | mfu: 50.52 | epoch: 1 | total time: 24.87m | eta: 155.3m +step 02315/16704 (13.86%) | loss: 2.882398 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,672 | mfu: 50.67 | epoch: 1 | total time: 24.88m | eta: 155.3m +step 02316/16704 (13.86%) | loss: 2.897542 | lrm: 1.00 | dt: 648.14ms | tok/sec: 808,911 | mfu: 50.56 | epoch: 1 | total time: 24.89m | eta: 155.3m +step 02317/16704 (13.87%) | loss: 2.903617 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,108 | mfu: 50.76 | epoch: 1 | total time: 24.90m | eta: 155.3m +step 02318/16704 (13.88%) | loss: 2.895375 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,192 | mfu: 50.70 | epoch: 1 | total time: 24.91m | eta: 155.3m +step 02319/16704 (13.88%) | loss: 2.898186 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,787 | mfu: 50.80 | epoch: 1 | total time: 24.93m | eta: 155.3m +step 02320/16704 (13.89%) | loss: 2.889174 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,093 | mfu: 50.76 | epoch: 1 | total time: 24.94m | eta: 155.3m +step 02321/16704 (13.89%) | loss: 2.883631 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,326 | mfu: 50.83 | epoch: 1 | total time: 24.95m | eta: 155.3m +step 02322/16704 (13.90%) | loss: 2.874561 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,957 | mfu: 50.69 | epoch: 1 | total time: 24.96m | eta: 155.2m +step 02323/16704 (13.91%) | loss: 2.880075 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,279 | mfu: 50.77 | epoch: 1 | total time: 24.97m | eta: 155.2m +step 02324/16704 (13.91%) | loss: 2.887928 | lrm: 1.00 | dt: 648.94ms | tok/sec: 807,912 | mfu: 50.50 | epoch: 1 | total time: 24.98m | eta: 155.2m +step 02325/16704 (13.92%) | loss: 2.894722 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,205 | mfu: 50.83 | epoch: 1 | total time: 24.99m | eta: 155.2m +step 02326/16704 (13.92%) | loss: 2.887893 | lrm: 1.00 | dt: 646.96ms | tok/sec: 810,384 | mfu: 50.65 | epoch: 1 | total time: 25.00m | eta: 155.2m +step 02327/16704 (13.93%) | loss: 2.886682 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,503 | mfu: 50.66 | epoch: 1 | total time: 25.01m | eta: 155.2m +step 02328/16704 (13.94%) | loss: 2.883400 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,429 | mfu: 50.84 | epoch: 1 | total time: 25.02m | eta: 155.2m +step 02329/16704 (13.94%) | loss: 2.878655 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,979 | mfu: 50.62 | epoch: 1 | total time: 25.03m | eta: 155.2m +step 02330/16704 (13.95%) | loss: 2.894124 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,442 | mfu: 50.84 | epoch: 1 | total time: 25.04m | eta: 155.2m +step 02331/16704 (13.95%) | loss: 2.889476 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,780 | mfu: 50.86 | epoch: 1 | total time: 25.05m | eta: 155.2m +step 02332/16704 (13.96%) | loss: 2.894143 | lrm: 1.00 | dt: 648.99ms | tok/sec: 807,847 | mfu: 50.49 | epoch: 1 | total time: 25.06m | eta: 155.1m +step 02333/16704 (13.97%) | loss: 2.883378 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,012 | mfu: 50.81 | epoch: 1 | total time: 25.08m | eta: 155.1m +step 02334/16704 (13.97%) | loss: 2.895618 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,830 | mfu: 50.74 | epoch: 1 | total time: 25.09m | eta: 155.1m +step 02335/16704 (13.98%) | loss: 2.877333 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,570 | mfu: 50.60 | epoch: 1 | total time: 25.10m | eta: 155.1m +step 02336/16704 (13.98%) | loss: 2.882695 | lrm: 1.00 | dt: 648.35ms | tok/sec: 808,649 | mfu: 50.54 | epoch: 1 | total time: 25.11m | eta: 155.1m +step 02337/16704 (13.99%) | loss: 2.880851 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,667 | mfu: 50.73 | epoch: 1 | total time: 25.12m | eta: 155.1m +step 02338/16704 (14.00%) | loss: 2.873475 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,177 | mfu: 50.82 | epoch: 1 | total time: 25.13m | eta: 155.1m +step 02339/16704 (14.00%) | loss: 2.860487 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,159 | mfu: 50.64 | epoch: 1 | total time: 25.14m | eta: 155.1m +step 02340/16704 (14.01%) | loss: 2.849810 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,297 | mfu: 50.96 | epoch: 1 | total time: 25.15m | eta: 155.1m +step 02341/16704 (14.01%) | loss: 2.863922 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,153 | mfu: 50.76 | epoch: 1 | total time: 25.16m | eta: 155.0m +step 02342/16704 (14.02%) | loss: 2.869613 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,557 | mfu: 50.66 | epoch: 1 | total time: 25.17m | eta: 155.0m +step 02343/16704 (14.03%) | loss: 2.876873 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,276 | mfu: 50.71 | epoch: 1 | total time: 25.18m | eta: 155.0m +step 02344/16704 (14.03%) | loss: 2.870839 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,981 | mfu: 50.69 | epoch: 1 | total time: 25.19m | eta: 155.0m +step 02345/16704 (14.04%) | loss: 2.882979 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,775 | mfu: 50.67 | epoch: 1 | total time: 25.20m | eta: 155.0m +step 02346/16704 (14.04%) | loss: 2.875416 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,602 | mfu: 50.91 | epoch: 1 | total time: 25.22m | eta: 155.0m +step 02347/16704 (14.05%) | loss: 2.880810 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,525 | mfu: 50.66 | epoch: 1 | total time: 25.23m | eta: 155.0m +step 02348/16704 (14.06%) | loss: 2.876504 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,663 | mfu: 50.79 | epoch: 1 | total time: 25.24m | eta: 155.0m +step 02349/16704 (14.06%) | loss: 2.877390 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,402 | mfu: 50.71 | epoch: 1 | total time: 25.25m | eta: 155.0m +step 02350/16704 (14.07%) | loss: 2.892378 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,336 | mfu: 50.83 | epoch: 1 | total time: 25.26m | eta: 154.9m +step 02351/16704 (14.07%) | loss: 2.888400 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,384 | mfu: 50.84 | epoch: 1 | total time: 25.27m | eta: 154.9m +step 02352/16704 (14.08%) | loss: 2.881063 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,176 | mfu: 50.76 | epoch: 1 | total time: 25.28m | eta: 154.9m +step 02353/16704 (14.09%) | loss: 2.877930 | lrm: 1.00 | dt: 648.45ms | tok/sec: 808,521 | mfu: 50.53 | epoch: 1 | total time: 25.29m | eta: 154.9m +step 02354/16704 (14.09%) | loss: 2.880926 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,268 | mfu: 50.77 | epoch: 1 | total time: 25.30m | eta: 154.9m +step 02355/16704 (14.10%) | loss: 2.877450 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,108 | mfu: 50.63 | epoch: 1 | total time: 25.31m | eta: 154.9m +step 02356/16704 (14.10%) | loss: 2.871898 | lrm: 1.00 | dt: 648.21ms | tok/sec: 808,823 | mfu: 50.55 | epoch: 1 | total time: 25.32m | eta: 154.9m +step 02357/16704 (14.11%) | loss: 2.864727 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,844 | mfu: 50.93 | epoch: 1 | total time: 25.33m | eta: 154.9m +step 02358/16704 (14.12%) | loss: 2.879645 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,767 | mfu: 50.80 | epoch: 1 | total time: 25.34m | eta: 154.9m +step 02359/16704 (14.12%) | loss: 2.884947 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,831 | mfu: 50.80 | epoch: 1 | total time: 25.36m | eta: 154.8m +step 02360/16704 (14.13%) | loss: 2.882445 | lrm: 1.00 | dt: 647.27ms | tok/sec: 809,993 | mfu: 50.63 | epoch: 1 | total time: 25.37m | eta: 154.8m +step 02361/16704 (14.13%) | loss: 2.885062 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,567 | mfu: 50.66 | epoch: 1 | total time: 25.38m | eta: 154.8m +step 02362/16704 (14.14%) | loss: 2.893307 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,193 | mfu: 50.70 | epoch: 1 | total time: 25.39m | eta: 154.8m +step 02363/16704 (14.15%) | loss: 2.889413 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 1 | total time: 25.40m | eta: 154.8m +step 02364/16704 (14.15%) | loss: 2.895294 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,385 | mfu: 50.84 | epoch: 1 | total time: 25.41m | eta: 154.8m +step 02365/16704 (14.16%) | loss: 2.893451 | lrm: 1.00 | dt: 647.59ms | tok/sec: 809,593 | mfu: 50.60 | epoch: 1 | total time: 25.42m | eta: 154.8m +step 02366/16704 (14.16%) | loss: 2.900856 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,236 | mfu: 50.64 | epoch: 1 | total time: 25.43m | eta: 154.8m +step 02367/16704 (14.17%) | loss: 2.905943 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,044 | mfu: 50.82 | epoch: 1 | total time: 25.44m | eta: 154.8m +step 02368/16704 (14.18%) | loss: 2.903384 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,036 | mfu: 50.69 | epoch: 1 | total time: 25.45m | eta: 154.7m +step 02369/16704 (14.18%) | loss: 2.893972 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,654 | mfu: 50.79 | epoch: 1 | total time: 25.46m | eta: 154.7m +step 02370/16704 (14.19%) | loss: 2.883429 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,409 | mfu: 50.78 | epoch: 1 | total time: 25.47m | eta: 154.7m +step 02371/16704 (14.19%) | loss: 2.881622 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,421 | mfu: 50.65 | epoch: 1 | total time: 25.48m | eta: 154.7m +step 02372/16704 (14.20%) | loss: 2.869910 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 1 | total time: 25.50m | eta: 154.7m +step 02373/16704 (14.21%) | loss: 2.876461 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,643 | mfu: 50.92 | epoch: 1 | total time: 25.51m | eta: 154.7m +step 02374/16704 (14.21%) | loss: 2.878757 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,847 | mfu: 50.74 | epoch: 1 | total time: 25.52m | eta: 154.7m +step 02375/16704 (14.22%) | loss: 2.887225 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,941 | mfu: 50.68 | epoch: 1 | total time: 25.53m | eta: 154.7m +step 02376/16704 (14.22%) | loss: 2.886748 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,896 | mfu: 50.68 | epoch: 1 | total time: 25.54m | eta: 154.7m +step 02377/16704 (14.23%) | loss: 2.901208 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,245 | mfu: 50.70 | epoch: 1 | total time: 25.55m | eta: 154.6m +step 02378/16704 (14.24%) | loss: 2.889644 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,574 | mfu: 50.66 | epoch: 1 | total time: 25.56m | eta: 154.6m +step 02379/16704 (14.24%) | loss: 2.891730 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,320 | mfu: 50.71 | epoch: 1 | total time: 25.57m | eta: 154.6m +step 02380/16704 (14.25%) | loss: 2.892447 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,302 | mfu: 50.83 | epoch: 1 | total time: 25.58m | eta: 154.6m +step 02381/16704 (14.25%) | loss: 2.890550 | lrm: 1.00 | dt: 647.16ms | tok/sec: 810,141 | mfu: 50.63 | epoch: 1 | total time: 25.59m | eta: 154.6m +step 02382/16704 (14.26%) | loss: 2.897064 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,871 | mfu: 50.74 | epoch: 1 | total time: 25.60m | eta: 154.6m +step 02383/16704 (14.27%) | loss: 2.892976 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,161 | mfu: 50.89 | epoch: 1 | total time: 25.61m | eta: 154.6m +step 02384/16704 (14.27%) | loss: 2.886788 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,300 | mfu: 50.77 | epoch: 1 | total time: 25.62m | eta: 154.6m +step 02385/16704 (14.28%) | loss: 2.881472 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,542 | mfu: 50.79 | epoch: 1 | total time: 25.64m | eta: 154.6m +step 02386/16704 (14.28%) | loss: 2.880357 | lrm: 1.00 | dt: 647.88ms | tok/sec: 809,236 | mfu: 50.58 | epoch: 1 | total time: 25.65m | eta: 154.5m +step 02387/16704 (14.29%) | loss: 2.877742 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,616 | mfu: 50.79 | epoch: 1 | total time: 25.66m | eta: 154.5m +step 02388/16704 (14.30%) | loss: 2.876980 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,639 | mfu: 50.73 | epoch: 1 | total time: 25.67m | eta: 154.5m +step 02389/16704 (14.30%) | loss: 2.881770 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,535 | mfu: 50.72 | epoch: 1 | total time: 25.68m | eta: 154.5m +step 02390/16704 (14.31%) | loss: 2.879942 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,591 | mfu: 50.73 | epoch: 1 | total time: 25.69m | eta: 154.5m +step 02391/16704 (14.31%) | loss: 2.880427 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,928 | mfu: 50.81 | epoch: 1 | total time: 25.70m | eta: 154.5m +step 02392/16704 (14.32%) | loss: 2.882039 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,366 | mfu: 50.77 | epoch: 1 | total time: 25.71m | eta: 154.5m +step 02393/16704 (14.33%) | loss: 2.867729 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,774 | mfu: 50.67 | epoch: 1 | total time: 25.72m | eta: 154.5m +step 02394/16704 (14.33%) | loss: 2.857310 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,860 | mfu: 50.87 | epoch: 1 | total time: 25.73m | eta: 154.5m +step 02395/16704 (14.34%) | loss: 2.863454 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,850 | mfu: 50.68 | epoch: 1 | total time: 25.74m | eta: 154.4m +step 02396/16704 (14.34%) | loss: 2.871663 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,235 | mfu: 50.83 | epoch: 1 | total time: 25.75m | eta: 154.4m +step 02397/16704 (14.35%) | loss: 2.892512 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,955 | mfu: 50.81 | epoch: 1 | total time: 25.76m | eta: 154.4m +step 02398/16704 (14.36%) | loss: 2.880321 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,578 | mfu: 50.72 | epoch: 1 | total time: 25.78m | eta: 154.4m +step 02399/16704 (14.36%) | loss: 2.876939 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,355 | mfu: 50.90 | epoch: 1 | total time: 25.79m | eta: 154.4m +step 02400/16704 (14.37%) | loss: 2.865366 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,084 | mfu: 50.63 | epoch: 1 | total time: 25.80m | eta: 154.4m +step 02401/16704 (14.37%) | loss: 2.871333 | lrm: 1.00 | dt: 646.72ms | tok/sec: 810,683 | mfu: 50.67 | epoch: 1 | total time: 25.81m | eta: 154.4m +step 02402/16704 (14.38%) | loss: 2.886330 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,080 | mfu: 50.76 | epoch: 1 | total time: 25.82m | eta: 154.4m +step 02403/16704 (14.39%) | loss: 2.885120 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,512 | mfu: 50.72 | epoch: 1 | total time: 25.83m | eta: 154.4m +step 02404/16704 (14.39%) | loss: 2.883640 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,016 | mfu: 50.81 | epoch: 1 | total time: 25.84m | eta: 154.3m +step 02405/16704 (14.40%) | loss: 2.882925 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,113 | mfu: 50.76 | epoch: 1 | total time: 25.85m | eta: 154.3m +step 02406/16704 (14.40%) | loss: 2.891376 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,556 | mfu: 50.72 | epoch: 1 | total time: 25.86m | eta: 154.3m +step 02407/16704 (14.41%) | loss: 2.880891 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,583 | mfu: 50.85 | epoch: 1 | total time: 25.87m | eta: 154.3m +step 02408/16704 (14.42%) | loss: 2.876298 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,338 | mfu: 50.83 | epoch: 1 | total time: 25.88m | eta: 154.3m +step 02409/16704 (14.42%) | loss: 2.868878 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,740 | mfu: 50.67 | epoch: 1 | total time: 25.89m | eta: 154.3m +step 02410/16704 (14.43%) | loss: 2.861981 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,945 | mfu: 50.81 | epoch: 1 | total time: 25.90m | eta: 154.3m +step 02411/16704 (14.43%) | loss: 2.867646 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,672 | mfu: 50.79 | epoch: 1 | total time: 25.92m | eta: 154.3m +step 02412/16704 (14.44%) | loss: 2.857800 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,330 | mfu: 50.65 | epoch: 1 | total time: 25.93m | eta: 154.3m +step 02413/16704 (14.45%) | loss: 2.874721 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,778 | mfu: 50.67 | epoch: 1 | total time: 25.94m | eta: 154.3m +step 02414/16704 (14.45%) | loss: 2.881077 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,208 | mfu: 50.70 | epoch: 1 | total time: 25.95m | eta: 154.2m +step 02415/16704 (14.46%) | loss: 2.878467 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,119 | mfu: 50.76 | epoch: 1 | total time: 25.96m | eta: 154.2m +step 02416/16704 (14.46%) | loss: 2.886750 | lrm: 1.00 | dt: 648.45ms | tok/sec: 808,528 | mfu: 50.53 | epoch: 1 | total time: 25.97m | eta: 154.2m +step 02417/16704 (14.47%) | loss: 2.884117 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 1 | total time: 25.98m | eta: 154.2m +step 02418/16704 (14.48%) | loss: 2.888338 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,605 | mfu: 50.73 | epoch: 1 | total time: 25.99m | eta: 154.2m +step 02419/16704 (14.48%) | loss: 2.879569 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,725 | mfu: 50.80 | epoch: 1 | total time: 26.00m | eta: 154.2m +step 02420/16704 (14.49%) | loss: 2.876813 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,077 | mfu: 50.82 | epoch: 1 | total time: 26.01m | eta: 154.2m +step 02421/16704 (14.49%) | loss: 2.885844 | lrm: 1.00 | dt: 648.07ms | tok/sec: 808,999 | mfu: 50.56 | epoch: 1 | total time: 26.02m | eta: 154.2m +step 02422/16704 (14.50%) | loss: 2.888723 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,327 | mfu: 50.71 | epoch: 1 | total time: 26.03m | eta: 154.2m +step 02423/16704 (14.51%) | loss: 2.887732 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,976 | mfu: 50.62 | epoch: 1 | total time: 26.04m | eta: 154.1m +step 02424/16704 (14.51%) | loss: 2.893164 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 1 | total time: 26.06m | eta: 154.1m +step 02425/16704 (14.52%) | loss: 2.885517 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,474 | mfu: 50.72 | epoch: 1 | total time: 26.07m | eta: 154.1m +step 02426/16704 (14.52%) | loss: 2.880987 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,876 | mfu: 50.68 | epoch: 1 | total time: 26.08m | eta: 154.1m +step 02427/16704 (14.53%) | loss: 2.882450 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,782 | mfu: 50.74 | epoch: 1 | total time: 26.09m | eta: 154.1m +step 02428/16704 (14.54%) | loss: 2.887984 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 1 | total time: 26.10m | eta: 154.1m +step 02429/16704 (14.54%) | loss: 2.888481 | lrm: 1.00 | dt: 647.78ms | tok/sec: 809,362 | mfu: 50.59 | epoch: 1 | total time: 26.11m | eta: 154.1m +step 02430/16704 (14.55%) | loss: 2.896183 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,353 | mfu: 50.77 | epoch: 1 | total time: 26.12m | eta: 154.1m +step 02431/16704 (14.55%) | loss: 2.905946 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,495 | mfu: 50.78 | epoch: 1 | total time: 26.13m | eta: 154.1m +step 02432/16704 (14.56%) | loss: 2.900930 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,902 | mfu: 50.68 | epoch: 1 | total time: 26.14m | eta: 154.0m +step 02433/16704 (14.57%) | loss: 2.896544 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,598 | mfu: 50.79 | epoch: 1 | total time: 26.15m | eta: 154.0m +step 02434/16704 (14.57%) | loss: 2.905959 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,607 | mfu: 50.66 | epoch: 1 | total time: 26.16m | eta: 154.0m +step 02435/16704 (14.58%) | loss: 2.893290 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,752 | mfu: 50.67 | epoch: 1 | total time: 26.17m | eta: 154.0m +step 02436/16704 (14.58%) | loss: 2.891300 | lrm: 1.00 | dt: 651.11ms | tok/sec: 805,223 | mfu: 50.33 | epoch: 1 | total time: 26.18m | eta: 154.0m +step 02437/16704 (14.59%) | loss: 2.904969 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,217 | mfu: 50.76 | epoch: 1 | total time: 26.20m | eta: 154.0m +step 02438/16704 (14.60%) | loss: 2.902183 | lrm: 1.00 | dt: 647.50ms | tok/sec: 809,711 | mfu: 50.61 | epoch: 1 | total time: 26.21m | eta: 154.0m +step 02439/16704 (14.60%) | loss: 2.905937 | lrm: 1.00 | dt: 648.34ms | tok/sec: 808,661 | mfu: 50.54 | epoch: 1 | total time: 26.22m | eta: 154.0m +step 02440/16704 (14.61%) | loss: 2.900678 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,170 | mfu: 50.95 | epoch: 1 | total time: 26.23m | eta: 154.0m +step 02441/16704 (14.61%) | loss: 2.917759 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,495 | mfu: 50.66 | epoch: 1 | total time: 26.24m | eta: 153.9m +step 02442/16704 (14.62%) | loss: 2.905786 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,234 | mfu: 50.77 | epoch: 1 | total time: 26.25m | eta: 153.9m +step 02443/16704 (14.63%) | loss: 2.914343 | lrm: 1.00 | dt: 647.27ms | tok/sec: 810,001 | mfu: 50.63 | epoch: 1 | total time: 26.26m | eta: 153.9m +step 02444/16704 (14.63%) | loss: 2.904835 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,533 | mfu: 50.78 | epoch: 1 | total time: 26.27m | eta: 153.9m +step 02445/16704 (14.64%) | loss: 2.907693 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,984 | mfu: 50.81 | epoch: 1 | total time: 26.28m | eta: 153.9m +step 02446/16704 (14.64%) | loss: 2.898516 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 1 | total time: 26.29m | eta: 153.9m +step 02447/16704 (14.65%) | loss: 2.897521 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,959 | mfu: 50.75 | epoch: 1 | total time: 26.30m | eta: 153.9m +step 02448/16704 (14.66%) | loss: 2.875553 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,528 | mfu: 50.85 | epoch: 1 | total time: 26.31m | eta: 153.9m +step 02449/16704 (14.66%) | loss: 2.875602 | lrm: 1.00 | dt: 649.87ms | tok/sec: 806,759 | mfu: 50.42 | epoch: 1 | total time: 26.32m | eta: 153.9m +step 02450/16704 (14.67%) | loss: 2.875573 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,699 | mfu: 50.86 | epoch: 1 | total time: 26.34m | eta: 153.8m +step 02451/16704 (14.67%) | loss: 2.875760 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,193 | mfu: 50.58 | epoch: 1 | total time: 26.35m | eta: 153.8m +step 02452/16704 (14.68%) | loss: 2.884920 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 1 | total time: 26.36m | eta: 153.8m +step 02453/16704 (14.69%) | loss: 2.880862 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,355 | mfu: 50.71 | epoch: 1 | total time: 26.37m | eta: 153.8m +step 02454/16704 (14.69%) | loss: 2.904104 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,201 | mfu: 50.58 | epoch: 1 | total time: 26.38m | eta: 153.8m +step 02455/16704 (14.70%) | loss: 2.904703 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,680 | mfu: 50.86 | epoch: 1 | total time: 26.39m | eta: 153.8m +step 02456/16704 (14.70%) | loss: 2.897579 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,305 | mfu: 50.58 | epoch: 1 | total time: 26.40m | eta: 153.8m +step 02457/16704 (14.71%) | loss: 2.894162 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,580 | mfu: 50.85 | epoch: 1 | total time: 26.41m | eta: 153.8m +step 02458/16704 (14.72%) | loss: 2.891994 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,896 | mfu: 50.68 | epoch: 1 | total time: 26.42m | eta: 153.8m +step 02459/16704 (14.72%) | loss: 2.881365 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,037 | mfu: 50.75 | epoch: 1 | total time: 26.43m | eta: 153.7m +step 02460/16704 (14.73%) | loss: 2.872277 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,894 | mfu: 50.81 | epoch: 1 | total time: 26.44m | eta: 153.7m +step 02461/16704 (14.73%) | loss: 2.868388 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,132 | mfu: 50.76 | epoch: 1 | total time: 26.45m | eta: 153.7m +step 02462/16704 (14.74%) | loss: 2.855277 | lrm: 1.00 | dt: 649.75ms | tok/sec: 806,912 | mfu: 50.43 | epoch: 1 | total time: 26.46m | eta: 153.7m +step 02463/16704 (14.74%) | loss: 2.851946 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,547 | mfu: 50.85 | epoch: 1 | total time: 26.48m | eta: 153.7m +step 02464/16704 (14.75%) | loss: 2.848667 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,057 | mfu: 50.69 | epoch: 1 | total time: 26.49m | eta: 153.7m +step 02465/16704 (14.76%) | loss: 2.847093 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 1 | total time: 26.50m | eta: 153.7m +step 02466/16704 (14.76%) | loss: 2.853403 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,994 | mfu: 50.75 | epoch: 1 | total time: 26.51m | eta: 153.7m +step 02467/16704 (14.77%) | loss: 2.857450 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,969 | mfu: 50.87 | epoch: 1 | total time: 26.52m | eta: 153.7m +step 02468/16704 (14.77%) | loss: 2.866524 | lrm: 1.00 | dt: 647.78ms | tok/sec: 809,355 | mfu: 50.59 | epoch: 1 | total time: 26.53m | eta: 153.6m +step 02469/16704 (14.78%) | loss: 2.866163 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,129 | mfu: 50.70 | epoch: 1 | total time: 26.54m | eta: 153.6m +step 02470/16704 (14.79%) | loss: 2.865480 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 1 | total time: 26.55m | eta: 153.6m +step 02471/16704 (14.79%) | loss: 2.867054 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,825 | mfu: 50.80 | epoch: 1 | total time: 26.56m | eta: 153.6m +step 02472/16704 (14.80%) | loss: 2.857423 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 26.57m | eta: 153.6m +step 02473/16704 (14.80%) | loss: 2.851012 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,796 | mfu: 50.80 | epoch: 1 | total time: 26.58m | eta: 153.6m +step 02474/16704 (14.81%) | loss: 2.854837 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,026 | mfu: 50.75 | epoch: 1 | total time: 26.59m | eta: 153.6m +step 02475/16704 (14.82%) | loss: 2.867348 | lrm: 1.00 | dt: 650.19ms | tok/sec: 806,356 | mfu: 50.40 | epoch: 1 | total time: 26.60m | eta: 153.6m +step 02476/16704 (14.82%) | loss: 2.866002 | lrm: 1.00 | dt: 641.86ms | tok/sec: 816,820 | mfu: 51.05 | epoch: 1 | total time: 26.62m | eta: 153.6m +step 02477/16704 (14.83%) | loss: 2.867455 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,309 | mfu: 50.71 | epoch: 1 | total time: 26.63m | eta: 153.6m +step 02478/16704 (14.83%) | loss: 2.874911 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,503 | mfu: 50.66 | epoch: 1 | total time: 26.64m | eta: 153.5m +step 02479/16704 (14.84%) | loss: 2.864885 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,387 | mfu: 50.84 | epoch: 1 | total time: 26.65m | eta: 153.5m +step 02480/16704 (14.85%) | loss: 2.869529 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,327 | mfu: 50.71 | epoch: 1 | total time: 26.66m | eta: 153.5m +step 02481/16704 (14.85%) | loss: 2.864307 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,427 | mfu: 50.59 | epoch: 1 | total time: 26.67m | eta: 153.5m +step 02482/16704 (14.86%) | loss: 2.870834 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,807 | mfu: 50.93 | epoch: 1 | total time: 26.68m | eta: 153.5m +step 02483/16704 (14.86%) | loss: 2.888788 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,270 | mfu: 50.71 | epoch: 1 | total time: 26.69m | eta: 153.5m +step 02484/16704 (14.87%) | loss: 2.884118 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,829 | mfu: 50.80 | epoch: 1 | total time: 26.70m | eta: 153.5m +step 02485/16704 (14.88%) | loss: 2.887216 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,497 | mfu: 50.78 | epoch: 1 | total time: 26.71m | eta: 153.5m +step 02486/16704 (14.88%) | loss: 2.873853 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,363 | mfu: 50.90 | epoch: 1 | total time: 26.72m | eta: 153.5m +step 02487/16704 (14.89%) | loss: 2.865304 | lrm: 1.00 | dt: 648.12ms | tok/sec: 808,940 | mfu: 50.56 | epoch: 1 | total time: 26.73m | eta: 153.4m +step 02488/16704 (14.89%) | loss: 2.875038 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,156 | mfu: 50.70 | epoch: 1 | total time: 26.74m | eta: 153.4m +step 02489/16704 (14.90%) | loss: 2.883778 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,167 | mfu: 50.89 | epoch: 1 | total time: 26.76m | eta: 153.4m +step 02490/16704 (14.91%) | loss: 2.863368 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 1 | total time: 26.77m | eta: 153.4m +step 02491/16704 (14.91%) | loss: 2.859318 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 1 | total time: 26.78m | eta: 153.4m +step 02492/16704 (14.92%) | loss: 2.858018 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 1 | total time: 26.79m | eta: 153.4m +step 02493/16704 (14.92%) | loss: 2.852406 | lrm: 1.00 | dt: 647.62ms | tok/sec: 809,557 | mfu: 50.60 | epoch: 1 | total time: 26.80m | eta: 153.4m +step 02494/16704 (14.93%) | loss: 2.858791 | lrm: 1.00 | dt: 646.47ms | tok/sec: 810,996 | mfu: 50.69 | epoch: 1 | total time: 26.81m | eta: 153.4m +step 02495/16704 (14.94%) | loss: 2.847298 | lrm: 1.00 | dt: 646.47ms | tok/sec: 810,997 | mfu: 50.69 | epoch: 1 | total time: 26.82m | eta: 153.4m +step 02496/16704 (14.94%) | loss: 2.844444 | lrm: 1.00 | dt: 648.88ms | tok/sec: 807,988 | mfu: 50.50 | epoch: 1 | total time: 26.83m | eta: 153.3m +step 02497/16704 (14.95%) | loss: 2.835204 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,776 | mfu: 50.86 | epoch: 1 | total time: 26.84m | eta: 153.3m +step 02498/16704 (14.95%) | loss: 2.837978 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,948 | mfu: 50.87 | epoch: 1 | total time: 26.85m | eta: 153.3m +step 02499/16704 (14.96%) | loss: 2.835986 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,008 | mfu: 50.69 | epoch: 1 | total time: 26.86m | eta: 153.3m +Step 02500 | Validation bpb: 0.872341 +step 02500/16704 (14.97%) | loss: 2.837607 | lrm: 1.00 | dt: 648.57ms | tok/sec: 808,371 | mfu: 50.52 | epoch: 1 | total time: 26.87m | eta: 153.3m +step 02501/16704 (14.97%) | loss: 2.839999 | lrm: 1.00 | dt: 648.20ms | tok/sec: 808,832 | mfu: 50.55 | epoch: 1 | total time: 26.88m | eta: 153.3m +step 02502/16704 (14.98%) | loss: 2.843642 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,620 | mfu: 50.60 | epoch: 1 | total time: 26.90m | eta: 153.3m +step 02503/16704 (14.98%) | loss: 2.840963 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,895 | mfu: 50.99 | epoch: 1 | total time: 26.91m | eta: 153.3m +step 02504/16704 (14.99%) | loss: 2.837940 | lrm: 1.00 | dt: 649.12ms | tok/sec: 807,695 | mfu: 50.48 | epoch: 1 | total time: 26.92m | eta: 153.3m +step 02505/16704 (15.00%) | loss: 2.829258 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,401 | mfu: 50.90 | epoch: 1 | total time: 26.93m | eta: 153.2m +step 02506/16704 (15.00%) | loss: 2.832137 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,697 | mfu: 50.61 | epoch: 1 | total time: 26.94m | eta: 153.2m +step 02507/16704 (15.01%) | loss: 2.844652 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,978 | mfu: 50.62 | epoch: 1 | total time: 26.95m | eta: 153.2m +step 02508/16704 (15.01%) | loss: 2.846307 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,435 | mfu: 50.90 | epoch: 1 | total time: 26.96m | eta: 153.2m +step 02509/16704 (15.02%) | loss: 2.855745 | lrm: 1.00 | dt: 648.19ms | tok/sec: 808,849 | mfu: 50.55 | epoch: 1 | total time: 26.97m | eta: 153.2m +step 02510/16704 (15.03%) | loss: 2.869926 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,542 | mfu: 50.85 | epoch: 1 | total time: 26.98m | eta: 153.2m +step 02511/16704 (15.03%) | loss: 2.871525 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,402 | mfu: 50.71 | epoch: 1 | total time: 26.99m | eta: 153.2m +step 02512/16704 (15.04%) | loss: 2.880238 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,011 | mfu: 50.69 | epoch: 1 | total time: 27.00m | eta: 153.2m +step 02513/16704 (15.04%) | loss: 2.878063 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,925 | mfu: 50.81 | epoch: 1 | total time: 27.01m | eta: 153.2m +step 02514/16704 (15.05%) | loss: 2.871616 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,706 | mfu: 50.67 | epoch: 1 | total time: 27.02m | eta: 153.1m +step 02515/16704 (15.06%) | loss: 2.877767 | lrm: 1.00 | dt: 648.62ms | tok/sec: 808,314 | mfu: 50.52 | epoch: 1 | total time: 27.04m | eta: 153.1m +step 02516/16704 (15.06%) | loss: 2.879302 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,681 | mfu: 50.92 | epoch: 1 | total time: 27.05m | eta: 153.1m +step 02517/16704 (15.07%) | loss: 2.883111 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,200 | mfu: 50.58 | epoch: 1 | total time: 27.06m | eta: 153.1m +step 02518/16704 (15.07%) | loss: 2.872757 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,763 | mfu: 50.92 | epoch: 1 | total time: 27.07m | eta: 153.1m +step 02519/16704 (15.08%) | loss: 2.864716 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,919 | mfu: 50.62 | epoch: 1 | total time: 27.08m | eta: 153.1m +step 02520/16704 (15.09%) | loss: 2.867403 | lrm: 1.00 | dt: 650.16ms | tok/sec: 806,392 | mfu: 50.40 | epoch: 1 | total time: 27.09m | eta: 153.1m +step 02521/16704 (15.09%) | loss: 2.856842 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,301 | mfu: 50.89 | epoch: 1 | total time: 27.10m | eta: 153.1m +step 02522/16704 (15.10%) | loss: 2.853462 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,579 | mfu: 50.66 | epoch: 1 | total time: 27.11m | eta: 153.1m +step 02523/16704 (15.10%) | loss: 2.855834 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,287 | mfu: 50.83 | epoch: 1 | total time: 27.12m | eta: 153.0m +step 02524/16704 (15.11%) | loss: 2.862577 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 1 | total time: 27.13m | eta: 153.0m +step 02525/16704 (15.12%) | loss: 2.872142 | lrm: 1.00 | dt: 648.73ms | tok/sec: 808,179 | mfu: 50.51 | epoch: 1 | total time: 27.14m | eta: 153.0m +step 02526/16704 (15.12%) | loss: 2.890011 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 1 | total time: 27.15m | eta: 153.0m +step 02527/16704 (15.13%) | loss: 2.882869 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,639 | mfu: 50.73 | epoch: 1 | total time: 27.16m | eta: 153.0m +step 02528/16704 (15.13%) | loss: 2.871790 | lrm: 1.00 | dt: 649.61ms | tok/sec: 807,084 | mfu: 50.44 | epoch: 1 | total time: 27.18m | eta: 153.0m +step 02529/16704 (15.14%) | loss: 2.868385 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,192 | mfu: 50.83 | epoch: 1 | total time: 27.19m | eta: 153.0m +step 02530/16704 (15.15%) | loss: 2.872728 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,785 | mfu: 50.74 | epoch: 1 | total time: 27.20m | eta: 153.0m +step 02531/16704 (15.15%) | loss: 2.871164 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,135 | mfu: 50.88 | epoch: 1 | total time: 27.21m | eta: 153.0m +step 02532/16704 (15.16%) | loss: 2.879851 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,943 | mfu: 50.87 | epoch: 1 | total time: 27.22m | eta: 152.9m +step 02533/16704 (15.16%) | loss: 2.883068 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,499 | mfu: 50.78 | epoch: 1 | total time: 27.23m | eta: 152.9m +step 02534/16704 (15.17%) | loss: 2.876764 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,889 | mfu: 50.74 | epoch: 1 | total time: 27.24m | eta: 152.9m +step 02535/16704 (15.18%) | loss: 2.871963 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,533 | mfu: 50.66 | epoch: 1 | total time: 27.25m | eta: 152.9m +step 02536/16704 (15.18%) | loss: 2.883678 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,671 | mfu: 50.79 | epoch: 1 | total time: 27.26m | eta: 152.9m +step 02537/16704 (15.19%) | loss: 2.886887 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,142 | mfu: 50.76 | epoch: 1 | total time: 27.27m | eta: 152.9m +step 02538/16704 (15.19%) | loss: 2.872574 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,330 | mfu: 50.90 | epoch: 1 | total time: 27.28m | eta: 152.9m +step 02539/16704 (15.20%) | loss: 2.855726 | lrm: 1.00 | dt: 649.01ms | tok/sec: 807,830 | mfu: 50.49 | epoch: 1 | total time: 27.29m | eta: 152.9m +step 02540/16704 (15.21%) | loss: 2.853600 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,220 | mfu: 50.95 | epoch: 1 | total time: 27.30m | eta: 152.9m +step 02541/16704 (15.21%) | loss: 2.851528 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,949 | mfu: 50.75 | epoch: 1 | total time: 27.32m | eta: 152.8m +step 02542/16704 (15.22%) | loss: 2.853928 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,705 | mfu: 50.67 | epoch: 1 | total time: 27.33m | eta: 152.8m +step 02543/16704 (15.22%) | loss: 2.856883 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,252 | mfu: 50.95 | epoch: 1 | total time: 27.34m | eta: 152.8m +step 02544/16704 (15.23%) | loss: 2.856786 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,773 | mfu: 50.55 | epoch: 1 | total time: 27.35m | eta: 152.8m +step 02545/16704 (15.24%) | loss: 2.851179 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 1 | total time: 27.36m | eta: 152.8m +step 02546/16704 (15.24%) | loss: 2.852363 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,697 | mfu: 50.73 | epoch: 1 | total time: 27.37m | eta: 152.8m +step 02547/16704 (15.25%) | loss: 2.854657 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,905 | mfu: 50.81 | epoch: 1 | total time: 27.38m | eta: 152.8m +step 02548/16704 (15.25%) | loss: 2.861316 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,153 | mfu: 50.95 | epoch: 1 | total time: 27.39m | eta: 152.8m +step 02549/16704 (15.26%) | loss: 2.863904 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,489 | mfu: 50.78 | epoch: 1 | total time: 27.40m | eta: 152.8m +step 02550/16704 (15.27%) | loss: 2.864919 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,187 | mfu: 50.76 | epoch: 1 | total time: 27.41m | eta: 152.8m +step 02551/16704 (15.27%) | loss: 2.863048 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 1 | total time: 27.42m | eta: 152.7m +step 02552/16704 (15.28%) | loss: 2.864664 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,734 | mfu: 50.67 | epoch: 1 | total time: 27.43m | eta: 152.7m +step 02553/16704 (15.28%) | loss: 2.871040 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,273 | mfu: 50.58 | epoch: 1 | total time: 27.44m | eta: 152.7m +step 02554/16704 (15.29%) | loss: 2.865941 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,719 | mfu: 50.67 | epoch: 1 | total time: 27.45m | eta: 152.7m +step 02555/16704 (15.30%) | loss: 2.867170 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,293 | mfu: 50.71 | epoch: 1 | total time: 27.47m | eta: 152.7m +step 02556/16704 (15.30%) | loss: 2.880947 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,099 | mfu: 50.88 | epoch: 1 | total time: 27.48m | eta: 152.7m +step 02557/16704 (15.31%) | loss: 2.883607 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,648 | mfu: 50.60 | epoch: 1 | total time: 27.49m | eta: 152.7m +step 02558/16704 (15.31%) | loss: 2.887957 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,647 | mfu: 50.67 | epoch: 1 | total time: 27.50m | eta: 152.7m +step 02559/16704 (15.32%) | loss: 2.887507 | lrm: 1.00 | dt: 642.72ms | tok/sec: 815,735 | mfu: 50.98 | epoch: 1 | total time: 27.51m | eta: 152.7m +step 02560/16704 (15.33%) | loss: 2.883237 | lrm: 1.00 | dt: 648.00ms | tok/sec: 809,084 | mfu: 50.57 | epoch: 1 | total time: 27.52m | eta: 152.6m +step 02561/16704 (15.33%) | loss: 2.886582 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,309 | mfu: 50.83 | epoch: 1 | total time: 27.53m | eta: 152.6m +step 02562/16704 (15.34%) | loss: 2.887931 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,336 | mfu: 50.71 | epoch: 1 | total time: 27.54m | eta: 152.6m +step 02563/16704 (15.34%) | loss: 2.892898 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,447 | mfu: 50.78 | epoch: 1 | total time: 27.55m | eta: 152.6m +step 02564/16704 (15.35%) | loss: 2.892134 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,456 | mfu: 50.78 | epoch: 1 | total time: 27.56m | eta: 152.6m +step 02565/16704 (15.36%) | loss: 2.888689 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,426 | mfu: 50.78 | epoch: 1 | total time: 27.57m | eta: 152.6m +step 02566/16704 (15.36%) | loss: 2.878954 | lrm: 1.00 | dt: 650.78ms | tok/sec: 805,634 | mfu: 50.35 | epoch: 1 | total time: 27.58m | eta: 152.6m +step 02567/16704 (15.37%) | loss: 2.871682 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,692 | mfu: 50.73 | epoch: 1 | total time: 27.59m | eta: 152.6m +step 02568/16704 (15.37%) | loss: 2.869208 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,415 | mfu: 50.78 | epoch: 1 | total time: 27.61m | eta: 152.6m +step 02569/16704 (15.38%) | loss: 2.879330 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,626 | mfu: 50.60 | epoch: 1 | total time: 27.62m | eta: 152.5m +step 02570/16704 (15.39%) | loss: 2.893642 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,086 | mfu: 50.88 | epoch: 1 | total time: 27.63m | eta: 152.5m +step 02571/16704 (15.39%) | loss: 2.901039 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,301 | mfu: 50.77 | epoch: 1 | total time: 27.64m | eta: 152.5m +step 02572/16704 (15.40%) | loss: 2.893509 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,395 | mfu: 50.71 | epoch: 1 | total time: 27.65m | eta: 152.5m +step 02573/16704 (15.40%) | loss: 2.889836 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,943 | mfu: 50.94 | epoch: 1 | total time: 27.66m | eta: 152.5m +step 02574/16704 (15.41%) | loss: 2.897555 | lrm: 1.00 | dt: 647.83ms | tok/sec: 809,296 | mfu: 50.58 | epoch: 1 | total time: 27.67m | eta: 152.5m +step 02575/16704 (15.42%) | loss: 2.894600 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,085 | mfu: 50.82 | epoch: 1 | total time: 27.68m | eta: 152.5m +step 02576/16704 (15.42%) | loss: 2.880120 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,147 | mfu: 50.82 | epoch: 1 | total time: 27.69m | eta: 152.5m +step 02577/16704 (15.43%) | loss: 2.869366 | lrm: 1.00 | dt: 647.06ms | tok/sec: 810,258 | mfu: 50.64 | epoch: 1 | total time: 27.70m | eta: 152.5m +step 02578/16704 (15.43%) | loss: 2.872944 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,401 | mfu: 50.78 | epoch: 1 | total time: 27.71m | eta: 152.4m +step 02579/16704 (15.44%) | loss: 2.868405 | lrm: 1.00 | dt: 647.01ms | tok/sec: 810,329 | mfu: 50.65 | epoch: 1 | total time: 27.72m | eta: 152.4m +step 02580/16704 (15.45%) | loss: 2.860930 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,761 | mfu: 50.80 | epoch: 1 | total time: 27.73m | eta: 152.4m +step 02581/16704 (15.45%) | loss: 2.866185 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,523 | mfu: 50.72 | epoch: 1 | total time: 27.75m | eta: 152.4m +step 02582/16704 (15.46%) | loss: 2.877739 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,604 | mfu: 50.73 | epoch: 1 | total time: 27.76m | eta: 152.4m +step 02583/16704 (15.46%) | loss: 2.874448 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,371 | mfu: 50.84 | epoch: 1 | total time: 27.77m | eta: 152.4m +step 02584/16704 (15.47%) | loss: 2.872067 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,148 | mfu: 50.82 | epoch: 1 | total time: 27.78m | eta: 152.4m +step 02585/16704 (15.48%) | loss: 2.876005 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,816 | mfu: 50.99 | epoch: 1 | total time: 27.79m | eta: 152.4m +step 02586/16704 (15.48%) | loss: 2.869172 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,188 | mfu: 50.76 | epoch: 1 | total time: 27.80m | eta: 152.4m +step 02587/16704 (15.49%) | loss: 2.844521 | lrm: 1.00 | dt: 647.21ms | tok/sec: 810,076 | mfu: 50.63 | epoch: 1 | total time: 27.81m | eta: 152.3m +step 02588/16704 (15.49%) | loss: 2.852914 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,361 | mfu: 50.77 | epoch: 1 | total time: 27.82m | eta: 152.3m +step 02589/16704 (15.50%) | loss: 2.854813 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,747 | mfu: 50.74 | epoch: 1 | total time: 27.83m | eta: 152.3m +step 02590/16704 (15.51%) | loss: 2.869980 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 1 | total time: 27.84m | eta: 152.3m +step 02591/16704 (15.51%) | loss: 2.874754 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,545 | mfu: 50.85 | epoch: 1 | total time: 27.85m | eta: 152.3m +step 02592/16704 (15.52%) | loss: 2.873101 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,655 | mfu: 50.73 | epoch: 1 | total time: 27.86m | eta: 152.3m +step 02593/16704 (15.52%) | loss: 2.866738 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,613 | mfu: 50.79 | epoch: 1 | total time: 27.87m | eta: 152.3m +step 02594/16704 (15.53%) | loss: 2.859987 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,119 | mfu: 50.76 | epoch: 1 | total time: 27.89m | eta: 152.3m +step 02595/16704 (15.54%) | loss: 2.880283 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,084 | mfu: 50.82 | epoch: 1 | total time: 27.90m | eta: 152.3m +step 02596/16704 (15.54%) | loss: 2.876459 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,771 | mfu: 50.74 | epoch: 1 | total time: 27.91m | eta: 152.2m +step 02597/16704 (15.55%) | loss: 2.881088 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,940 | mfu: 50.50 | epoch: 1 | total time: 27.92m | eta: 152.2m +step 02598/16704 (15.55%) | loss: 2.885267 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,674 | mfu: 50.73 | epoch: 1 | total time: 27.93m | eta: 152.2m +step 02599/16704 (15.56%) | loss: 2.879703 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,586 | mfu: 50.66 | epoch: 1 | total time: 27.94m | eta: 152.2m +step 02600/16704 (15.57%) | loss: 2.871735 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,005 | mfu: 50.69 | epoch: 1 | total time: 27.95m | eta: 152.2m +step 02601/16704 (15.57%) | loss: 2.859515 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,275 | mfu: 50.83 | epoch: 1 | total time: 27.96m | eta: 152.2m +step 02602/16704 (15.58%) | loss: 2.857094 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,759 | mfu: 50.67 | epoch: 1 | total time: 27.97m | eta: 152.2m +step 02603/16704 (15.58%) | loss: 2.854380 | lrm: 1.00 | dt: 648.04ms | tok/sec: 809,034 | mfu: 50.57 | epoch: 1 | total time: 27.98m | eta: 152.2m +step 02604/16704 (15.59%) | loss: 2.835655 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,224 | mfu: 50.89 | epoch: 1 | total time: 27.99m | eta: 152.2m +step 02605/16704 (15.60%) | loss: 2.832759 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,307 | mfu: 50.58 | epoch: 1 | total time: 28.00m | eta: 152.1m +step 02606/16704 (15.60%) | loss: 2.834809 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,730 | mfu: 50.73 | epoch: 1 | total time: 28.01m | eta: 152.1m +step 02607/16704 (15.61%) | loss: 2.841155 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,621 | mfu: 50.60 | epoch: 1 | total time: 28.03m | eta: 152.1m +step 02608/16704 (15.61%) | loss: 2.856465 | lrm: 1.00 | dt: 648.07ms | tok/sec: 808,997 | mfu: 50.56 | epoch: 1 | total time: 28.04m | eta: 152.1m +step 02609/16704 (15.62%) | loss: 2.853365 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,176 | mfu: 50.70 | epoch: 1 | total time: 28.05m | eta: 152.1m +step 02610/16704 (15.62%) | loss: 2.849645 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,902 | mfu: 50.75 | epoch: 1 | total time: 28.06m | eta: 152.1m +step 02611/16704 (15.63%) | loss: 2.847323 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,794 | mfu: 50.80 | epoch: 1 | total time: 28.07m | eta: 152.1m +step 02612/16704 (15.64%) | loss: 2.856512 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,850 | mfu: 50.80 | epoch: 1 | total time: 28.08m | eta: 152.1m +step 02613/16704 (15.64%) | loss: 2.856801 | lrm: 1.00 | dt: 649.90ms | tok/sec: 806,718 | mfu: 50.42 | epoch: 1 | total time: 28.09m | eta: 152.1m +step 02614/16704 (15.65%) | loss: 2.868364 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,703 | mfu: 50.73 | epoch: 1 | total time: 28.10m | eta: 152.1m +step 02615/16704 (15.65%) | loss: 2.875553 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,887 | mfu: 50.74 | epoch: 1 | total time: 28.11m | eta: 152.0m +step 02616/16704 (15.66%) | loss: 2.879224 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,075 | mfu: 50.82 | epoch: 1 | total time: 28.12m | eta: 152.0m +step 02617/16704 (15.67%) | loss: 2.876285 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,115 | mfu: 50.88 | epoch: 1 | total time: 28.13m | eta: 152.0m +step 02618/16704 (15.67%) | loss: 2.879775 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,281 | mfu: 50.83 | epoch: 1 | total time: 28.14m | eta: 152.0m +step 02619/16704 (15.68%) | loss: 2.871200 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,654 | mfu: 50.73 | epoch: 1 | total time: 28.15m | eta: 152.0m +step 02620/16704 (15.68%) | loss: 2.870733 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,560 | mfu: 50.85 | epoch: 1 | total time: 28.17m | eta: 152.0m +step 02621/16704 (15.69%) | loss: 2.867938 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,328 | mfu: 50.83 | epoch: 1 | total time: 28.18m | eta: 152.0m +step 02622/16704 (15.70%) | loss: 2.858500 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,748 | mfu: 50.86 | epoch: 1 | total time: 28.19m | eta: 152.0m +step 02623/16704 (15.70%) | loss: 2.857768 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,205 | mfu: 50.83 | epoch: 1 | total time: 28.20m | eta: 152.0m +step 02624/16704 (15.71%) | loss: 2.853314 | lrm: 1.00 | dt: 647.72ms | tok/sec: 809,442 | mfu: 50.59 | epoch: 1 | total time: 28.21m | eta: 151.9m +step 02625/16704 (15.71%) | loss: 2.861185 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,086 | mfu: 50.82 | epoch: 1 | total time: 28.22m | eta: 151.9m +step 02626/16704 (15.72%) | loss: 2.856754 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,815 | mfu: 50.74 | epoch: 1 | total time: 28.23m | eta: 151.9m +step 02627/16704 (15.73%) | loss: 2.857162 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,770 | mfu: 50.67 | epoch: 1 | total time: 28.24m | eta: 151.9m +step 02628/16704 (15.73%) | loss: 2.866191 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,766 | mfu: 50.86 | epoch: 1 | total time: 28.25m | eta: 151.9m +step 02629/16704 (15.74%) | loss: 2.864760 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,354 | mfu: 50.84 | epoch: 1 | total time: 28.26m | eta: 151.9m +step 02630/16704 (15.74%) | loss: 2.852484 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,740 | mfu: 50.67 | epoch: 1 | total time: 28.27m | eta: 151.9m +step 02631/16704 (15.75%) | loss: 2.855819 | lrm: 1.00 | dt: 647.67ms | tok/sec: 809,497 | mfu: 50.59 | epoch: 1 | total time: 28.28m | eta: 151.9m +step 02632/16704 (15.76%) | loss: 2.849221 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 1 | total time: 28.29m | eta: 151.9m +step 02633/16704 (15.76%) | loss: 2.854666 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,069 | mfu: 50.69 | epoch: 1 | total time: 28.31m | eta: 151.8m +step 02634/16704 (15.77%) | loss: 2.845434 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,797 | mfu: 50.80 | epoch: 1 | total time: 28.32m | eta: 151.8m +step 02635/16704 (15.77%) | loss: 2.836813 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,232 | mfu: 50.83 | epoch: 1 | total time: 28.33m | eta: 151.8m +step 02636/16704 (15.78%) | loss: 2.834629 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,765 | mfu: 50.80 | epoch: 1 | total time: 28.34m | eta: 151.8m +step 02637/16704 (15.79%) | loss: 2.847530 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,668 | mfu: 50.86 | epoch: 1 | total time: 28.35m | eta: 151.8m +step 02638/16704 (15.79%) | loss: 2.848241 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,796 | mfu: 50.80 | epoch: 1 | total time: 28.36m | eta: 151.8m +step 02639/16704 (15.80%) | loss: 2.856372 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,096 | mfu: 50.69 | epoch: 1 | total time: 28.37m | eta: 151.8m +step 02640/16704 (15.80%) | loss: 2.850626 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,540 | mfu: 50.91 | epoch: 1 | total time: 28.38m | eta: 151.8m +step 02641/16704 (15.81%) | loss: 2.842087 | lrm: 1.00 | dt: 650.93ms | tok/sec: 805,447 | mfu: 50.34 | epoch: 1 | total time: 28.39m | eta: 151.8m +step 02642/16704 (15.82%) | loss: 2.837726 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,326 | mfu: 50.77 | epoch: 1 | total time: 28.40m | eta: 151.7m +step 02643/16704 (15.82%) | loss: 2.851914 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,237 | mfu: 50.95 | epoch: 1 | total time: 28.41m | eta: 151.7m +step 02644/16704 (15.83%) | loss: 2.852072 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,813 | mfu: 50.68 | epoch: 1 | total time: 28.42m | eta: 151.7m +step 02645/16704 (15.83%) | loss: 2.846414 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,881 | mfu: 50.62 | epoch: 1 | total time: 28.43m | eta: 151.7m +step 02646/16704 (15.84%) | loss: 2.864034 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,169 | mfu: 50.89 | epoch: 1 | total time: 28.45m | eta: 151.7m +step 02647/16704 (15.85%) | loss: 2.880572 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,639 | mfu: 50.79 | epoch: 1 | total time: 28.46m | eta: 151.7m +step 02648/16704 (15.85%) | loss: 2.878160 | lrm: 1.00 | dt: 647.03ms | tok/sec: 810,299 | mfu: 50.64 | epoch: 1 | total time: 28.47m | eta: 151.7m +step 02649/16704 (15.86%) | loss: 2.880112 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,163 | mfu: 51.01 | epoch: 1 | total time: 28.48m | eta: 151.7m +step 02650/16704 (15.86%) | loss: 2.876471 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,611 | mfu: 50.73 | epoch: 1 | total time: 28.49m | eta: 151.7m +step 02651/16704 (15.87%) | loss: 2.869447 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,515 | mfu: 50.97 | epoch: 1 | total time: 28.50m | eta: 151.6m +step 02652/16704 (15.88%) | loss: 2.848814 | lrm: 1.00 | dt: 647.94ms | tok/sec: 809,158 | mfu: 50.57 | epoch: 1 | total time: 28.51m | eta: 151.6m +step 02653/16704 (15.88%) | loss: 2.841607 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,971 | mfu: 50.94 | epoch: 1 | total time: 28.52m | eta: 151.6m +step 02654/16704 (15.89%) | loss: 2.839502 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,746 | mfu: 50.80 | epoch: 1 | total time: 28.53m | eta: 151.6m +step 02655/16704 (15.89%) | loss: 2.852198 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,143 | mfu: 50.82 | epoch: 1 | total time: 28.54m | eta: 151.6m +step 02656/16704 (15.90%) | loss: 2.849588 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,815 | mfu: 50.86 | epoch: 1 | total time: 28.55m | eta: 151.6m +step 02657/16704 (15.91%) | loss: 2.850884 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,948 | mfu: 50.62 | epoch: 1 | total time: 28.56m | eta: 151.6m +step 02658/16704 (15.91%) | loss: 2.856627 | lrm: 1.00 | dt: 646.02ms | tok/sec: 811,569 | mfu: 50.72 | epoch: 1 | total time: 28.57m | eta: 151.6m +step 02659/16704 (15.92%) | loss: 2.845410 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,270 | mfu: 50.71 | epoch: 1 | total time: 28.58m | eta: 151.6m +step 02660/16704 (15.92%) | loss: 2.861006 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,226 | mfu: 50.83 | epoch: 1 | total time: 28.60m | eta: 151.5m +step 02661/16704 (15.93%) | loss: 2.869118 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,514 | mfu: 50.85 | epoch: 1 | total time: 28.61m | eta: 151.5m +step 02662/16704 (15.94%) | loss: 2.868681 | lrm: 1.00 | dt: 649.14ms | tok/sec: 807,664 | mfu: 50.48 | epoch: 1 | total time: 28.62m | eta: 151.5m +step 02663/16704 (15.94%) | loss: 2.854550 | lrm: 1.00 | dt: 648.12ms | tok/sec: 808,941 | mfu: 50.56 | epoch: 1 | total time: 28.63m | eta: 151.5m +step 02664/16704 (15.95%) | loss: 2.860097 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,134 | mfu: 50.88 | epoch: 1 | total time: 28.64m | eta: 151.5m +step 02665/16704 (15.95%) | loss: 2.847950 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,164 | mfu: 50.70 | epoch: 1 | total time: 28.65m | eta: 151.5m +step 02666/16704 (15.96%) | loss: 2.843236 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,851 | mfu: 50.74 | epoch: 1 | total time: 28.66m | eta: 151.5m +step 02667/16704 (15.97%) | loss: 2.838346 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,463 | mfu: 50.84 | epoch: 1 | total time: 28.67m | eta: 151.5m +step 02668/16704 (15.97%) | loss: 2.832073 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,766 | mfu: 50.61 | epoch: 1 | total time: 28.68m | eta: 151.5m +step 02669/16704 (15.98%) | loss: 2.830780 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,625 | mfu: 50.79 | epoch: 1 | total time: 28.69m | eta: 151.4m +step 02670/16704 (15.98%) | loss: 2.851338 | lrm: 1.00 | dt: 649.02ms | tok/sec: 807,808 | mfu: 50.49 | epoch: 1 | total time: 28.70m | eta: 151.4m +step 02671/16704 (15.99%) | loss: 2.846828 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,974 | mfu: 50.69 | epoch: 1 | total time: 28.71m | eta: 151.4m +step 02672/16704 (16.00%) | loss: 2.843957 | lrm: 1.00 | dt: 647.71ms | tok/sec: 809,445 | mfu: 50.59 | epoch: 1 | total time: 28.72m | eta: 151.4m +step 02673/16704 (16.00%) | loss: 2.848644 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,663 | mfu: 50.86 | epoch: 1 | total time: 28.74m | eta: 151.4m +step 02674/16704 (16.01%) | loss: 2.856145 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,744 | mfu: 50.86 | epoch: 1 | total time: 28.75m | eta: 151.4m +step 02675/16704 (16.01%) | loss: 2.849191 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,969 | mfu: 50.75 | epoch: 1 | total time: 28.76m | eta: 151.4m +step 02676/16704 (16.02%) | loss: 2.849439 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,129 | mfu: 50.70 | epoch: 1 | total time: 28.77m | eta: 151.4m +step 02677/16704 (16.03%) | loss: 2.844803 | lrm: 1.00 | dt: 647.90ms | tok/sec: 809,216 | mfu: 50.58 | epoch: 1 | total time: 28.78m | eta: 151.4m +step 02678/16704 (16.03%) | loss: 2.842406 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 1 | total time: 28.79m | eta: 151.3m +step 02679/16704 (16.04%) | loss: 2.846560 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,061 | mfu: 50.88 | epoch: 1 | total time: 28.80m | eta: 151.3m +step 02680/16704 (16.04%) | loss: 2.848871 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,151 | mfu: 50.82 | epoch: 1 | total time: 28.81m | eta: 151.3m +step 02681/16704 (16.05%) | loss: 2.864630 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,198 | mfu: 50.76 | epoch: 1 | total time: 28.82m | eta: 151.3m +step 02682/16704 (16.06%) | loss: 2.868438 | lrm: 1.00 | dt: 648.34ms | tok/sec: 808,660 | mfu: 50.54 | epoch: 1 | total time: 28.83m | eta: 151.3m +step 02683/16704 (16.06%) | loss: 2.858153 | lrm: 1.00 | dt: 647.27ms | tok/sec: 810,001 | mfu: 50.63 | epoch: 1 | total time: 28.84m | eta: 151.3m +step 02684/16704 (16.07%) | loss: 2.865611 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,326 | mfu: 50.83 | epoch: 1 | total time: 28.85m | eta: 151.3m +step 02685/16704 (16.07%) | loss: 2.871014 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,236 | mfu: 50.70 | epoch: 1 | total time: 28.86m | eta: 151.3m +step 02686/16704 (16.08%) | loss: 2.874917 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,965 | mfu: 50.75 | epoch: 1 | total time: 28.88m | eta: 151.3m +step 02687/16704 (16.09%) | loss: 2.861395 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,853 | mfu: 50.87 | epoch: 1 | total time: 28.89m | eta: 151.3m +step 02688/16704 (16.09%) | loss: 2.848458 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,609 | mfu: 50.73 | epoch: 1 | total time: 28.90m | eta: 151.2m +step 02689/16704 (16.10%) | loss: 2.842295 | lrm: 1.00 | dt: 646.70ms | tok/sec: 810,708 | mfu: 50.67 | epoch: 1 | total time: 28.91m | eta: 151.2m +step 02690/16704 (16.10%) | loss: 2.854868 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,820 | mfu: 50.80 | epoch: 1 | total time: 28.92m | eta: 151.2m +step 02691/16704 (16.11%) | loss: 2.859320 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,501 | mfu: 50.85 | epoch: 1 | total time: 28.93m | eta: 151.2m +step 02692/16704 (16.12%) | loss: 2.868885 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,233 | mfu: 50.70 | epoch: 1 | total time: 28.94m | eta: 151.2m +step 02693/16704 (16.12%) | loss: 2.864187 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,772 | mfu: 50.80 | epoch: 1 | total time: 28.95m | eta: 151.2m +step 02694/16704 (16.13%) | loss: 2.883090 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,275 | mfu: 50.64 | epoch: 1 | total time: 28.96m | eta: 151.2m +step 02695/16704 (16.13%) | loss: 2.866766 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,938 | mfu: 50.81 | epoch: 1 | total time: 28.97m | eta: 151.2m +step 02696/16704 (16.14%) | loss: 2.860793 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,215 | mfu: 50.83 | epoch: 1 | total time: 28.98m | eta: 151.2m +step 02697/16704 (16.15%) | loss: 2.860590 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,165 | mfu: 50.70 | epoch: 1 | total time: 28.99m | eta: 151.1m +step 02698/16704 (16.15%) | loss: 2.862165 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,929 | mfu: 50.87 | epoch: 1 | total time: 29.00m | eta: 151.1m +step 02699/16704 (16.16%) | loss: 2.851709 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,487 | mfu: 50.78 | epoch: 1 | total time: 29.02m | eta: 151.1m +step 02700/16704 (16.16%) | loss: 2.854159 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,336 | mfu: 50.77 | epoch: 1 | total time: 29.03m | eta: 151.1m +step 02701/16704 (16.17%) | loss: 2.845815 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,334 | mfu: 50.83 | epoch: 1 | total time: 29.04m | eta: 151.1m +step 02702/16704 (16.18%) | loss: 2.846095 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,039 | mfu: 50.69 | epoch: 1 | total time: 29.05m | eta: 151.1m +step 02703/16704 (16.18%) | loss: 2.856977 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,630 | mfu: 50.73 | epoch: 1 | total time: 29.06m | eta: 151.1m +step 02704/16704 (16.19%) | loss: 2.866672 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,470 | mfu: 50.78 | epoch: 1 | total time: 29.07m | eta: 151.1m +step 02705/16704 (16.19%) | loss: 2.862919 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,338 | mfu: 50.77 | epoch: 1 | total time: 29.08m | eta: 151.1m +step 02706/16704 (16.20%) | loss: 2.864770 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,559 | mfu: 50.85 | epoch: 1 | total time: 29.09m | eta: 151.0m +step 02707/16704 (16.21%) | loss: 2.862111 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,075 | mfu: 50.57 | epoch: 1 | total time: 29.10m | eta: 151.0m +step 02708/16704 (16.21%) | loss: 2.865283 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,312 | mfu: 50.83 | epoch: 1 | total time: 29.11m | eta: 151.0m +step 02709/16704 (16.22%) | loss: 2.871107 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,969 | mfu: 50.81 | epoch: 1 | total time: 29.12m | eta: 151.0m +step 02710/16704 (16.22%) | loss: 2.876660 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,616 | mfu: 50.66 | epoch: 1 | total time: 29.13m | eta: 151.0m +step 02711/16704 (16.23%) | loss: 2.876729 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,710 | mfu: 50.86 | epoch: 1 | total time: 29.14m | eta: 151.0m +step 02712/16704 (16.24%) | loss: 2.883662 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,219 | mfu: 50.58 | epoch: 1 | total time: 29.16m | eta: 151.0m +step 02713/16704 (16.24%) | loss: 2.876991 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,331 | mfu: 50.77 | epoch: 1 | total time: 29.17m | eta: 151.0m +step 02714/16704 (16.25%) | loss: 2.884588 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,946 | mfu: 50.75 | epoch: 1 | total time: 29.18m | eta: 151.0m +step 02715/16704 (16.25%) | loss: 2.876837 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,956 | mfu: 50.69 | epoch: 1 | total time: 29.19m | eta: 150.9m +step 02716/16704 (16.26%) | loss: 2.871330 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 1 | total time: 29.20m | eta: 150.9m +step 02717/16704 (16.27%) | loss: 2.858866 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 1 | total time: 29.21m | eta: 150.9m +step 02718/16704 (16.27%) | loss: 2.862855 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,185 | mfu: 50.76 | epoch: 1 | total time: 29.22m | eta: 150.9m +step 02719/16704 (16.28%) | loss: 2.855051 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,687 | mfu: 50.79 | epoch: 1 | total time: 29.23m | eta: 150.9m +step 02720/16704 (16.28%) | loss: 2.852524 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,576 | mfu: 50.79 | epoch: 1 | total time: 29.24m | eta: 150.9m +step 02721/16704 (16.29%) | loss: 2.851232 | lrm: 1.00 | dt: 647.01ms | tok/sec: 810,319 | mfu: 50.65 | epoch: 1 | total time: 29.25m | eta: 150.9m +step 02722/16704 (16.30%) | loss: 2.848296 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,760 | mfu: 50.99 | epoch: 1 | total time: 29.26m | eta: 150.9m +step 02723/16704 (16.30%) | loss: 2.854469 | lrm: 1.00 | dt: 647.03ms | tok/sec: 810,300 | mfu: 50.64 | epoch: 1 | total time: 29.27m | eta: 150.9m +step 02724/16704 (16.31%) | loss: 2.847714 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,275 | mfu: 50.58 | epoch: 1 | total time: 29.28m | eta: 150.8m +step 02725/16704 (16.31%) | loss: 2.847217 | lrm: 1.00 | dt: 648.40ms | tok/sec: 808,583 | mfu: 50.54 | epoch: 1 | total time: 29.30m | eta: 150.8m +step 02726/16704 (16.32%) | loss: 2.849932 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,198 | mfu: 50.83 | epoch: 1 | total time: 29.31m | eta: 150.8m +step 02727/16704 (16.33%) | loss: 2.863004 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,031 | mfu: 50.88 | epoch: 1 | total time: 29.32m | eta: 150.8m +step 02728/16704 (16.33%) | loss: 2.862234 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,014 | mfu: 50.88 | epoch: 1 | total time: 29.33m | eta: 150.8m +step 02729/16704 (16.34%) | loss: 2.874408 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,604 | mfu: 50.73 | epoch: 1 | total time: 29.34m | eta: 150.8m +step 02730/16704 (16.34%) | loss: 2.867981 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,239 | mfu: 50.95 | epoch: 1 | total time: 29.35m | eta: 150.8m +step 02731/16704 (16.35%) | loss: 2.875440 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,751 | mfu: 50.80 | epoch: 1 | total time: 29.36m | eta: 150.8m +step 02732/16704 (16.36%) | loss: 2.870647 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,577 | mfu: 50.60 | epoch: 1 | total time: 29.37m | eta: 150.8m +step 02733/16704 (16.36%) | loss: 2.877359 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,494 | mfu: 50.72 | epoch: 1 | total time: 29.38m | eta: 150.7m +step 02734/16704 (16.37%) | loss: 2.876195 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,296 | mfu: 50.83 | epoch: 1 | total time: 29.39m | eta: 150.7m +step 02735/16704 (16.37%) | loss: 2.868330 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,527 | mfu: 50.72 | epoch: 1 | total time: 29.40m | eta: 150.7m +step 02736/16704 (16.38%) | loss: 2.858342 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,605 | mfu: 50.79 | epoch: 1 | total time: 29.41m | eta: 150.7m +step 02737/16704 (16.39%) | loss: 2.867516 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,773 | mfu: 50.55 | epoch: 1 | total time: 29.42m | eta: 150.7m +step 02738/16704 (16.39%) | loss: 2.861130 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,515 | mfu: 50.72 | epoch: 1 | total time: 29.44m | eta: 150.7m +step 02739/16704 (16.40%) | loss: 2.863552 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,733 | mfu: 50.86 | epoch: 1 | total time: 29.45m | eta: 150.7m +step 02740/16704 (16.40%) | loss: 2.874140 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,909 | mfu: 50.75 | epoch: 1 | total time: 29.46m | eta: 150.7m +step 02741/16704 (16.41%) | loss: 2.876297 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,352 | mfu: 50.71 | epoch: 1 | total time: 29.47m | eta: 150.7m +step 02742/16704 (16.42%) | loss: 2.877940 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,351 | mfu: 50.84 | epoch: 1 | total time: 29.48m | eta: 150.6m +step 02743/16704 (16.42%) | loss: 2.872441 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,555 | mfu: 50.97 | epoch: 1 | total time: 29.49m | eta: 150.6m +step 02744/16704 (16.43%) | loss: 2.867023 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,416 | mfu: 50.90 | epoch: 1 | total time: 29.50m | eta: 150.6m +step 02745/16704 (16.43%) | loss: 2.859105 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,604 | mfu: 50.85 | epoch: 1 | total time: 29.51m | eta: 150.6m +step 02746/16704 (16.44%) | loss: 2.848377 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,472 | mfu: 50.78 | epoch: 1 | total time: 29.52m | eta: 150.6m +step 02747/16704 (16.45%) | loss: 2.870346 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,491 | mfu: 50.78 | epoch: 1 | total time: 29.53m | eta: 150.6m +step 02748/16704 (16.45%) | loss: 2.864891 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,049 | mfu: 50.88 | epoch: 1 | total time: 29.54m | eta: 150.6m +step 02749/16704 (16.46%) | loss: 2.865387 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,705 | mfu: 50.73 | epoch: 1 | total time: 29.55m | eta: 150.6m +Step 02750 | Validation bpb: 0.867204 +step 02750/16704 (16.46%) | loss: 2.855417 | lrm: 1.00 | dt: 648.00ms | tok/sec: 809,081 | mfu: 50.57 | epoch: 1 | total time: 29.56m | eta: 150.6m +step 02751/16704 (16.47%) | loss: 2.873536 | lrm: 1.00 | dt: 649.14ms | tok/sec: 807,671 | mfu: 50.48 | epoch: 1 | total time: 29.57m | eta: 150.6m +step 02752/16704 (16.48%) | loss: 2.862492 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,877 | mfu: 50.74 | epoch: 1 | total time: 29.59m | eta: 150.5m +step 02753/16704 (16.48%) | loss: 2.855455 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,115 | mfu: 50.88 | epoch: 1 | total time: 29.60m | eta: 150.5m +step 02754/16704 (16.49%) | loss: 2.870322 | lrm: 1.00 | dt: 649.38ms | tok/sec: 807,370 | mfu: 50.46 | epoch: 1 | total time: 29.61m | eta: 150.5m +step 02755/16704 (16.49%) | loss: 2.867729 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,239 | mfu: 50.95 | epoch: 1 | total time: 29.62m | eta: 150.5m +step 02756/16704 (16.50%) | loss: 2.853626 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,400 | mfu: 50.71 | epoch: 1 | total time: 29.63m | eta: 150.5m +step 02757/16704 (16.51%) | loss: 2.846298 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,253 | mfu: 50.77 | epoch: 1 | total time: 29.64m | eta: 150.5m +step 02758/16704 (16.51%) | loss: 2.832541 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,482 | mfu: 50.91 | epoch: 1 | total time: 29.65m | eta: 150.5m +step 02759/16704 (16.52%) | loss: 2.828395 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,095 | mfu: 50.82 | epoch: 1 | total time: 29.66m | eta: 150.5m +step 02760/16704 (16.52%) | loss: 2.825751 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,477 | mfu: 50.84 | epoch: 1 | total time: 29.67m | eta: 150.5m +step 02761/16704 (16.53%) | loss: 2.825877 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,609 | mfu: 50.66 | epoch: 1 | total time: 29.68m | eta: 150.4m +step 02762/16704 (16.53%) | loss: 2.817792 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 1 | total time: 29.69m | eta: 150.4m +step 02763/16704 (16.54%) | loss: 2.816773 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,101 | mfu: 50.94 | epoch: 1 | total time: 29.70m | eta: 150.4m +step 02764/16704 (16.55%) | loss: 2.815156 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,585 | mfu: 50.73 | epoch: 1 | total time: 29.71m | eta: 150.4m +step 02765/16704 (16.55%) | loss: 2.819284 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,203 | mfu: 50.70 | epoch: 1 | total time: 29.73m | eta: 150.4m +step 02766/16704 (16.56%) | loss: 2.812723 | lrm: 1.00 | dt: 643.19ms | tok/sec: 815,138 | mfu: 50.95 | epoch: 1 | total time: 29.74m | eta: 150.4m +step 02767/16704 (16.56%) | loss: 2.822265 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,268 | mfu: 50.77 | epoch: 1 | total time: 29.75m | eta: 150.4m +step 02768/16704 (16.57%) | loss: 2.826749 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,699 | mfu: 50.86 | epoch: 1 | total time: 29.76m | eta: 150.4m +step 02769/16704 (16.58%) | loss: 2.825725 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,401 | mfu: 50.90 | epoch: 1 | total time: 29.77m | eta: 150.4m +step 02770/16704 (16.58%) | loss: 2.821268 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,052 | mfu: 50.69 | epoch: 1 | total time: 29.78m | eta: 150.3m +step 02771/16704 (16.59%) | loss: 2.817066 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,221 | mfu: 50.83 | epoch: 1 | total time: 29.79m | eta: 150.3m +step 02772/16704 (16.59%) | loss: 2.814161 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,220 | mfu: 50.70 | epoch: 1 | total time: 29.80m | eta: 150.3m +step 02773/16704 (16.60%) | loss: 2.804356 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,508 | mfu: 50.78 | epoch: 1 | total time: 29.81m | eta: 150.3m +step 02774/16704 (16.61%) | loss: 2.803879 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,979 | mfu: 50.69 | epoch: 1 | total time: 29.82m | eta: 150.3m +step 02775/16704 (16.61%) | loss: 2.812177 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,799 | mfu: 50.80 | epoch: 1 | total time: 29.83m | eta: 150.3m +step 02776/16704 (16.62%) | loss: 2.802711 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,477 | mfu: 50.84 | epoch: 1 | total time: 29.84m | eta: 150.3m +step 02777/16704 (16.62%) | loss: 2.788316 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,187 | mfu: 50.83 | epoch: 1 | total time: 29.85m | eta: 150.3m +step 02778/16704 (16.63%) | loss: 2.788927 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,494 | mfu: 50.84 | epoch: 1 | total time: 29.87m | eta: 150.3m +step 02779/16704 (16.64%) | loss: 2.784196 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,204 | mfu: 50.83 | epoch: 1 | total time: 29.88m | eta: 150.2m +step 02780/16704 (16.64%) | loss: 2.785891 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,730 | mfu: 50.67 | epoch: 1 | total time: 29.89m | eta: 150.2m +step 02781/16704 (16.65%) | loss: 2.771076 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,340 | mfu: 50.77 | epoch: 1 | total time: 29.90m | eta: 150.2m +step 02782/16704 (16.65%) | loss: 2.782664 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,066 | mfu: 50.88 | epoch: 1 | total time: 29.91m | eta: 150.2m +step 02783/16704 (16.66%) | loss: 2.785752 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 1 | total time: 29.92m | eta: 150.2m +step 02784/16704 (16.67%) | loss: 2.794594 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 1 | total time: 29.93m | eta: 150.2m +step 02785/16704 (16.67%) | loss: 2.806782 | lrm: 1.00 | dt: 647.45ms | tok/sec: 809,772 | mfu: 50.61 | epoch: 1 | total time: 29.94m | eta: 150.2m +step 02786/16704 (16.68%) | loss: 2.813016 | lrm: 1.00 | dt: 641.90ms | tok/sec: 816,769 | mfu: 51.05 | epoch: 1 | total time: 29.95m | eta: 150.2m +step 02787/16704 (16.68%) | loss: 2.817957 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,823 | mfu: 50.80 | epoch: 1 | total time: 29.96m | eta: 150.2m +step 02788/16704 (16.69%) | loss: 2.810290 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,530 | mfu: 50.72 | epoch: 1 | total time: 29.97m | eta: 150.1m +step 02789/16704 (16.70%) | loss: 2.826375 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,285 | mfu: 50.71 | epoch: 1 | total time: 29.98m | eta: 150.1m +step 02790/16704 (16.70%) | loss: 2.821025 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 1 | total time: 29.99m | eta: 150.1m +step 02791/16704 (16.71%) | loss: 2.815874 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,021 | mfu: 50.75 | epoch: 1 | total time: 30.00m | eta: 150.1m +step 02792/16704 (16.71%) | loss: 2.818747 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,928 | mfu: 50.75 | epoch: 1 | total time: 30.02m | eta: 150.1m +step 02793/16704 (16.72%) | loss: 2.817057 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,933 | mfu: 50.81 | epoch: 1 | total time: 30.03m | eta: 150.1m +step 02794/16704 (16.73%) | loss: 2.824930 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,771 | mfu: 50.80 | epoch: 1 | total time: 30.04m | eta: 150.1m +step 02795/16704 (16.73%) | loss: 2.825867 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,861 | mfu: 50.74 | epoch: 1 | total time: 30.05m | eta: 150.1m +step 02796/16704 (16.74%) | loss: 2.810149 | lrm: 1.00 | dt: 647.15ms | tok/sec: 810,151 | mfu: 50.64 | epoch: 1 | total time: 30.06m | eta: 150.1m +step 02797/16704 (16.74%) | loss: 2.824593 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,619 | mfu: 50.98 | epoch: 1 | total time: 30.07m | eta: 150.0m +step 02798/16704 (16.75%) | loss: 2.819992 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,488 | mfu: 50.78 | epoch: 1 | total time: 30.08m | eta: 150.0m +step 02799/16704 (16.76%) | loss: 2.823667 | lrm: 1.00 | dt: 649.26ms | tok/sec: 807,512 | mfu: 50.47 | epoch: 1 | total time: 30.09m | eta: 150.0m +step 02800/16704 (16.76%) | loss: 2.828764 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,158 | mfu: 50.70 | epoch: 1 | total time: 30.10m | eta: 150.0m +step 02801/16704 (16.77%) | loss: 2.852499 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,127 | mfu: 50.88 | epoch: 1 | total time: 30.11m | eta: 150.0m +step 02802/16704 (16.77%) | loss: 2.844451 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,939 | mfu: 50.68 | epoch: 1 | total time: 30.12m | eta: 150.0m +step 02803/16704 (16.78%) | loss: 2.847503 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 1 | total time: 30.13m | eta: 150.0m +step 02804/16704 (16.79%) | loss: 2.850316 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,929 | mfu: 50.68 | epoch: 1 | total time: 30.14m | eta: 150.0m +step 02805/16704 (16.79%) | loss: 2.854534 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,311 | mfu: 50.58 | epoch: 1 | total time: 30.16m | eta: 150.0m +step 02806/16704 (16.80%) | loss: 2.846243 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,228 | mfu: 50.95 | epoch: 1 | total time: 30.17m | eta: 149.9m +step 02807/16704 (16.80%) | loss: 2.842185 | lrm: 1.00 | dt: 647.16ms | tok/sec: 810,137 | mfu: 50.63 | epoch: 1 | total time: 30.18m | eta: 149.9m +step 02808/16704 (16.81%) | loss: 2.850672 | lrm: 1.00 | dt: 646.38ms | tok/sec: 811,113 | mfu: 50.70 | epoch: 1 | total time: 30.19m | eta: 149.9m +step 02809/16704 (16.82%) | loss: 2.851377 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,081 | mfu: 50.76 | epoch: 1 | total time: 30.20m | eta: 149.9m +step 02810/16704 (16.82%) | loss: 2.861251 | lrm: 1.00 | dt: 649.67ms | tok/sec: 807,008 | mfu: 50.44 | epoch: 1 | total time: 30.21m | eta: 149.9m +step 02811/16704 (16.83%) | loss: 2.858590 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,554 | mfu: 50.79 | epoch: 1 | total time: 30.22m | eta: 149.9m +step 02812/16704 (16.83%) | loss: 2.857923 | lrm: 1.00 | dt: 649.98ms | tok/sec: 806,617 | mfu: 50.41 | epoch: 1 | total time: 30.23m | eta: 149.9m +step 02813/16704 (16.84%) | loss: 2.857311 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,619 | mfu: 50.79 | epoch: 1 | total time: 30.24m | eta: 149.9m +step 02814/16704 (16.85%) | loss: 2.861515 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,492 | mfu: 50.72 | epoch: 1 | total time: 30.25m | eta: 149.9m +step 02815/16704 (16.85%) | loss: 2.838466 | lrm: 1.00 | dt: 649.44ms | tok/sec: 807,292 | mfu: 50.46 | epoch: 1 | total time: 30.26m | eta: 149.9m +step 02816/16704 (16.86%) | loss: 2.842581 | lrm: 1.00 | dt: 649.34ms | tok/sec: 807,413 | mfu: 50.46 | epoch: 1 | total time: 30.27m | eta: 149.8m +step 02817/16704 (16.86%) | loss: 2.848269 | lrm: 1.00 | dt: 648.66ms | tok/sec: 808,259 | mfu: 50.52 | epoch: 1 | total time: 30.29m | eta: 149.8m +step 02818/16704 (16.87%) | loss: 2.853589 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,643 | mfu: 50.60 | epoch: 1 | total time: 30.30m | eta: 149.8m +step 02819/16704 (16.88%) | loss: 2.864659 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,290 | mfu: 50.71 | epoch: 1 | total time: 30.31m | eta: 149.8m +step 02820/16704 (16.88%) | loss: 2.868512 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,327 | mfu: 50.58 | epoch: 1 | total time: 30.32m | eta: 149.8m +step 02821/16704 (16.89%) | loss: 2.864039 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,731 | mfu: 50.73 | epoch: 1 | total time: 30.33m | eta: 149.8m +step 02822/16704 (16.89%) | loss: 2.857317 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,237 | mfu: 50.64 | epoch: 1 | total time: 30.34m | eta: 149.8m +step 02823/16704 (16.90%) | loss: 2.859139 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,430 | mfu: 50.84 | epoch: 1 | total time: 30.35m | eta: 149.8m +step 02824/16704 (16.91%) | loss: 2.850530 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,764 | mfu: 50.61 | epoch: 1 | total time: 30.36m | eta: 149.8m +step 02825/16704 (16.91%) | loss: 2.852225 | lrm: 1.00 | dt: 650.55ms | tok/sec: 805,918 | mfu: 50.37 | epoch: 1 | total time: 30.37m | eta: 149.7m +step 02826/16704 (16.92%) | loss: 2.851781 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,285 | mfu: 50.77 | epoch: 1 | total time: 30.38m | eta: 149.7m +step 02827/16704 (16.92%) | loss: 2.863126 | lrm: 1.00 | dt: 646.99ms | tok/sec: 810,353 | mfu: 50.65 | epoch: 1 | total time: 30.39m | eta: 149.7m +step 02828/16704 (16.93%) | loss: 2.853190 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,545 | mfu: 50.72 | epoch: 1 | total time: 30.40m | eta: 149.7m +step 02829/16704 (16.94%) | loss: 2.851762 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,823 | mfu: 50.87 | epoch: 1 | total time: 30.41m | eta: 149.7m +step 02830/16704 (16.94%) | loss: 2.835204 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,616 | mfu: 50.73 | epoch: 1 | total time: 30.43m | eta: 149.7m +step 02831/16704 (16.95%) | loss: 2.828153 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 1 | total time: 30.44m | eta: 149.7m +step 02832/16704 (16.95%) | loss: 2.823068 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,292 | mfu: 50.64 | epoch: 1 | total time: 30.45m | eta: 149.7m +step 02833/16704 (16.96%) | loss: 2.823702 | lrm: 1.00 | dt: 647.74ms | tok/sec: 809,413 | mfu: 50.59 | epoch: 1 | total time: 30.46m | eta: 149.7m +step 02834/16704 (16.97%) | loss: 2.828582 | lrm: 1.00 | dt: 648.78ms | tok/sec: 808,115 | mfu: 50.51 | epoch: 1 | total time: 30.47m | eta: 149.6m +step 02835/16704 (16.97%) | loss: 2.839076 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,843 | mfu: 50.68 | epoch: 1 | total time: 30.48m | eta: 149.6m +step 02836/16704 (16.98%) | loss: 2.857856 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,118 | mfu: 50.76 | epoch: 1 | total time: 30.49m | eta: 149.6m +step 02837/16704 (16.98%) | loss: 2.854467 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,970 | mfu: 50.62 | epoch: 1 | total time: 30.50m | eta: 149.6m +step 02838/16704 (16.99%) | loss: 2.862325 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,516 | mfu: 50.66 | epoch: 1 | total time: 30.51m | eta: 149.6m +step 02839/16704 (17.00%) | loss: 2.862579 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,180 | mfu: 50.76 | epoch: 1 | total time: 30.52m | eta: 149.6m +step 02840/16704 (17.00%) | loss: 2.859935 | lrm: 1.00 | dt: 648.53ms | tok/sec: 808,420 | mfu: 50.53 | epoch: 1 | total time: 30.53m | eta: 149.6m +step 02841/16704 (17.01%) | loss: 2.856220 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 1 | total time: 30.54m | eta: 149.6m +step 02842/16704 (17.01%) | loss: 2.860994 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,764 | mfu: 50.74 | epoch: 1 | total time: 30.55m | eta: 149.6m +step 02843/16704 (17.02%) | loss: 2.864415 | lrm: 1.00 | dt: 649.19ms | tok/sec: 807,607 | mfu: 50.48 | epoch: 1 | total time: 30.57m | eta: 149.5m +step 02844/16704 (17.03%) | loss: 2.864675 | lrm: 1.00 | dt: 642.76ms | tok/sec: 815,685 | mfu: 50.98 | epoch: 1 | total time: 30.58m | eta: 149.5m +step 02845/16704 (17.03%) | loss: 2.854789 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,839 | mfu: 50.68 | epoch: 1 | total time: 30.59m | eta: 149.5m +step 02846/16704 (17.04%) | loss: 2.856376 | lrm: 1.00 | dt: 647.58ms | tok/sec: 809,605 | mfu: 50.60 | epoch: 1 | total time: 30.60m | eta: 149.5m +step 02847/16704 (17.04%) | loss: 2.854340 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,731 | mfu: 50.73 | epoch: 1 | total time: 30.61m | eta: 149.5m +step 02848/16704 (17.05%) | loss: 2.838072 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,884 | mfu: 50.74 | epoch: 1 | total time: 30.62m | eta: 149.5m +step 02849/16704 (17.06%) | loss: 2.840900 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,328 | mfu: 50.71 | epoch: 1 | total time: 30.63m | eta: 149.5m +step 02850/16704 (17.06%) | loss: 2.829854 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,203 | mfu: 50.76 | epoch: 1 | total time: 30.64m | eta: 149.5m +step 02851/16704 (17.07%) | loss: 2.825264 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,624 | mfu: 50.60 | epoch: 1 | total time: 30.65m | eta: 149.5m +step 02852/16704 (17.07%) | loss: 2.827803 | lrm: 1.00 | dt: 646.38ms | tok/sec: 811,116 | mfu: 50.70 | epoch: 1 | total time: 30.66m | eta: 149.4m +step 02853/16704 (17.08%) | loss: 2.833672 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,757 | mfu: 50.61 | epoch: 1 | total time: 30.67m | eta: 149.4m +step 02854/16704 (17.09%) | loss: 2.838727 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,379 | mfu: 50.71 | epoch: 1 | total time: 30.68m | eta: 149.4m +step 02855/16704 (17.09%) | loss: 2.851889 | lrm: 1.00 | dt: 647.01ms | tok/sec: 810,325 | mfu: 50.65 | epoch: 1 | total time: 30.69m | eta: 149.4m +step 02856/16704 (17.10%) | loss: 2.847501 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 1 | total time: 30.71m | eta: 149.4m +step 02857/16704 (17.10%) | loss: 2.853346 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,932 | mfu: 50.87 | epoch: 1 | total time: 30.72m | eta: 149.4m +step 02858/16704 (17.11%) | loss: 2.847543 | lrm: 1.00 | dt: 647.58ms | tok/sec: 809,617 | mfu: 50.60 | epoch: 1 | total time: 30.73m | eta: 149.4m +step 02859/16704 (17.12%) | loss: 2.844361 | lrm: 1.00 | dt: 647.80ms | tok/sec: 809,340 | mfu: 50.58 | epoch: 1 | total time: 30.74m | eta: 149.4m +step 02860/16704 (17.12%) | loss: 2.827543 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,309 | mfu: 50.58 | epoch: 1 | total time: 30.75m | eta: 149.4m +step 02861/16704 (17.13%) | loss: 2.828613 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,950 | mfu: 50.69 | epoch: 1 | total time: 30.76m | eta: 149.4m +step 02862/16704 (17.13%) | loss: 2.827040 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,774 | mfu: 50.55 | epoch: 1 | total time: 30.77m | eta: 149.3m +step 02863/16704 (17.14%) | loss: 2.841164 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,794 | mfu: 50.86 | epoch: 1 | total time: 30.78m | eta: 149.3m +step 02864/16704 (17.15%) | loss: 2.848806 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 1 | total time: 30.79m | eta: 149.3m +step 02865/16704 (17.15%) | loss: 2.841346 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,853 | mfu: 50.74 | epoch: 1 | total time: 30.80m | eta: 149.3m +step 02866/16704 (17.16%) | loss: 2.848070 | lrm: 1.00 | dt: 647.87ms | tok/sec: 809,249 | mfu: 50.58 | epoch: 1 | total time: 30.81m | eta: 149.3m +step 02867/16704 (17.16%) | loss: 2.864404 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.86 | epoch: 1 | total time: 30.82m | eta: 149.3m +step 02868/16704 (17.17%) | loss: 2.855685 | lrm: 1.00 | dt: 650.28ms | tok/sec: 806,246 | mfu: 50.39 | epoch: 1 | total time: 30.83m | eta: 149.3m +step 02869/16704 (17.18%) | loss: 2.831194 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 1 | total time: 30.85m | eta: 149.3m +step 02870/16704 (17.18%) | loss: 2.856452 | lrm: 1.00 | dt: 647.65ms | tok/sec: 809,528 | mfu: 50.60 | epoch: 1 | total time: 30.86m | eta: 149.3m +step 02871/16704 (17.19%) | loss: 2.850610 | lrm: 1.00 | dt: 649.50ms | tok/sec: 807,215 | mfu: 50.45 | epoch: 1 | total time: 30.87m | eta: 149.2m +step 02872/16704 (17.19%) | loss: 2.855800 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,590 | mfu: 50.91 | epoch: 1 | total time: 30.88m | eta: 149.2m +step 02873/16704 (17.20%) | loss: 2.846641 | lrm: 1.00 | dt: 648.97ms | tok/sec: 807,881 | mfu: 50.49 | epoch: 1 | total time: 30.89m | eta: 149.2m +step 02874/16704 (17.21%) | loss: 2.846078 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,163 | mfu: 50.82 | epoch: 1 | total time: 30.90m | eta: 149.2m +step 02875/16704 (17.21%) | loss: 2.842764 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,824 | mfu: 50.99 | epoch: 1 | total time: 30.91m | eta: 149.2m +step 02876/16704 (17.22%) | loss: 2.840343 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,662 | mfu: 50.67 | epoch: 1 | total time: 30.92m | eta: 149.2m +step 02877/16704 (17.22%) | loss: 2.838108 | lrm: 1.00 | dt: 647.90ms | tok/sec: 809,208 | mfu: 50.58 | epoch: 1 | total time: 30.93m | eta: 149.2m +step 02878/16704 (17.23%) | loss: 2.841084 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,450 | mfu: 50.97 | epoch: 1 | total time: 30.94m | eta: 149.2m +step 02879/16704 (17.24%) | loss: 2.848034 | lrm: 1.00 | dt: 649.08ms | tok/sec: 807,744 | mfu: 50.49 | epoch: 1 | total time: 30.95m | eta: 149.2m +step 02880/16704 (17.24%) | loss: 2.861218 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,304 | mfu: 50.71 | epoch: 1 | total time: 30.96m | eta: 149.1m +step 02881/16704 (17.25%) | loss: 2.862367 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,283 | mfu: 50.71 | epoch: 1 | total time: 30.97m | eta: 149.1m +step 02882/16704 (17.25%) | loss: 2.852773 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,757 | mfu: 50.74 | epoch: 1 | total time: 30.99m | eta: 149.1m +step 02883/16704 (17.26%) | loss: 2.842032 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,861 | mfu: 50.93 | epoch: 1 | total time: 31.00m | eta: 149.1m +step 02884/16704 (17.27%) | loss: 2.839787 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,627 | mfu: 50.79 | epoch: 1 | total time: 31.01m | eta: 149.1m +step 02885/16704 (17.27%) | loss: 2.838853 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,990 | mfu: 50.75 | epoch: 1 | total time: 31.02m | eta: 149.1m +step 02886/16704 (17.28%) | loss: 2.860905 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,218 | mfu: 50.76 | epoch: 1 | total time: 31.03m | eta: 149.1m +step 02887/16704 (17.28%) | loss: 2.868451 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 1 | total time: 31.04m | eta: 149.1m +step 02888/16704 (17.29%) | loss: 2.873885 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,411 | mfu: 50.78 | epoch: 1 | total time: 31.05m | eta: 149.1m +step 02889/16704 (17.30%) | loss: 2.861961 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,660 | mfu: 50.92 | epoch: 1 | total time: 31.06m | eta: 149.0m +step 02890/16704 (17.30%) | loss: 2.871286 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,175 | mfu: 50.64 | epoch: 1 | total time: 31.07m | eta: 149.0m +step 02891/16704 (17.31%) | loss: 2.872733 | lrm: 1.00 | dt: 647.25ms | tok/sec: 810,028 | mfu: 50.63 | epoch: 1 | total time: 31.08m | eta: 149.0m +step 02892/16704 (17.31%) | loss: 2.870110 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,933 | mfu: 50.87 | epoch: 1 | total time: 31.09m | eta: 149.0m +step 02893/16704 (17.32%) | loss: 2.876443 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,785 | mfu: 50.93 | epoch: 1 | total time: 31.10m | eta: 149.0m +step 02894/16704 (17.33%) | loss: 2.874214 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,391 | mfu: 50.71 | epoch: 1 | total time: 31.11m | eta: 149.0m +step 02895/16704 (17.33%) | loss: 2.867532 | lrm: 1.00 | dt: 642.31ms | tok/sec: 816,253 | mfu: 51.02 | epoch: 1 | total time: 31.13m | eta: 149.0m +step 02896/16704 (17.34%) | loss: 2.856461 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,002 | mfu: 50.69 | epoch: 1 | total time: 31.14m | eta: 149.0m +step 02897/16704 (17.34%) | loss: 2.860881 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,760 | mfu: 50.74 | epoch: 1 | total time: 31.15m | eta: 149.0m +step 02898/16704 (17.35%) | loss: 2.860111 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,542 | mfu: 50.79 | epoch: 1 | total time: 31.16m | eta: 148.9m +step 02899/16704 (17.36%) | loss: 2.855346 | lrm: 1.00 | dt: 648.61ms | tok/sec: 808,328 | mfu: 50.52 | epoch: 1 | total time: 31.17m | eta: 148.9m +step 02900/16704 (17.36%) | loss: 2.842556 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 1 | total time: 31.18m | eta: 148.9m +step 02901/16704 (17.37%) | loss: 2.846174 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,360 | mfu: 50.77 | epoch: 1 | total time: 31.19m | eta: 148.9m +step 02902/16704 (17.37%) | loss: 2.836656 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,519 | mfu: 50.66 | epoch: 1 | total time: 31.20m | eta: 148.9m +step 02903/16704 (17.38%) | loss: 2.832556 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,771 | mfu: 50.92 | epoch: 1 | total time: 31.21m | eta: 148.9m +step 02904/16704 (17.39%) | loss: 2.814803 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,239 | mfu: 50.83 | epoch: 1 | total time: 31.22m | eta: 148.9m +step 02905/16704 (17.39%) | loss: 2.826725 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,526 | mfu: 50.72 | epoch: 1 | total time: 31.23m | eta: 148.9m +step 02906/16704 (17.40%) | loss: 2.826914 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,608 | mfu: 50.79 | epoch: 1 | total time: 31.24m | eta: 148.9m +step 02907/16704 (17.40%) | loss: 2.820584 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,437 | mfu: 50.78 | epoch: 1 | total time: 31.25m | eta: 148.8m +step 02908/16704 (17.41%) | loss: 2.831957 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,374 | mfu: 50.71 | epoch: 1 | total time: 31.27m | eta: 148.8m +step 02909/16704 (17.41%) | loss: 2.823541 | lrm: 1.00 | dt: 648.33ms | tok/sec: 808,668 | mfu: 50.54 | epoch: 1 | total time: 31.28m | eta: 148.8m +step 02910/16704 (17.42%) | loss: 2.822016 | lrm: 1.00 | dt: 649.06ms | tok/sec: 807,770 | mfu: 50.49 | epoch: 1 | total time: 31.29m | eta: 148.8m +step 02911/16704 (17.43%) | loss: 2.818168 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,862 | mfu: 50.81 | epoch: 1 | total time: 31.30m | eta: 148.8m +step 02912/16704 (17.43%) | loss: 2.825225 | lrm: 1.00 | dt: 648.42ms | tok/sec: 808,568 | mfu: 50.54 | epoch: 1 | total time: 31.31m | eta: 148.8m +step 02913/16704 (17.44%) | loss: 2.820681 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,618 | mfu: 50.79 | epoch: 1 | total time: 31.32m | eta: 148.8m +step 02914/16704 (17.44%) | loss: 2.829505 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,976 | mfu: 50.81 | epoch: 1 | total time: 31.33m | eta: 148.8m +step 02915/16704 (17.45%) | loss: 2.833925 | lrm: 1.00 | dt: 647.86ms | tok/sec: 809,257 | mfu: 50.58 | epoch: 1 | total time: 31.34m | eta: 148.8m +step 02916/16704 (17.46%) | loss: 2.825187 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,789 | mfu: 50.86 | epoch: 1 | total time: 31.35m | eta: 148.8m +step 02917/16704 (17.46%) | loss: 2.825523 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,717 | mfu: 50.80 | epoch: 1 | total time: 31.36m | eta: 148.7m +step 02918/16704 (17.47%) | loss: 2.811668 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,671 | mfu: 50.79 | epoch: 1 | total time: 31.37m | eta: 148.7m +step 02919/16704 (17.47%) | loss: 2.818815 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,420 | mfu: 50.71 | epoch: 1 | total time: 31.38m | eta: 148.7m +step 02920/16704 (17.48%) | loss: 2.798264 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,725 | mfu: 50.86 | epoch: 1 | total time: 31.39m | eta: 148.7m +step 02921/16704 (17.49%) | loss: 2.794161 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 1 | total time: 31.41m | eta: 148.7m +step 02922/16704 (17.49%) | loss: 2.798844 | lrm: 1.00 | dt: 647.17ms | tok/sec: 810,127 | mfu: 50.63 | epoch: 1 | total time: 31.42m | eta: 148.7m +step 02923/16704 (17.50%) | loss: 2.786526 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,499 | mfu: 50.66 | epoch: 1 | total time: 31.43m | eta: 148.7m +step 02924/16704 (17.50%) | loss: 2.779277 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,233 | mfu: 50.77 | epoch: 1 | total time: 31.44m | eta: 148.7m +step 02925/16704 (17.51%) | loss: 2.786078 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,516 | mfu: 50.85 | epoch: 1 | total time: 31.45m | eta: 148.7m +step 02926/16704 (17.52%) | loss: 2.783054 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,258 | mfu: 50.89 | epoch: 1 | total time: 31.46m | eta: 148.6m +step 02927/16704 (17.52%) | loss: 2.791968 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,669 | mfu: 50.67 | epoch: 1 | total time: 31.47m | eta: 148.6m +step 02928/16704 (17.53%) | loss: 2.789845 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,094 | mfu: 50.76 | epoch: 1 | total time: 31.48m | eta: 148.6m +step 02929/16704 (17.53%) | loss: 2.783214 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,142 | mfu: 50.70 | epoch: 1 | total time: 31.49m | eta: 148.6m +step 02930/16704 (17.54%) | loss: 2.796398 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,931 | mfu: 50.68 | epoch: 1 | total time: 31.50m | eta: 148.6m +step 02931/16704 (17.55%) | loss: 2.796561 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,812 | mfu: 50.86 | epoch: 1 | total time: 31.51m | eta: 148.6m +step 02932/16704 (17.55%) | loss: 2.791400 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,097 | mfu: 50.76 | epoch: 1 | total time: 31.52m | eta: 148.6m +step 02933/16704 (17.56%) | loss: 2.792542 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,890 | mfu: 50.68 | epoch: 1 | total time: 31.53m | eta: 148.6m +step 02934/16704 (17.56%) | loss: 2.800964 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,620 | mfu: 50.66 | epoch: 1 | total time: 31.55m | eta: 148.6m +step 02935/16704 (17.57%) | loss: 2.805987 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,198 | mfu: 50.76 | epoch: 1 | total time: 31.56m | eta: 148.5m +step 02936/16704 (17.58%) | loss: 2.804742 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,363 | mfu: 50.71 | epoch: 1 | total time: 31.57m | eta: 148.5m +step 02937/16704 (17.58%) | loss: 2.804985 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,414 | mfu: 50.84 | epoch: 1 | total time: 31.58m | eta: 148.5m +step 02938/16704 (17.59%) | loss: 2.819477 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,907 | mfu: 50.75 | epoch: 1 | total time: 31.59m | eta: 148.5m +step 02939/16704 (17.59%) | loss: 2.826660 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,974 | mfu: 50.75 | epoch: 1 | total time: 31.60m | eta: 148.5m +step 02940/16704 (17.60%) | loss: 2.817524 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,449 | mfu: 50.84 | epoch: 1 | total time: 31.61m | eta: 148.5m +step 02941/16704 (17.61%) | loss: 2.827751 | lrm: 1.00 | dt: 647.80ms | tok/sec: 809,340 | mfu: 50.58 | epoch: 1 | total time: 31.62m | eta: 148.5m +step 02942/16704 (17.61%) | loss: 2.825150 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,790 | mfu: 50.99 | epoch: 1 | total time: 31.63m | eta: 148.5m +step 02943/16704 (17.62%) | loss: 2.820812 | lrm: 1.00 | dt: 647.98ms | tok/sec: 809,115 | mfu: 50.57 | epoch: 1 | total time: 31.64m | eta: 148.5m +step 02944/16704 (17.62%) | loss: 2.829093 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,445 | mfu: 50.78 | epoch: 1 | total time: 31.65m | eta: 148.4m +step 02945/16704 (17.63%) | loss: 2.838004 | lrm: 1.00 | dt: 646.95ms | tok/sec: 810,398 | mfu: 50.65 | epoch: 1 | total time: 31.66m | eta: 148.4m +step 02946/16704 (17.64%) | loss: 2.839967 | lrm: 1.00 | dt: 641.98ms | tok/sec: 816,672 | mfu: 51.04 | epoch: 1 | total time: 31.67m | eta: 148.4m +step 02947/16704 (17.64%) | loss: 2.845544 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,433 | mfu: 50.65 | epoch: 1 | total time: 31.68m | eta: 148.4m +step 02948/16704 (17.65%) | loss: 2.868376 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,067 | mfu: 50.76 | epoch: 1 | total time: 31.70m | eta: 148.4m +step 02949/16704 (17.65%) | loss: 2.860502 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,432 | mfu: 50.65 | epoch: 1 | total time: 31.71m | eta: 148.4m +step 02950/16704 (17.66%) | loss: 2.856429 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,482 | mfu: 50.78 | epoch: 1 | total time: 31.72m | eta: 148.4m +step 02951/16704 (17.67%) | loss: 2.856846 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,071 | mfu: 50.82 | epoch: 1 | total time: 31.73m | eta: 148.4m +step 02952/16704 (17.67%) | loss: 2.863648 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,945 | mfu: 50.75 | epoch: 1 | total time: 31.74m | eta: 148.4m +step 02953/16704 (17.68%) | loss: 2.861288 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,154 | mfu: 50.95 | epoch: 1 | total time: 31.75m | eta: 148.3m +step 02954/16704 (17.68%) | loss: 2.855340 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,904 | mfu: 50.62 | epoch: 1 | total time: 31.76m | eta: 148.3m +step 02955/16704 (17.69%) | loss: 2.851487 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,922 | mfu: 50.81 | epoch: 1 | total time: 31.77m | eta: 148.3m +step 02956/16704 (17.70%) | loss: 2.853743 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,885 | mfu: 50.74 | epoch: 1 | total time: 31.78m | eta: 148.3m +step 02957/16704 (17.70%) | loss: 2.851690 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,408 | mfu: 50.65 | epoch: 1 | total time: 31.79m | eta: 148.3m +step 02958/16704 (17.71%) | loss: 2.860292 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,841 | mfu: 50.87 | epoch: 1 | total time: 31.80m | eta: 148.3m +step 02959/16704 (17.71%) | loss: 2.858648 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,450 | mfu: 50.84 | epoch: 1 | total time: 31.81m | eta: 148.3m +step 02960/16704 (17.72%) | loss: 2.855863 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,246 | mfu: 50.77 | epoch: 1 | total time: 31.82m | eta: 148.3m +step 02961/16704 (17.73%) | loss: 2.865955 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,746 | mfu: 50.74 | epoch: 1 | total time: 31.84m | eta: 148.3m +step 02962/16704 (17.73%) | loss: 2.864411 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,554 | mfu: 50.91 | epoch: 1 | total time: 31.85m | eta: 148.2m +step 02963/16704 (17.74%) | loss: 2.859129 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,975 | mfu: 50.75 | epoch: 1 | total time: 31.86m | eta: 148.2m +step 02964/16704 (17.74%) | loss: 2.852350 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,666 | mfu: 50.67 | epoch: 1 | total time: 31.87m | eta: 148.2m +step 02965/16704 (17.75%) | loss: 2.840109 | lrm: 1.00 | dt: 642.44ms | tok/sec: 816,090 | mfu: 51.01 | epoch: 1 | total time: 31.88m | eta: 148.2m +step 02966/16704 (17.76%) | loss: 2.839661 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 1 | total time: 31.89m | eta: 148.2m +step 02967/16704 (17.76%) | loss: 2.839436 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,960 | mfu: 50.69 | epoch: 1 | total time: 31.90m | eta: 148.2m +step 02968/16704 (17.77%) | loss: 2.833923 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,706 | mfu: 50.86 | epoch: 1 | total time: 31.91m | eta: 148.2m +step 02969/16704 (17.77%) | loss: 2.837298 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,515 | mfu: 50.78 | epoch: 1 | total time: 31.92m | eta: 148.2m +step 02970/16704 (17.78%) | loss: 2.843136 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 1 | total time: 31.93m | eta: 148.2m +step 02971/16704 (17.79%) | loss: 2.841009 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,116 | mfu: 50.76 | epoch: 1 | total time: 31.94m | eta: 148.2m +step 02972/16704 (17.79%) | loss: 2.831271 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,032 | mfu: 50.69 | epoch: 1 | total time: 31.95m | eta: 148.1m +step 02973/16704 (17.80%) | loss: 2.841834 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,863 | mfu: 50.81 | epoch: 1 | total time: 31.96m | eta: 148.1m +step 02974/16704 (17.80%) | loss: 2.859732 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,878 | mfu: 50.68 | epoch: 1 | total time: 31.98m | eta: 148.1m +step 02975/16704 (17.81%) | loss: 2.848370 | lrm: 1.00 | dt: 646.79ms | tok/sec: 810,595 | mfu: 50.66 | epoch: 1 | total time: 31.99m | eta: 148.1m +step 02976/16704 (17.82%) | loss: 2.847299 | lrm: 1.00 | dt: 647.62ms | tok/sec: 809,560 | mfu: 50.60 | epoch: 1 | total time: 32.00m | eta: 148.1m +step 02977/16704 (17.82%) | loss: 2.853597 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,852 | mfu: 50.80 | epoch: 1 | total time: 32.01m | eta: 148.1m +step 02978/16704 (17.83%) | loss: 2.862155 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,005 | mfu: 50.69 | epoch: 1 | total time: 32.02m | eta: 148.1m +step 02979/16704 (17.83%) | loss: 2.861726 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,859 | mfu: 50.74 | epoch: 1 | total time: 32.03m | eta: 148.1m +step 02980/16704 (17.84%) | loss: 2.868372 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,961 | mfu: 50.81 | epoch: 1 | total time: 32.04m | eta: 148.1m +step 02981/16704 (17.85%) | loss: 2.863918 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,338 | mfu: 50.77 | epoch: 1 | total time: 32.05m | eta: 148.0m +step 02982/16704 (17.85%) | loss: 2.854834 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,703 | mfu: 50.67 | epoch: 1 | total time: 32.06m | eta: 148.0m +step 02983/16704 (17.86%) | loss: 2.857751 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,609 | mfu: 50.85 | epoch: 1 | total time: 32.07m | eta: 148.0m +step 02984/16704 (17.86%) | loss: 2.865692 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,318 | mfu: 50.65 | epoch: 1 | total time: 32.08m | eta: 148.0m +step 02985/16704 (17.87%) | loss: 2.854175 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,483 | mfu: 50.91 | epoch: 1 | total time: 32.09m | eta: 148.0m +step 02986/16704 (17.88%) | loss: 2.846843 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,006 | mfu: 50.69 | epoch: 1 | total time: 32.10m | eta: 148.0m +step 02987/16704 (17.88%) | loss: 2.845110 | lrm: 1.00 | dt: 642.10ms | tok/sec: 816,526 | mfu: 51.03 | epoch: 1 | total time: 32.12m | eta: 148.0m +step 02988/16704 (17.89%) | loss: 2.838047 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,030 | mfu: 50.69 | epoch: 1 | total time: 32.13m | eta: 148.0m +step 02989/16704 (17.89%) | loss: 2.820725 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,787 | mfu: 50.68 | epoch: 1 | total time: 32.14m | eta: 148.0m +step 02990/16704 (17.90%) | loss: 2.821038 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,792 | mfu: 50.80 | epoch: 1 | total time: 32.15m | eta: 147.9m +step 02991/16704 (17.91%) | loss: 2.822021 | lrm: 1.00 | dt: 646.70ms | tok/sec: 810,718 | mfu: 50.67 | epoch: 1 | total time: 32.16m | eta: 147.9m +step 02992/16704 (17.91%) | loss: 2.811811 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,673 | mfu: 50.86 | epoch: 1 | total time: 32.17m | eta: 147.9m +step 02993/16704 (17.92%) | loss: 2.811422 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,982 | mfu: 50.75 | epoch: 1 | total time: 32.18m | eta: 147.9m +step 02994/16704 (17.92%) | loss: 2.812342 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,663 | mfu: 50.79 | epoch: 1 | total time: 32.19m | eta: 147.9m +step 02995/16704 (17.93%) | loss: 2.813669 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,281 | mfu: 50.64 | epoch: 1 | total time: 32.20m | eta: 147.9m +step 02996/16704 (17.94%) | loss: 2.819027 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,825 | mfu: 50.80 | epoch: 1 | total time: 32.21m | eta: 147.9m +step 02997/16704 (17.94%) | loss: 2.814544 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,528 | mfu: 50.66 | epoch: 1 | total time: 32.22m | eta: 147.9m +step 02998/16704 (17.95%) | loss: 2.814393 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,177 | mfu: 50.89 | epoch: 1 | total time: 32.23m | eta: 147.9m +step 02999/16704 (17.95%) | loss: 2.820569 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,721 | mfu: 50.86 | epoch: 1 | total time: 32.24m | eta: 147.8m +Step 03000 | Validation bpb: 0.863480 +Downloading https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip... +Downloaded to /root/.cache/nanochat/eval_bundle.zip +Evaluating: hellaswag_zeroshot (0-shot, type: multiple_choice)... accuracy: 0.3827 | centered: 0.1769 | time: 23.00s +Evaluating: jeopardy (10-shot, type: language_modeling)... accuracy: 0.0132 | centered: 0.0132 | time: 5.03s +Evaluating: bigbench_qa_wikidata (10-shot, type: language_modeling)... accuracy: 0.3637 | centered: 0.3637 | time: 46.92s +Evaluating: arc_easy (10-shot, type: multiple_choice)... accuracy: 0.5699 | centered: 0.4265 | time: 6.31s +Evaluating: arc_challenge (10-shot, type: multiple_choice)... accuracy: 0.2969 | centered: 0.0626 | time: 3.04s +Evaluating: copa (0-shot, type: multiple_choice)... accuracy: 0.6600 | centered: 0.3200 | time: 0.28s +Evaluating: commonsense_qa (10-shot, type: multiple_choice)... accuracy: 0.3653 | centered: 0.2066 | time: 3.18s +Evaluating: piqa (10-shot, type: multiple_choice)... accuracy: 0.6600 | centered: 0.3199 | time: 4.87s +Evaluating: openbook_qa (0-shot, type: multiple_choice)... accuracy: 0.3240 | centered: 0.0987 | time: 1.22s +Evaluating: lambada_openai (0-shot, type: language_modeling)... accuracy: 0.2847 | centered: 0.2847 | time: 11.64s +Evaluating: hellaswag (10-shot, type: multiple_choice)... accuracy: 0.3779 | centered: 0.1706 | time: 35.53s +Evaluating: winograd (0-shot, type: schema)... accuracy: 0.5861 | centered: 0.1722 | time: 0.64s +Evaluating: winogrande (0-shot, type: schema)... accuracy: 0.5383 | centered: 0.0766 | time: 2.75s +Evaluating: bigbench_dyck_languages (10-shot, type: language_modeling)... accuracy: 0.0970 | centered: 0.0970 | time: 2.64s +Evaluating: agi_eval_lsat_ar (3-shot, type: multiple_choice)... accuracy: 0.2522 | centered: 0.0652 | time: 0.80s +Evaluating: bigbench_cs_algorithms (10-shot, type: language_modeling)... accuracy: 0.3576 | centered: 0.3576 | time: 3.24s +Evaluating: bigbench_operators (10-shot, type: language_modeling)... accuracy: 0.1381 | centered: 0.1381 | time: 0.54s +Evaluating: bigbench_repeat_copy_logic (10-shot, type: language_modeling)... accuracy: 0.0000 | centered: 0.0000 | time: 0.09s +Evaluating: squad (10-shot, type: language_modeling)... accuracy: 0.1528 | centered: 0.1528 | time: 29.90s +Evaluating: coqa (0-shot, type: language_modeling)... accuracy: 0.1561 | centered: 0.1561 | time: 19.13s +Evaluating: boolq (10-shot, type: multiple_choice)... accuracy: 0.5813 | centered: -0.1017 | time: 10.93s +Evaluating: bigbench_language_identification (10-shot, type: multiple_choice)... accuracy: 0.2521 | centered: 0.1772 | time: 59.80s +Step 03000 | CORE metric: 0.1697 +step 03000/16704 (17.96%) | loss: 2.824082 | lrm: 1.00 | dt: 628.30ms | tok/sec: 834,453 | mfu: 52.15 | epoch: 1 | total time: 32.25m | eta: 147.8m +step 03001/16704 (17.97%) | loss: 2.833200 | lrm: 1.00 | dt: 648.50ms | tok/sec: 808,464 | mfu: 50.53 | epoch: 1 | total time: 32.27m | eta: 147.8m +step 03002/16704 (17.97%) | loss: 2.847645 | lrm: 1.00 | dt: 640.35ms | tok/sec: 818,748 | mfu: 51.17 | epoch: 1 | total time: 32.28m | eta: 147.8m +step 03003/16704 (17.98%) | loss: 2.848879 | lrm: 1.00 | dt: 641.37ms | tok/sec: 817,454 | mfu: 51.09 | epoch: 1 | total time: 32.29m | eta: 147.8m +step 03004/16704 (17.98%) | loss: 2.845411 | lrm: 1.00 | dt: 648.50ms | tok/sec: 808,463 | mfu: 50.53 | epoch: 1 | total time: 32.30m | eta: 147.8m +step 03005/16704 (17.99%) | loss: 2.841670 | lrm: 1.00 | dt: 637.14ms | tok/sec: 822,875 | mfu: 51.43 | epoch: 1 | total time: 32.31m | eta: 147.8m +step 03006/16704 (18.00%) | loss: 2.837677 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,887 | mfu: 50.87 | epoch: 1 | total time: 32.32m | eta: 147.8m +step 03007/16704 (18.00%) | loss: 2.843861 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,874 | mfu: 50.81 | epoch: 1 | total time: 32.33m | eta: 147.8m +step 03008/16704 (18.01%) | loss: 2.841424 | lrm: 1.00 | dt: 638.23ms | tok/sec: 821,466 | mfu: 51.34 | epoch: 1 | total time: 32.34m | eta: 147.7m +step 03009/16704 (18.01%) | loss: 2.852384 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,850 | mfu: 50.68 | epoch: 1 | total time: 32.35m | eta: 147.7m +step 03010/16704 (18.02%) | loss: 2.847530 | lrm: 1.00 | dt: 638.97ms | tok/sec: 820,523 | mfu: 51.28 | epoch: 1 | total time: 32.36m | eta: 147.7m +step 03011/16704 (18.03%) | loss: 2.847113 | lrm: 1.00 | dt: 640.16ms | tok/sec: 818,995 | mfu: 51.19 | epoch: 1 | total time: 32.37m | eta: 147.7m +step 03012/16704 (18.03%) | loss: 2.842346 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,763 | mfu: 50.92 | epoch: 1 | total time: 32.38m | eta: 147.7m +step 03013/16704 (18.04%) | loss: 2.856318 | lrm: 1.00 | dt: 640.94ms | tok/sec: 818,003 | mfu: 51.13 | epoch: 1 | total time: 32.39m | eta: 147.7m +step 03014/16704 (18.04%) | loss: 2.854731 | lrm: 1.00 | dt: 641.84ms | tok/sec: 816,851 | mfu: 51.05 | epoch: 1 | total time: 32.40m | eta: 147.7m +step 03015/16704 (18.05%) | loss: 2.852506 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,311 | mfu: 50.96 | epoch: 1 | total time: 32.42m | eta: 147.7m +step 03016/16704 (18.06%) | loss: 2.856417 | lrm: 1.00 | dt: 640.51ms | tok/sec: 818,541 | mfu: 51.16 | epoch: 1 | total time: 32.43m | eta: 147.7m +step 03017/16704 (18.06%) | loss: 2.869634 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,858 | mfu: 50.99 | epoch: 1 | total time: 32.44m | eta: 147.6m +step 03018/16704 (18.07%) | loss: 2.853586 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,292 | mfu: 50.83 | epoch: 1 | total time: 32.45m | eta: 147.6m +step 03019/16704 (18.07%) | loss: 2.842535 | lrm: 1.00 | dt: 641.51ms | tok/sec: 817,276 | mfu: 51.08 | epoch: 1 | total time: 32.46m | eta: 147.6m +step 03020/16704 (18.08%) | loss: 2.850407 | lrm: 1.00 | dt: 641.22ms | tok/sec: 817,642 | mfu: 51.10 | epoch: 1 | total time: 32.47m | eta: 147.6m +step 03021/16704 (18.09%) | loss: 2.851223 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,860 | mfu: 50.93 | epoch: 1 | total time: 32.48m | eta: 147.6m +step 03022/16704 (18.09%) | loss: 2.845955 | lrm: 1.00 | dt: 640.19ms | tok/sec: 818,953 | mfu: 51.19 | epoch: 1 | total time: 32.49m | eta: 147.6m +step 03023/16704 (18.10%) | loss: 2.843763 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,699 | mfu: 50.98 | epoch: 1 | total time: 32.50m | eta: 147.6m +step 03024/16704 (18.10%) | loss: 2.845982 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,801 | mfu: 50.99 | epoch: 1 | total time: 32.51m | eta: 147.6m +step 03025/16704 (18.11%) | loss: 2.848454 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,127 | mfu: 50.88 | epoch: 1 | total time: 32.52m | eta: 147.6m +step 03026/16704 (18.12%) | loss: 2.843690 | lrm: 1.00 | dt: 641.38ms | tok/sec: 817,439 | mfu: 51.09 | epoch: 1 | total time: 32.53m | eta: 147.5m +step 03027/16704 (18.12%) | loss: 2.839999 | lrm: 1.00 | dt: 641.79ms | tok/sec: 816,915 | mfu: 51.06 | epoch: 1 | total time: 32.54m | eta: 147.5m +step 03028/16704 (18.13%) | loss: 2.831349 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,693 | mfu: 50.79 | epoch: 1 | total time: 32.55m | eta: 147.5m +step 03029/16704 (18.13%) | loss: 2.825789 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,115 | mfu: 50.88 | epoch: 1 | total time: 32.57m | eta: 147.5m +step 03030/16704 (18.14%) | loss: 2.817800 | lrm: 1.00 | dt: 641.99ms | tok/sec: 816,660 | mfu: 51.04 | epoch: 1 | total time: 32.58m | eta: 147.5m +step 03031/16704 (18.15%) | loss: 2.822544 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,359 | mfu: 50.96 | epoch: 1 | total time: 32.59m | eta: 147.5m +step 03032/16704 (18.15%) | loss: 2.820007 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,960 | mfu: 50.87 | epoch: 1 | total time: 32.60m | eta: 147.5m +step 03033/16704 (18.16%) | loss: 2.810974 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,471 | mfu: 50.97 | epoch: 1 | total time: 32.61m | eta: 147.5m +step 03034/16704 (18.16%) | loss: 2.820175 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,332 | mfu: 50.90 | epoch: 1 | total time: 32.62m | eta: 147.5m +step 03035/16704 (18.17%) | loss: 2.810219 | lrm: 1.00 | dt: 643.26ms | tok/sec: 815,052 | mfu: 50.94 | epoch: 1 | total time: 32.63m | eta: 147.4m +step 03036/16704 (18.18%) | loss: 2.802871 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 1 | total time: 32.64m | eta: 147.4m +step 03037/16704 (18.18%) | loss: 2.808497 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,403 | mfu: 50.90 | epoch: 1 | total time: 32.65m | eta: 147.4m +step 03038/16704 (18.19%) | loss: 2.811391 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,711 | mfu: 50.92 | epoch: 1 | total time: 32.66m | eta: 147.4m +step 03039/16704 (18.19%) | loss: 2.823334 | lrm: 1.00 | dt: 641.78ms | tok/sec: 816,927 | mfu: 51.06 | epoch: 1 | total time: 32.67m | eta: 147.4m +step 03040/16704 (18.20%) | loss: 2.813559 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 1 | total time: 32.68m | eta: 147.4m +step 03041/16704 (18.21%) | loss: 2.818258 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,837 | mfu: 50.80 | epoch: 1 | total time: 32.69m | eta: 147.4m +step 03042/16704 (18.21%) | loss: 2.809183 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,885 | mfu: 50.93 | epoch: 1 | total time: 32.70m | eta: 147.4m +step 03043/16704 (18.22%) | loss: 2.806574 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,062 | mfu: 50.94 | epoch: 1 | total time: 32.72m | eta: 147.4m +step 03044/16704 (18.22%) | loss: 2.810831 | lrm: 1.00 | dt: 641.12ms | tok/sec: 817,764 | mfu: 51.11 | epoch: 1 | total time: 32.73m | eta: 147.3m +step 03045/16704 (18.23%) | loss: 2.813747 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,601 | mfu: 50.85 | epoch: 1 | total time: 32.74m | eta: 147.3m +step 03046/16704 (18.24%) | loss: 2.812221 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,971 | mfu: 50.81 | epoch: 1 | total time: 32.75m | eta: 147.3m +step 03047/16704 (18.24%) | loss: 2.814924 | lrm: 1.00 | dt: 643.30ms | tok/sec: 815,000 | mfu: 50.94 | epoch: 1 | total time: 32.76m | eta: 147.3m +step 03048/16704 (18.25%) | loss: 2.814389 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,262 | mfu: 50.89 | epoch: 1 | total time: 32.77m | eta: 147.3m +step 03049/16704 (18.25%) | loss: 2.830385 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,883 | mfu: 50.87 | epoch: 1 | total time: 32.78m | eta: 147.3m +step 03050/16704 (18.26%) | loss: 2.830385 | lrm: 1.00 | dt: 641.46ms | tok/sec: 817,338 | mfu: 51.08 | epoch: 1 | total time: 32.79m | eta: 147.3m +step 03051/16704 (18.27%) | loss: 2.863933 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,474 | mfu: 50.84 | epoch: 1 | total time: 32.80m | eta: 147.3m +step 03052/16704 (18.27%) | loss: 2.858365 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,663 | mfu: 50.92 | epoch: 1 | total time: 32.81m | eta: 147.3m +step 03053/16704 (18.28%) | loss: 2.851256 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,207 | mfu: 50.83 | epoch: 1 | total time: 32.82m | eta: 147.2m +step 03054/16704 (18.28%) | loss: 2.857458 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,895 | mfu: 50.81 | epoch: 1 | total time: 32.83m | eta: 147.2m +step 03055/16704 (18.29%) | loss: 2.860093 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,394 | mfu: 50.78 | epoch: 1 | total time: 32.84m | eta: 147.2m +step 03056/16704 (18.30%) | loss: 2.852807 | lrm: 1.00 | dt: 641.59ms | tok/sec: 817,166 | mfu: 51.07 | epoch: 1 | total time: 32.85m | eta: 147.2m +step 03057/16704 (18.30%) | loss: 2.866336 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,737 | mfu: 50.92 | epoch: 1 | total time: 32.87m | eta: 147.2m +step 03058/16704 (18.31%) | loss: 2.870432 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 1 | total time: 32.88m | eta: 147.2m +step 03059/16704 (18.31%) | loss: 2.860785 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,065 | mfu: 50.94 | epoch: 1 | total time: 32.89m | eta: 147.2m +step 03060/16704 (18.32%) | loss: 2.848826 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,389 | mfu: 50.71 | epoch: 1 | total time: 32.90m | eta: 147.2m +step 03061/16704 (18.32%) | loss: 2.848571 | lrm: 1.00 | dt: 640.61ms | tok/sec: 818,426 | mfu: 51.15 | epoch: 1 | total time: 32.91m | eta: 147.2m +step 03062/16704 (18.33%) | loss: 2.847755 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,384 | mfu: 50.96 | epoch: 1 | total time: 32.92m | eta: 147.1m +step 03063/16704 (18.34%) | loss: 2.838467 | lrm: 1.00 | dt: 647.12ms | tok/sec: 810,187 | mfu: 50.64 | epoch: 1 | total time: 32.93m | eta: 147.1m +step 03064/16704 (18.34%) | loss: 2.855908 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,340 | mfu: 50.96 | epoch: 1 | total time: 32.94m | eta: 147.1m +step 03065/16704 (18.35%) | loss: 2.862540 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,268 | mfu: 50.71 | epoch: 1 | total time: 32.95m | eta: 147.1m +step 03066/16704 (18.35%) | loss: 2.852026 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,470 | mfu: 50.97 | epoch: 1 | total time: 32.96m | eta: 147.1m +step 03067/16704 (18.36%) | loss: 2.843533 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,596 | mfu: 50.79 | epoch: 1 | total time: 32.97m | eta: 147.1m +step 03068/16704 (18.37%) | loss: 2.864504 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,973 | mfu: 50.94 | epoch: 1 | total time: 32.98m | eta: 147.1m +step 03069/16704 (18.37%) | loss: 2.866902 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,159 | mfu: 50.82 | epoch: 1 | total time: 32.99m | eta: 147.1m +step 03070/16704 (18.38%) | loss: 2.862997 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,309 | mfu: 50.77 | epoch: 1 | total time: 33.01m | eta: 147.1m +step 03071/16704 (18.38%) | loss: 2.873031 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,310 | mfu: 50.90 | epoch: 1 | total time: 33.02m | eta: 147.0m +step 03072/16704 (18.39%) | loss: 2.873567 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,588 | mfu: 50.73 | epoch: 1 | total time: 33.03m | eta: 147.0m +step 03073/16704 (18.40%) | loss: 2.868646 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,048 | mfu: 50.69 | epoch: 1 | total time: 33.04m | eta: 147.0m +step 03074/16704 (18.40%) | loss: 2.866662 | lrm: 1.00 | dt: 641.54ms | tok/sec: 817,239 | mfu: 51.08 | epoch: 1 | total time: 33.05m | eta: 147.0m +step 03075/16704 (18.41%) | loss: 2.867257 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 1 | total time: 33.06m | eta: 147.0m +step 03076/16704 (18.41%) | loss: 2.884966 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,399 | mfu: 50.96 | epoch: 1 | total time: 33.07m | eta: 147.0m +step 03077/16704 (18.42%) | loss: 2.888151 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,199 | mfu: 50.83 | epoch: 1 | total time: 33.08m | eta: 147.0m +step 03078/16704 (18.43%) | loss: 2.874514 | lrm: 1.00 | dt: 641.95ms | tok/sec: 816,710 | mfu: 51.05 | epoch: 1 | total time: 33.09m | eta: 147.0m +step 03079/16704 (18.43%) | loss: 2.869035 | lrm: 1.00 | dt: 642.05ms | tok/sec: 816,578 | mfu: 51.04 | epoch: 1 | total time: 33.10m | eta: 147.0m +step 03080/16704 (18.44%) | loss: 2.864642 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,739 | mfu: 50.92 | epoch: 1 | total time: 33.11m | eta: 146.9m +step 03081/16704 (18.44%) | loss: 2.861390 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,904 | mfu: 51.00 | epoch: 1 | total time: 33.12m | eta: 146.9m +step 03082/16704 (18.45%) | loss: 2.856209 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,878 | mfu: 50.81 | epoch: 1 | total time: 33.13m | eta: 146.9m +step 03083/16704 (18.46%) | loss: 2.857684 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,214 | mfu: 50.89 | epoch: 1 | total time: 33.14m | eta: 146.9m +step 03084/16704 (18.46%) | loss: 2.856939 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,312 | mfu: 50.90 | epoch: 1 | total time: 33.16m | eta: 146.9m +step 03085/16704 (18.47%) | loss: 2.847605 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,862 | mfu: 50.87 | epoch: 1 | total time: 33.17m | eta: 146.9m +step 03086/16704 (18.47%) | loss: 2.845897 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,048 | mfu: 50.88 | epoch: 1 | total time: 33.18m | eta: 146.9m +step 03087/16704 (18.48%) | loss: 2.843490 | lrm: 1.00 | dt: 641.50ms | tok/sec: 817,279 | mfu: 51.08 | epoch: 1 | total time: 33.19m | eta: 146.9m +step 03088/16704 (18.49%) | loss: 2.834326 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,196 | mfu: 50.83 | epoch: 1 | total time: 33.20m | eta: 146.9m +step 03089/16704 (18.49%) | loss: 2.819958 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,508 | mfu: 50.85 | epoch: 1 | total time: 33.21m | eta: 146.8m +step 03090/16704 (18.50%) | loss: 2.832911 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,643 | mfu: 50.92 | epoch: 1 | total time: 33.22m | eta: 146.8m +step 03091/16704 (18.50%) | loss: 2.829811 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,528 | mfu: 50.91 | epoch: 1 | total time: 33.23m | eta: 146.8m +step 03092/16704 (18.51%) | loss: 2.827043 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,087 | mfu: 50.88 | epoch: 1 | total time: 33.24m | eta: 146.8m +step 03093/16704 (18.52%) | loss: 2.835386 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,662 | mfu: 50.92 | epoch: 1 | total time: 33.25m | eta: 146.8m +step 03094/16704 (18.52%) | loss: 2.820843 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,322 | mfu: 50.83 | epoch: 1 | total time: 33.26m | eta: 146.8m +step 03095/16704 (18.53%) | loss: 2.824434 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,364 | mfu: 50.84 | epoch: 1 | total time: 33.27m | eta: 146.8m +step 03096/16704 (18.53%) | loss: 2.837273 | lrm: 1.00 | dt: 642.18ms | tok/sec: 816,412 | mfu: 51.03 | epoch: 1 | total time: 33.28m | eta: 146.8m +step 03097/16704 (18.54%) | loss: 2.834394 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,729 | mfu: 50.92 | epoch: 1 | total time: 33.29m | eta: 146.8m +step 03098/16704 (18.55%) | loss: 2.829914 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,335 | mfu: 50.83 | epoch: 1 | total time: 33.31m | eta: 146.7m +step 03099/16704 (18.55%) | loss: 2.823980 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,182 | mfu: 50.95 | epoch: 1 | total time: 33.32m | eta: 146.7m +step 03100/16704 (18.56%) | loss: 2.829750 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,251 | mfu: 50.77 | epoch: 1 | total time: 33.33m | eta: 146.7m +step 03101/16704 (18.56%) | loss: 2.826326 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,732 | mfu: 50.92 | epoch: 1 | total time: 33.34m | eta: 146.7m +step 03102/16704 (18.57%) | loss: 2.824885 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,269 | mfu: 50.96 | epoch: 1 | total time: 33.35m | eta: 146.7m +step 03103/16704 (18.58%) | loss: 2.827399 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,695 | mfu: 50.98 | epoch: 1 | total time: 33.36m | eta: 146.7m +step 03104/16704 (18.58%) | loss: 2.825234 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,072 | mfu: 50.94 | epoch: 1 | total time: 33.37m | eta: 146.7m +step 03105/16704 (18.59%) | loss: 2.829138 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,720 | mfu: 50.92 | epoch: 1 | total time: 33.38m | eta: 146.7m +step 03106/16704 (18.59%) | loss: 2.831713 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,869 | mfu: 51.06 | epoch: 1 | total time: 33.39m | eta: 146.7m +step 03107/16704 (18.60%) | loss: 2.835257 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,824 | mfu: 50.87 | epoch: 1 | total time: 33.40m | eta: 146.6m +step 03108/16704 (18.61%) | loss: 2.840539 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 1 | total time: 33.41m | eta: 146.6m +step 03109/16704 (18.61%) | loss: 2.839702 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,222 | mfu: 50.95 | epoch: 1 | total time: 33.42m | eta: 146.6m +step 03110/16704 (18.62%) | loss: 2.834813 | lrm: 1.00 | dt: 650.32ms | tok/sec: 806,195 | mfu: 50.39 | epoch: 1 | total time: 33.43m | eta: 146.6m +step 03111/16704 (18.62%) | loss: 2.835579 | lrm: 1.00 | dt: 641.90ms | tok/sec: 816,771 | mfu: 51.05 | epoch: 1 | total time: 33.45m | eta: 146.6m +step 03112/16704 (18.63%) | loss: 2.836615 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,292 | mfu: 50.83 | epoch: 1 | total time: 33.46m | eta: 146.6m +step 03113/16704 (18.64%) | loss: 2.836582 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,416 | mfu: 50.78 | epoch: 1 | total time: 33.47m | eta: 146.6m +step 03114/16704 (18.64%) | loss: 2.837344 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,194 | mfu: 50.83 | epoch: 1 | total time: 33.48m | eta: 146.6m +step 03115/16704 (18.65%) | loss: 2.837030 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,884 | mfu: 50.81 | epoch: 1 | total time: 33.49m | eta: 146.6m +step 03116/16704 (18.65%) | loss: 2.819999 | lrm: 1.00 | dt: 642.31ms | tok/sec: 816,251 | mfu: 51.02 | epoch: 1 | total time: 33.50m | eta: 146.5m +step 03117/16704 (18.66%) | loss: 2.814051 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,325 | mfu: 50.83 | epoch: 1 | total time: 33.51m | eta: 146.5m +step 03118/16704 (18.67%) | loss: 2.823420 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,597 | mfu: 50.91 | epoch: 1 | total time: 33.52m | eta: 146.5m +step 03119/16704 (18.67%) | loss: 2.822140 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,629 | mfu: 50.73 | epoch: 1 | total time: 33.53m | eta: 146.5m +step 03120/16704 (18.68%) | loss: 2.826308 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,193 | mfu: 51.01 | epoch: 1 | total time: 33.54m | eta: 146.5m +step 03121/16704 (18.68%) | loss: 2.806992 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,040 | mfu: 50.75 | epoch: 1 | total time: 33.55m | eta: 146.5m +step 03122/16704 (18.69%) | loss: 2.824740 | lrm: 1.00 | dt: 643.19ms | tok/sec: 815,131 | mfu: 50.95 | epoch: 1 | total time: 33.56m | eta: 146.5m +step 03123/16704 (18.70%) | loss: 2.822867 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,021 | mfu: 50.81 | epoch: 1 | total time: 33.57m | eta: 146.5m +step 03124/16704 (18.70%) | loss: 2.835715 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,888 | mfu: 50.81 | epoch: 1 | total time: 33.58m | eta: 146.5m +step 03125/16704 (18.71%) | loss: 2.849642 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,208 | mfu: 50.83 | epoch: 1 | total time: 33.60m | eta: 146.5m +step 03126/16704 (18.71%) | loss: 2.848173 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 1 | total time: 33.61m | eta: 146.4m +step 03127/16704 (18.72%) | loss: 2.856510 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,013 | mfu: 50.94 | epoch: 1 | total time: 33.62m | eta: 146.4m +step 03128/16704 (18.73%) | loss: 2.844584 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,371 | mfu: 50.84 | epoch: 1 | total time: 33.63m | eta: 146.4m +step 03129/16704 (18.73%) | loss: 2.852375 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,294 | mfu: 50.77 | epoch: 1 | total time: 33.64m | eta: 146.4m +step 03130/16704 (18.74%) | loss: 2.847656 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,158 | mfu: 50.89 | epoch: 1 | total time: 33.65m | eta: 146.4m +step 03131/16704 (18.74%) | loss: 2.851378 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,990 | mfu: 50.88 | epoch: 1 | total time: 33.66m | eta: 146.4m +step 03132/16704 (18.75%) | loss: 2.840992 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 1 | total time: 33.67m | eta: 146.4m +step 03133/16704 (18.76%) | loss: 2.832882 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,914 | mfu: 50.87 | epoch: 1 | total time: 33.68m | eta: 146.4m +step 03134/16704 (18.76%) | loss: 2.827527 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,210 | mfu: 50.83 | epoch: 1 | total time: 33.69m | eta: 146.4m +step 03135/16704 (18.77%) | loss: 2.820130 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,929 | mfu: 50.81 | epoch: 1 | total time: 33.70m | eta: 146.3m +step 03136/16704 (18.77%) | loss: 2.806357 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 1 | total time: 33.71m | eta: 146.3m +step 03137/16704 (18.78%) | loss: 2.806697 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 1 | total time: 33.72m | eta: 146.3m +step 03138/16704 (18.79%) | loss: 2.819110 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,508 | mfu: 50.78 | epoch: 1 | total time: 33.74m | eta: 146.3m +step 03139/16704 (18.79%) | loss: 2.812823 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,376 | mfu: 50.77 | epoch: 1 | total time: 33.75m | eta: 146.3m +step 03140/16704 (18.80%) | loss: 2.809653 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 1 | total time: 33.76m | eta: 146.3m +step 03141/16704 (18.80%) | loss: 2.816756 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,304 | mfu: 50.77 | epoch: 1 | total time: 33.77m | eta: 146.3m +step 03142/16704 (18.81%) | loss: 2.830741 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,727 | mfu: 50.92 | epoch: 1 | total time: 33.78m | eta: 146.3m +step 03143/16704 (18.82%) | loss: 2.826037 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,838 | mfu: 50.87 | epoch: 1 | total time: 33.79m | eta: 146.3m +step 03144/16704 (18.82%) | loss: 2.831180 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,212 | mfu: 50.83 | epoch: 1 | total time: 33.80m | eta: 146.2m +step 03145/16704 (18.83%) | loss: 2.824864 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,925 | mfu: 51.00 | epoch: 1 | total time: 33.81m | eta: 146.2m +step 03146/16704 (18.83%) | loss: 2.826329 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,761 | mfu: 50.86 | epoch: 1 | total time: 33.82m | eta: 146.2m +step 03147/16704 (18.84%) | loss: 2.812589 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,971 | mfu: 50.94 | epoch: 1 | total time: 33.83m | eta: 146.2m +step 03148/16704 (18.85%) | loss: 2.806524 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,852 | mfu: 50.93 | epoch: 1 | total time: 33.84m | eta: 146.2m +step 03149/16704 (18.85%) | loss: 2.805560 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,758 | mfu: 50.92 | epoch: 1 | total time: 33.85m | eta: 146.2m +step 03150/16704 (18.86%) | loss: 2.813393 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,115 | mfu: 50.82 | epoch: 1 | total time: 33.86m | eta: 146.2m +step 03151/16704 (18.86%) | loss: 2.819025 | lrm: 1.00 | dt: 644.09ms | tok/sec: 814,002 | mfu: 50.88 | epoch: 1 | total time: 33.87m | eta: 146.2m +step 03152/16704 (18.87%) | loss: 2.831814 | lrm: 1.00 | dt: 641.82ms | tok/sec: 816,882 | mfu: 51.06 | epoch: 1 | total time: 33.89m | eta: 146.2m +step 03153/16704 (18.88%) | loss: 2.835359 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,752 | mfu: 50.80 | epoch: 1 | total time: 33.90m | eta: 146.1m +step 03154/16704 (18.88%) | loss: 2.833811 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,671 | mfu: 50.79 | epoch: 1 | total time: 33.91m | eta: 146.1m +step 03155/16704 (18.89%) | loss: 2.828361 | lrm: 1.00 | dt: 641.48ms | tok/sec: 817,307 | mfu: 51.08 | epoch: 1 | total time: 33.92m | eta: 146.1m +step 03156/16704 (18.89%) | loss: 2.820218 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,295 | mfu: 50.96 | epoch: 1 | total time: 33.93m | eta: 146.1m +step 03157/16704 (18.90%) | loss: 2.813870 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,450 | mfu: 50.97 | epoch: 1 | total time: 33.94m | eta: 146.1m +step 03158/16704 (18.91%) | loss: 2.823755 | lrm: 1.00 | dt: 641.35ms | tok/sec: 817,478 | mfu: 51.09 | epoch: 1 | total time: 33.95m | eta: 146.1m +step 03159/16704 (18.91%) | loss: 2.823267 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,342 | mfu: 50.84 | epoch: 1 | total time: 33.96m | eta: 146.1m +step 03160/16704 (18.92%) | loss: 2.824507 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,797 | mfu: 50.86 | epoch: 1 | total time: 33.97m | eta: 146.1m +step 03161/16704 (18.92%) | loss: 2.829573 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,601 | mfu: 50.91 | epoch: 1 | total time: 33.98m | eta: 146.1m +step 03162/16704 (18.93%) | loss: 2.825030 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,554 | mfu: 50.85 | epoch: 1 | total time: 33.99m | eta: 146.0m +step 03163/16704 (18.94%) | loss: 2.836133 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,174 | mfu: 50.95 | epoch: 1 | total time: 34.00m | eta: 146.0m +step 03164/16704 (18.94%) | loss: 2.839331 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 1 | total time: 34.01m | eta: 146.0m +step 03165/16704 (18.95%) | loss: 2.827595 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,809 | mfu: 50.86 | epoch: 1 | total time: 34.02m | eta: 146.0m +step 03166/16704 (18.95%) | loss: 2.829060 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,007 | mfu: 50.94 | epoch: 1 | total time: 34.04m | eta: 146.0m +step 03167/16704 (18.96%) | loss: 2.834996 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,721 | mfu: 50.92 | epoch: 1 | total time: 34.05m | eta: 146.0m +step 03168/16704 (18.97%) | loss: 2.839497 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,011 | mfu: 50.75 | epoch: 1 | total time: 34.06m | eta: 146.0m +step 03169/16704 (18.97%) | loss: 2.832617 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,971 | mfu: 50.94 | epoch: 1 | total time: 34.07m | eta: 146.0m +step 03170/16704 (18.98%) | loss: 2.837453 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,649 | mfu: 50.79 | epoch: 1 | total time: 34.08m | eta: 146.0m +step 03171/16704 (18.98%) | loss: 2.843363 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,162 | mfu: 50.76 | epoch: 1 | total time: 34.09m | eta: 145.9m +step 03172/16704 (18.99%) | loss: 2.842696 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,211 | mfu: 50.95 | epoch: 1 | total time: 34.10m | eta: 145.9m +step 03173/16704 (19.00%) | loss: 2.850996 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,765 | mfu: 50.80 | epoch: 1 | total time: 34.11m | eta: 145.9m +step 03174/16704 (19.00%) | loss: 2.858802 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,247 | mfu: 50.89 | epoch: 1 | total time: 34.12m | eta: 145.9m +step 03175/16704 (19.01%) | loss: 2.853662 | lrm: 1.00 | dt: 642.97ms | tok/sec: 815,419 | mfu: 50.96 | epoch: 1 | total time: 34.13m | eta: 145.9m +step 03176/16704 (19.01%) | loss: 2.867381 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,905 | mfu: 50.93 | epoch: 1 | total time: 34.14m | eta: 145.9m +step 03177/16704 (19.02%) | loss: 2.856640 | lrm: 1.00 | dt: 642.01ms | tok/sec: 816,634 | mfu: 51.04 | epoch: 1 | total time: 34.15m | eta: 145.9m +step 03178/16704 (19.03%) | loss: 2.834569 | lrm: 1.00 | dt: 641.52ms | tok/sec: 817,260 | mfu: 51.08 | epoch: 1 | total time: 34.16m | eta: 145.9m +step 03179/16704 (19.03%) | loss: 2.823073 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,865 | mfu: 50.87 | epoch: 1 | total time: 34.18m | eta: 145.9m +step 03180/16704 (19.04%) | loss: 2.820145 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,964 | mfu: 50.94 | epoch: 1 | total time: 34.19m | eta: 145.8m +step 03181/16704 (19.04%) | loss: 2.811539 | lrm: 1.00 | dt: 641.79ms | tok/sec: 816,911 | mfu: 51.06 | epoch: 1 | total time: 34.20m | eta: 145.8m +step 03182/16704 (19.05%) | loss: 2.817238 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,928 | mfu: 50.75 | epoch: 1 | total time: 34.21m | eta: 145.8m +step 03183/16704 (19.06%) | loss: 2.806505 | lrm: 1.00 | dt: 642.25ms | tok/sec: 816,326 | mfu: 51.02 | epoch: 1 | total time: 34.22m | eta: 145.8m +step 03184/16704 (19.06%) | loss: 2.803828 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,234 | mfu: 50.89 | epoch: 1 | total time: 34.23m | eta: 145.8m +step 03185/16704 (19.07%) | loss: 2.810168 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,496 | mfu: 50.91 | epoch: 1 | total time: 34.24m | eta: 145.8m +step 03186/16704 (19.07%) | loss: 2.803820 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,239 | mfu: 50.83 | epoch: 1 | total time: 34.25m | eta: 145.8m +step 03187/16704 (19.08%) | loss: 2.813472 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,125 | mfu: 50.82 | epoch: 1 | total time: 34.26m | eta: 145.8m +step 03188/16704 (19.09%) | loss: 2.818663 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,264 | mfu: 50.96 | epoch: 1 | total time: 34.27m | eta: 145.8m +step 03189/16704 (19.09%) | loss: 2.825441 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,329 | mfu: 50.96 | epoch: 1 | total time: 34.28m | eta: 145.7m +step 03190/16704 (19.10%) | loss: 2.806631 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,193 | mfu: 51.01 | epoch: 1 | total time: 34.29m | eta: 145.7m +step 03191/16704 (19.10%) | loss: 2.809125 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,175 | mfu: 50.82 | epoch: 1 | total time: 34.30m | eta: 145.7m +step 03192/16704 (19.11%) | loss: 2.804063 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,323 | mfu: 50.90 | epoch: 1 | total time: 34.31m | eta: 145.7m +step 03193/16704 (19.12%) | loss: 2.808912 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,899 | mfu: 50.99 | epoch: 1 | total time: 34.33m | eta: 145.7m +step 03194/16704 (19.12%) | loss: 2.809133 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,861 | mfu: 50.99 | epoch: 1 | total time: 34.34m | eta: 145.7m +step 03195/16704 (19.13%) | loss: 2.802549 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,861 | mfu: 50.80 | epoch: 1 | total time: 34.35m | eta: 145.7m +step 03196/16704 (19.13%) | loss: 2.808884 | lrm: 1.00 | dt: 642.80ms | tok/sec: 815,631 | mfu: 50.98 | epoch: 1 | total time: 34.36m | eta: 145.7m +step 03197/16704 (19.14%) | loss: 2.813856 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,942 | mfu: 50.94 | epoch: 1 | total time: 34.37m | eta: 145.7m +step 03198/16704 (19.15%) | loss: 2.831264 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,532 | mfu: 50.91 | epoch: 1 | total time: 34.38m | eta: 145.6m +step 03199/16704 (19.15%) | loss: 2.817618 | lrm: 1.00 | dt: 642.34ms | tok/sec: 816,214 | mfu: 51.01 | epoch: 1 | total time: 34.39m | eta: 145.6m +step 03200/16704 (19.16%) | loss: 2.820383 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,201 | mfu: 50.95 | epoch: 1 | total time: 34.40m | eta: 145.6m +step 03201/16704 (19.16%) | loss: 2.830499 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,721 | mfu: 50.86 | epoch: 1 | total time: 34.41m | eta: 145.6m +step 03202/16704 (19.17%) | loss: 2.834494 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,375 | mfu: 50.71 | epoch: 1 | total time: 34.42m | eta: 145.6m +step 03203/16704 (19.18%) | loss: 2.839070 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,099 | mfu: 50.94 | epoch: 1 | total time: 34.43m | eta: 145.6m +step 03204/16704 (19.18%) | loss: 2.830960 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,410 | mfu: 50.84 | epoch: 1 | total time: 34.44m | eta: 145.6m +step 03205/16704 (19.19%) | loss: 2.820388 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 1 | total time: 34.45m | eta: 145.6m +step 03206/16704 (19.19%) | loss: 2.826568 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,302 | mfu: 50.96 | epoch: 1 | total time: 34.46m | eta: 145.6m +step 03207/16704 (19.20%) | loss: 2.828345 | lrm: 1.00 | dt: 641.88ms | tok/sec: 816,799 | mfu: 51.05 | epoch: 1 | total time: 34.48m | eta: 145.5m +step 03208/16704 (19.20%) | loss: 2.819903 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 1 | total time: 34.49m | eta: 145.5m +step 03209/16704 (19.21%) | loss: 2.824886 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,611 | mfu: 50.91 | epoch: 1 | total time: 34.50m | eta: 145.5m +step 03210/16704 (19.22%) | loss: 2.842964 | lrm: 1.00 | dt: 646.48ms | tok/sec: 810,986 | mfu: 50.69 | epoch: 1 | total time: 34.51m | eta: 145.5m +step 03211/16704 (19.22%) | loss: 2.836680 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,106 | mfu: 50.88 | epoch: 1 | total time: 34.52m | eta: 145.5m +step 03212/16704 (19.23%) | loss: 2.828026 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,312 | mfu: 51.02 | epoch: 1 | total time: 34.53m | eta: 145.5m +step 03213/16704 (19.23%) | loss: 2.830073 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,226 | mfu: 50.89 | epoch: 1 | total time: 34.54m | eta: 145.5m +step 03214/16704 (19.24%) | loss: 2.830103 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,530 | mfu: 50.91 | epoch: 1 | total time: 34.55m | eta: 145.5m +step 03215/16704 (19.25%) | loss: 2.823035 | lrm: 1.00 | dt: 642.09ms | tok/sec: 816,538 | mfu: 51.03 | epoch: 1 | total time: 34.56m | eta: 145.5m +step 03216/16704 (19.25%) | loss: 2.827892 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 1 | total time: 34.57m | eta: 145.4m +step 03217/16704 (19.26%) | loss: 2.823784 | lrm: 1.00 | dt: 642.06ms | tok/sec: 816,571 | mfu: 51.04 | epoch: 1 | total time: 34.58m | eta: 145.4m +step 03218/16704 (19.26%) | loss: 2.821002 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,300 | mfu: 50.83 | epoch: 1 | total time: 34.59m | eta: 145.4m +step 03219/16704 (19.27%) | loss: 2.822346 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,067 | mfu: 50.76 | epoch: 1 | total time: 34.60m | eta: 145.4m +step 03220/16704 (19.28%) | loss: 2.821862 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,747 | mfu: 50.92 | epoch: 1 | total time: 34.61m | eta: 145.4m +step 03221/16704 (19.28%) | loss: 2.804328 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 1 | total time: 34.63m | eta: 145.4m +step 03222/16704 (19.29%) | loss: 2.818250 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,829 | mfu: 50.87 | epoch: 1 | total time: 34.64m | eta: 145.4m +step 03223/16704 (19.29%) | loss: 2.826165 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,155 | mfu: 51.01 | epoch: 1 | total time: 34.65m | eta: 145.4m +step 03224/16704 (19.30%) | loss: 2.821272 | lrm: 1.00 | dt: 642.45ms | tok/sec: 816,077 | mfu: 51.01 | epoch: 1 | total time: 34.66m | eta: 145.4m +step 03225/16704 (19.31%) | loss: 2.829858 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,973 | mfu: 51.00 | epoch: 1 | total time: 34.67m | eta: 145.3m +step 03226/16704 (19.31%) | loss: 2.824679 | lrm: 1.00 | dt: 641.80ms | tok/sec: 816,898 | mfu: 51.06 | epoch: 1 | total time: 34.68m | eta: 145.3m +step 03227/16704 (19.32%) | loss: 2.824122 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,844 | mfu: 50.74 | epoch: 1 | total time: 34.69m | eta: 145.3m +step 03228/16704 (19.32%) | loss: 2.820977 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,868 | mfu: 50.81 | epoch: 1 | total time: 34.70m | eta: 145.3m +step 03229/16704 (19.33%) | loss: 2.814104 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,860 | mfu: 50.99 | epoch: 1 | total time: 34.71m | eta: 145.3m +step 03230/16704 (19.34%) | loss: 2.822548 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 1 | total time: 34.72m | eta: 145.3m +step 03231/16704 (19.34%) | loss: 2.831427 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,845 | mfu: 50.99 | epoch: 1 | total time: 34.73m | eta: 145.3m +step 03232/16704 (19.35%) | loss: 2.832846 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,052 | mfu: 50.88 | epoch: 1 | total time: 34.74m | eta: 145.3m +step 03233/16704 (19.35%) | loss: 2.843734 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,954 | mfu: 50.87 | epoch: 1 | total time: 34.75m | eta: 145.3m +step 03234/16704 (19.36%) | loss: 2.832092 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,386 | mfu: 50.96 | epoch: 1 | total time: 34.77m | eta: 145.3m +step 03235/16704 (19.37%) | loss: 2.842404 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,185 | mfu: 50.89 | epoch: 1 | total time: 34.78m | eta: 145.2m +step 03236/16704 (19.37%) | loss: 2.834224 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,000 | mfu: 50.81 | epoch: 1 | total time: 34.79m | eta: 145.2m +step 03237/16704 (19.38%) | loss: 2.832942 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,839 | mfu: 50.99 | epoch: 1 | total time: 34.80m | eta: 145.2m +step 03238/16704 (19.38%) | loss: 2.822148 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,513 | mfu: 50.85 | epoch: 1 | total time: 34.81m | eta: 145.2m +step 03239/16704 (19.39%) | loss: 2.821646 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,723 | mfu: 50.67 | epoch: 1 | total time: 34.82m | eta: 145.2m +step 03240/16704 (19.40%) | loss: 2.820732 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,874 | mfu: 50.87 | epoch: 1 | total time: 34.83m | eta: 145.2m +step 03241/16704 (19.40%) | loss: 2.826756 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,462 | mfu: 50.91 | epoch: 1 | total time: 34.84m | eta: 145.2m +step 03242/16704 (19.41%) | loss: 2.843589 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,375 | mfu: 50.90 | epoch: 1 | total time: 34.85m | eta: 145.2m +step 03243/16704 (19.41%) | loss: 2.833643 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,267 | mfu: 50.71 | epoch: 1 | total time: 34.86m | eta: 145.2m +step 03244/16704 (19.42%) | loss: 2.826745 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,864 | mfu: 50.93 | epoch: 1 | total time: 34.87m | eta: 145.1m +step 03245/16704 (19.43%) | loss: 2.829221 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,464 | mfu: 50.84 | epoch: 1 | total time: 34.88m | eta: 145.1m +step 03246/16704 (19.43%) | loss: 2.829597 | lrm: 1.00 | dt: 641.80ms | tok/sec: 816,900 | mfu: 51.06 | epoch: 1 | total time: 34.89m | eta: 145.1m +step 03247/16704 (19.44%) | loss: 2.832937 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 1 | total time: 34.90m | eta: 145.1m +step 03248/16704 (19.44%) | loss: 2.816623 | lrm: 1.00 | dt: 642.22ms | tok/sec: 816,367 | mfu: 51.02 | epoch: 1 | total time: 34.92m | eta: 145.1m +step 03249/16704 (19.45%) | loss: 2.813881 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,725 | mfu: 50.92 | epoch: 1 | total time: 34.93m | eta: 145.1m +Step 03250 | Validation bpb: 0.859749 +step 03250/16704 (19.46%) | loss: 2.815039 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,691 | mfu: 50.54 | epoch: 1 | total time: 34.94m | eta: 145.1m +step 03251/16704 (19.46%) | loss: 2.814850 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,337 | mfu: 50.90 | epoch: 1 | total time: 34.95m | eta: 145.1m +step 03252/16704 (19.47%) | loss: 2.826319 | lrm: 1.00 | dt: 649.39ms | tok/sec: 807,360 | mfu: 50.46 | epoch: 1 | total time: 34.96m | eta: 145.1m +step 03253/16704 (19.47%) | loss: 2.820748 | lrm: 1.00 | dt: 641.34ms | tok/sec: 817,482 | mfu: 51.09 | epoch: 1 | total time: 34.97m | eta: 145.0m +step 03254/16704 (19.48%) | loss: 2.810415 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,430 | mfu: 50.84 | epoch: 1 | total time: 34.98m | eta: 145.0m +step 03255/16704 (19.49%) | loss: 2.817326 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,879 | mfu: 50.74 | epoch: 1 | total time: 34.99m | eta: 145.0m +step 03256/16704 (19.49%) | loss: 2.820209 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,681 | mfu: 50.92 | epoch: 1 | total time: 35.00m | eta: 145.0m +step 03257/16704 (19.50%) | loss: 2.820018 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,318 | mfu: 50.83 | epoch: 1 | total time: 35.01m | eta: 145.0m +step 03258/16704 (19.50%) | loss: 2.817487 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,743 | mfu: 50.86 | epoch: 1 | total time: 35.02m | eta: 145.0m +step 03259/16704 (19.51%) | loss: 2.826238 | lrm: 1.00 | dt: 642.68ms | tok/sec: 815,783 | mfu: 50.99 | epoch: 1 | total time: 35.03m | eta: 145.0m +step 03260/16704 (19.52%) | loss: 2.837428 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,854 | mfu: 50.68 | epoch: 1 | total time: 35.04m | eta: 145.0m +step 03261/16704 (19.52%) | loss: 2.819683 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,242 | mfu: 50.64 | epoch: 1 | total time: 35.06m | eta: 145.0m +step 03262/16704 (19.53%) | loss: 2.826066 | lrm: 1.00 | dt: 640.53ms | tok/sec: 818,525 | mfu: 51.16 | epoch: 1 | total time: 35.07m | eta: 144.9m +step 03263/16704 (19.53%) | loss: 2.827254 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,458 | mfu: 50.78 | epoch: 1 | total time: 35.08m | eta: 144.9m +step 03264/16704 (19.54%) | loss: 2.822923 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,669 | mfu: 50.86 | epoch: 1 | total time: 35.09m | eta: 144.9m +step 03265/16704 (19.55%) | loss: 2.819318 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,753 | mfu: 50.80 | epoch: 1 | total time: 35.10m | eta: 144.9m +step 03266/16704 (19.55%) | loss: 2.815418 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,965 | mfu: 50.94 | epoch: 1 | total time: 35.11m | eta: 144.9m +step 03267/16704 (19.56%) | loss: 2.815615 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,356 | mfu: 50.84 | epoch: 1 | total time: 35.12m | eta: 144.9m +step 03268/16704 (19.56%) | loss: 2.822261 | lrm: 1.00 | dt: 641.76ms | tok/sec: 816,955 | mfu: 51.06 | epoch: 1 | total time: 35.13m | eta: 144.9m +step 03269/16704 (19.57%) | loss: 2.821400 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,681 | mfu: 50.86 | epoch: 1 | total time: 35.14m | eta: 144.9m +step 03270/16704 (19.58%) | loss: 2.817469 | lrm: 1.00 | dt: 640.87ms | tok/sec: 818,083 | mfu: 51.13 | epoch: 1 | total time: 35.15m | eta: 144.9m +step 03271/16704 (19.58%) | loss: 2.823906 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,445 | mfu: 50.97 | epoch: 1 | total time: 35.16m | eta: 144.8m +step 03272/16704 (19.59%) | loss: 2.827147 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,810 | mfu: 50.86 | epoch: 1 | total time: 35.17m | eta: 144.8m +step 03273/16704 (19.59%) | loss: 2.829956 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 1 | total time: 35.18m | eta: 144.8m +step 03274/16704 (19.60%) | loss: 2.833053 | lrm: 1.00 | dt: 641.89ms | tok/sec: 816,792 | mfu: 51.05 | epoch: 1 | total time: 35.19m | eta: 144.8m +step 03275/16704 (19.61%) | loss: 2.819981 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,469 | mfu: 50.84 | epoch: 1 | total time: 35.21m | eta: 144.8m +step 03276/16704 (19.61%) | loss: 2.821099 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,252 | mfu: 50.95 | epoch: 1 | total time: 35.22m | eta: 144.8m +step 03277/16704 (19.62%) | loss: 2.825730 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,831 | mfu: 50.68 | epoch: 1 | total time: 35.23m | eta: 144.8m +step 03278/16704 (19.62%) | loss: 2.811126 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,227 | mfu: 50.95 | epoch: 1 | total time: 35.24m | eta: 144.8m +step 03279/16704 (19.63%) | loss: 2.821387 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,443 | mfu: 50.84 | epoch: 1 | total time: 35.25m | eta: 144.8m +step 03280/16704 (19.64%) | loss: 2.822641 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,077 | mfu: 50.88 | epoch: 1 | total time: 35.26m | eta: 144.7m +step 03281/16704 (19.64%) | loss: 2.818658 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,146 | mfu: 50.89 | epoch: 1 | total time: 35.27m | eta: 144.7m +step 03282/16704 (19.65%) | loss: 2.816653 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,083 | mfu: 50.94 | epoch: 1 | total time: 35.28m | eta: 144.7m +step 03283/16704 (19.65%) | loss: 2.811085 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,842 | mfu: 50.93 | epoch: 1 | total time: 35.29m | eta: 144.7m +step 03284/16704 (19.66%) | loss: 2.815307 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,070 | mfu: 50.94 | epoch: 1 | total time: 35.30m | eta: 144.7m +step 03285/16704 (19.67%) | loss: 2.824579 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,844 | mfu: 50.80 | epoch: 1 | total time: 35.31m | eta: 144.7m +step 03286/16704 (19.67%) | loss: 2.827238 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,695 | mfu: 50.86 | epoch: 1 | total time: 35.32m | eta: 144.7m +step 03287/16704 (19.68%) | loss: 2.827212 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,490 | mfu: 50.97 | epoch: 1 | total time: 35.33m | eta: 144.7m +step 03288/16704 (19.68%) | loss: 2.825337 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,843 | mfu: 50.68 | epoch: 1 | total time: 35.34m | eta: 144.7m +step 03289/16704 (19.69%) | loss: 2.807836 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,425 | mfu: 50.97 | epoch: 1 | total time: 35.36m | eta: 144.6m +step 03290/16704 (19.70%) | loss: 2.809143 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,792 | mfu: 50.86 | epoch: 1 | total time: 35.37m | eta: 144.6m +step 03291/16704 (19.70%) | loss: 2.821971 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 1 | total time: 35.38m | eta: 144.6m +step 03292/16704 (19.71%) | loss: 2.824765 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,918 | mfu: 50.75 | epoch: 1 | total time: 35.39m | eta: 144.6m +step 03293/16704 (19.71%) | loss: 2.816329 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,953 | mfu: 50.94 | epoch: 1 | total time: 35.40m | eta: 144.6m +step 03294/16704 (19.72%) | loss: 2.818723 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,535 | mfu: 50.85 | epoch: 1 | total time: 35.41m | eta: 144.6m +step 03295/16704 (19.73%) | loss: 2.803897 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,581 | mfu: 50.72 | epoch: 1 | total time: 35.42m | eta: 144.6m +step 03296/16704 (19.73%) | loss: 2.807294 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,159 | mfu: 50.95 | epoch: 1 | total time: 35.43m | eta: 144.6m +step 03297/16704 (19.74%) | loss: 2.802294 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,487 | mfu: 50.84 | epoch: 1 | total time: 35.44m | eta: 144.6m +step 03298/16704 (19.74%) | loss: 2.793292 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,770 | mfu: 50.86 | epoch: 1 | total time: 35.45m | eta: 144.5m +step 03299/16704 (19.75%) | loss: 2.796320 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,329 | mfu: 50.90 | epoch: 1 | total time: 35.46m | eta: 144.5m +step 03300/16704 (19.76%) | loss: 2.798256 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,595 | mfu: 50.73 | epoch: 1 | total time: 35.47m | eta: 144.5m +step 03301/16704 (19.76%) | loss: 2.791493 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,492 | mfu: 50.97 | epoch: 1 | total time: 35.48m | eta: 144.5m +step 03302/16704 (19.77%) | loss: 2.795332 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 1 | total time: 35.50m | eta: 144.5m +step 03303/16704 (19.77%) | loss: 2.805705 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 1 | total time: 35.51m | eta: 144.5m +step 03304/16704 (19.78%) | loss: 2.807230 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,434 | mfu: 50.84 | epoch: 1 | total time: 35.52m | eta: 144.5m +step 03305/16704 (19.79%) | loss: 2.811575 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,095 | mfu: 50.94 | epoch: 1 | total time: 35.53m | eta: 144.5m +step 03306/16704 (19.79%) | loss: 2.803294 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,552 | mfu: 51.04 | epoch: 1 | total time: 35.54m | eta: 144.5m +step 03307/16704 (19.80%) | loss: 2.805719 | lrm: 1.00 | dt: 642.34ms | tok/sec: 816,213 | mfu: 51.01 | epoch: 1 | total time: 35.55m | eta: 144.4m +step 03308/16704 (19.80%) | loss: 2.806500 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,471 | mfu: 50.84 | epoch: 1 | total time: 35.56m | eta: 144.4m +step 03309/16704 (19.81%) | loss: 2.816391 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,417 | mfu: 50.90 | epoch: 1 | total time: 35.57m | eta: 144.4m +step 03310/16704 (19.82%) | loss: 2.824064 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,696 | mfu: 50.79 | epoch: 1 | total time: 35.58m | eta: 144.4m +step 03311/16704 (19.82%) | loss: 2.833754 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,036 | mfu: 50.75 | epoch: 1 | total time: 35.59m | eta: 144.4m +step 03312/16704 (19.83%) | loss: 2.832908 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,241 | mfu: 50.89 | epoch: 1 | total time: 35.60m | eta: 144.4m +step 03313/16704 (19.83%) | loss: 2.841006 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 1 | total time: 35.61m | eta: 144.4m +step 03314/16704 (19.84%) | loss: 2.847557 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,164 | mfu: 50.76 | epoch: 1 | total time: 35.62m | eta: 144.4m +step 03315/16704 (19.85%) | loss: 2.843121 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,965 | mfu: 50.87 | epoch: 1 | total time: 35.63m | eta: 144.4m +step 03316/16704 (19.85%) | loss: 2.848207 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 1 | total time: 35.65m | eta: 144.4m +step 03317/16704 (19.86%) | loss: 2.845035 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,507 | mfu: 50.97 | epoch: 1 | total time: 35.66m | eta: 144.3m +step 03318/16704 (19.86%) | loss: 2.834005 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,356 | mfu: 50.84 | epoch: 1 | total time: 35.67m | eta: 144.3m +step 03319/16704 (19.87%) | loss: 2.831225 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,576 | mfu: 50.91 | epoch: 1 | total time: 35.68m | eta: 144.3m +step 03320/16704 (19.88%) | loss: 2.826779 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,484 | mfu: 50.84 | epoch: 1 | total time: 35.69m | eta: 144.3m +step 03321/16704 (19.88%) | loss: 2.820668 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,911 | mfu: 50.87 | epoch: 1 | total time: 35.70m | eta: 144.3m +step 03322/16704 (19.89%) | loss: 2.836190 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,498 | mfu: 50.84 | epoch: 1 | total time: 35.71m | eta: 144.3m +step 03323/16704 (19.89%) | loss: 2.837125 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,039 | mfu: 50.69 | epoch: 1 | total time: 35.72m | eta: 144.3m +step 03324/16704 (19.90%) | loss: 2.832266 | lrm: 1.00 | dt: 640.29ms | tok/sec: 818,823 | mfu: 51.18 | epoch: 1 | total time: 35.73m | eta: 144.3m +step 03325/16704 (19.91%) | loss: 2.837103 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,908 | mfu: 50.81 | epoch: 1 | total time: 35.74m | eta: 144.3m +step 03326/16704 (19.91%) | loss: 2.839780 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,227 | mfu: 50.89 | epoch: 1 | total time: 35.75m | eta: 144.2m +step 03327/16704 (19.92%) | loss: 2.835803 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,928 | mfu: 50.93 | epoch: 1 | total time: 35.76m | eta: 144.2m +step 03328/16704 (19.92%) | loss: 2.833348 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,444 | mfu: 50.84 | epoch: 1 | total time: 35.77m | eta: 144.2m +step 03329/16704 (19.93%) | loss: 2.832525 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,566 | mfu: 50.91 | epoch: 1 | total time: 35.79m | eta: 144.2m +step 03330/16704 (19.94%) | loss: 2.829154 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,037 | mfu: 50.69 | epoch: 1 | total time: 35.80m | eta: 144.2m +step 03331/16704 (19.94%) | loss: 2.825528 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,808 | mfu: 50.93 | epoch: 1 | total time: 35.81m | eta: 144.2m +step 03332/16704 (19.95%) | loss: 2.826616 | lrm: 1.00 | dt: 641.29ms | tok/sec: 817,552 | mfu: 51.10 | epoch: 1 | total time: 35.82m | eta: 144.2m +step 03333/16704 (19.95%) | loss: 2.824503 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,668 | mfu: 50.67 | epoch: 1 | total time: 35.83m | eta: 144.2m +step 03334/16704 (19.96%) | loss: 2.833640 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,295 | mfu: 50.96 | epoch: 1 | total time: 35.84m | eta: 144.2m +step 03335/16704 (19.97%) | loss: 2.831475 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,428 | mfu: 50.84 | epoch: 1 | total time: 35.85m | eta: 144.1m +step 03336/16704 (19.97%) | loss: 2.836160 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,587 | mfu: 50.79 | epoch: 1 | total time: 35.86m | eta: 144.1m +step 03337/16704 (19.98%) | loss: 2.823778 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,052 | mfu: 50.75 | epoch: 1 | total time: 35.87m | eta: 144.1m +step 03338/16704 (19.98%) | loss: 2.838052 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,809 | mfu: 50.80 | epoch: 1 | total time: 35.88m | eta: 144.1m +step 03339/16704 (19.99%) | loss: 2.839186 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,377 | mfu: 50.96 | epoch: 1 | total time: 35.89m | eta: 144.1m +step 03340/16704 (20.00%) | loss: 2.837527 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,676 | mfu: 50.86 | epoch: 1 | total time: 35.90m | eta: 144.1m +step 03341/16704 (20.00%) | loss: 2.836122 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 1 | total time: 35.91m | eta: 144.1m +step 03342/16704 (20.01%) | loss: 2.832530 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,024 | mfu: 50.82 | epoch: 1 | total time: 35.92m | eta: 144.1m +step 03343/16704 (20.01%) | loss: 2.831090 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,268 | mfu: 50.96 | epoch: 1 | total time: 35.94m | eta: 144.1m +step 03344/16704 (20.02%) | loss: 2.833226 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,187 | mfu: 50.76 | epoch: 1 | total time: 35.95m | eta: 144.0m +step 03345/16704 (20.03%) | loss: 2.834312 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,145 | mfu: 50.82 | epoch: 1 | total time: 35.96m | eta: 144.0m +step 03346/16704 (20.03%) | loss: 2.838834 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,922 | mfu: 50.93 | epoch: 1 | total time: 35.97m | eta: 144.0m +step 03347/16704 (20.04%) | loss: 2.835832 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,622 | mfu: 50.85 | epoch: 1 | total time: 35.98m | eta: 144.0m +step 03348/16704 (20.04%) | loss: 2.823887 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,935 | mfu: 50.93 | epoch: 1 | total time: 35.99m | eta: 144.0m +step 03349/16704 (20.05%) | loss: 2.824975 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,165 | mfu: 50.82 | epoch: 1 | total time: 36.00m | eta: 144.0m +step 03350/16704 (20.06%) | loss: 2.826054 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,689 | mfu: 50.92 | epoch: 1 | total time: 36.01m | eta: 144.0m +step 03351/16704 (20.06%) | loss: 2.840238 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,127 | mfu: 50.88 | epoch: 1 | total time: 36.02m | eta: 144.0m +step 03352/16704 (20.07%) | loss: 2.837142 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,909 | mfu: 51.00 | epoch: 1 | total time: 36.03m | eta: 144.0m +step 03353/16704 (20.07%) | loss: 2.844109 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,496 | mfu: 50.91 | epoch: 1 | total time: 36.04m | eta: 143.9m +step 03354/16704 (20.08%) | loss: 2.846656 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,488 | mfu: 50.91 | epoch: 1 | total time: 36.05m | eta: 143.9m +step 03355/16704 (20.09%) | loss: 2.846957 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 1 | total time: 36.06m | eta: 143.9m +step 03356/16704 (20.09%) | loss: 2.854125 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,868 | mfu: 50.93 | epoch: 1 | total time: 36.07m | eta: 143.9m +step 03357/16704 (20.10%) | loss: 2.861393 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,002 | mfu: 50.75 | epoch: 1 | total time: 36.09m | eta: 143.9m +step 03358/16704 (20.10%) | loss: 2.863914 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,615 | mfu: 50.79 | epoch: 1 | total time: 36.10m | eta: 143.9m +step 03359/16704 (20.11%) | loss: 2.857895 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,216 | mfu: 50.89 | epoch: 1 | total time: 36.11m | eta: 143.9m +step 03360/16704 (20.11%) | loss: 2.871196 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,118 | mfu: 50.76 | epoch: 1 | total time: 36.12m | eta: 143.9m +step 03361/16704 (20.12%) | loss: 2.864981 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,526 | mfu: 50.97 | epoch: 1 | total time: 36.13m | eta: 143.9m +step 03362/16704 (20.13%) | loss: 2.846396 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,719 | mfu: 50.86 | epoch: 1 | total time: 36.14m | eta: 143.8m +step 03363/16704 (20.13%) | loss: 2.834354 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,939 | mfu: 50.81 | epoch: 1 | total time: 36.15m | eta: 143.8m +step 03364/16704 (20.14%) | loss: 2.828537 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,176 | mfu: 50.82 | epoch: 1 | total time: 36.16m | eta: 143.8m +step 03365/16704 (20.14%) | loss: 2.835076 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,703 | mfu: 50.80 | epoch: 1 | total time: 36.17m | eta: 143.8m +step 03366/16704 (20.15%) | loss: 2.835915 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,045 | mfu: 50.88 | epoch: 1 | total time: 36.18m | eta: 143.8m +step 03367/16704 (20.16%) | loss: 2.830970 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,885 | mfu: 50.81 | epoch: 1 | total time: 36.19m | eta: 143.8m +step 03368/16704 (20.16%) | loss: 2.821838 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,317 | mfu: 50.65 | epoch: 1 | total time: 36.20m | eta: 143.8m +step 03369/16704 (20.17%) | loss: 2.817726 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,436 | mfu: 50.72 | epoch: 1 | total time: 36.21m | eta: 143.8m +step 03370/16704 (20.17%) | loss: 2.816695 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,004 | mfu: 50.88 | epoch: 1 | total time: 36.23m | eta: 143.8m +step 03371/16704 (20.18%) | loss: 2.834811 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,035 | mfu: 50.69 | epoch: 1 | total time: 36.24m | eta: 143.7m +step 03372/16704 (20.19%) | loss: 2.842746 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,340 | mfu: 50.90 | epoch: 1 | total time: 36.25m | eta: 143.7m +step 03373/16704 (20.19%) | loss: 2.840492 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,446 | mfu: 50.72 | epoch: 1 | total time: 36.26m | eta: 143.7m +step 03374/16704 (20.20%) | loss: 2.830038 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,232 | mfu: 50.95 | epoch: 1 | total time: 36.27m | eta: 143.7m +step 03375/16704 (20.20%) | loss: 2.827368 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,089 | mfu: 50.82 | epoch: 1 | total time: 36.28m | eta: 143.7m +step 03376/16704 (20.21%) | loss: 2.829995 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,716 | mfu: 50.73 | epoch: 1 | total time: 36.29m | eta: 143.7m +step 03377/16704 (20.22%) | loss: 2.832901 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,746 | mfu: 50.74 | epoch: 1 | total time: 36.30m | eta: 143.7m +step 03378/16704 (20.22%) | loss: 2.847580 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,258 | mfu: 50.89 | epoch: 1 | total time: 36.31m | eta: 143.7m +step 03379/16704 (20.23%) | loss: 2.835523 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,602 | mfu: 50.79 | epoch: 1 | total time: 36.32m | eta: 143.7m +step 03380/16704 (20.23%) | loss: 2.830684 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,055 | mfu: 50.69 | epoch: 1 | total time: 36.33m | eta: 143.6m +step 03381/16704 (20.24%) | loss: 2.825101 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,799 | mfu: 50.93 | epoch: 1 | total time: 36.34m | eta: 143.6m +step 03382/16704 (20.25%) | loss: 2.828461 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,907 | mfu: 50.68 | epoch: 1 | total time: 36.35m | eta: 143.6m +step 03383/16704 (20.25%) | loss: 2.828795 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,769 | mfu: 50.86 | epoch: 1 | total time: 36.37m | eta: 143.6m +step 03384/16704 (20.26%) | loss: 2.838985 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,092 | mfu: 50.76 | epoch: 1 | total time: 36.38m | eta: 143.6m +step 03385/16704 (20.26%) | loss: 2.841553 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,079 | mfu: 50.88 | epoch: 1 | total time: 36.39m | eta: 143.6m +step 03386/16704 (20.27%) | loss: 2.838946 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 1 | total time: 36.40m | eta: 143.6m +step 03387/16704 (20.28%) | loss: 2.853634 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,656 | mfu: 50.79 | epoch: 1 | total time: 36.41m | eta: 143.6m +step 03388/16704 (20.28%) | loss: 2.849605 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,375 | mfu: 50.84 | epoch: 1 | total time: 36.42m | eta: 143.6m +step 03389/16704 (20.29%) | loss: 2.845670 | lrm: 1.00 | dt: 642.04ms | tok/sec: 816,591 | mfu: 51.04 | epoch: 1 | total time: 36.43m | eta: 143.6m +step 03390/16704 (20.29%) | loss: 2.831379 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,737 | mfu: 50.61 | epoch: 1 | total time: 36.44m | eta: 143.5m +step 03391/16704 (20.30%) | loss: 2.818390 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,327 | mfu: 50.71 | epoch: 1 | total time: 36.45m | eta: 143.5m +step 03392/16704 (20.31%) | loss: 2.819345 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,592 | mfu: 50.91 | epoch: 1 | total time: 36.46m | eta: 143.5m +step 03393/16704 (20.31%) | loss: 2.806335 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,206 | mfu: 50.70 | epoch: 1 | total time: 36.47m | eta: 143.5m +step 03394/16704 (20.32%) | loss: 2.819903 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,167 | mfu: 50.76 | epoch: 1 | total time: 36.48m | eta: 143.5m +step 03395/16704 (20.32%) | loss: 2.826968 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,226 | mfu: 50.83 | epoch: 1 | total time: 36.49m | eta: 143.5m +step 03396/16704 (20.33%) | loss: 2.821792 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,229 | mfu: 50.64 | epoch: 1 | total time: 36.50m | eta: 143.5m +step 03397/16704 (20.34%) | loss: 2.830263 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,854 | mfu: 50.74 | epoch: 1 | total time: 36.52m | eta: 143.5m +step 03398/16704 (20.34%) | loss: 2.829163 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,881 | mfu: 50.93 | epoch: 1 | total time: 36.53m | eta: 143.5m +step 03399/16704 (20.35%) | loss: 2.810001 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,967 | mfu: 50.87 | epoch: 1 | total time: 36.54m | eta: 143.4m +step 03400/16704 (20.35%) | loss: 2.828601 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,088 | mfu: 50.94 | epoch: 1 | total time: 36.55m | eta: 143.4m +step 03401/16704 (20.36%) | loss: 2.830592 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,355 | mfu: 50.90 | epoch: 1 | total time: 36.56m | eta: 143.4m +step 03402/16704 (20.37%) | loss: 2.836742 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,404 | mfu: 50.84 | epoch: 1 | total time: 36.57m | eta: 143.4m +step 03403/16704 (20.37%) | loss: 2.838762 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,970 | mfu: 51.00 | epoch: 1 | total time: 36.58m | eta: 143.4m +step 03404/16704 (20.38%) | loss: 2.835539 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,243 | mfu: 50.77 | epoch: 1 | total time: 36.59m | eta: 143.4m +step 03405/16704 (20.38%) | loss: 2.843658 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,235 | mfu: 50.95 | epoch: 1 | total time: 36.60m | eta: 143.4m +step 03406/16704 (20.39%) | loss: 2.835802 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 1 | total time: 36.61m | eta: 143.4m +step 03407/16704 (20.40%) | loss: 2.830860 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,020 | mfu: 50.69 | epoch: 1 | total time: 36.62m | eta: 143.4m +step 03408/16704 (20.40%) | loss: 2.833585 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,648 | mfu: 50.98 | epoch: 1 | total time: 36.63m | eta: 143.3m +step 03409/16704 (20.41%) | loss: 2.834502 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,328 | mfu: 50.83 | epoch: 1 | total time: 36.64m | eta: 143.3m +step 03410/16704 (20.41%) | loss: 2.839589 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,821 | mfu: 50.86 | epoch: 1 | total time: 36.66m | eta: 143.3m +step 03411/16704 (20.42%) | loss: 2.856162 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,329 | mfu: 50.83 | epoch: 1 | total time: 36.67m | eta: 143.3m +step 03412/16704 (20.43%) | loss: 2.852695 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,723 | mfu: 50.80 | epoch: 1 | total time: 36.68m | eta: 143.3m +step 03413/16704 (20.43%) | loss: 2.831507 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,850 | mfu: 50.99 | epoch: 1 | total time: 36.69m | eta: 143.3m +step 03414/16704 (20.44%) | loss: 2.827905 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,851 | mfu: 50.93 | epoch: 1 | total time: 36.70m | eta: 143.3m +step 03415/16704 (20.44%) | loss: 2.837189 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,795 | mfu: 50.80 | epoch: 1 | total time: 36.71m | eta: 143.3m +step 03416/16704 (20.45%) | loss: 2.849744 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,616 | mfu: 50.91 | epoch: 1 | total time: 36.72m | eta: 143.3m +step 03417/16704 (20.46%) | loss: 2.843250 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,447 | mfu: 50.78 | epoch: 1 | total time: 36.73m | eta: 143.2m +step 03418/16704 (20.46%) | loss: 2.844155 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,245 | mfu: 50.83 | epoch: 1 | total time: 36.74m | eta: 143.2m +step 03419/16704 (20.47%) | loss: 2.846709 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,483 | mfu: 50.91 | epoch: 1 | total time: 36.75m | eta: 143.2m +step 03420/16704 (20.47%) | loss: 2.834267 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,461 | mfu: 50.65 | epoch: 1 | total time: 36.76m | eta: 143.2m +step 03421/16704 (20.48%) | loss: 2.829708 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,624 | mfu: 50.85 | epoch: 1 | total time: 36.77m | eta: 143.2m +step 03422/16704 (20.49%) | loss: 2.823699 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,694 | mfu: 50.73 | epoch: 1 | total time: 36.78m | eta: 143.2m +step 03423/16704 (20.49%) | loss: 2.813597 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,834 | mfu: 50.80 | epoch: 1 | total time: 36.79m | eta: 143.2m +step 03424/16704 (20.50%) | loss: 2.821753 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,374 | mfu: 50.90 | epoch: 1 | total time: 36.81m | eta: 143.2m +step 03425/16704 (20.50%) | loss: 2.821745 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,415 | mfu: 50.90 | epoch: 1 | total time: 36.82m | eta: 143.2m +step 03426/16704 (20.51%) | loss: 2.838619 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 1 | total time: 36.83m | eta: 143.1m +step 03427/16704 (20.52%) | loss: 2.830065 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,644 | mfu: 50.85 | epoch: 1 | total time: 36.84m | eta: 143.1m +step 03428/16704 (20.52%) | loss: 2.825364 | lrm: 1.00 | dt: 641.67ms | tok/sec: 817,069 | mfu: 51.07 | epoch: 1 | total time: 36.85m | eta: 143.1m +step 03429/16704 (20.53%) | loss: 2.813132 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,104 | mfu: 50.76 | epoch: 1 | total time: 36.86m | eta: 143.1m +step 03430/16704 (20.53%) | loss: 2.806263 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,741 | mfu: 50.86 | epoch: 1 | total time: 36.87m | eta: 143.1m +step 03431/16704 (20.54%) | loss: 2.798183 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,882 | mfu: 50.74 | epoch: 1 | total time: 36.88m | eta: 143.1m +step 03432/16704 (20.55%) | loss: 2.813470 | lrm: 1.00 | dt: 648.09ms | tok/sec: 808,972 | mfu: 50.56 | epoch: 1 | total time: 36.89m | eta: 143.1m +step 03433/16704 (20.55%) | loss: 2.810625 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,273 | mfu: 50.89 | epoch: 1 | total time: 36.90m | eta: 143.1m +step 03434/16704 (20.56%) | loss: 2.819950 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,085 | mfu: 50.88 | epoch: 1 | total time: 36.91m | eta: 143.1m +step 03435/16704 (20.56%) | loss: 2.823302 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,965 | mfu: 50.69 | epoch: 1 | total time: 36.92m | eta: 143.0m +step 03436/16704 (20.57%) | loss: 2.817564 | lrm: 1.00 | dt: 641.39ms | tok/sec: 817,421 | mfu: 51.09 | epoch: 1 | total time: 36.93m | eta: 143.0m +step 03437/16704 (20.58%) | loss: 2.815782 | lrm: 1.00 | dt: 646.96ms | tok/sec: 810,391 | mfu: 50.65 | epoch: 1 | total time: 36.95m | eta: 143.0m +step 03438/16704 (20.58%) | loss: 2.816063 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,089 | mfu: 50.88 | epoch: 1 | total time: 36.96m | eta: 143.0m +step 03439/16704 (20.59%) | loss: 2.810232 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,737 | mfu: 50.86 | epoch: 1 | total time: 36.97m | eta: 143.0m +step 03440/16704 (20.59%) | loss: 2.807033 | lrm: 1.00 | dt: 649.37ms | tok/sec: 807,384 | mfu: 50.46 | epoch: 1 | total time: 36.98m | eta: 143.0m +step 03441/16704 (20.60%) | loss: 2.810789 | lrm: 1.00 | dt: 642.47ms | tok/sec: 816,051 | mfu: 51.00 | epoch: 1 | total time: 36.99m | eta: 143.0m +step 03442/16704 (20.61%) | loss: 2.822882 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,473 | mfu: 50.91 | epoch: 1 | total time: 37.00m | eta: 143.0m +step 03443/16704 (20.61%) | loss: 2.827333 | lrm: 1.00 | dt: 642.66ms | tok/sec: 815,810 | mfu: 50.99 | epoch: 1 | total time: 37.01m | eta: 143.0m +step 03444/16704 (20.62%) | loss: 2.835378 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,064 | mfu: 50.76 | epoch: 1 | total time: 37.02m | eta: 143.0m +step 03445/16704 (20.62%) | loss: 2.828065 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,449 | mfu: 50.78 | epoch: 1 | total time: 37.03m | eta: 142.9m +step 03446/16704 (20.63%) | loss: 2.814613 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 1 | total time: 37.04m | eta: 142.9m +step 03447/16704 (20.64%) | loss: 2.820548 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,489 | mfu: 50.78 | epoch: 1 | total time: 37.05m | eta: 142.9m +step 03448/16704 (20.64%) | loss: 2.814366 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,108 | mfu: 50.76 | epoch: 1 | total time: 37.06m | eta: 142.9m +step 03449/16704 (20.65%) | loss: 2.814693 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,204 | mfu: 51.01 | epoch: 1 | total time: 37.07m | eta: 142.9m +step 03450/16704 (20.65%) | loss: 2.807061 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,455 | mfu: 50.72 | epoch: 1 | total time: 37.08m | eta: 142.9m +step 03451/16704 (20.66%) | loss: 2.807501 | lrm: 1.00 | dt: 641.90ms | tok/sec: 816,780 | mfu: 51.05 | epoch: 1 | total time: 37.10m | eta: 142.9m +step 03452/16704 (20.67%) | loss: 2.817682 | lrm: 1.00 | dt: 647.15ms | tok/sec: 810,154 | mfu: 50.64 | epoch: 1 | total time: 37.11m | eta: 142.9m +step 03453/16704 (20.67%) | loss: 2.808961 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,605 | mfu: 50.79 | epoch: 1 | total time: 37.12m | eta: 142.9m +step 03454/16704 (20.68%) | loss: 2.813798 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 1 | total time: 37.13m | eta: 142.8m +step 03455/16704 (20.68%) | loss: 2.824632 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,734 | mfu: 50.80 | epoch: 1 | total time: 37.14m | eta: 142.8m +step 03456/16704 (20.69%) | loss: 2.824785 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 1 | total time: 37.15m | eta: 142.8m +step 03457/16704 (20.70%) | loss: 2.811974 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,298 | mfu: 50.71 | epoch: 1 | total time: 37.16m | eta: 142.8m +step 03458/16704 (20.70%) | loss: 2.826435 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,131 | mfu: 50.88 | epoch: 1 | total time: 37.17m | eta: 142.8m +step 03459/16704 (20.71%) | loss: 2.824204 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 1 | total time: 37.18m | eta: 142.8m +step 03460/16704 (20.71%) | loss: 2.821066 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,888 | mfu: 50.68 | epoch: 1 | total time: 37.19m | eta: 142.8m +step 03461/16704 (20.72%) | loss: 2.808560 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,177 | mfu: 50.89 | epoch: 1 | total time: 37.20m | eta: 142.8m +step 03462/16704 (20.73%) | loss: 2.801015 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,954 | mfu: 50.75 | epoch: 1 | total time: 37.21m | eta: 142.8m +step 03463/16704 (20.73%) | loss: 2.798321 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,662 | mfu: 50.67 | epoch: 1 | total time: 37.22m | eta: 142.7m +step 03464/16704 (20.74%) | loss: 2.800999 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,766 | mfu: 50.92 | epoch: 1 | total time: 37.24m | eta: 142.7m +step 03465/16704 (20.74%) | loss: 2.805111 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,784 | mfu: 50.68 | epoch: 1 | total time: 37.25m | eta: 142.7m +step 03466/16704 (20.75%) | loss: 2.805811 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,432 | mfu: 50.72 | epoch: 1 | total time: 37.26m | eta: 142.7m +step 03467/16704 (20.76%) | loss: 2.806107 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,339 | mfu: 50.96 | epoch: 1 | total time: 37.27m | eta: 142.7m +step 03468/16704 (20.76%) | loss: 2.814817 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,327 | mfu: 50.58 | epoch: 1 | total time: 37.28m | eta: 142.7m +step 03469/16704 (20.77%) | loss: 2.809667 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,438 | mfu: 50.97 | epoch: 1 | total time: 37.29m | eta: 142.7m +step 03470/16704 (20.77%) | loss: 2.798771 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,804 | mfu: 50.93 | epoch: 1 | total time: 37.30m | eta: 142.7m +step 03471/16704 (20.78%) | loss: 2.798707 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,333 | mfu: 50.71 | epoch: 1 | total time: 37.31m | eta: 142.7m +step 03472/16704 (20.79%) | loss: 2.813755 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,396 | mfu: 50.90 | epoch: 1 | total time: 37.32m | eta: 142.6m +step 03473/16704 (20.79%) | loss: 2.819434 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,213 | mfu: 50.76 | epoch: 1 | total time: 37.33m | eta: 142.6m +step 03474/16704 (20.80%) | loss: 2.818428 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,622 | mfu: 50.92 | epoch: 1 | total time: 37.34m | eta: 142.6m +step 03475/16704 (20.80%) | loss: 2.835031 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,321 | mfu: 50.83 | epoch: 1 | total time: 37.35m | eta: 142.6m +step 03476/16704 (20.81%) | loss: 2.825996 | lrm: 1.00 | dt: 648.18ms | tok/sec: 808,862 | mfu: 50.56 | epoch: 1 | total time: 37.36m | eta: 142.6m +step 03477/16704 (20.82%) | loss: 2.822146 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,850 | mfu: 50.74 | epoch: 1 | total time: 37.38m | eta: 142.6m +step 03478/16704 (20.82%) | loss: 2.820585 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,856 | mfu: 50.93 | epoch: 1 | total time: 37.39m | eta: 142.6m +step 03479/16704 (20.83%) | loss: 2.822887 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 1 | total time: 37.40m | eta: 142.6m +step 03480/16704 (20.83%) | loss: 2.820800 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,856 | mfu: 50.87 | epoch: 1 | total time: 37.41m | eta: 142.6m +step 03481/16704 (20.84%) | loss: 2.837796 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,247 | mfu: 50.77 | epoch: 1 | total time: 37.42m | eta: 142.5m +step 03482/16704 (20.85%) | loss: 2.836605 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,715 | mfu: 50.73 | epoch: 1 | total time: 37.43m | eta: 142.5m +step 03483/16704 (20.85%) | loss: 2.845435 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,763 | mfu: 50.67 | epoch: 1 | total time: 37.44m | eta: 142.5m +step 03484/16704 (20.86%) | loss: 2.851592 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,485 | mfu: 50.84 | epoch: 1 | total time: 37.45m | eta: 142.5m +step 03485/16704 (20.86%) | loss: 2.855473 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,725 | mfu: 50.80 | epoch: 1 | total time: 37.46m | eta: 142.5m +step 03486/16704 (20.87%) | loss: 2.844378 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,569 | mfu: 50.91 | epoch: 1 | total time: 37.47m | eta: 142.5m +step 03487/16704 (20.88%) | loss: 2.845996 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,387 | mfu: 50.90 | epoch: 1 | total time: 37.48m | eta: 142.5m +step 03488/16704 (20.88%) | loss: 2.831708 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,200 | mfu: 50.70 | epoch: 1 | total time: 37.49m | eta: 142.5m +step 03489/16704 (20.89%) | loss: 2.836095 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 1 | total time: 37.50m | eta: 142.5m +step 03490/16704 (20.89%) | loss: 2.838768 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 1 | total time: 37.51m | eta: 142.4m +step 03491/16704 (20.90%) | loss: 2.831022 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,162 | mfu: 50.82 | epoch: 1 | total time: 37.53m | eta: 142.4m +step 03492/16704 (20.91%) | loss: 2.822963 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,825 | mfu: 50.80 | epoch: 1 | total time: 37.54m | eta: 142.4m +step 03493/16704 (20.91%) | loss: 2.825448 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,331 | mfu: 50.83 | epoch: 1 | total time: 37.55m | eta: 142.4m +step 03494/16704 (20.92%) | loss: 2.826121 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,515 | mfu: 50.72 | epoch: 1 | total time: 37.56m | eta: 142.4m +step 03495/16704 (20.92%) | loss: 2.819076 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,654 | mfu: 50.92 | epoch: 1 | total time: 37.57m | eta: 142.4m +step 03496/16704 (20.93%) | loss: 2.817753 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,669 | mfu: 50.73 | epoch: 1 | total time: 37.58m | eta: 142.4m +step 03497/16704 (20.94%) | loss: 2.807603 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 1 | total time: 37.59m | eta: 142.4m +step 03498/16704 (20.94%) | loss: 2.809489 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,440 | mfu: 50.78 | epoch: 1 | total time: 37.60m | eta: 142.4m +step 03499/16704 (20.95%) | loss: 2.824995 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,581 | mfu: 50.85 | epoch: 1 | total time: 37.61m | eta: 142.4m +Step 03500 | Validation bpb: 0.856126 +step 03500/16704 (20.95%) | loss: 2.833467 | lrm: 1.00 | dt: 648.46ms | tok/sec: 808,509 | mfu: 50.53 | epoch: 1 | total time: 37.62m | eta: 142.3m +step 03501/16704 (20.96%) | loss: 2.823958 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 1 | total time: 37.63m | eta: 142.3m +step 03502/16704 (20.97%) | loss: 2.830634 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,629 | mfu: 50.73 | epoch: 1 | total time: 37.64m | eta: 142.3m +step 03503/16704 (20.97%) | loss: 2.838265 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,168 | mfu: 51.01 | epoch: 1 | total time: 37.65m | eta: 142.3m +step 03504/16704 (20.98%) | loss: 2.836215 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,964 | mfu: 50.87 | epoch: 1 | total time: 37.67m | eta: 142.3m +step 03505/16704 (20.98%) | loss: 2.838604 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,485 | mfu: 50.78 | epoch: 1 | total time: 37.68m | eta: 142.3m +step 03506/16704 (20.99%) | loss: 2.845538 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,724 | mfu: 51.05 | epoch: 1 | total time: 37.69m | eta: 142.3m +step 03507/16704 (20.99%) | loss: 2.841541 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,443 | mfu: 50.78 | epoch: 1 | total time: 37.70m | eta: 142.3m +step 03508/16704 (21.00%) | loss: 2.835653 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,932 | mfu: 50.93 | epoch: 1 | total time: 37.71m | eta: 142.3m +step 03509/16704 (21.01%) | loss: 2.825034 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,447 | mfu: 50.84 | epoch: 1 | total time: 37.72m | eta: 142.2m +step 03510/16704 (21.01%) | loss: 2.824926 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,721 | mfu: 50.98 | epoch: 1 | total time: 37.73m | eta: 142.2m +step 03511/16704 (21.02%) | loss: 2.822305 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,012 | mfu: 50.69 | epoch: 1 | total time: 37.74m | eta: 142.2m +step 03512/16704 (21.02%) | loss: 2.829301 | lrm: 1.00 | dt: 641.67ms | tok/sec: 817,062 | mfu: 51.07 | epoch: 1 | total time: 37.75m | eta: 142.2m +step 03513/16704 (21.03%) | loss: 2.816831 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,422 | mfu: 50.90 | epoch: 1 | total time: 37.76m | eta: 142.2m +step 03514/16704 (21.04%) | loss: 2.825944 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,412 | mfu: 50.90 | epoch: 1 | total time: 37.77m | eta: 142.2m +step 03515/16704 (21.04%) | loss: 2.832851 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,139 | mfu: 50.88 | epoch: 1 | total time: 37.78m | eta: 142.2m +step 03516/16704 (21.05%) | loss: 2.821651 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,324 | mfu: 50.77 | epoch: 1 | total time: 37.79m | eta: 142.2m +step 03517/16704 (21.05%) | loss: 2.821772 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,451 | mfu: 50.90 | epoch: 1 | total time: 37.81m | eta: 142.2m +step 03518/16704 (21.06%) | loss: 2.829250 | lrm: 1.00 | dt: 642.45ms | tok/sec: 816,081 | mfu: 51.01 | epoch: 1 | total time: 37.82m | eta: 142.1m +step 03519/16704 (21.07%) | loss: 2.835931 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,040 | mfu: 50.88 | epoch: 1 | total time: 37.83m | eta: 142.1m +step 03520/16704 (21.07%) | loss: 2.831535 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,543 | mfu: 50.91 | epoch: 1 | total time: 37.84m | eta: 142.1m +step 03521/16704 (21.08%) | loss: 2.829638 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,841 | mfu: 50.80 | epoch: 1 | total time: 37.85m | eta: 142.1m +step 03522/16704 (21.08%) | loss: 2.845029 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,543 | mfu: 50.85 | epoch: 1 | total time: 37.86m | eta: 142.1m +step 03523/16704 (21.09%) | loss: 2.840531 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,662 | mfu: 50.86 | epoch: 1 | total time: 37.87m | eta: 142.1m +step 03524/16704 (21.10%) | loss: 2.840632 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,509 | mfu: 50.85 | epoch: 1 | total time: 37.88m | eta: 142.1m +step 03525/16704 (21.10%) | loss: 2.831904 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,628 | mfu: 50.85 | epoch: 1 | total time: 37.89m | eta: 142.1m +step 03526/16704 (21.11%) | loss: 2.835436 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,206 | mfu: 50.70 | epoch: 1 | total time: 37.90m | eta: 142.1m +step 03527/16704 (21.11%) | loss: 2.833093 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,454 | mfu: 50.78 | epoch: 1 | total time: 37.91m | eta: 142.0m +step 03528/16704 (21.12%) | loss: 2.827009 | lrm: 1.00 | dt: 641.74ms | tok/sec: 816,977 | mfu: 51.06 | epoch: 1 | total time: 37.92m | eta: 142.0m +step 03529/16704 (21.13%) | loss: 2.826000 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,805 | mfu: 50.93 | epoch: 1 | total time: 37.93m | eta: 142.0m +step 03530/16704 (21.13%) | loss: 2.827511 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,465 | mfu: 50.91 | epoch: 1 | total time: 37.94m | eta: 142.0m +step 03531/16704 (21.14%) | loss: 2.825610 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 1 | total time: 37.96m | eta: 142.0m +step 03532/16704 (21.14%) | loss: 2.822634 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,227 | mfu: 50.58 | epoch: 1 | total time: 37.97m | eta: 142.0m +step 03533/16704 (21.15%) | loss: 2.820364 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,682 | mfu: 50.92 | epoch: 1 | total time: 37.98m | eta: 142.0m +step 03534/16704 (21.16%) | loss: 2.820748 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,573 | mfu: 50.66 | epoch: 1 | total time: 37.99m | eta: 142.0m +step 03535/16704 (21.16%) | loss: 2.821685 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,740 | mfu: 50.61 | epoch: 1 | total time: 38.00m | eta: 142.0m +step 03536/16704 (21.17%) | loss: 2.814033 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,615 | mfu: 50.73 | epoch: 1 | total time: 38.01m | eta: 141.9m +step 03537/16704 (21.17%) | loss: 2.799040 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,946 | mfu: 50.94 | epoch: 1 | total time: 38.02m | eta: 141.9m +step 03538/16704 (21.18%) | loss: 2.783908 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,564 | mfu: 50.91 | epoch: 1 | total time: 38.03m | eta: 141.9m +step 03539/16704 (21.19%) | loss: 2.783730 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,599 | mfu: 50.79 | epoch: 1 | total time: 38.04m | eta: 141.9m +step 03540/16704 (21.19%) | loss: 2.801906 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,678 | mfu: 50.92 | epoch: 1 | total time: 38.05m | eta: 141.9m +step 03541/16704 (21.20%) | loss: 2.801520 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 1 | total time: 38.06m | eta: 141.9m +step 03542/16704 (21.20%) | loss: 2.806751 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,853 | mfu: 50.74 | epoch: 1 | total time: 38.07m | eta: 141.9m +step 03543/16704 (21.21%) | loss: 2.801028 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,859 | mfu: 50.80 | epoch: 1 | total time: 38.08m | eta: 141.9m +step 03544/16704 (21.22%) | loss: 2.810127 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,805 | mfu: 50.80 | epoch: 1 | total time: 38.10m | eta: 141.9m +step 03545/16704 (21.22%) | loss: 2.812637 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,664 | mfu: 50.79 | epoch: 1 | total time: 38.11m | eta: 141.8m +step 03546/16704 (21.23%) | loss: 2.800746 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,426 | mfu: 50.78 | epoch: 1 | total time: 38.12m | eta: 141.8m +step 03547/16704 (21.23%) | loss: 2.811316 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,203 | mfu: 50.89 | epoch: 1 | total time: 38.13m | eta: 141.8m +step 03548/16704 (21.24%) | loss: 2.809917 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,916 | mfu: 51.00 | epoch: 1 | total time: 38.14m | eta: 141.8m +step 03549/16704 (21.25%) | loss: 2.812071 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,318 | mfu: 50.96 | epoch: 1 | total time: 38.15m | eta: 141.8m +step 03550/16704 (21.25%) | loss: 2.826651 | lrm: 1.00 | dt: 640.58ms | tok/sec: 818,463 | mfu: 51.16 | epoch: 1 | total time: 38.16m | eta: 141.8m +step 03551/16704 (21.26%) | loss: 2.832990 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,525 | mfu: 50.91 | epoch: 1 | total time: 38.17m | eta: 141.8m +step 03552/16704 (21.26%) | loss: 2.824193 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,708 | mfu: 50.86 | epoch: 1 | total time: 38.18m | eta: 141.8m +step 03553/16704 (21.27%) | loss: 2.821227 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,501 | mfu: 50.84 | epoch: 1 | total time: 38.19m | eta: 141.8m +step 03554/16704 (21.28%) | loss: 2.812992 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 1 | total time: 38.20m | eta: 141.7m +step 03555/16704 (21.28%) | loss: 2.806455 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,489 | mfu: 50.97 | epoch: 1 | total time: 38.21m | eta: 141.7m +step 03556/16704 (21.29%) | loss: 2.810726 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,203 | mfu: 50.95 | epoch: 1 | total time: 38.22m | eta: 141.7m +step 03557/16704 (21.29%) | loss: 2.811711 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,691 | mfu: 50.98 | epoch: 1 | total time: 38.23m | eta: 141.7m +step 03558/16704 (21.30%) | loss: 2.800337 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,022 | mfu: 50.75 | epoch: 1 | total time: 38.25m | eta: 141.7m +step 03559/16704 (21.31%) | loss: 2.790403 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,103 | mfu: 50.82 | epoch: 1 | total time: 38.26m | eta: 141.7m +step 03560/16704 (21.31%) | loss: 2.784593 | lrm: 1.00 | dt: 641.86ms | tok/sec: 816,824 | mfu: 51.05 | epoch: 1 | total time: 38.27m | eta: 141.7m +step 03561/16704 (21.32%) | loss: 2.786243 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,053 | mfu: 50.88 | epoch: 1 | total time: 38.28m | eta: 141.7m +step 03562/16704 (21.32%) | loss: 2.787108 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 1 | total time: 38.29m | eta: 141.7m +step 03563/16704 (21.33%) | loss: 2.791007 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,403 | mfu: 50.96 | epoch: 1 | total time: 38.30m | eta: 141.7m +step 03564/16704 (21.34%) | loss: 2.787655 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,749 | mfu: 50.80 | epoch: 1 | total time: 38.31m | eta: 141.6m +step 03565/16704 (21.34%) | loss: 2.796980 | lrm: 1.00 | dt: 641.89ms | tok/sec: 816,787 | mfu: 51.05 | epoch: 1 | total time: 38.32m | eta: 141.6m +step 03566/16704 (21.35%) | loss: 2.805592 | lrm: 1.00 | dt: 642.28ms | tok/sec: 816,296 | mfu: 51.02 | epoch: 1 | total time: 38.33m | eta: 141.6m +step 03567/16704 (21.35%) | loss: 2.817473 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,061 | mfu: 50.94 | epoch: 1 | total time: 38.34m | eta: 141.6m +step 03568/16704 (21.36%) | loss: 2.812373 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,330 | mfu: 50.90 | epoch: 1 | total time: 38.35m | eta: 141.6m +step 03569/16704 (21.37%) | loss: 2.816079 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,845 | mfu: 50.93 | epoch: 1 | total time: 38.36m | eta: 141.6m +step 03570/16704 (21.37%) | loss: 2.808902 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 1 | total time: 38.37m | eta: 141.6m +step 03571/16704 (21.38%) | loss: 2.811100 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,972 | mfu: 50.69 | epoch: 1 | total time: 38.38m | eta: 141.6m +step 03572/16704 (21.38%) | loss: 2.807536 | lrm: 1.00 | dt: 641.13ms | tok/sec: 817,750 | mfu: 51.11 | epoch: 1 | total time: 38.40m | eta: 141.6m +step 03573/16704 (21.39%) | loss: 2.804492 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,216 | mfu: 50.89 | epoch: 1 | total time: 38.41m | eta: 141.5m +step 03574/16704 (21.40%) | loss: 2.801223 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,523 | mfu: 50.97 | epoch: 1 | total time: 38.42m | eta: 141.5m +step 03575/16704 (21.40%) | loss: 2.794442 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,513 | mfu: 50.78 | epoch: 1 | total time: 38.43m | eta: 141.5m +step 03576/16704 (21.41%) | loss: 2.796449 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,893 | mfu: 50.87 | epoch: 1 | total time: 38.44m | eta: 141.5m +step 03577/16704 (21.41%) | loss: 2.802659 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 1 | total time: 38.45m | eta: 141.5m +step 03578/16704 (21.42%) | loss: 2.792238 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,612 | mfu: 50.91 | epoch: 1 | total time: 38.46m | eta: 141.5m +step 03579/16704 (21.43%) | loss: 2.802060 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,580 | mfu: 50.97 | epoch: 1 | total time: 38.47m | eta: 141.5m +step 03580/16704 (21.43%) | loss: 2.813052 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,435 | mfu: 50.97 | epoch: 1 | total time: 38.48m | eta: 141.5m +step 03581/16704 (21.44%) | loss: 2.814456 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,916 | mfu: 50.75 | epoch: 1 | total time: 38.49m | eta: 141.5m +step 03582/16704 (21.44%) | loss: 2.818865 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,722 | mfu: 50.98 | epoch: 1 | total time: 38.50m | eta: 141.4m +step 03583/16704 (21.45%) | loss: 2.822533 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,469 | mfu: 50.84 | epoch: 1 | total time: 38.51m | eta: 141.4m +step 03584/16704 (21.46%) | loss: 2.809368 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,123 | mfu: 51.01 | epoch: 1 | total time: 38.52m | eta: 141.4m +step 03585/16704 (21.46%) | loss: 2.812182 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 1 | total time: 38.53m | eta: 141.4m +step 03586/16704 (21.47%) | loss: 2.809915 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,835 | mfu: 50.93 | epoch: 1 | total time: 38.55m | eta: 141.4m +step 03587/16704 (21.47%) | loss: 2.811097 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,816 | mfu: 50.80 | epoch: 1 | total time: 38.56m | eta: 141.4m +step 03588/16704 (21.48%) | loss: 2.809633 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,213 | mfu: 50.83 | epoch: 1 | total time: 38.57m | eta: 141.4m +step 03589/16704 (21.49%) | loss: 2.822670 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,031 | mfu: 50.82 | epoch: 1 | total time: 38.58m | eta: 141.4m +step 03590/16704 (21.49%) | loss: 2.824371 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,980 | mfu: 50.81 | epoch: 1 | total time: 38.59m | eta: 141.4m +step 03591/16704 (21.50%) | loss: 2.823156 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,342 | mfu: 50.90 | epoch: 1 | total time: 38.60m | eta: 141.3m +step 03592/16704 (21.50%) | loss: 2.824073 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,767 | mfu: 50.86 | epoch: 1 | total time: 38.61m | eta: 141.3m +step 03593/16704 (21.51%) | loss: 2.839062 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,003 | mfu: 50.81 | epoch: 1 | total time: 38.62m | eta: 141.3m +step 03594/16704 (21.52%) | loss: 2.842312 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,608 | mfu: 50.85 | epoch: 1 | total time: 38.63m | eta: 141.3m +step 03595/16704 (21.52%) | loss: 2.856891 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,678 | mfu: 50.79 | epoch: 1 | total time: 38.64m | eta: 141.3m +step 03596/16704 (21.53%) | loss: 2.852056 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,858 | mfu: 50.87 | epoch: 1 | total time: 38.65m | eta: 141.3m +step 03597/16704 (21.53%) | loss: 2.838283 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,543 | mfu: 50.91 | epoch: 1 | total time: 38.66m | eta: 141.3m +step 03598/16704 (21.54%) | loss: 2.826281 | lrm: 1.00 | dt: 642.15ms | tok/sec: 816,452 | mfu: 51.03 | epoch: 1 | total time: 38.67m | eta: 141.3m +step 03599/16704 (21.55%) | loss: 2.829023 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,912 | mfu: 50.87 | epoch: 1 | total time: 38.69m | eta: 141.3m +step 03600/16704 (21.55%) | loss: 2.831366 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,010 | mfu: 50.88 | epoch: 1 | total time: 38.70m | eta: 141.2m +step 03601/16704 (21.56%) | loss: 2.830611 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,774 | mfu: 50.86 | epoch: 1 | total time: 38.71m | eta: 141.2m +step 03602/16704 (21.56%) | loss: 2.821164 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,449 | mfu: 50.65 | epoch: 1 | total time: 38.72m | eta: 141.2m +step 03603/16704 (21.57%) | loss: 2.823536 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,579 | mfu: 50.97 | epoch: 1 | total time: 38.73m | eta: 141.2m +step 03604/16704 (21.58%) | loss: 2.826774 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,855 | mfu: 50.99 | epoch: 1 | total time: 38.74m | eta: 141.2m +step 03605/16704 (21.58%) | loss: 2.828569 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,904 | mfu: 50.81 | epoch: 1 | total time: 38.75m | eta: 141.2m +step 03606/16704 (21.59%) | loss: 2.829567 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,935 | mfu: 50.75 | epoch: 1 | total time: 38.76m | eta: 141.2m +step 03607/16704 (21.59%) | loss: 2.814802 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 1 | total time: 38.77m | eta: 141.2m +step 03608/16704 (21.60%) | loss: 2.824273 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,449 | mfu: 50.84 | epoch: 1 | total time: 38.78m | eta: 141.2m +step 03609/16704 (21.61%) | loss: 2.826317 | lrm: 1.00 | dt: 643.30ms | tok/sec: 814,999 | mfu: 50.94 | epoch: 1 | total time: 38.79m | eta: 141.1m +step 03610/16704 (21.61%) | loss: 2.838226 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 38.80m | eta: 141.1m +step 03611/16704 (21.62%) | loss: 2.841165 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,021 | mfu: 50.75 | epoch: 1 | total time: 38.81m | eta: 141.1m +step 03612/16704 (21.62%) | loss: 2.852175 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,782 | mfu: 50.93 | epoch: 1 | total time: 38.82m | eta: 141.1m +step 03613/16704 (21.63%) | loss: 2.851088 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,801 | mfu: 50.80 | epoch: 1 | total time: 38.84m | eta: 141.1m +step 03614/16704 (21.64%) | loss: 2.852925 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,867 | mfu: 51.06 | epoch: 1 | total time: 38.85m | eta: 141.1m +step 03615/16704 (21.64%) | loss: 2.861230 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,693 | mfu: 50.86 | epoch: 1 | total time: 38.86m | eta: 141.1m +step 03616/16704 (21.65%) | loss: 2.861742 | lrm: 1.00 | dt: 647.69ms | tok/sec: 809,472 | mfu: 50.59 | epoch: 1 | total time: 38.87m | eta: 141.1m +step 03617/16704 (21.65%) | loss: 2.859125 | lrm: 1.00 | dt: 642.46ms | tok/sec: 816,069 | mfu: 51.01 | epoch: 1 | total time: 38.88m | eta: 141.1m +step 03618/16704 (21.66%) | loss: 2.860700 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,580 | mfu: 50.91 | epoch: 1 | total time: 38.89m | eta: 141.0m +step 03619/16704 (21.67%) | loss: 2.849642 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,643 | mfu: 50.92 | epoch: 1 | total time: 38.90m | eta: 141.0m +step 03620/16704 (21.67%) | loss: 2.858887 | lrm: 1.00 | dt: 641.45ms | tok/sec: 817,347 | mfu: 51.09 | epoch: 1 | total time: 38.91m | eta: 141.0m +step 03621/16704 (21.68%) | loss: 2.853105 | lrm: 1.00 | dt: 648.12ms | tok/sec: 808,942 | mfu: 50.56 | epoch: 1 | total time: 38.92m | eta: 141.0m +step 03622/16704 (21.68%) | loss: 2.854781 | lrm: 1.00 | dt: 641.88ms | tok/sec: 816,801 | mfu: 51.05 | epoch: 1 | total time: 38.93m | eta: 141.0m +step 03623/16704 (21.69%) | loss: 2.846561 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,803 | mfu: 50.93 | epoch: 1 | total time: 38.94m | eta: 141.0m +step 03624/16704 (21.70%) | loss: 2.839042 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,744 | mfu: 50.86 | epoch: 1 | total time: 38.95m | eta: 141.0m +step 03625/16704 (21.70%) | loss: 2.843694 | lrm: 1.00 | dt: 639.67ms | tok/sec: 819,623 | mfu: 51.23 | epoch: 1 | total time: 38.96m | eta: 141.0m +step 03626/16704 (21.71%) | loss: 2.835291 | lrm: 1.00 | dt: 646.96ms | tok/sec: 810,391 | mfu: 50.65 | epoch: 1 | total time: 38.98m | eta: 141.0m +step 03627/16704 (21.71%) | loss: 2.851122 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,600 | mfu: 50.73 | epoch: 1 | total time: 38.99m | eta: 141.0m +step 03628/16704 (21.72%) | loss: 2.857609 | lrm: 1.00 | dt: 641.50ms | tok/sec: 817,289 | mfu: 51.08 | epoch: 1 | total time: 39.00m | eta: 140.9m +step 03629/16704 (21.73%) | loss: 2.851270 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,876 | mfu: 50.74 | epoch: 1 | total time: 39.01m | eta: 140.9m +step 03630/16704 (21.73%) | loss: 2.850974 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,816 | mfu: 50.86 | epoch: 1 | total time: 39.02m | eta: 140.9m +step 03631/16704 (21.74%) | loss: 2.836009 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,312 | mfu: 50.83 | epoch: 1 | total time: 39.03m | eta: 140.9m +step 03632/16704 (21.74%) | loss: 2.845472 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,886 | mfu: 50.81 | epoch: 1 | total time: 39.04m | eta: 140.9m +step 03633/16704 (21.75%) | loss: 2.845776 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,905 | mfu: 51.00 | epoch: 1 | total time: 39.05m | eta: 140.9m +step 03634/16704 (21.76%) | loss: 2.842726 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,626 | mfu: 50.92 | epoch: 1 | total time: 39.06m | eta: 140.9m +step 03635/16704 (21.76%) | loss: 2.839452 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,590 | mfu: 50.98 | epoch: 1 | total time: 39.07m | eta: 140.9m +step 03636/16704 (21.77%) | loss: 2.842811 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,056 | mfu: 50.88 | epoch: 1 | total time: 39.08m | eta: 140.9m +step 03637/16704 (21.77%) | loss: 2.833046 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,553 | mfu: 50.91 | epoch: 1 | total time: 39.09m | eta: 140.8m +step 03638/16704 (21.78%) | loss: 2.831883 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,721 | mfu: 50.86 | epoch: 1 | total time: 39.10m | eta: 140.8m +step 03639/16704 (21.79%) | loss: 2.840395 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,680 | mfu: 50.86 | epoch: 1 | total time: 39.11m | eta: 140.8m +step 03640/16704 (21.79%) | loss: 2.838843 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,401 | mfu: 50.84 | epoch: 1 | total time: 39.13m | eta: 140.8m +step 03641/16704 (21.80%) | loss: 2.843611 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,741 | mfu: 50.86 | epoch: 1 | total time: 39.14m | eta: 140.8m +step 03642/16704 (21.80%) | loss: 2.849145 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,516 | mfu: 50.91 | epoch: 1 | total time: 39.15m | eta: 140.8m +step 03643/16704 (21.81%) | loss: 2.852131 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,128 | mfu: 50.88 | epoch: 1 | total time: 39.16m | eta: 140.8m +step 03644/16704 (21.82%) | loss: 2.844416 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,160 | mfu: 51.01 | epoch: 1 | total time: 39.17m | eta: 140.8m +step 03645/16704 (21.82%) | loss: 2.833059 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,899 | mfu: 50.93 | epoch: 1 | total time: 39.18m | eta: 140.8m +step 03646/16704 (21.83%) | loss: 2.829218 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,571 | mfu: 50.85 | epoch: 1 | total time: 39.19m | eta: 140.7m +step 03647/16704 (21.83%) | loss: 2.823131 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,006 | mfu: 50.88 | epoch: 1 | total time: 39.20m | eta: 140.7m +step 03648/16704 (21.84%) | loss: 2.828638 | lrm: 1.00 | dt: 642.56ms | tok/sec: 815,941 | mfu: 51.00 | epoch: 1 | total time: 39.21m | eta: 140.7m +step 03649/16704 (21.85%) | loss: 2.840022 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,518 | mfu: 50.97 | epoch: 1 | total time: 39.22m | eta: 140.7m +step 03650/16704 (21.85%) | loss: 2.815632 | lrm: 1.00 | dt: 642.56ms | tok/sec: 815,936 | mfu: 51.00 | epoch: 1 | total time: 39.23m | eta: 140.7m +step 03651/16704 (21.86%) | loss: 2.822099 | lrm: 1.00 | dt: 642.54ms | tok/sec: 815,958 | mfu: 51.00 | epoch: 1 | total time: 39.24m | eta: 140.7m +step 03652/16704 (21.86%) | loss: 2.810724 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,265 | mfu: 50.83 | epoch: 1 | total time: 39.25m | eta: 140.7m +step 03653/16704 (21.87%) | loss: 2.803987 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,231 | mfu: 50.95 | epoch: 1 | total time: 39.26m | eta: 140.7m +step 03654/16704 (21.88%) | loss: 2.820096 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,559 | mfu: 50.72 | epoch: 1 | total time: 39.28m | eta: 140.7m +step 03655/16704 (21.88%) | loss: 2.823373 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,522 | mfu: 50.85 | epoch: 1 | total time: 39.29m | eta: 140.6m +step 03656/16704 (21.89%) | loss: 2.836714 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,193 | mfu: 50.83 | epoch: 1 | total time: 39.30m | eta: 140.6m +step 03657/16704 (21.89%) | loss: 2.848865 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,964 | mfu: 50.69 | epoch: 1 | total time: 39.31m | eta: 140.6m +step 03658/16704 (21.90%) | loss: 2.840908 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,289 | mfu: 50.64 | epoch: 1 | total time: 39.32m | eta: 140.6m +step 03659/16704 (21.90%) | loss: 2.834300 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,370 | mfu: 50.90 | epoch: 1 | total time: 39.33m | eta: 140.6m +step 03660/16704 (21.91%) | loss: 2.830168 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,039 | mfu: 50.88 | epoch: 1 | total time: 39.34m | eta: 140.6m +step 03661/16704 (21.92%) | loss: 2.823125 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,621 | mfu: 50.92 | epoch: 1 | total time: 39.35m | eta: 140.6m +step 03662/16704 (21.92%) | loss: 2.817334 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,036 | mfu: 50.82 | epoch: 1 | total time: 39.36m | eta: 140.6m +step 03663/16704 (21.93%) | loss: 2.822201 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,519 | mfu: 50.85 | epoch: 1 | total time: 39.37m | eta: 140.6m +step 03664/16704 (21.93%) | loss: 2.817385 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,816 | mfu: 50.80 | epoch: 1 | total time: 39.38m | eta: 140.5m +step 03665/16704 (21.94%) | loss: 2.818759 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,558 | mfu: 50.72 | epoch: 1 | total time: 39.39m | eta: 140.5m +step 03666/16704 (21.95%) | loss: 2.832650 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,814 | mfu: 50.74 | epoch: 1 | total time: 39.40m | eta: 140.5m +step 03667/16704 (21.95%) | loss: 2.835070 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,199 | mfu: 50.83 | epoch: 1 | total time: 39.42m | eta: 140.5m +step 03668/16704 (21.96%) | loss: 2.831249 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,898 | mfu: 50.68 | epoch: 1 | total time: 39.43m | eta: 140.5m +step 03669/16704 (21.96%) | loss: 2.820179 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,757 | mfu: 50.99 | epoch: 1 | total time: 39.44m | eta: 140.5m +step 03670/16704 (21.97%) | loss: 2.812376 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,273 | mfu: 50.89 | epoch: 1 | total time: 39.45m | eta: 140.5m +step 03671/16704 (21.98%) | loss: 2.820427 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,995 | mfu: 50.88 | epoch: 1 | total time: 39.46m | eta: 140.5m +step 03672/16704 (21.98%) | loss: 2.821795 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,695 | mfu: 50.92 | epoch: 1 | total time: 39.47m | eta: 140.5m +step 03673/16704 (21.99%) | loss: 2.822075 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,101 | mfu: 50.76 | epoch: 1 | total time: 39.48m | eta: 140.4m +step 03674/16704 (21.99%) | loss: 2.813625 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,792 | mfu: 50.74 | epoch: 1 | total time: 39.49m | eta: 140.4m +step 03675/16704 (22.00%) | loss: 2.806961 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,853 | mfu: 50.80 | epoch: 1 | total time: 39.50m | eta: 140.4m +step 03676/16704 (22.01%) | loss: 2.806133 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,654 | mfu: 50.79 | epoch: 1 | total time: 39.51m | eta: 140.4m +step 03677/16704 (22.01%) | loss: 2.801666 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,230 | mfu: 50.77 | epoch: 1 | total time: 39.52m | eta: 140.4m +step 03678/16704 (22.02%) | loss: 2.795544 | lrm: 1.00 | dt: 642.24ms | tok/sec: 816,345 | mfu: 51.02 | epoch: 1 | total time: 39.53m | eta: 140.4m +step 03679/16704 (22.02%) | loss: 2.782053 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,703 | mfu: 50.61 | epoch: 1 | total time: 39.54m | eta: 140.4m +step 03680/16704 (22.03%) | loss: 2.785045 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,985 | mfu: 50.88 | epoch: 1 | total time: 39.55m | eta: 140.4m +step 03681/16704 (22.04%) | loss: 2.794657 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,239 | mfu: 50.70 | epoch: 1 | total time: 39.57m | eta: 140.4m +step 03682/16704 (22.04%) | loss: 2.789249 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,138 | mfu: 50.88 | epoch: 1 | total time: 39.58m | eta: 140.3m +step 03683/16704 (22.05%) | loss: 2.794505 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,655 | mfu: 50.79 | epoch: 1 | total time: 39.59m | eta: 140.3m +step 03684/16704 (22.05%) | loss: 2.782652 | lrm: 1.00 | dt: 642.77ms | tok/sec: 815,666 | mfu: 50.98 | epoch: 1 | total time: 39.60m | eta: 140.3m +step 03685/16704 (22.06%) | loss: 2.780873 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,528 | mfu: 50.97 | epoch: 1 | total time: 39.61m | eta: 140.3m +step 03686/16704 (22.07%) | loss: 2.782041 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,364 | mfu: 50.90 | epoch: 1 | total time: 39.62m | eta: 140.3m +step 03687/16704 (22.07%) | loss: 2.779434 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,249 | mfu: 50.83 | epoch: 1 | total time: 39.63m | eta: 140.3m +step 03688/16704 (22.08%) | loss: 2.763792 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,767 | mfu: 50.80 | epoch: 1 | total time: 39.64m | eta: 140.3m +step 03689/16704 (22.08%) | loss: 2.770885 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,691 | mfu: 50.86 | epoch: 1 | total time: 39.65m | eta: 140.3m +step 03690/16704 (22.09%) | loss: 2.783308 | lrm: 1.00 | dt: 649.14ms | tok/sec: 807,662 | mfu: 50.48 | epoch: 1 | total time: 39.66m | eta: 140.3m +step 03691/16704 (22.10%) | loss: 2.781421 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,872 | mfu: 50.81 | epoch: 1 | total time: 39.67m | eta: 140.3m +step 03692/16704 (22.10%) | loss: 2.799384 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,443 | mfu: 50.65 | epoch: 1 | total time: 39.68m | eta: 140.2m +step 03693/16704 (22.11%) | loss: 2.811724 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,516 | mfu: 50.78 | epoch: 1 | total time: 39.69m | eta: 140.2m +step 03694/16704 (22.11%) | loss: 2.811226 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,962 | mfu: 50.81 | epoch: 1 | total time: 39.71m | eta: 140.2m +step 03695/16704 (22.12%) | loss: 2.817155 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,430 | mfu: 50.90 | epoch: 1 | total time: 39.72m | eta: 140.2m +step 03696/16704 (22.13%) | loss: 2.820937 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,206 | mfu: 50.64 | epoch: 1 | total time: 39.73m | eta: 140.2m +step 03697/16704 (22.13%) | loss: 2.819250 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,469 | mfu: 50.78 | epoch: 1 | total time: 39.74m | eta: 140.2m +step 03698/16704 (22.14%) | loss: 2.818187 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,063 | mfu: 50.94 | epoch: 1 | total time: 39.75m | eta: 140.2m +step 03699/16704 (22.14%) | loss: 2.821856 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,528 | mfu: 50.72 | epoch: 1 | total time: 39.76m | eta: 140.2m +step 03700/16704 (22.15%) | loss: 2.817367 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,490 | mfu: 50.84 | epoch: 1 | total time: 39.77m | eta: 140.2m +step 03701/16704 (22.16%) | loss: 2.818095 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,000 | mfu: 50.75 | epoch: 1 | total time: 39.78m | eta: 140.1m +step 03702/16704 (22.16%) | loss: 2.811024 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,570 | mfu: 50.85 | epoch: 1 | total time: 39.79m | eta: 140.1m +step 03703/16704 (22.17%) | loss: 2.814192 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,487 | mfu: 50.72 | epoch: 1 | total time: 39.80m | eta: 140.1m +step 03704/16704 (22.17%) | loss: 2.808458 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,256 | mfu: 50.77 | epoch: 1 | total time: 39.81m | eta: 140.1m +step 03705/16704 (22.18%) | loss: 2.816028 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,689 | mfu: 50.86 | epoch: 1 | total time: 39.82m | eta: 140.1m +step 03706/16704 (22.19%) | loss: 2.811214 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,947 | mfu: 50.81 | epoch: 1 | total time: 39.83m | eta: 140.1m +step 03707/16704 (22.19%) | loss: 2.819819 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,215 | mfu: 50.64 | epoch: 1 | total time: 39.85m | eta: 140.1m +step 03708/16704 (22.20%) | loss: 2.816046 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,760 | mfu: 50.61 | epoch: 1 | total time: 39.86m | eta: 140.1m +step 03709/16704 (22.20%) | loss: 2.820589 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,986 | mfu: 50.88 | epoch: 1 | total time: 39.87m | eta: 140.1m +step 03710/16704 (22.21%) | loss: 2.815636 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 1 | total time: 39.88m | eta: 140.0m +step 03711/16704 (22.22%) | loss: 2.818926 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,735 | mfu: 50.92 | epoch: 1 | total time: 39.89m | eta: 140.0m +step 03712/16704 (22.22%) | loss: 2.810590 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,282 | mfu: 50.58 | epoch: 1 | total time: 39.90m | eta: 140.0m +step 03713/16704 (22.23%) | loss: 2.792655 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,263 | mfu: 50.89 | epoch: 1 | total time: 39.91m | eta: 140.0m +step 03714/16704 (22.23%) | loss: 2.797759 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,899 | mfu: 50.62 | epoch: 1 | total time: 39.92m | eta: 140.0m +step 03715/16704 (22.24%) | loss: 2.798732 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,350 | mfu: 50.71 | epoch: 1 | total time: 39.93m | eta: 140.0m +step 03716/16704 (22.25%) | loss: 2.810378 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,890 | mfu: 50.99 | epoch: 1 | total time: 39.94m | eta: 140.0m +step 03717/16704 (22.25%) | loss: 2.797641 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,354 | mfu: 50.77 | epoch: 1 | total time: 39.95m | eta: 140.0m +step 03718/16704 (22.26%) | loss: 2.807446 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,309 | mfu: 50.71 | epoch: 1 | total time: 39.96m | eta: 140.0m +step 03719/16704 (22.26%) | loss: 2.801692 | lrm: 1.00 | dt: 648.57ms | tok/sec: 808,379 | mfu: 50.52 | epoch: 1 | total time: 39.97m | eta: 139.9m +step 03720/16704 (22.27%) | loss: 2.799086 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,662 | mfu: 50.67 | epoch: 1 | total time: 39.99m | eta: 139.9m +step 03721/16704 (22.28%) | loss: 2.811752 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,112 | mfu: 50.88 | epoch: 1 | total time: 40.00m | eta: 139.9m +step 03722/16704 (22.28%) | loss: 2.819069 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,114 | mfu: 50.76 | epoch: 1 | total time: 40.01m | eta: 139.9m +step 03723/16704 (22.29%) | loss: 2.821986 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,354 | mfu: 50.90 | epoch: 1 | total time: 40.02m | eta: 139.9m +step 03724/16704 (22.29%) | loss: 2.817863 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,211 | mfu: 50.76 | epoch: 1 | total time: 40.03m | eta: 139.9m +step 03725/16704 (22.30%) | loss: 2.809066 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 1 | total time: 40.04m | eta: 139.9m +step 03726/16704 (22.31%) | loss: 2.804517 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,831 | mfu: 50.68 | epoch: 1 | total time: 40.05m | eta: 139.9m +step 03727/16704 (22.31%) | loss: 2.811833 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,116 | mfu: 50.63 | epoch: 1 | total time: 40.06m | eta: 139.9m +step 03728/16704 (22.32%) | loss: 2.813580 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,950 | mfu: 50.75 | epoch: 1 | total time: 40.07m | eta: 139.9m +step 03729/16704 (22.32%) | loss: 2.799269 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,588 | mfu: 50.79 | epoch: 1 | total time: 40.08m | eta: 139.8m +step 03730/16704 (22.33%) | loss: 2.795805 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,481 | mfu: 50.78 | epoch: 1 | total time: 40.09m | eta: 139.8m +step 03731/16704 (22.34%) | loss: 2.799175 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,782 | mfu: 50.93 | epoch: 1 | total time: 40.10m | eta: 139.8m +step 03732/16704 (22.34%) | loss: 2.800665 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,687 | mfu: 50.73 | epoch: 1 | total time: 40.11m | eta: 139.8m +step 03733/16704 (22.35%) | loss: 2.814827 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,778 | mfu: 50.74 | epoch: 1 | total time: 40.13m | eta: 139.8m +step 03734/16704 (22.35%) | loss: 2.813599 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,693 | mfu: 50.86 | epoch: 1 | total time: 40.14m | eta: 139.8m +step 03735/16704 (22.36%) | loss: 2.805883 | lrm: 1.00 | dt: 647.14ms | tok/sec: 810,158 | mfu: 50.64 | epoch: 1 | total time: 40.15m | eta: 139.8m +step 03736/16704 (22.37%) | loss: 2.808468 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,064 | mfu: 50.76 | epoch: 1 | total time: 40.16m | eta: 139.8m +step 03737/16704 (22.37%) | loss: 2.807946 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,206 | mfu: 50.83 | epoch: 1 | total time: 40.17m | eta: 139.8m +step 03738/16704 (22.38%) | loss: 2.803226 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,261 | mfu: 50.70 | epoch: 1 | total time: 40.18m | eta: 139.7m +step 03739/16704 (22.38%) | loss: 2.804007 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,882 | mfu: 50.81 | epoch: 1 | total time: 40.19m | eta: 139.7m +step 03740/16704 (22.39%) | loss: 2.809333 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,734 | mfu: 50.73 | epoch: 1 | total time: 40.20m | eta: 139.7m +step 03741/16704 (22.40%) | loss: 2.800826 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,420 | mfu: 50.84 | epoch: 1 | total time: 40.21m | eta: 139.7m +step 03742/16704 (22.40%) | loss: 2.802553 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,579 | mfu: 50.85 | epoch: 1 | total time: 40.22m | eta: 139.7m +step 03743/16704 (22.41%) | loss: 2.816443 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,132 | mfu: 50.70 | epoch: 1 | total time: 40.23m | eta: 139.7m +step 03744/16704 (22.41%) | loss: 2.816339 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,989 | mfu: 50.88 | epoch: 1 | total time: 40.24m | eta: 139.7m +step 03745/16704 (22.42%) | loss: 2.809967 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,680 | mfu: 50.67 | epoch: 1 | total time: 40.25m | eta: 139.7m +step 03746/16704 (22.43%) | loss: 2.808379 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,792 | mfu: 50.80 | epoch: 1 | total time: 40.26m | eta: 139.7m +step 03747/16704 (22.43%) | loss: 2.811698 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,038 | mfu: 50.82 | epoch: 1 | total time: 40.28m | eta: 139.6m +step 03748/16704 (22.44%) | loss: 2.810117 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,933 | mfu: 50.81 | epoch: 1 | total time: 40.29m | eta: 139.6m +step 03749/16704 (22.44%) | loss: 2.812477 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,942 | mfu: 50.69 | epoch: 1 | total time: 40.30m | eta: 139.6m +Step 03750 | Validation bpb: 0.853041 +step 03750/16704 (22.45%) | loss: 2.813855 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,728 | mfu: 50.73 | epoch: 1 | total time: 40.31m | eta: 139.6m +step 03751/16704 (22.46%) | loss: 2.806135 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 1 | total time: 40.32m | eta: 139.6m +step 03752/16704 (22.46%) | loss: 2.807310 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,244 | mfu: 50.77 | epoch: 1 | total time: 40.33m | eta: 139.6m +step 03753/16704 (22.47%) | loss: 2.813063 | lrm: 1.00 | dt: 639.13ms | tok/sec: 820,320 | mfu: 51.27 | epoch: 1 | total time: 40.34m | eta: 139.6m +step 03754/16704 (22.47%) | loss: 2.804531 | lrm: 1.00 | dt: 647.54ms | tok/sec: 809,662 | mfu: 50.61 | epoch: 1 | total time: 40.35m | eta: 139.6m +step 03755/16704 (22.48%) | loss: 2.790552 | lrm: 1.00 | dt: 641.50ms | tok/sec: 817,283 | mfu: 51.08 | epoch: 1 | total time: 40.36m | eta: 139.6m +step 03756/16704 (22.49%) | loss: 2.808945 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,503 | mfu: 50.85 | epoch: 1 | total time: 40.37m | eta: 139.5m +step 03757/16704 (22.49%) | loss: 2.807865 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,461 | mfu: 50.84 | epoch: 1 | total time: 40.38m | eta: 139.5m +step 03758/16704 (22.50%) | loss: 2.805918 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,973 | mfu: 51.00 | epoch: 1 | total time: 40.39m | eta: 139.5m +step 03759/16704 (22.50%) | loss: 2.806190 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,807 | mfu: 50.93 | epoch: 1 | total time: 40.40m | eta: 139.5m +step 03760/16704 (22.51%) | loss: 2.812225 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,019 | mfu: 50.94 | epoch: 1 | total time: 40.42m | eta: 139.5m +step 03761/16704 (22.52%) | loss: 2.806180 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,081 | mfu: 50.82 | epoch: 1 | total time: 40.43m | eta: 139.5m +step 03762/16704 (22.52%) | loss: 2.807222 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,721 | mfu: 50.86 | epoch: 1 | total time: 40.44m | eta: 139.5m +step 03763/16704 (22.53%) | loss: 2.821489 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,780 | mfu: 50.86 | epoch: 1 | total time: 40.45m | eta: 139.5m +step 03764/16704 (22.53%) | loss: 2.820882 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,264 | mfu: 50.96 | epoch: 1 | total time: 40.46m | eta: 139.5m +step 03765/16704 (22.54%) | loss: 2.822389 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,168 | mfu: 50.76 | epoch: 1 | total time: 40.47m | eta: 139.4m +step 03766/16704 (22.55%) | loss: 2.832585 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,381 | mfu: 50.71 | epoch: 1 | total time: 40.48m | eta: 139.4m +step 03767/16704 (22.55%) | loss: 2.825542 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,322 | mfu: 50.77 | epoch: 1 | total time: 40.49m | eta: 139.4m +step 03768/16704 (22.56%) | loss: 2.820129 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,328 | mfu: 50.77 | epoch: 1 | total time: 40.50m | eta: 139.4m +step 03769/16704 (22.56%) | loss: 2.802826 | lrm: 1.00 | dt: 642.00ms | tok/sec: 816,654 | mfu: 51.04 | epoch: 1 | total time: 40.51m | eta: 139.4m +step 03770/16704 (22.57%) | loss: 2.797639 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,686 | mfu: 50.86 | epoch: 1 | total time: 40.52m | eta: 139.4m +step 03771/16704 (22.58%) | loss: 2.804754 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,468 | mfu: 50.91 | epoch: 1 | total time: 40.53m | eta: 139.4m +step 03772/16704 (22.58%) | loss: 2.799834 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 1 | total time: 40.54m | eta: 139.4m +step 03773/16704 (22.59%) | loss: 2.820201 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,248 | mfu: 50.83 | epoch: 1 | total time: 40.55m | eta: 139.4m +step 03774/16704 (22.59%) | loss: 2.819090 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,421 | mfu: 50.90 | epoch: 1 | total time: 40.57m | eta: 139.3m +step 03775/16704 (22.60%) | loss: 2.810606 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,720 | mfu: 50.80 | epoch: 1 | total time: 40.58m | eta: 139.3m +step 03776/16704 (22.61%) | loss: 2.812345 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 1 | total time: 40.59m | eta: 139.3m +step 03777/16704 (22.61%) | loss: 2.812669 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,934 | mfu: 50.75 | epoch: 1 | total time: 40.60m | eta: 139.3m +step 03778/16704 (22.62%) | loss: 2.810439 | lrm: 1.00 | dt: 642.49ms | tok/sec: 816,025 | mfu: 51.00 | epoch: 1 | total time: 40.61m | eta: 139.3m +step 03779/16704 (22.62%) | loss: 2.827331 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,688 | mfu: 50.79 | epoch: 1 | total time: 40.62m | eta: 139.3m +step 03780/16704 (22.63%) | loss: 2.827751 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,204 | mfu: 50.89 | epoch: 1 | total time: 40.63m | eta: 139.3m +step 03781/16704 (22.64%) | loss: 2.823050 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,645 | mfu: 50.85 | epoch: 1 | total time: 40.64m | eta: 139.3m +step 03782/16704 (22.64%) | loss: 2.832774 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,923 | mfu: 50.93 | epoch: 1 | total time: 40.65m | eta: 139.3m +step 03783/16704 (22.65%) | loss: 2.829417 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,832 | mfu: 50.80 | epoch: 1 | total time: 40.66m | eta: 139.3m +step 03784/16704 (22.65%) | loss: 2.829871 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,407 | mfu: 50.78 | epoch: 1 | total time: 40.67m | eta: 139.2m +step 03785/16704 (22.66%) | loss: 2.838294 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 1 | total time: 40.68m | eta: 139.2m +step 03786/16704 (22.67%) | loss: 2.834603 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 1 | total time: 40.69m | eta: 139.2m +step 03787/16704 (22.67%) | loss: 2.832547 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,959 | mfu: 50.94 | epoch: 1 | total time: 40.71m | eta: 139.2m +step 03788/16704 (22.68%) | loss: 2.842084 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,970 | mfu: 50.81 | epoch: 1 | total time: 40.72m | eta: 139.2m +step 03789/16704 (22.68%) | loss: 2.837847 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,691 | mfu: 50.79 | epoch: 1 | total time: 40.73m | eta: 139.2m +step 03790/16704 (22.69%) | loss: 2.817259 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,261 | mfu: 50.89 | epoch: 1 | total time: 40.74m | eta: 139.2m +step 03791/16704 (22.70%) | loss: 2.816076 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,293 | mfu: 50.83 | epoch: 1 | total time: 40.75m | eta: 139.2m +step 03792/16704 (22.70%) | loss: 2.807700 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,072 | mfu: 50.82 | epoch: 1 | total time: 40.76m | eta: 139.2m +step 03793/16704 (22.71%) | loss: 2.814428 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,116 | mfu: 50.76 | epoch: 1 | total time: 40.77m | eta: 139.1m +step 03794/16704 (22.71%) | loss: 2.824890 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,770 | mfu: 50.92 | epoch: 1 | total time: 40.78m | eta: 139.1m +step 03795/16704 (22.72%) | loss: 2.826380 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,315 | mfu: 50.83 | epoch: 1 | total time: 40.79m | eta: 139.1m +step 03796/16704 (22.73%) | loss: 2.825612 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,990 | mfu: 50.88 | epoch: 1 | total time: 40.80m | eta: 139.1m +step 03797/16704 (22.73%) | loss: 2.822864 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,367 | mfu: 50.77 | epoch: 1 | total time: 40.81m | eta: 139.1m +step 03798/16704 (22.74%) | loss: 2.803228 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,662 | mfu: 50.98 | epoch: 1 | total time: 40.82m | eta: 139.1m +step 03799/16704 (22.74%) | loss: 2.808838 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,789 | mfu: 50.93 | epoch: 1 | total time: 40.83m | eta: 139.1m +step 03800/16704 (22.75%) | loss: 2.796521 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,100 | mfu: 50.82 | epoch: 1 | total time: 40.84m | eta: 139.1m +step 03801/16704 (22.76%) | loss: 2.808948 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,697 | mfu: 50.86 | epoch: 1 | total time: 40.86m | eta: 139.1m +step 03802/16704 (22.76%) | loss: 2.805274 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,718 | mfu: 50.86 | epoch: 1 | total time: 40.87m | eta: 139.0m +step 03803/16704 (22.77%) | loss: 2.799674 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,648 | mfu: 50.73 | epoch: 1 | total time: 40.88m | eta: 139.0m +step 03804/16704 (22.77%) | loss: 2.792877 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,440 | mfu: 50.84 | epoch: 1 | total time: 40.89m | eta: 139.0m +step 03805/16704 (22.78%) | loss: 2.793118 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,361 | mfu: 50.96 | epoch: 1 | total time: 40.90m | eta: 139.0m +step 03806/16704 (22.78%) | loss: 2.794917 | lrm: 1.00 | dt: 648.19ms | tok/sec: 808,848 | mfu: 50.55 | epoch: 1 | total time: 40.91m | eta: 139.0m +step 03807/16704 (22.79%) | loss: 2.779540 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,166 | mfu: 50.76 | epoch: 1 | total time: 40.92m | eta: 139.0m +step 03808/16704 (22.80%) | loss: 2.781297 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,010 | mfu: 50.88 | epoch: 1 | total time: 40.93m | eta: 139.0m +step 03809/16704 (22.80%) | loss: 2.784762 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,837 | mfu: 50.87 | epoch: 1 | total time: 40.94m | eta: 139.0m +step 03810/16704 (22.81%) | loss: 2.795564 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,174 | mfu: 50.76 | epoch: 1 | total time: 40.95m | eta: 139.0m +step 03811/16704 (22.81%) | loss: 2.809018 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,984 | mfu: 50.88 | epoch: 1 | total time: 40.96m | eta: 138.9m +step 03812/16704 (22.82%) | loss: 2.816276 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,736 | mfu: 50.92 | epoch: 1 | total time: 40.97m | eta: 138.9m +step 03813/16704 (22.83%) | loss: 2.817788 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,117 | mfu: 50.82 | epoch: 1 | total time: 40.98m | eta: 138.9m +step 03814/16704 (22.83%) | loss: 2.811104 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,082 | mfu: 50.82 | epoch: 1 | total time: 41.00m | eta: 138.9m +step 03815/16704 (22.84%) | loss: 2.816846 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,848 | mfu: 50.74 | epoch: 1 | total time: 41.01m | eta: 138.9m +step 03816/16704 (22.84%) | loss: 2.810442 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,154 | mfu: 50.76 | epoch: 1 | total time: 41.02m | eta: 138.9m +step 03817/16704 (22.85%) | loss: 2.818402 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,750 | mfu: 50.92 | epoch: 1 | total time: 41.03m | eta: 138.9m +step 03818/16704 (22.86%) | loss: 2.806024 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,529 | mfu: 50.78 | epoch: 1 | total time: 41.04m | eta: 138.9m +step 03819/16704 (22.86%) | loss: 2.796840 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,779 | mfu: 50.92 | epoch: 1 | total time: 41.05m | eta: 138.9m +step 03820/16704 (22.87%) | loss: 2.798170 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,730 | mfu: 50.86 | epoch: 1 | total time: 41.06m | eta: 138.8m +step 03821/16704 (22.87%) | loss: 2.804474 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,537 | mfu: 50.72 | epoch: 1 | total time: 41.07m | eta: 138.8m +step 03822/16704 (22.88%) | loss: 2.814146 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,871 | mfu: 50.93 | epoch: 1 | total time: 41.08m | eta: 138.8m +step 03823/16704 (22.89%) | loss: 2.819778 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,095 | mfu: 50.88 | epoch: 1 | total time: 41.09m | eta: 138.8m +step 03824/16704 (22.89%) | loss: 2.829524 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 1 | total time: 41.10m | eta: 138.8m +step 03825/16704 (22.90%) | loss: 2.836027 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,837 | mfu: 50.99 | epoch: 1 | total time: 41.11m | eta: 138.8m +step 03826/16704 (22.90%) | loss: 2.842048 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,234 | mfu: 50.70 | epoch: 1 | total time: 41.12m | eta: 138.8m +step 03827/16704 (22.91%) | loss: 2.834896 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,711 | mfu: 50.86 | epoch: 1 | total time: 41.13m | eta: 138.8m +step 03828/16704 (22.92%) | loss: 2.836108 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,180 | mfu: 50.95 | epoch: 1 | total time: 41.15m | eta: 138.8m +step 03829/16704 (22.92%) | loss: 2.834563 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,658 | mfu: 50.92 | epoch: 1 | total time: 41.16m | eta: 138.8m +step 03830/16704 (22.93%) | loss: 2.833548 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,239 | mfu: 50.83 | epoch: 1 | total time: 41.17m | eta: 138.7m +step 03831/16704 (22.93%) | loss: 2.827741 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,280 | mfu: 50.77 | epoch: 1 | total time: 41.18m | eta: 138.7m +step 03832/16704 (22.94%) | loss: 2.809812 | lrm: 1.00 | dt: 642.80ms | tok/sec: 815,636 | mfu: 50.98 | epoch: 1 | total time: 41.19m | eta: 138.7m +step 03833/16704 (22.95%) | loss: 2.801979 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,665 | mfu: 50.86 | epoch: 1 | total time: 41.20m | eta: 138.7m +step 03834/16704 (22.95%) | loss: 2.790708 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,225 | mfu: 50.70 | epoch: 1 | total time: 41.21m | eta: 138.7m +step 03835/16704 (22.96%) | loss: 2.789353 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,427 | mfu: 50.84 | epoch: 1 | total time: 41.22m | eta: 138.7m +step 03836/16704 (22.96%) | loss: 2.789364 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,620 | mfu: 50.98 | epoch: 1 | total time: 41.23m | eta: 138.7m +step 03837/16704 (22.97%) | loss: 2.792485 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,371 | mfu: 50.96 | epoch: 1 | total time: 41.24m | eta: 138.7m +step 03838/16704 (22.98%) | loss: 2.784644 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 1 | total time: 41.25m | eta: 138.7m +step 03839/16704 (22.98%) | loss: 2.786969 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 1 | total time: 41.26m | eta: 138.6m +step 03840/16704 (22.99%) | loss: 2.786758 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,749 | mfu: 50.80 | epoch: 1 | total time: 41.27m | eta: 138.6m +step 03841/16704 (22.99%) | loss: 2.785189 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,512 | mfu: 50.78 | epoch: 1 | total time: 41.29m | eta: 138.6m +step 03842/16704 (23.00%) | loss: 2.774105 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,898 | mfu: 50.87 | epoch: 1 | total time: 41.30m | eta: 138.6m +step 03843/16704 (23.01%) | loss: 2.780901 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,535 | mfu: 50.66 | epoch: 1 | total time: 41.31m | eta: 138.6m +step 03844/16704 (23.01%) | loss: 2.773484 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,930 | mfu: 50.93 | epoch: 1 | total time: 41.32m | eta: 138.6m +step 03845/16704 (23.02%) | loss: 2.769558 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,405 | mfu: 50.65 | epoch: 1 | total time: 41.33m | eta: 138.6m +step 03846/16704 (23.02%) | loss: 2.779976 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,942 | mfu: 50.81 | epoch: 1 | total time: 41.34m | eta: 138.6m +step 03847/16704 (23.03%) | loss: 2.780460 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,038 | mfu: 50.94 | epoch: 1 | total time: 41.35m | eta: 138.6m +step 03848/16704 (23.04%) | loss: 2.787859 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,256 | mfu: 50.70 | epoch: 1 | total time: 41.36m | eta: 138.5m +step 03849/16704 (23.04%) | loss: 2.788880 | lrm: 1.00 | dt: 647.64ms | tok/sec: 809,535 | mfu: 50.60 | epoch: 1 | total time: 41.37m | eta: 138.5m +step 03850/16704 (23.05%) | loss: 2.789622 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,976 | mfu: 50.94 | epoch: 1 | total time: 41.38m | eta: 138.5m +step 03851/16704 (23.05%) | loss: 2.776058 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,754 | mfu: 50.74 | epoch: 1 | total time: 41.39m | eta: 138.5m +step 03852/16704 (23.06%) | loss: 2.780233 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,784 | mfu: 50.80 | epoch: 1 | total time: 41.40m | eta: 138.5m +step 03853/16704 (23.07%) | loss: 2.786382 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 1 | total time: 41.41m | eta: 138.5m +step 03854/16704 (23.07%) | loss: 2.781909 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,311 | mfu: 51.02 | epoch: 1 | total time: 41.43m | eta: 138.5m +step 03855/16704 (23.08%) | loss: 2.800651 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,014 | mfu: 50.75 | epoch: 1 | total time: 41.44m | eta: 138.5m +step 03856/16704 (23.08%) | loss: 2.803557 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,244 | mfu: 50.70 | epoch: 1 | total time: 41.45m | eta: 138.5m +step 03857/16704 (23.09%) | loss: 2.805407 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,096 | mfu: 50.69 | epoch: 1 | total time: 41.46m | eta: 138.4m +step 03858/16704 (23.10%) | loss: 2.797940 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,858 | mfu: 50.87 | epoch: 1 | total time: 41.47m | eta: 138.4m +step 03859/16704 (23.10%) | loss: 2.802513 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,605 | mfu: 50.91 | epoch: 1 | total time: 41.48m | eta: 138.4m +step 03860/16704 (23.11%) | loss: 2.783717 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,288 | mfu: 50.89 | epoch: 1 | total time: 41.49m | eta: 138.4m +step 03861/16704 (23.11%) | loss: 2.767573 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,607 | mfu: 50.79 | epoch: 1 | total time: 41.50m | eta: 138.4m +step 03862/16704 (23.12%) | loss: 2.769866 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,064 | mfu: 50.94 | epoch: 1 | total time: 41.51m | eta: 138.4m +step 03863/16704 (23.13%) | loss: 2.769603 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,129 | mfu: 50.88 | epoch: 1 | total time: 41.52m | eta: 138.4m +step 03864/16704 (23.13%) | loss: 2.781980 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,169 | mfu: 50.76 | epoch: 1 | total time: 41.53m | eta: 138.4m +step 03865/16704 (23.14%) | loss: 2.768334 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,874 | mfu: 50.87 | epoch: 1 | total time: 41.54m | eta: 138.4m +step 03866/16704 (23.14%) | loss: 2.771479 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,836 | mfu: 50.93 | epoch: 1 | total time: 41.55m | eta: 138.3m +step 03867/16704 (23.15%) | loss: 2.788222 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,068 | mfu: 50.82 | epoch: 1 | total time: 41.56m | eta: 138.3m +step 03868/16704 (23.16%) | loss: 2.788665 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,982 | mfu: 50.75 | epoch: 1 | total time: 41.58m | eta: 138.3m +step 03869/16704 (23.16%) | loss: 2.801839 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,240 | mfu: 50.70 | epoch: 1 | total time: 41.59m | eta: 138.3m +step 03870/16704 (23.17%) | loss: 2.808224 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,266 | mfu: 50.89 | epoch: 1 | total time: 41.60m | eta: 138.3m +step 03871/16704 (23.17%) | loss: 2.814892 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 1 | total time: 41.61m | eta: 138.3m +step 03872/16704 (23.18%) | loss: 2.812995 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,247 | mfu: 50.64 | epoch: 1 | total time: 41.62m | eta: 138.3m +step 03873/16704 (23.19%) | loss: 2.819349 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,821 | mfu: 50.80 | epoch: 1 | total time: 41.63m | eta: 138.3m +step 03874/16704 (23.19%) | loss: 2.821744 | lrm: 1.00 | dt: 642.13ms | tok/sec: 816,482 | mfu: 51.03 | epoch: 1 | total time: 41.64m | eta: 138.3m +step 03875/16704 (23.20%) | loss: 2.818499 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,154 | mfu: 50.76 | epoch: 1 | total time: 41.65m | eta: 138.3m +step 03876/16704 (23.20%) | loss: 2.824483 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,781 | mfu: 50.74 | epoch: 1 | total time: 41.66m | eta: 138.2m +step 03877/16704 (23.21%) | loss: 2.820888 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,990 | mfu: 50.81 | epoch: 1 | total time: 41.67m | eta: 138.2m +step 03878/16704 (23.22%) | loss: 2.808634 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,371 | mfu: 50.71 | epoch: 1 | total time: 41.68m | eta: 138.2m +step 03879/16704 (23.22%) | loss: 2.790256 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,861 | mfu: 50.93 | epoch: 1 | total time: 41.69m | eta: 138.2m +step 03880/16704 (23.23%) | loss: 2.791391 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,960 | mfu: 50.75 | epoch: 1 | total time: 41.70m | eta: 138.2m +step 03881/16704 (23.23%) | loss: 2.792958 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,176 | mfu: 50.57 | epoch: 1 | total time: 41.72m | eta: 138.2m +step 03882/16704 (23.24%) | loss: 2.795490 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 1 | total time: 41.73m | eta: 138.2m +step 03883/16704 (23.25%) | loss: 2.789230 | lrm: 1.00 | dt: 646.38ms | tok/sec: 811,117 | mfu: 50.70 | epoch: 1 | total time: 41.74m | eta: 138.2m +step 03884/16704 (23.25%) | loss: 2.790364 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,093 | mfu: 50.76 | epoch: 1 | total time: 41.75m | eta: 138.2m +step 03885/16704 (23.26%) | loss: 2.771859 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,281 | mfu: 50.96 | epoch: 1 | total time: 41.76m | eta: 138.1m +step 03886/16704 (23.26%) | loss: 2.782906 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,404 | mfu: 50.78 | epoch: 1 | total time: 41.77m | eta: 138.1m +step 03887/16704 (23.27%) | loss: 2.787106 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,730 | mfu: 50.92 | epoch: 1 | total time: 41.78m | eta: 138.1m +step 03888/16704 (23.28%) | loss: 2.797262 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,563 | mfu: 50.79 | epoch: 1 | total time: 41.79m | eta: 138.1m +step 03889/16704 (23.28%) | loss: 2.791931 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,694 | mfu: 50.86 | epoch: 1 | total time: 41.80m | eta: 138.1m +step 03890/16704 (23.29%) | loss: 2.787375 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,210 | mfu: 50.76 | epoch: 1 | total time: 41.81m | eta: 138.1m +step 03891/16704 (23.29%) | loss: 2.788460 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,358 | mfu: 50.96 | epoch: 1 | total time: 41.82m | eta: 138.1m +step 03892/16704 (23.30%) | loss: 2.787993 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,040 | mfu: 50.82 | epoch: 1 | total time: 41.83m | eta: 138.1m +step 03893/16704 (23.31%) | loss: 2.787917 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,321 | mfu: 50.96 | epoch: 1 | total time: 41.84m | eta: 138.1m +step 03894/16704 (23.31%) | loss: 2.786666 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,549 | mfu: 50.79 | epoch: 1 | total time: 41.85m | eta: 138.0m +step 03895/16704 (23.32%) | loss: 2.781233 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,194 | mfu: 50.95 | epoch: 1 | total time: 41.87m | eta: 138.0m +step 03896/16704 (23.32%) | loss: 2.779985 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,580 | mfu: 50.79 | epoch: 1 | total time: 41.88m | eta: 138.0m +step 03897/16704 (23.33%) | loss: 2.776589 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,242 | mfu: 50.95 | epoch: 1 | total time: 41.89m | eta: 138.0m +step 03898/16704 (23.34%) | loss: 2.784958 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,045 | mfu: 50.69 | epoch: 1 | total time: 41.90m | eta: 138.0m +step 03899/16704 (23.34%) | loss: 2.785856 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,614 | mfu: 50.66 | epoch: 1 | total time: 41.91m | eta: 138.0m +step 03900/16704 (23.35%) | loss: 2.785838 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,213 | mfu: 50.89 | epoch: 1 | total time: 41.92m | eta: 138.0m +step 03901/16704 (23.35%) | loss: 2.784434 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,727 | mfu: 50.80 | epoch: 1 | total time: 41.93m | eta: 138.0m +step 03902/16704 (23.36%) | loss: 2.780733 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,535 | mfu: 50.91 | epoch: 1 | total time: 41.94m | eta: 138.0m +step 03903/16704 (23.37%) | loss: 2.776770 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,440 | mfu: 50.72 | epoch: 1 | total time: 41.95m | eta: 137.9m +step 03904/16704 (23.37%) | loss: 2.763297 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,012 | mfu: 50.75 | epoch: 1 | total time: 41.96m | eta: 137.9m +step 03905/16704 (23.38%) | loss: 2.766117 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,508 | mfu: 50.85 | epoch: 1 | total time: 41.97m | eta: 137.9m +step 03906/16704 (23.38%) | loss: 2.766204 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,878 | mfu: 50.68 | epoch: 1 | total time: 41.98m | eta: 137.9m +step 03907/16704 (23.39%) | loss: 2.768974 | lrm: 1.00 | dt: 642.44ms | tok/sec: 816,083 | mfu: 51.01 | epoch: 1 | total time: 41.99m | eta: 137.9m +step 03908/16704 (23.40%) | loss: 2.774482 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 1 | total time: 42.01m | eta: 137.9m +step 03909/16704 (23.40%) | loss: 2.779651 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,547 | mfu: 50.85 | epoch: 1 | total time: 42.02m | eta: 137.9m +step 03910/16704 (23.41%) | loss: 2.781499 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,660 | mfu: 50.85 | epoch: 1 | total time: 42.03m | eta: 137.9m +step 03911/16704 (23.41%) | loss: 2.782304 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,363 | mfu: 50.90 | epoch: 1 | total time: 42.04m | eta: 137.9m +step 03912/16704 (23.42%) | loss: 2.792455 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,687 | mfu: 50.92 | epoch: 1 | total time: 42.05m | eta: 137.8m +step 03913/16704 (23.43%) | loss: 2.793370 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,599 | mfu: 50.79 | epoch: 1 | total time: 42.06m | eta: 137.8m +step 03914/16704 (23.43%) | loss: 2.791092 | lrm: 1.00 | dt: 642.47ms | tok/sec: 816,051 | mfu: 51.00 | epoch: 1 | total time: 42.07m | eta: 137.8m +step 03915/16704 (23.44%) | loss: 2.783200 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,624 | mfu: 50.67 | epoch: 1 | total time: 42.08m | eta: 137.8m +step 03916/16704 (23.44%) | loss: 2.791880 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,961 | mfu: 50.87 | epoch: 1 | total time: 42.09m | eta: 137.8m +step 03917/16704 (23.45%) | loss: 2.782884 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,763 | mfu: 50.80 | epoch: 1 | total time: 42.10m | eta: 137.8m +step 03918/16704 (23.46%) | loss: 2.787148 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,026 | mfu: 50.69 | epoch: 1 | total time: 42.11m | eta: 137.8m +step 03919/16704 (23.46%) | loss: 2.795403 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,607 | mfu: 50.91 | epoch: 1 | total time: 42.12m | eta: 137.8m +step 03920/16704 (23.47%) | loss: 2.787495 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 1 | total time: 42.13m | eta: 137.8m +step 03921/16704 (23.47%) | loss: 2.792616 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,884 | mfu: 50.87 | epoch: 1 | total time: 42.15m | eta: 137.7m +step 03922/16704 (23.48%) | loss: 2.796559 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,799 | mfu: 50.80 | epoch: 1 | total time: 42.16m | eta: 137.7m +step 03923/16704 (23.49%) | loss: 2.792917 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,815 | mfu: 50.86 | epoch: 1 | total time: 42.17m | eta: 137.7m +step 03924/16704 (23.49%) | loss: 2.794723 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,606 | mfu: 50.85 | epoch: 1 | total time: 42.18m | eta: 137.7m +step 03925/16704 (23.50%) | loss: 2.794142 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,128 | mfu: 50.82 | epoch: 1 | total time: 42.19m | eta: 137.7m +step 03926/16704 (23.50%) | loss: 2.801575 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,596 | mfu: 50.85 | epoch: 1 | total time: 42.20m | eta: 137.7m +step 03927/16704 (23.51%) | loss: 2.788380 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,154 | mfu: 50.95 | epoch: 1 | total time: 42.21m | eta: 137.7m +step 03928/16704 (23.52%) | loss: 2.781619 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,841 | mfu: 50.74 | epoch: 1 | total time: 42.22m | eta: 137.7m +step 03929/16704 (23.52%) | loss: 2.774349 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,316 | mfu: 50.96 | epoch: 1 | total time: 42.23m | eta: 137.7m +step 03930/16704 (23.53%) | loss: 2.766197 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,970 | mfu: 50.87 | epoch: 1 | total time: 42.24m | eta: 137.7m +step 03931/16704 (23.53%) | loss: 2.769101 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,741 | mfu: 50.86 | epoch: 1 | total time: 42.25m | eta: 137.6m +step 03932/16704 (23.54%) | loss: 2.781739 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,026 | mfu: 50.82 | epoch: 1 | total time: 42.26m | eta: 137.6m +step 03933/16704 (23.55%) | loss: 2.791117 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,008 | mfu: 50.75 | epoch: 1 | total time: 42.27m | eta: 137.6m +step 03934/16704 (23.55%) | loss: 2.780753 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,292 | mfu: 50.83 | epoch: 1 | total time: 42.28m | eta: 137.6m +step 03935/16704 (23.56%) | loss: 2.787543 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,172 | mfu: 50.89 | epoch: 1 | total time: 42.30m | eta: 137.6m +step 03936/16704 (23.56%) | loss: 2.788212 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,754 | mfu: 50.67 | epoch: 1 | total time: 42.31m | eta: 137.6m +step 03937/16704 (23.57%) | loss: 2.781744 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,860 | mfu: 50.68 | epoch: 1 | total time: 42.32m | eta: 137.6m +step 03938/16704 (23.58%) | loss: 2.778840 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 1 | total time: 42.33m | eta: 137.6m +step 03939/16704 (23.58%) | loss: 2.765743 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 1 | total time: 42.34m | eta: 137.6m +step 03940/16704 (23.59%) | loss: 2.775073 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,551 | mfu: 50.91 | epoch: 1 | total time: 42.35m | eta: 137.5m +step 03941/16704 (23.59%) | loss: 2.779719 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 1 | total time: 42.36m | eta: 137.5m +step 03942/16704 (23.60%) | loss: 2.774808 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,316 | mfu: 50.71 | epoch: 1 | total time: 42.37m | eta: 137.5m +step 03943/16704 (23.61%) | loss: 2.767941 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,856 | mfu: 50.93 | epoch: 1 | total time: 42.38m | eta: 137.5m +step 03944/16704 (23.61%) | loss: 2.774824 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 1 | total time: 42.39m | eta: 137.5m +step 03945/16704 (23.62%) | loss: 2.783204 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,435 | mfu: 50.53 | epoch: 1 | total time: 42.40m | eta: 137.5m +step 03946/16704 (23.62%) | loss: 2.794710 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,312 | mfu: 50.96 | epoch: 1 | total time: 42.41m | eta: 137.5m +step 03947/16704 (23.63%) | loss: 2.796932 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,354 | mfu: 50.71 | epoch: 1 | total time: 42.42m | eta: 137.5m +step 03948/16704 (23.64%) | loss: 2.812802 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,105 | mfu: 50.88 | epoch: 1 | total time: 42.44m | eta: 137.5m +step 03949/16704 (23.64%) | loss: 2.804147 | lrm: 1.00 | dt: 643.30ms | tok/sec: 815,000 | mfu: 50.94 | epoch: 1 | total time: 42.45m | eta: 137.4m +step 03950/16704 (23.65%) | loss: 2.807296 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,363 | mfu: 50.84 | epoch: 1 | total time: 42.46m | eta: 137.4m +step 03951/16704 (23.65%) | loss: 2.810412 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,043 | mfu: 50.82 | epoch: 1 | total time: 42.47m | eta: 137.4m +step 03952/16704 (23.66%) | loss: 2.808782 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,205 | mfu: 50.95 | epoch: 1 | total time: 42.48m | eta: 137.4m +step 03953/16704 (23.66%) | loss: 2.805027 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,169 | mfu: 50.70 | epoch: 1 | total time: 42.49m | eta: 137.4m +step 03954/16704 (23.67%) | loss: 2.784844 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,658 | mfu: 50.85 | epoch: 1 | total time: 42.50m | eta: 137.4m +step 03955/16704 (23.68%) | loss: 2.781104 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,518 | mfu: 50.78 | epoch: 1 | total time: 42.51m | eta: 137.4m +step 03956/16704 (23.68%) | loss: 2.786849 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,682 | mfu: 50.92 | epoch: 1 | total time: 42.52m | eta: 137.4m +step 03957/16704 (23.69%) | loss: 2.784053 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,795 | mfu: 50.86 | epoch: 1 | total time: 42.53m | eta: 137.4m +step 03958/16704 (23.69%) | loss: 2.780674 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,391 | mfu: 50.90 | epoch: 1 | total time: 42.54m | eta: 137.3m +step 03959/16704 (23.70%) | loss: 2.779191 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,539 | mfu: 50.97 | epoch: 1 | total time: 42.55m | eta: 137.3m +step 03960/16704 (23.71%) | loss: 2.779933 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,169 | mfu: 50.76 | epoch: 1 | total time: 42.56m | eta: 137.3m +step 03961/16704 (23.71%) | loss: 2.784978 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,550 | mfu: 50.85 | epoch: 1 | total time: 42.57m | eta: 137.3m +step 03962/16704 (23.72%) | loss: 2.775060 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,687 | mfu: 50.86 | epoch: 1 | total time: 42.59m | eta: 137.3m +step 03963/16704 (23.72%) | loss: 2.769496 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,716 | mfu: 50.73 | epoch: 1 | total time: 42.60m | eta: 137.3m +step 03964/16704 (23.73%) | loss: 2.790041 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,580 | mfu: 50.66 | epoch: 1 | total time: 42.61m | eta: 137.3m +step 03965/16704 (23.74%) | loss: 2.778022 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,064 | mfu: 50.94 | epoch: 1 | total time: 42.62m | eta: 137.3m +step 03966/16704 (23.74%) | loss: 2.774931 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,802 | mfu: 50.93 | epoch: 1 | total time: 42.63m | eta: 137.3m +step 03967/16704 (23.75%) | loss: 2.762128 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,859 | mfu: 50.80 | epoch: 1 | total time: 42.64m | eta: 137.2m +step 03968/16704 (23.75%) | loss: 2.763220 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 1 | total time: 42.65m | eta: 137.2m +step 03969/16704 (23.76%) | loss: 2.763336 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,200 | mfu: 50.89 | epoch: 1 | total time: 42.66m | eta: 137.2m +step 03970/16704 (23.77%) | loss: 2.779279 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,655 | mfu: 50.98 | epoch: 1 | total time: 42.67m | eta: 137.2m +step 03971/16704 (23.77%) | loss: 2.781354 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,431 | mfu: 50.72 | epoch: 1 | total time: 42.68m | eta: 137.2m +step 03972/16704 (23.78%) | loss: 2.781285 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 1 | total time: 42.69m | eta: 137.2m +step 03973/16704 (23.78%) | loss: 2.773037 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,461 | mfu: 50.72 | epoch: 1 | total time: 42.70m | eta: 137.2m +step 03974/16704 (23.79%) | loss: 2.782108 | lrm: 1.00 | dt: 641.68ms | tok/sec: 817,060 | mfu: 51.07 | epoch: 1 | total time: 42.71m | eta: 137.2m +step 03975/16704 (23.80%) | loss: 2.775417 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,682 | mfu: 50.86 | epoch: 1 | total time: 42.73m | eta: 137.2m +step 03976/16704 (23.80%) | loss: 2.785591 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,557 | mfu: 50.91 | epoch: 1 | total time: 42.74m | eta: 137.2m +step 03977/16704 (23.81%) | loss: 2.784749 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,165 | mfu: 50.76 | epoch: 1 | total time: 42.75m | eta: 137.1m +step 03978/16704 (23.81%) | loss: 2.796405 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,577 | mfu: 50.91 | epoch: 1 | total time: 42.76m | eta: 137.1m +step 03979/16704 (23.82%) | loss: 2.801390 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,259 | mfu: 50.77 | epoch: 1 | total time: 42.77m | eta: 137.1m +step 03980/16704 (23.83%) | loss: 2.805779 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,743 | mfu: 50.99 | epoch: 1 | total time: 42.78m | eta: 137.1m +step 03981/16704 (23.83%) | loss: 2.796807 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,157 | mfu: 50.76 | epoch: 1 | total time: 42.79m | eta: 137.1m +step 03982/16704 (23.84%) | loss: 2.789052 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,616 | mfu: 50.73 | epoch: 1 | total time: 42.80m | eta: 137.1m +step 03983/16704 (23.84%) | loss: 2.785685 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,729 | mfu: 51.05 | epoch: 1 | total time: 42.81m | eta: 137.1m +step 03984/16704 (23.85%) | loss: 2.800321 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,261 | mfu: 50.83 | epoch: 1 | total time: 42.82m | eta: 137.1m +step 03985/16704 (23.86%) | loss: 2.799890 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 1 | total time: 42.83m | eta: 137.1m +step 03986/16704 (23.86%) | loss: 2.816023 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,986 | mfu: 50.88 | epoch: 1 | total time: 42.84m | eta: 137.0m +step 03987/16704 (23.87%) | loss: 2.803900 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,485 | mfu: 50.91 | epoch: 1 | total time: 42.85m | eta: 137.0m +step 03988/16704 (23.87%) | loss: 2.792364 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,952 | mfu: 50.87 | epoch: 1 | total time: 42.86m | eta: 137.0m +step 03989/16704 (23.88%) | loss: 2.801956 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,426 | mfu: 50.65 | epoch: 1 | total time: 42.88m | eta: 137.0m +step 03990/16704 (23.89%) | loss: 2.803926 | lrm: 1.00 | dt: 647.32ms | tok/sec: 809,932 | mfu: 50.62 | epoch: 1 | total time: 42.89m | eta: 137.0m +step 03991/16704 (23.89%) | loss: 2.808765 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,971 | mfu: 51.00 | epoch: 1 | total time: 42.90m | eta: 137.0m +step 03992/16704 (23.90%) | loss: 2.817370 | lrm: 1.00 | dt: 649.06ms | tok/sec: 807,767 | mfu: 50.49 | epoch: 1 | total time: 42.91m | eta: 137.0m +step 03993/16704 (23.90%) | loss: 2.817107 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,343 | mfu: 50.96 | epoch: 1 | total time: 42.92m | eta: 137.0m +step 03994/16704 (23.91%) | loss: 2.817922 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,927 | mfu: 50.93 | epoch: 1 | total time: 42.93m | eta: 137.0m +step 03995/16704 (23.92%) | loss: 2.818104 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,105 | mfu: 50.88 | epoch: 1 | total time: 42.94m | eta: 136.9m +step 03996/16704 (23.92%) | loss: 2.817508 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,076 | mfu: 50.69 | epoch: 1 | total time: 42.95m | eta: 136.9m +step 03997/16704 (23.93%) | loss: 2.813128 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,062 | mfu: 50.94 | epoch: 1 | total time: 42.96m | eta: 136.9m +step 03998/16704 (23.93%) | loss: 2.814283 | lrm: 1.00 | dt: 648.15ms | tok/sec: 808,903 | mfu: 50.56 | epoch: 1 | total time: 42.97m | eta: 136.9m +step 03999/16704 (23.94%) | loss: 2.814171 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,490 | mfu: 50.91 | epoch: 1 | total time: 42.98m | eta: 136.9m +[GC rank7] gen2: 116.7ms collected 90976 objects +[GC rank0] gen2: 121.0ms collected 91112 objects +[GC rank2] gen2: 124.9ms collected 91056 objects +[GC rank6] gen2: 155.3ms collected 90984 objects +[GC rank3] gen2: 155.4ms collected 91048 objects +[GC rank1] gen2: 157.2ms collected 91088 objects +[GC rank5] gen2: 206.4ms collected 91008 objects +[GC rank4] gen2: 218.6ms collected 91024 objects +Step 04000 | Validation bpb: 0.849925 +step 04000/16704 (23.95%) | loss: 2.802016 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,011 | mfu: 50.69 | epoch: 1 | total time: 42.99m | eta: 136.9m +step 04001/16704 (23.95%) | loss: 2.803112 | lrm: 1.00 | dt: 647.95ms | tok/sec: 809,143 | mfu: 50.57 | epoch: 1 | total time: 43.00m | eta: 136.9m +step 04002/16704 (23.96%) | loss: 2.802132 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,167 | mfu: 50.76 | epoch: 1 | total time: 43.02m | eta: 136.9m +step 04003/16704 (23.96%) | loss: 2.787658 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,429 | mfu: 50.90 | epoch: 1 | total time: 43.03m | eta: 136.9m +step 04004/16704 (23.97%) | loss: 2.791473 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,936 | mfu: 50.50 | epoch: 1 | total time: 43.04m | eta: 136.8m +step 04005/16704 (23.98%) | loss: 2.789486 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,433 | mfu: 50.90 | epoch: 1 | total time: 43.05m | eta: 136.8m +step 04006/16704 (23.98%) | loss: 2.789396 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,826 | mfu: 50.93 | epoch: 1 | total time: 43.06m | eta: 136.8m +step 04007/16704 (23.99%) | loss: 2.794654 | lrm: 1.00 | dt: 648.58ms | tok/sec: 808,366 | mfu: 50.52 | epoch: 1 | total time: 43.07m | eta: 136.8m +step 04008/16704 (23.99%) | loss: 2.785253 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,646 | mfu: 50.92 | epoch: 1 | total time: 43.08m | eta: 136.8m +step 04009/16704 (24.00%) | loss: 2.774581 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,552 | mfu: 50.72 | epoch: 1 | total time: 43.09m | eta: 136.8m +step 04010/16704 (24.01%) | loss: 2.769972 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,721 | mfu: 50.80 | epoch: 1 | total time: 43.10m | eta: 136.8m +step 04011/16704 (24.01%) | loss: 2.769576 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,122 | mfu: 50.95 | epoch: 1 | total time: 43.11m | eta: 136.8m +step 04012/16704 (24.02%) | loss: 2.772576 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,186 | mfu: 50.70 | epoch: 1 | total time: 43.12m | eta: 136.8m +step 04013/16704 (24.02%) | loss: 2.764656 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,702 | mfu: 50.86 | epoch: 1 | total time: 43.13m | eta: 136.7m +step 04014/16704 (24.03%) | loss: 2.771328 | lrm: 1.00 | dt: 648.24ms | tok/sec: 808,788 | mfu: 50.55 | epoch: 1 | total time: 43.14m | eta: 136.7m +step 04015/16704 (24.04%) | loss: 2.788301 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,478 | mfu: 50.91 | epoch: 1 | total time: 43.16m | eta: 136.7m +step 04016/16704 (24.04%) | loss: 2.794299 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,668 | mfu: 50.86 | epoch: 1 | total time: 43.17m | eta: 136.7m +step 04017/16704 (24.05%) | loss: 2.784435 | lrm: 1.00 | dt: 647.56ms | tok/sec: 809,639 | mfu: 50.60 | epoch: 1 | total time: 43.18m | eta: 136.7m +step 04018/16704 (24.05%) | loss: 2.792184 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,327 | mfu: 50.96 | epoch: 1 | total time: 43.19m | eta: 136.7m +step 04019/16704 (24.06%) | loss: 2.788286 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,318 | mfu: 50.96 | epoch: 1 | total time: 43.20m | eta: 136.7m +step 04020/16704 (24.07%) | loss: 2.783142 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,159 | mfu: 50.76 | epoch: 1 | total time: 43.21m | eta: 136.7m +step 04021/16704 (24.07%) | loss: 2.779144 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,082 | mfu: 50.88 | epoch: 1 | total time: 43.22m | eta: 136.7m +step 04022/16704 (24.08%) | loss: 2.769308 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,726 | mfu: 50.73 | epoch: 1 | total time: 43.23m | eta: 136.7m +step 04023/16704 (24.08%) | loss: 2.776088 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,435 | mfu: 50.90 | epoch: 1 | total time: 43.24m | eta: 136.6m +step 04024/16704 (24.09%) | loss: 2.770520 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,231 | mfu: 50.95 | epoch: 1 | total time: 43.25m | eta: 136.6m +step 04025/16704 (24.10%) | loss: 2.773458 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,575 | mfu: 50.97 | epoch: 1 | total time: 43.26m | eta: 136.6m +step 04026/16704 (24.10%) | loss: 2.785382 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,110 | mfu: 50.76 | epoch: 1 | total time: 43.27m | eta: 136.6m +step 04027/16704 (24.11%) | loss: 2.784398 | lrm: 1.00 | dt: 647.72ms | tok/sec: 809,433 | mfu: 50.59 | epoch: 1 | total time: 43.28m | eta: 136.6m +step 04028/16704 (24.11%) | loss: 2.779327 | lrm: 1.00 | dt: 641.82ms | tok/sec: 816,875 | mfu: 51.06 | epoch: 1 | total time: 43.29m | eta: 136.6m +step 04029/16704 (24.12%) | loss: 2.769998 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,886 | mfu: 50.74 | epoch: 1 | total time: 43.31m | eta: 136.6m +step 04030/16704 (24.13%) | loss: 2.775575 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 1 | total time: 43.32m | eta: 136.6m +step 04031/16704 (24.13%) | loss: 2.781269 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,342 | mfu: 50.71 | epoch: 1 | total time: 43.33m | eta: 136.6m +step 04032/16704 (24.14%) | loss: 2.792218 | lrm: 1.00 | dt: 648.66ms | tok/sec: 808,265 | mfu: 50.52 | epoch: 1 | total time: 43.34m | eta: 136.5m +step 04033/16704 (24.14%) | loss: 2.792730 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,762 | mfu: 50.92 | epoch: 1 | total time: 43.35m | eta: 136.5m +step 04034/16704 (24.15%) | loss: 2.788270 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,350 | mfu: 50.84 | epoch: 1 | total time: 43.36m | eta: 136.5m +step 04035/16704 (24.16%) | loss: 2.785130 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,591 | mfu: 50.66 | epoch: 1 | total time: 43.37m | eta: 136.5m +step 04036/16704 (24.16%) | loss: 2.779084 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,237 | mfu: 50.89 | epoch: 1 | total time: 43.38m | eta: 136.5m +step 04037/16704 (24.17%) | loss: 2.780765 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,398 | mfu: 50.78 | epoch: 1 | total time: 43.39m | eta: 136.5m +step 04038/16704 (24.17%) | loss: 2.794796 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,663 | mfu: 50.79 | epoch: 1 | total time: 43.40m | eta: 136.5m +step 04039/16704 (24.18%) | loss: 2.795702 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,343 | mfu: 50.84 | epoch: 1 | total time: 43.41m | eta: 136.5m +step 04040/16704 (24.19%) | loss: 2.775293 | lrm: 1.00 | dt: 649.90ms | tok/sec: 806,721 | mfu: 50.42 | epoch: 1 | total time: 43.42m | eta: 136.5m +step 04041/16704 (24.19%) | loss: 2.782590 | lrm: 1.00 | dt: 641.71ms | tok/sec: 817,012 | mfu: 51.06 | epoch: 1 | total time: 43.43m | eta: 136.4m +step 04042/16704 (24.20%) | loss: 2.782820 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,220 | mfu: 50.64 | epoch: 1 | total time: 43.45m | eta: 136.4m +step 04043/16704 (24.20%) | loss: 2.775765 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,751 | mfu: 50.61 | epoch: 1 | total time: 43.46m | eta: 136.4m +step 04044/16704 (24.21%) | loss: 2.781372 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,380 | mfu: 50.84 | epoch: 1 | total time: 43.47m | eta: 136.4m +step 04045/16704 (24.22%) | loss: 2.801416 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,170 | mfu: 50.57 | epoch: 1 | total time: 43.48m | eta: 136.4m +step 04046/16704 (24.22%) | loss: 2.799054 | lrm: 1.00 | dt: 642.49ms | tok/sec: 816,027 | mfu: 51.00 | epoch: 1 | total time: 43.49m | eta: 136.4m +step 04047/16704 (24.23%) | loss: 2.801895 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,039 | mfu: 50.69 | epoch: 1 | total time: 43.50m | eta: 136.4m +step 04048/16704 (24.23%) | loss: 2.811064 | lrm: 1.00 | dt: 646.88ms | tok/sec: 810,489 | mfu: 50.66 | epoch: 1 | total time: 43.51m | eta: 136.4m +step 04049/16704 (24.24%) | loss: 2.803473 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,641 | mfu: 50.85 | epoch: 1 | total time: 43.52m | eta: 136.4m +step 04050/16704 (24.25%) | loss: 2.790326 | lrm: 1.00 | dt: 648.98ms | tok/sec: 807,869 | mfu: 50.49 | epoch: 1 | total time: 43.53m | eta: 136.3m +step 04051/16704 (24.25%) | loss: 2.786618 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,218 | mfu: 50.70 | epoch: 1 | total time: 43.54m | eta: 136.3m +step 04052/16704 (24.26%) | loss: 2.786234 | lrm: 1.00 | dt: 647.97ms | tok/sec: 809,121 | mfu: 50.57 | epoch: 1 | total time: 43.55m | eta: 136.3m +step 04053/16704 (24.26%) | loss: 2.798126 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,926 | mfu: 50.68 | epoch: 1 | total time: 43.56m | eta: 136.3m +step 04054/16704 (24.27%) | loss: 2.796697 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,747 | mfu: 50.80 | epoch: 1 | total time: 43.57m | eta: 136.3m +step 04055/16704 (24.28%) | loss: 2.798770 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,939 | mfu: 50.68 | epoch: 1 | total time: 43.59m | eta: 136.3m +step 04056/16704 (24.28%) | loss: 2.804424 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,894 | mfu: 50.68 | epoch: 1 | total time: 43.60m | eta: 136.3m +step 04057/16704 (24.29%) | loss: 2.799396 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,482 | mfu: 50.78 | epoch: 1 | total time: 43.61m | eta: 136.3m +step 04058/16704 (24.29%) | loss: 2.787720 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,547 | mfu: 50.91 | epoch: 1 | total time: 43.62m | eta: 136.3m +step 04059/16704 (24.30%) | loss: 2.773687 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,717 | mfu: 50.73 | epoch: 1 | total time: 43.63m | eta: 136.3m +step 04060/16704 (24.31%) | loss: 2.777238 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,276 | mfu: 50.71 | epoch: 1 | total time: 43.64m | eta: 136.2m +step 04061/16704 (24.31%) | loss: 2.777167 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,548 | mfu: 50.79 | epoch: 1 | total time: 43.65m | eta: 136.2m +step 04062/16704 (24.32%) | loss: 2.775800 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,137 | mfu: 50.82 | epoch: 1 | total time: 43.66m | eta: 136.2m +step 04063/16704 (24.32%) | loss: 2.760534 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,082 | mfu: 50.94 | epoch: 1 | total time: 43.67m | eta: 136.2m +step 04064/16704 (24.33%) | loss: 2.759289 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,542 | mfu: 50.72 | epoch: 1 | total time: 43.68m | eta: 136.2m +step 04065/16704 (24.34%) | loss: 2.767494 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,152 | mfu: 50.82 | epoch: 1 | total time: 43.69m | eta: 136.2m +step 04066/16704 (24.34%) | loss: 2.773790 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,694 | mfu: 50.92 | epoch: 1 | total time: 43.70m | eta: 136.2m +step 04067/16704 (24.35%) | loss: 2.781106 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,347 | mfu: 50.84 | epoch: 1 | total time: 43.71m | eta: 136.2m +step 04068/16704 (24.35%) | loss: 2.785329 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,830 | mfu: 50.87 | epoch: 1 | total time: 43.73m | eta: 136.2m +step 04069/16704 (24.36%) | loss: 2.784251 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,008 | mfu: 50.69 | epoch: 1 | total time: 43.74m | eta: 136.1m +step 04070/16704 (24.37%) | loss: 2.789133 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,163 | mfu: 50.89 | epoch: 1 | total time: 43.75m | eta: 136.1m +step 04071/16704 (24.37%) | loss: 2.791703 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,744 | mfu: 50.67 | epoch: 1 | total time: 43.76m | eta: 136.1m +step 04072/16704 (24.38%) | loss: 2.776505 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,937 | mfu: 50.81 | epoch: 1 | total time: 43.77m | eta: 136.1m +step 04073/16704 (24.38%) | loss: 2.800077 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,500 | mfu: 50.72 | epoch: 1 | total time: 43.78m | eta: 136.1m +step 04074/16704 (24.39%) | loss: 2.787356 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,039 | mfu: 50.75 | epoch: 1 | total time: 43.79m | eta: 136.1m +step 04075/16704 (24.40%) | loss: 2.783521 | lrm: 1.00 | dt: 647.17ms | tok/sec: 810,129 | mfu: 50.63 | epoch: 1 | total time: 43.80m | eta: 136.1m +step 04076/16704 (24.40%) | loss: 2.782623 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,644 | mfu: 50.79 | epoch: 1 | total time: 43.81m | eta: 136.1m +step 04077/16704 (24.41%) | loss: 2.801338 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,887 | mfu: 50.81 | epoch: 1 | total time: 43.82m | eta: 136.1m +step 04078/16704 (24.41%) | loss: 2.782925 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,950 | mfu: 50.81 | epoch: 1 | total time: 43.83m | eta: 136.0m +step 04079/16704 (24.42%) | loss: 2.795714 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,281 | mfu: 50.77 | epoch: 1 | total time: 43.84m | eta: 136.0m +step 04080/16704 (24.43%) | loss: 2.799735 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,074 | mfu: 50.76 | epoch: 1 | total time: 43.85m | eta: 136.0m +step 04081/16704 (24.43%) | loss: 2.798063 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,078 | mfu: 50.82 | epoch: 1 | total time: 43.87m | eta: 136.0m +step 04082/16704 (24.44%) | loss: 2.805609 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,817 | mfu: 50.86 | epoch: 1 | total time: 43.88m | eta: 136.0m +step 04083/16704 (24.44%) | loss: 2.800643 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,698 | mfu: 50.79 | epoch: 1 | total time: 43.89m | eta: 136.0m +step 04084/16704 (24.45%) | loss: 2.801718 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,538 | mfu: 50.72 | epoch: 1 | total time: 43.90m | eta: 136.0m +step 04085/16704 (24.46%) | loss: 2.811936 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 43.91m | eta: 136.0m +step 04086/16704 (24.46%) | loss: 2.809966 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,479 | mfu: 50.91 | epoch: 1 | total time: 43.92m | eta: 136.0m +step 04087/16704 (24.47%) | loss: 2.807007 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,264 | mfu: 50.83 | epoch: 1 | total time: 43.93m | eta: 135.9m +step 04088/16704 (24.47%) | loss: 2.810640 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,431 | mfu: 50.65 | epoch: 1 | total time: 43.94m | eta: 135.9m +step 04089/16704 (24.48%) | loss: 2.807172 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,375 | mfu: 50.90 | epoch: 1 | total time: 43.95m | eta: 135.9m +step 04090/16704 (24.49%) | loss: 2.809506 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 1 | total time: 43.96m | eta: 135.9m +step 04091/16704 (24.49%) | loss: 2.814008 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,670 | mfu: 50.79 | epoch: 1 | total time: 43.97m | eta: 135.9m +step 04092/16704 (24.50%) | loss: 2.815981 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,182 | mfu: 50.89 | epoch: 1 | total time: 43.98m | eta: 135.9m +step 04093/16704 (24.50%) | loss: 2.827647 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,672 | mfu: 50.86 | epoch: 1 | total time: 43.99m | eta: 135.9m +step 04094/16704 (24.51%) | loss: 2.823858 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,585 | mfu: 50.85 | epoch: 1 | total time: 44.00m | eta: 135.9m +step 04095/16704 (24.52%) | loss: 2.823778 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,599 | mfu: 50.85 | epoch: 1 | total time: 44.02m | eta: 135.9m +step 04096/16704 (24.52%) | loss: 2.818753 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,987 | mfu: 50.81 | epoch: 1 | total time: 44.03m | eta: 135.8m +step 04097/16704 (24.53%) | loss: 2.807391 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 1 | total time: 44.04m | eta: 135.8m +step 04098/16704 (24.53%) | loss: 2.809825 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,948 | mfu: 50.87 | epoch: 1 | total time: 44.05m | eta: 135.8m +step 04099/16704 (24.54%) | loss: 2.807569 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,136 | mfu: 50.70 | epoch: 1 | total time: 44.06m | eta: 135.8m +step 04100/16704 (24.55%) | loss: 2.829143 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,323 | mfu: 50.58 | epoch: 1 | total time: 44.07m | eta: 135.8m +step 04101/16704 (24.55%) | loss: 2.822816 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,163 | mfu: 51.01 | epoch: 1 | total time: 44.08m | eta: 135.8m +step 04102/16704 (24.56%) | loss: 2.820027 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,789 | mfu: 50.74 | epoch: 1 | total time: 44.09m | eta: 135.8m +step 04103/16704 (24.56%) | loss: 2.820676 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,518 | mfu: 50.72 | epoch: 1 | total time: 44.10m | eta: 135.8m +step 04104/16704 (24.57%) | loss: 2.814135 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,768 | mfu: 50.86 | epoch: 1 | total time: 44.11m | eta: 135.8m +step 04105/16704 (24.57%) | loss: 2.800877 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,622 | mfu: 50.73 | epoch: 1 | total time: 44.12m | eta: 135.8m +step 04106/16704 (24.58%) | loss: 2.811450 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,768 | mfu: 50.67 | epoch: 1 | total time: 44.13m | eta: 135.7m +step 04107/16704 (24.59%) | loss: 2.815238 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,328 | mfu: 50.83 | epoch: 1 | total time: 44.14m | eta: 135.7m +step 04108/16704 (24.59%) | loss: 2.820474 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,242 | mfu: 50.70 | epoch: 1 | total time: 44.16m | eta: 135.7m +step 04109/16704 (24.60%) | loss: 2.806778 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,543 | mfu: 50.85 | epoch: 1 | total time: 44.17m | eta: 135.7m +step 04110/16704 (24.60%) | loss: 2.813434 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,961 | mfu: 50.87 | epoch: 1 | total time: 44.18m | eta: 135.7m +step 04111/16704 (24.61%) | loss: 2.809506 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,557 | mfu: 50.85 | epoch: 1 | total time: 44.19m | eta: 135.7m +step 04112/16704 (24.62%) | loss: 2.797383 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,637 | mfu: 50.79 | epoch: 1 | total time: 44.20m | eta: 135.7m +step 04113/16704 (24.62%) | loss: 2.785308 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,702 | mfu: 50.92 | epoch: 1 | total time: 44.21m | eta: 135.7m +step 04114/16704 (24.63%) | loss: 2.793873 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,462 | mfu: 50.84 | epoch: 1 | total time: 44.22m | eta: 135.7m +step 04115/16704 (24.63%) | loss: 2.787068 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,551 | mfu: 50.85 | epoch: 1 | total time: 44.23m | eta: 135.6m +step 04116/16704 (24.64%) | loss: 2.785006 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,759 | mfu: 50.92 | epoch: 1 | total time: 44.24m | eta: 135.6m +step 04117/16704 (24.65%) | loss: 2.791053 | lrm: 1.00 | dt: 647.20ms | tok/sec: 810,090 | mfu: 50.63 | epoch: 1 | total time: 44.25m | eta: 135.6m +step 04118/16704 (24.65%) | loss: 2.808615 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,205 | mfu: 50.95 | epoch: 1 | total time: 44.26m | eta: 135.6m +step 04119/16704 (24.66%) | loss: 2.807919 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,449 | mfu: 50.84 | epoch: 1 | total time: 44.27m | eta: 135.6m +step 04120/16704 (24.66%) | loss: 2.811499 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,185 | mfu: 50.95 | epoch: 1 | total time: 44.28m | eta: 135.6m +step 04121/16704 (24.67%) | loss: 2.803044 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,190 | mfu: 50.89 | epoch: 1 | total time: 44.29m | eta: 135.6m +step 04122/16704 (24.68%) | loss: 2.806242 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,977 | mfu: 50.62 | epoch: 1 | total time: 44.31m | eta: 135.6m +step 04123/16704 (24.68%) | loss: 2.812308 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,188 | mfu: 50.76 | epoch: 1 | total time: 44.32m | eta: 135.6m +step 04124/16704 (24.69%) | loss: 2.811591 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,886 | mfu: 50.81 | epoch: 1 | total time: 44.33m | eta: 135.5m +step 04125/16704 (24.69%) | loss: 2.811839 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,446 | mfu: 50.84 | epoch: 1 | total time: 44.34m | eta: 135.5m +step 04126/16704 (24.70%) | loss: 2.818393 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 1 | total time: 44.35m | eta: 135.5m +step 04127/16704 (24.71%) | loss: 2.838852 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,357 | mfu: 50.71 | epoch: 1 | total time: 44.36m | eta: 135.5m +step 04128/16704 (24.71%) | loss: 2.835355 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,151 | mfu: 50.95 | epoch: 1 | total time: 44.37m | eta: 135.5m +step 04129/16704 (24.72%) | loss: 2.821208 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,392 | mfu: 50.90 | epoch: 1 | total time: 44.38m | eta: 135.5m +step 04130/16704 (24.72%) | loss: 2.818318 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,247 | mfu: 50.89 | epoch: 1 | total time: 44.39m | eta: 135.5m +step 04131/16704 (24.73%) | loss: 2.805304 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,520 | mfu: 50.78 | epoch: 1 | total time: 44.40m | eta: 135.5m +step 04132/16704 (24.74%) | loss: 2.815822 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,728 | mfu: 50.80 | epoch: 1 | total time: 44.41m | eta: 135.5m +step 04133/16704 (24.74%) | loss: 2.820726 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,184 | mfu: 50.89 | epoch: 1 | total time: 44.42m | eta: 135.4m +step 04134/16704 (24.75%) | loss: 2.816681 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,932 | mfu: 50.75 | epoch: 1 | total time: 44.43m | eta: 135.4m +step 04135/16704 (24.75%) | loss: 2.815313 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,445 | mfu: 50.84 | epoch: 1 | total time: 44.45m | eta: 135.4m +step 04136/16704 (24.76%) | loss: 2.808904 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,800 | mfu: 50.86 | epoch: 1 | total time: 44.46m | eta: 135.4m +step 04137/16704 (24.77%) | loss: 2.821029 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,486 | mfu: 50.78 | epoch: 1 | total time: 44.47m | eta: 135.4m +step 04138/16704 (24.77%) | loss: 2.814057 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,321 | mfu: 50.90 | epoch: 1 | total time: 44.48m | eta: 135.4m +step 04139/16704 (24.78%) | loss: 2.807036 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,882 | mfu: 50.93 | epoch: 1 | total time: 44.49m | eta: 135.4m +step 04140/16704 (24.78%) | loss: 2.781400 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,232 | mfu: 50.77 | epoch: 1 | total time: 44.50m | eta: 135.4m +step 04141/16704 (24.79%) | loss: 2.779486 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,364 | mfu: 50.77 | epoch: 1 | total time: 44.51m | eta: 135.4m +step 04142/16704 (24.80%) | loss: 2.782942 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,211 | mfu: 50.76 | epoch: 1 | total time: 44.52m | eta: 135.4m +step 04143/16704 (24.80%) | loss: 2.789698 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,794 | mfu: 50.86 | epoch: 1 | total time: 44.53m | eta: 135.3m +step 04144/16704 (24.81%) | loss: 2.782089 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,609 | mfu: 50.79 | epoch: 1 | total time: 44.54m | eta: 135.3m +step 04145/16704 (24.81%) | loss: 2.779691 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,114 | mfu: 50.76 | epoch: 1 | total time: 44.55m | eta: 135.3m +step 04146/16704 (24.82%) | loss: 2.785362 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,520 | mfu: 50.91 | epoch: 1 | total time: 44.56m | eta: 135.3m +step 04147/16704 (24.83%) | loss: 2.786495 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,477 | mfu: 50.72 | epoch: 1 | total time: 44.57m | eta: 135.3m +step 04148/16704 (24.83%) | loss: 2.789282 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,202 | mfu: 50.83 | epoch: 1 | total time: 44.59m | eta: 135.3m +step 04149/16704 (24.84%) | loss: 2.791851 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,539 | mfu: 50.91 | epoch: 1 | total time: 44.60m | eta: 135.3m +step 04150/16704 (24.84%) | loss: 2.788690 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,218 | mfu: 50.70 | epoch: 1 | total time: 44.61m | eta: 135.3m +step 04151/16704 (24.85%) | loss: 2.801913 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,158 | mfu: 50.82 | epoch: 1 | total time: 44.62m | eta: 135.3m +step 04152/16704 (24.86%) | loss: 2.790283 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,427 | mfu: 50.78 | epoch: 1 | total time: 44.63m | eta: 135.2m +step 04153/16704 (24.86%) | loss: 2.789169 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,590 | mfu: 50.79 | epoch: 1 | total time: 44.64m | eta: 135.2m +step 04154/16704 (24.87%) | loss: 2.786199 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,968 | mfu: 50.81 | epoch: 1 | total time: 44.65m | eta: 135.2m +step 04155/16704 (24.87%) | loss: 2.787964 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 1 | total time: 44.66m | eta: 135.2m +step 04156/16704 (24.88%) | loss: 2.784588 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,159 | mfu: 50.76 | epoch: 1 | total time: 44.67m | eta: 135.2m +step 04157/16704 (24.89%) | loss: 2.800884 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,430 | mfu: 50.90 | epoch: 1 | total time: 44.68m | eta: 135.2m +step 04158/16704 (24.89%) | loss: 2.795394 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 1 | total time: 44.69m | eta: 135.2m +step 04159/16704 (24.90%) | loss: 2.796617 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,503 | mfu: 50.66 | epoch: 1 | total time: 44.70m | eta: 135.2m +step 04160/16704 (24.90%) | loss: 2.787506 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,830 | mfu: 50.87 | epoch: 1 | total time: 44.71m | eta: 135.2m +step 04161/16704 (24.91%) | loss: 2.793903 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,498 | mfu: 50.84 | epoch: 1 | total time: 44.72m | eta: 135.1m +step 04162/16704 (24.92%) | loss: 2.794964 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,004 | mfu: 50.88 | epoch: 1 | total time: 44.74m | eta: 135.1m +step 04163/16704 (24.92%) | loss: 2.798987 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,250 | mfu: 50.83 | epoch: 1 | total time: 44.75m | eta: 135.1m +step 04164/16704 (24.93%) | loss: 2.814900 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,446 | mfu: 50.72 | epoch: 1 | total time: 44.76m | eta: 135.1m +step 04165/16704 (24.93%) | loss: 2.810093 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,577 | mfu: 50.79 | epoch: 1 | total time: 44.77m | eta: 135.1m +step 04166/16704 (24.94%) | loss: 2.815690 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,669 | mfu: 50.86 | epoch: 1 | total time: 44.78m | eta: 135.1m +step 04167/16704 (24.95%) | loss: 2.808273 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,607 | mfu: 50.66 | epoch: 1 | total time: 44.79m | eta: 135.1m +step 04168/16704 (24.95%) | loss: 2.803684 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,843 | mfu: 50.74 | epoch: 1 | total time: 44.80m | eta: 135.1m +step 04169/16704 (24.96%) | loss: 2.799603 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,848 | mfu: 50.87 | epoch: 1 | total time: 44.81m | eta: 135.1m +step 04170/16704 (24.96%) | loss: 2.800835 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,115 | mfu: 50.82 | epoch: 1 | total time: 44.82m | eta: 135.0m +step 04171/16704 (24.97%) | loss: 2.799009 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,475 | mfu: 50.78 | epoch: 1 | total time: 44.83m | eta: 135.0m +step 04172/16704 (24.98%) | loss: 2.796145 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 1 | total time: 44.84m | eta: 135.0m +step 04173/16704 (24.98%) | loss: 2.796219 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,630 | mfu: 50.85 | epoch: 1 | total time: 44.85m | eta: 135.0m +step 04174/16704 (24.99%) | loss: 2.791918 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 1 | total time: 44.86m | eta: 135.0m +step 04175/16704 (24.99%) | loss: 2.794002 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,933 | mfu: 50.87 | epoch: 1 | total time: 44.88m | eta: 135.0m +step 04176/16704 (25.00%) | loss: 2.800956 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,787 | mfu: 50.80 | epoch: 1 | total time: 44.89m | eta: 135.0m +step 04177/16704 (25.01%) | loss: 2.787770 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,671 | mfu: 50.92 | epoch: 1 | total time: 44.90m | eta: 135.0m +step 04178/16704 (25.01%) | loss: 2.782437 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,623 | mfu: 50.67 | epoch: 1 | total time: 44.91m | eta: 135.0m +step 04179/16704 (25.02%) | loss: 2.780951 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,547 | mfu: 50.72 | epoch: 1 | total time: 44.92m | eta: 134.9m +step 04180/16704 (25.02%) | loss: 2.781105 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,414 | mfu: 50.78 | epoch: 1 | total time: 44.93m | eta: 134.9m +step 04181/16704 (25.03%) | loss: 2.779697 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,572 | mfu: 50.79 | epoch: 1 | total time: 44.94m | eta: 134.9m +step 04182/16704 (25.04%) | loss: 2.776712 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,099 | mfu: 50.76 | epoch: 1 | total time: 44.95m | eta: 134.9m +step 04183/16704 (25.04%) | loss: 2.774349 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,862 | mfu: 50.74 | epoch: 1 | total time: 44.96m | eta: 134.9m +step 04184/16704 (25.05%) | loss: 2.777694 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,460 | mfu: 50.84 | epoch: 1 | total time: 44.97m | eta: 134.9m +step 04185/16704 (25.05%) | loss: 2.780924 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,536 | mfu: 50.66 | epoch: 1 | total time: 44.98m | eta: 134.9m +step 04186/16704 (25.06%) | loss: 2.784165 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,888 | mfu: 50.62 | epoch: 1 | total time: 44.99m | eta: 134.9m +step 04187/16704 (25.07%) | loss: 2.783052 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,465 | mfu: 50.72 | epoch: 1 | total time: 45.00m | eta: 134.9m +step 04188/16704 (25.07%) | loss: 2.805068 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 1 | total time: 45.02m | eta: 134.9m +step 04189/16704 (25.08%) | loss: 2.803255 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,781 | mfu: 50.86 | epoch: 1 | total time: 45.03m | eta: 134.8m +step 04190/16704 (25.08%) | loss: 2.800066 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,193 | mfu: 50.76 | epoch: 1 | total time: 45.04m | eta: 134.8m +step 04191/16704 (25.09%) | loss: 2.807401 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,581 | mfu: 50.85 | epoch: 1 | total time: 45.05m | eta: 134.8m +step 04192/16704 (25.10%) | loss: 2.794897 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,791 | mfu: 50.86 | epoch: 1 | total time: 45.06m | eta: 134.8m +step 04193/16704 (25.10%) | loss: 2.788567 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,206 | mfu: 50.83 | epoch: 1 | total time: 45.07m | eta: 134.8m +step 04194/16704 (25.11%) | loss: 2.801402 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,101 | mfu: 50.82 | epoch: 1 | total time: 45.08m | eta: 134.8m +step 04195/16704 (25.11%) | loss: 2.792746 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,076 | mfu: 50.69 | epoch: 1 | total time: 45.09m | eta: 134.8m +step 04196/16704 (25.12%) | loss: 2.787626 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,334 | mfu: 50.77 | epoch: 1 | total time: 45.10m | eta: 134.8m +step 04197/16704 (25.13%) | loss: 2.791839 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,933 | mfu: 50.50 | epoch: 1 | total time: 45.11m | eta: 134.8m +step 04198/16704 (25.13%) | loss: 2.792000 | lrm: 1.00 | dt: 642.54ms | tok/sec: 815,965 | mfu: 51.00 | epoch: 1 | total time: 45.12m | eta: 134.7m +step 04199/16704 (25.14%) | loss: 2.803959 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,727 | mfu: 50.73 | epoch: 1 | total time: 45.13m | eta: 134.7m +step 04200/16704 (25.14%) | loss: 2.787240 | lrm: 1.00 | dt: 647.78ms | tok/sec: 809,359 | mfu: 50.59 | epoch: 1 | total time: 45.14m | eta: 134.7m +step 04201/16704 (25.15%) | loss: 2.779863 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,760 | mfu: 50.92 | epoch: 1 | total time: 45.16m | eta: 134.7m +step 04202/16704 (25.16%) | loss: 2.776339 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 1 | total time: 45.17m | eta: 134.7m +step 04203/16704 (25.16%) | loss: 2.794302 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 1 | total time: 45.18m | eta: 134.7m +step 04204/16704 (25.17%) | loss: 2.792118 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,306 | mfu: 50.58 | epoch: 1 | total time: 45.19m | eta: 134.7m +step 04205/16704 (25.17%) | loss: 2.789450 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,826 | mfu: 50.87 | epoch: 1 | total time: 45.20m | eta: 134.7m +step 04206/16704 (25.18%) | loss: 2.777200 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,943 | mfu: 50.87 | epoch: 1 | total time: 45.21m | eta: 134.7m +step 04207/16704 (25.19%) | loss: 2.770330 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 45.22m | eta: 134.6m +step 04208/16704 (25.19%) | loss: 2.780680 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,287 | mfu: 50.83 | epoch: 1 | total time: 45.23m | eta: 134.6m +step 04209/16704 (25.20%) | loss: 2.782947 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,947 | mfu: 50.62 | epoch: 1 | total time: 45.24m | eta: 134.6m +step 04210/16704 (25.20%) | loss: 2.782657 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,823 | mfu: 50.80 | epoch: 1 | total time: 45.25m | eta: 134.6m +step 04211/16704 (25.21%) | loss: 2.795744 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,318 | mfu: 50.77 | epoch: 1 | total time: 45.26m | eta: 134.6m +step 04212/16704 (25.22%) | loss: 2.799010 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,625 | mfu: 50.67 | epoch: 1 | total time: 45.27m | eta: 134.6m +step 04213/16704 (25.22%) | loss: 2.788589 | lrm: 1.00 | dt: 648.04ms | tok/sec: 809,032 | mfu: 50.57 | epoch: 1 | total time: 45.28m | eta: 134.6m +step 04214/16704 (25.23%) | loss: 2.782580 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,378 | mfu: 50.84 | epoch: 1 | total time: 45.29m | eta: 134.6m +step 04215/16704 (25.23%) | loss: 2.781403 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,395 | mfu: 50.71 | epoch: 1 | total time: 45.31m | eta: 134.6m +step 04216/16704 (25.24%) | loss: 2.784350 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,364 | mfu: 50.71 | epoch: 1 | total time: 45.32m | eta: 134.5m +step 04217/16704 (25.25%) | loss: 2.785983 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,262 | mfu: 50.77 | epoch: 1 | total time: 45.33m | eta: 134.5m +step 04218/16704 (25.25%) | loss: 2.796086 | lrm: 1.00 | dt: 647.95ms | tok/sec: 809,145 | mfu: 50.57 | epoch: 1 | total time: 45.34m | eta: 134.5m +step 04219/16704 (25.26%) | loss: 2.781612 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,393 | mfu: 50.96 | epoch: 1 | total time: 45.35m | eta: 134.5m +step 04220/16704 (25.26%) | loss: 2.797437 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,396 | mfu: 50.84 | epoch: 1 | total time: 45.36m | eta: 134.5m +step 04221/16704 (25.27%) | loss: 2.807874 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,312 | mfu: 50.71 | epoch: 1 | total time: 45.37m | eta: 134.5m +step 04222/16704 (25.28%) | loss: 2.803834 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,481 | mfu: 50.59 | epoch: 1 | total time: 45.38m | eta: 134.5m +step 04223/16704 (25.28%) | loss: 2.806563 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,454 | mfu: 50.84 | epoch: 1 | total time: 45.39m | eta: 134.5m +step 04224/16704 (25.29%) | loss: 2.815066 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,529 | mfu: 50.78 | epoch: 1 | total time: 45.40m | eta: 134.5m +step 04225/16704 (25.29%) | loss: 2.813380 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 1 | total time: 45.41m | eta: 134.5m +step 04226/16704 (25.30%) | loss: 2.808705 | lrm: 1.00 | dt: 644.09ms | tok/sec: 814,002 | mfu: 50.88 | epoch: 1 | total time: 45.42m | eta: 134.4m +step 04227/16704 (25.31%) | loss: 2.790329 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,044 | mfu: 50.69 | epoch: 1 | total time: 45.43m | eta: 134.4m +step 04228/16704 (25.31%) | loss: 2.791128 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,630 | mfu: 50.79 | epoch: 1 | total time: 45.45m | eta: 134.4m +step 04229/16704 (25.32%) | loss: 2.791093 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,861 | mfu: 50.80 | epoch: 1 | total time: 45.46m | eta: 134.4m +step 04230/16704 (25.32%) | loss: 2.794007 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,289 | mfu: 50.71 | epoch: 1 | total time: 45.47m | eta: 134.4m +step 04231/16704 (25.33%) | loss: 2.811506 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 1 | total time: 45.48m | eta: 134.4m +step 04232/16704 (25.34%) | loss: 2.791002 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,395 | mfu: 50.71 | epoch: 1 | total time: 45.49m | eta: 134.4m +step 04233/16704 (25.34%) | loss: 2.794939 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,647 | mfu: 50.79 | epoch: 1 | total time: 45.50m | eta: 134.4m +step 04234/16704 (25.35%) | loss: 2.794403 | lrm: 1.00 | dt: 649.39ms | tok/sec: 807,356 | mfu: 50.46 | epoch: 1 | total time: 45.51m | eta: 134.4m +step 04235/16704 (25.35%) | loss: 2.795439 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,488 | mfu: 50.59 | epoch: 1 | total time: 45.52m | eta: 134.3m +step 04236/16704 (25.36%) | loss: 2.808485 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,606 | mfu: 50.91 | epoch: 1 | total time: 45.53m | eta: 134.3m +step 04237/16704 (25.37%) | loss: 2.797167 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,459 | mfu: 50.65 | epoch: 1 | total time: 45.54m | eta: 134.3m +step 04238/16704 (25.37%) | loss: 2.795335 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,981 | mfu: 50.75 | epoch: 1 | total time: 45.55m | eta: 134.3m +step 04239/16704 (25.38%) | loss: 2.795457 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,155 | mfu: 50.82 | epoch: 1 | total time: 45.56m | eta: 134.3m +step 04240/16704 (25.38%) | loss: 2.800432 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,446 | mfu: 50.84 | epoch: 1 | total time: 45.57m | eta: 134.3m +step 04241/16704 (25.39%) | loss: 2.809350 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,474 | mfu: 50.72 | epoch: 1 | total time: 45.59m | eta: 134.3m +step 04242/16704 (25.40%) | loss: 2.799553 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,918 | mfu: 51.00 | epoch: 1 | total time: 45.60m | eta: 134.3m +step 04243/16704 (25.40%) | loss: 2.781991 | lrm: 1.00 | dt: 644.88ms | tok/sec: 812,995 | mfu: 50.81 | epoch: 1 | total time: 45.61m | eta: 134.3m +step 04244/16704 (25.41%) | loss: 2.788967 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,525 | mfu: 50.72 | epoch: 1 | total time: 45.62m | eta: 134.2m +step 04245/16704 (25.41%) | loss: 2.778160 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,245 | mfu: 50.83 | epoch: 1 | total time: 45.63m | eta: 134.2m +step 04246/16704 (25.42%) | loss: 2.775767 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,869 | mfu: 50.68 | epoch: 1 | total time: 45.64m | eta: 134.2m +step 04247/16704 (25.43%) | loss: 2.776369 | lrm: 1.00 | dt: 646.38ms | tok/sec: 811,113 | mfu: 50.70 | epoch: 1 | total time: 45.65m | eta: 134.2m +step 04248/16704 (25.43%) | loss: 2.782372 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,139 | mfu: 50.82 | epoch: 1 | total time: 45.66m | eta: 134.2m +step 04249/16704 (25.44%) | loss: 2.787182 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,022 | mfu: 50.69 | epoch: 1 | total time: 45.67m | eta: 134.2m +Step 04250 | Validation bpb: 0.847930 +step 04250/16704 (25.44%) | loss: 2.793925 | lrm: 1.00 | dt: 648.59ms | tok/sec: 808,352 | mfu: 50.52 | epoch: 1 | total time: 45.68m | eta: 134.2m +step 04251/16704 (25.45%) | loss: 2.796870 | lrm: 1.00 | dt: 648.51ms | tok/sec: 808,448 | mfu: 50.53 | epoch: 1 | total time: 45.69m | eta: 134.2m +step 04252/16704 (25.45%) | loss: 2.807306 | lrm: 1.00 | dt: 651.53ms | tok/sec: 804,698 | mfu: 50.29 | epoch: 1 | total time: 45.70m | eta: 134.2m +step 04253/16704 (25.46%) | loss: 2.809747 | lrm: 1.00 | dt: 641.78ms | tok/sec: 816,928 | mfu: 51.06 | epoch: 1 | total time: 45.71m | eta: 134.1m +step 04254/16704 (25.47%) | loss: 2.795783 | lrm: 1.00 | dt: 647.64ms | tok/sec: 809,534 | mfu: 50.60 | epoch: 1 | total time: 45.73m | eta: 134.1m +step 04255/16704 (25.47%) | loss: 2.794710 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,996 | mfu: 50.75 | epoch: 1 | total time: 45.74m | eta: 134.1m +step 04256/16704 (25.48%) | loss: 2.807257 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,624 | mfu: 50.98 | epoch: 1 | total time: 45.75m | eta: 134.1m +step 04257/16704 (25.48%) | loss: 2.804698 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,707 | mfu: 50.80 | epoch: 1 | total time: 45.76m | eta: 134.1m +step 04258/16704 (25.49%) | loss: 2.785016 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 1 | total time: 45.77m | eta: 134.1m +step 04259/16704 (25.50%) | loss: 2.788322 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,450 | mfu: 50.65 | epoch: 1 | total time: 45.78m | eta: 134.1m +step 04260/16704 (25.50%) | loss: 2.784819 | lrm: 1.00 | dt: 647.72ms | tok/sec: 809,433 | mfu: 50.59 | epoch: 1 | total time: 45.79m | eta: 134.1m +step 04261/16704 (25.51%) | loss: 2.779996 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,990 | mfu: 50.81 | epoch: 1 | total time: 45.80m | eta: 134.1m +step 04262/16704 (25.51%) | loss: 2.784148 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,921 | mfu: 50.68 | epoch: 1 | total time: 45.81m | eta: 134.1m +step 04263/16704 (25.52%) | loss: 2.774118 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,563 | mfu: 50.91 | epoch: 1 | total time: 45.82m | eta: 134.0m +step 04264/16704 (25.53%) | loss: 2.779203 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,204 | mfu: 50.83 | epoch: 1 | total time: 45.83m | eta: 134.0m +step 04265/16704 (25.53%) | loss: 2.780513 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,881 | mfu: 50.68 | epoch: 1 | total time: 45.84m | eta: 134.0m +step 04266/16704 (25.54%) | loss: 2.777662 | lrm: 1.00 | dt: 648.11ms | tok/sec: 808,942 | mfu: 50.56 | epoch: 1 | total time: 45.85m | eta: 134.0m +step 04267/16704 (25.54%) | loss: 2.788155 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,162 | mfu: 50.82 | epoch: 1 | total time: 45.87m | eta: 134.0m +step 04268/16704 (25.55%) | loss: 2.781268 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,361 | mfu: 50.84 | epoch: 1 | total time: 45.88m | eta: 134.0m +step 04269/16704 (25.56%) | loss: 2.789226 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,672 | mfu: 50.86 | epoch: 1 | total time: 45.89m | eta: 134.0m +step 04270/16704 (25.56%) | loss: 2.800569 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,352 | mfu: 50.84 | epoch: 1 | total time: 45.90m | eta: 134.0m +step 04271/16704 (25.57%) | loss: 2.787196 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,878 | mfu: 50.74 | epoch: 1 | total time: 45.91m | eta: 134.0m +step 04272/16704 (25.57%) | loss: 2.779721 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,573 | mfu: 50.79 | epoch: 1 | total time: 45.92m | eta: 133.9m +step 04273/16704 (25.58%) | loss: 2.782852 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,055 | mfu: 50.88 | epoch: 1 | total time: 45.93m | eta: 133.9m +step 04274/16704 (25.59%) | loss: 2.783024 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,060 | mfu: 50.63 | epoch: 1 | total time: 45.94m | eta: 133.9m +step 04275/16704 (25.59%) | loss: 2.769842 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,788 | mfu: 50.93 | epoch: 1 | total time: 45.95m | eta: 133.9m +step 04276/16704 (25.60%) | loss: 2.762651 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,332 | mfu: 50.65 | epoch: 1 | total time: 45.96m | eta: 133.9m +step 04277/16704 (25.60%) | loss: 2.769173 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,265 | mfu: 50.71 | epoch: 1 | total time: 45.97m | eta: 133.9m +step 04278/16704 (25.61%) | loss: 2.775145 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,838 | mfu: 50.80 | epoch: 1 | total time: 45.98m | eta: 133.9m +step 04279/16704 (25.62%) | loss: 2.777658 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,976 | mfu: 50.75 | epoch: 1 | total time: 45.99m | eta: 133.9m +step 04280/16704 (25.62%) | loss: 2.778140 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,924 | mfu: 50.75 | epoch: 1 | total time: 46.01m | eta: 133.9m +step 04281/16704 (25.63%) | loss: 2.782153 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,166 | mfu: 50.89 | epoch: 1 | total time: 46.02m | eta: 133.8m +step 04282/16704 (25.63%) | loss: 2.784765 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,936 | mfu: 50.75 | epoch: 1 | total time: 46.03m | eta: 133.8m +step 04283/16704 (25.64%) | loss: 2.793002 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,961 | mfu: 50.87 | epoch: 1 | total time: 46.04m | eta: 133.8m +step 04284/16704 (25.65%) | loss: 2.788190 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,823 | mfu: 50.99 | epoch: 1 | total time: 46.05m | eta: 133.8m +step 04285/16704 (25.65%) | loss: 2.791927 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,414 | mfu: 50.65 | epoch: 1 | total time: 46.06m | eta: 133.8m +step 04286/16704 (25.66%) | loss: 2.792095 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,225 | mfu: 50.89 | epoch: 1 | total time: 46.07m | eta: 133.8m +step 04287/16704 (25.66%) | loss: 2.792220 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,914 | mfu: 50.87 | epoch: 1 | total time: 46.08m | eta: 133.8m +step 04288/16704 (25.67%) | loss: 2.790819 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,526 | mfu: 50.85 | epoch: 1 | total time: 46.09m | eta: 133.8m +step 04289/16704 (25.68%) | loss: 2.781142 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,690 | mfu: 50.86 | epoch: 1 | total time: 46.10m | eta: 133.8m +step 04290/16704 (25.68%) | loss: 2.776358 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,688 | mfu: 50.73 | epoch: 1 | total time: 46.11m | eta: 133.7m +step 04291/16704 (25.69%) | loss: 2.782010 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,718 | mfu: 50.98 | epoch: 1 | total time: 46.12m | eta: 133.7m +step 04292/16704 (25.69%) | loss: 2.787489 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,188 | mfu: 50.83 | epoch: 1 | total time: 46.13m | eta: 133.7m +step 04293/16704 (25.70%) | loss: 2.787688 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,825 | mfu: 50.74 | epoch: 1 | total time: 46.14m | eta: 133.7m +step 04294/16704 (25.71%) | loss: 2.786005 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,076 | mfu: 50.88 | epoch: 1 | total time: 46.16m | eta: 133.7m +step 04295/16704 (25.71%) | loss: 2.781118 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,991 | mfu: 50.75 | epoch: 1 | total time: 46.17m | eta: 133.7m +step 04296/16704 (25.72%) | loss: 2.784805 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,820 | mfu: 50.74 | epoch: 1 | total time: 46.18m | eta: 133.7m +step 04297/16704 (25.72%) | loss: 2.783819 | lrm: 1.00 | dt: 646.97ms | tok/sec: 810,372 | mfu: 50.65 | epoch: 1 | total time: 46.19m | eta: 133.7m +step 04298/16704 (25.73%) | loss: 2.759947 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,445 | mfu: 50.90 | epoch: 1 | total time: 46.20m | eta: 133.7m +step 04299/16704 (25.74%) | loss: 2.771180 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,269 | mfu: 50.71 | epoch: 1 | total time: 46.21m | eta: 133.7m +step 04300/16704 (25.74%) | loss: 2.759383 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,105 | mfu: 50.82 | epoch: 1 | total time: 46.22m | eta: 133.6m +step 04301/16704 (25.75%) | loss: 2.768087 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,233 | mfu: 50.89 | epoch: 1 | total time: 46.23m | eta: 133.6m +step 04302/16704 (25.75%) | loss: 2.767876 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,463 | mfu: 50.72 | epoch: 1 | total time: 46.24m | eta: 133.6m +step 04303/16704 (25.76%) | loss: 2.775389 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,196 | mfu: 50.76 | epoch: 1 | total time: 46.25m | eta: 133.6m +step 04304/16704 (25.77%) | loss: 2.775551 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,522 | mfu: 50.72 | epoch: 1 | total time: 46.26m | eta: 133.6m +step 04305/16704 (25.77%) | loss: 2.777957 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,552 | mfu: 50.85 | epoch: 1 | total time: 46.27m | eta: 133.6m +step 04306/16704 (25.78%) | loss: 2.780588 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,640 | mfu: 50.73 | epoch: 1 | total time: 46.28m | eta: 133.6m +step 04307/16704 (25.78%) | loss: 2.787055 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,644 | mfu: 50.73 | epoch: 1 | total time: 46.30m | eta: 133.6m +step 04308/16704 (25.79%) | loss: 2.790944 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,740 | mfu: 50.73 | epoch: 1 | total time: 46.31m | eta: 133.6m +step 04309/16704 (25.80%) | loss: 2.792819 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,584 | mfu: 50.73 | epoch: 1 | total time: 46.32m | eta: 133.5m +step 04310/16704 (25.80%) | loss: 2.786588 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,272 | mfu: 50.83 | epoch: 1 | total time: 46.33m | eta: 133.5m +step 04311/16704 (25.81%) | loss: 2.790247 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,289 | mfu: 50.77 | epoch: 1 | total time: 46.34m | eta: 133.5m +step 04312/16704 (25.81%) | loss: 2.789231 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,421 | mfu: 50.90 | epoch: 1 | total time: 46.35m | eta: 133.5m +step 04313/16704 (25.82%) | loss: 2.796333 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,790 | mfu: 50.74 | epoch: 1 | total time: 46.36m | eta: 133.5m +step 04314/16704 (25.83%) | loss: 2.788827 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,380 | mfu: 50.71 | epoch: 1 | total time: 46.37m | eta: 133.5m +step 04315/16704 (25.83%) | loss: 2.796550 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,118 | mfu: 50.82 | epoch: 1 | total time: 46.38m | eta: 133.5m +step 04316/16704 (25.84%) | loss: 2.796345 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 1 | total time: 46.39m | eta: 133.5m +step 04317/16704 (25.84%) | loss: 2.794095 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,578 | mfu: 50.91 | epoch: 1 | total time: 46.40m | eta: 133.5m +step 04318/16704 (25.85%) | loss: 2.795465 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,928 | mfu: 50.75 | epoch: 1 | total time: 46.41m | eta: 133.4m +step 04319/16704 (25.86%) | loss: 2.796697 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,234 | mfu: 50.77 | epoch: 1 | total time: 46.42m | eta: 133.4m +step 04320/16704 (25.86%) | loss: 2.794864 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 1 | total time: 46.44m | eta: 133.4m +step 04321/16704 (25.87%) | loss: 2.782104 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,934 | mfu: 50.68 | epoch: 1 | total time: 46.45m | eta: 133.4m +step 04322/16704 (25.87%) | loss: 2.781634 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,894 | mfu: 50.93 | epoch: 1 | total time: 46.46m | eta: 133.4m +step 04323/16704 (25.88%) | loss: 2.782898 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,447 | mfu: 50.84 | epoch: 1 | total time: 46.47m | eta: 133.4m +step 04324/16704 (25.89%) | loss: 2.795568 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,409 | mfu: 50.71 | epoch: 1 | total time: 46.48m | eta: 133.4m +step 04325/16704 (25.89%) | loss: 2.785251 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,300 | mfu: 50.83 | epoch: 1 | total time: 46.49m | eta: 133.4m +step 04326/16704 (25.90%) | loss: 2.787335 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 1 | total time: 46.50m | eta: 133.4m +step 04327/16704 (25.90%) | loss: 2.799203 | lrm: 1.00 | dt: 649.08ms | tok/sec: 807,741 | mfu: 50.48 | epoch: 1 | total time: 46.51m | eta: 133.3m +step 04328/16704 (25.91%) | loss: 2.791786 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,634 | mfu: 50.92 | epoch: 1 | total time: 46.52m | eta: 133.3m +step 04329/16704 (25.92%) | loss: 2.801233 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,900 | mfu: 50.74 | epoch: 1 | total time: 46.53m | eta: 133.3m +step 04330/16704 (25.92%) | loss: 2.801928 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,905 | mfu: 50.81 | epoch: 1 | total time: 46.54m | eta: 133.3m +step 04331/16704 (25.93%) | loss: 2.797683 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,709 | mfu: 50.92 | epoch: 1 | total time: 46.55m | eta: 133.3m +step 04332/16704 (25.93%) | loss: 2.776925 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,243 | mfu: 50.64 | epoch: 1 | total time: 46.56m | eta: 133.3m +step 04333/16704 (25.94%) | loss: 2.786819 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 1 | total time: 46.58m | eta: 133.3m +step 04334/16704 (25.95%) | loss: 2.773567 | lrm: 1.00 | dt: 646.00ms | tok/sec: 811,587 | mfu: 50.73 | epoch: 1 | total time: 46.59m | eta: 133.3m +step 04335/16704 (25.95%) | loss: 2.781962 | lrm: 1.00 | dt: 649.08ms | tok/sec: 807,735 | mfu: 50.48 | epoch: 1 | total time: 46.60m | eta: 133.3m +step 04336/16704 (25.96%) | loss: 2.791283 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,402 | mfu: 50.90 | epoch: 1 | total time: 46.61m | eta: 133.3m +step 04337/16704 (25.96%) | loss: 2.794683 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,218 | mfu: 50.95 | epoch: 1 | total time: 46.62m | eta: 133.2m +step 04338/16704 (25.97%) | loss: 2.803381 | lrm: 1.00 | dt: 648.76ms | tok/sec: 808,144 | mfu: 50.51 | epoch: 1 | total time: 46.63m | eta: 133.2m +step 04339/16704 (25.98%) | loss: 2.799469 | lrm: 1.00 | dt: 640.67ms | tok/sec: 818,338 | mfu: 51.15 | epoch: 1 | total time: 46.64m | eta: 133.2m +step 04340/16704 (25.98%) | loss: 2.784798 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,576 | mfu: 50.60 | epoch: 1 | total time: 46.65m | eta: 133.2m +step 04341/16704 (25.99%) | loss: 2.781281 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,810 | mfu: 50.61 | epoch: 1 | total time: 46.66m | eta: 133.2m +step 04342/16704 (25.99%) | loss: 2.782349 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,182 | mfu: 50.83 | epoch: 1 | total time: 46.67m | eta: 133.2m +step 04343/16704 (26.00%) | loss: 2.798009 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,872 | mfu: 50.87 | epoch: 1 | total time: 46.68m | eta: 133.2m +step 04344/16704 (26.01%) | loss: 2.786415 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,624 | mfu: 50.73 | epoch: 1 | total time: 46.69m | eta: 133.2m +step 04345/16704 (26.01%) | loss: 2.791846 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,382 | mfu: 50.90 | epoch: 1 | total time: 46.70m | eta: 133.2m +step 04346/16704 (26.02%) | loss: 2.794300 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,675 | mfu: 50.73 | epoch: 1 | total time: 46.71m | eta: 133.1m +step 04347/16704 (26.02%) | loss: 2.793799 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,358 | mfu: 50.84 | epoch: 1 | total time: 46.73m | eta: 133.1m +step 04348/16704 (26.03%) | loss: 2.789033 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,477 | mfu: 50.78 | epoch: 1 | total time: 46.74m | eta: 133.1m +step 04349/16704 (26.04%) | loss: 2.798446 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,966 | mfu: 50.75 | epoch: 1 | total time: 46.75m | eta: 133.1m +step 04350/16704 (26.04%) | loss: 2.806747 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,291 | mfu: 50.71 | epoch: 1 | total time: 46.76m | eta: 133.1m +step 04351/16704 (26.05%) | loss: 2.808441 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,558 | mfu: 50.79 | epoch: 1 | total time: 46.77m | eta: 133.1m +step 04352/16704 (26.05%) | loss: 2.791347 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,104 | mfu: 50.88 | epoch: 1 | total time: 46.78m | eta: 133.1m +step 04353/16704 (26.06%) | loss: 2.788474 | lrm: 1.00 | dt: 649.03ms | tok/sec: 807,799 | mfu: 50.49 | epoch: 1 | total time: 46.79m | eta: 133.1m +step 04354/16704 (26.07%) | loss: 2.784713 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,978 | mfu: 50.69 | epoch: 1 | total time: 46.80m | eta: 133.1m +step 04355/16704 (26.07%) | loss: 2.774260 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,420 | mfu: 50.90 | epoch: 1 | total time: 46.81m | eta: 133.0m +step 04356/16704 (26.08%) | loss: 2.762847 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,025 | mfu: 50.82 | epoch: 1 | total time: 46.82m | eta: 133.0m +step 04357/16704 (26.08%) | loss: 2.767125 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,241 | mfu: 50.64 | epoch: 1 | total time: 46.83m | eta: 133.0m +step 04358/16704 (26.09%) | loss: 2.765784 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,418 | mfu: 50.78 | epoch: 1 | total time: 46.84m | eta: 133.0m +step 04359/16704 (26.10%) | loss: 2.784184 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,679 | mfu: 50.86 | epoch: 1 | total time: 46.85m | eta: 133.0m +step 04360/16704 (26.10%) | loss: 2.784071 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,448 | mfu: 50.78 | epoch: 1 | total time: 46.87m | eta: 133.0m +step 04361/16704 (26.11%) | loss: 2.789118 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,405 | mfu: 50.96 | epoch: 1 | total time: 46.88m | eta: 133.0m +step 04362/16704 (26.11%) | loss: 2.789821 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,746 | mfu: 50.67 | epoch: 1 | total time: 46.89m | eta: 133.0m +step 04363/16704 (26.12%) | loss: 2.780606 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 1 | total time: 46.90m | eta: 133.0m +step 04364/16704 (26.13%) | loss: 2.776866 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,774 | mfu: 50.80 | epoch: 1 | total time: 46.91m | eta: 132.9m +step 04365/16704 (26.13%) | loss: 2.774132 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,356 | mfu: 50.77 | epoch: 1 | total time: 46.92m | eta: 132.9m +step 04366/16704 (26.14%) | loss: 2.791249 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,736 | mfu: 50.73 | epoch: 1 | total time: 46.93m | eta: 132.9m +step 04367/16704 (26.14%) | loss: 2.787281 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,038 | mfu: 50.63 | epoch: 1 | total time: 46.94m | eta: 132.9m +step 04368/16704 (26.15%) | loss: 2.776694 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 1 | total time: 46.95m | eta: 132.9m +step 04369/16704 (26.16%) | loss: 2.785904 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,240 | mfu: 50.83 | epoch: 1 | total time: 46.96m | eta: 132.9m +step 04370/16704 (26.16%) | loss: 2.782707 | lrm: 1.00 | dt: 648.34ms | tok/sec: 808,656 | mfu: 50.54 | epoch: 1 | total time: 46.97m | eta: 132.9m +step 04371/16704 (26.17%) | loss: 2.778924 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,301 | mfu: 50.77 | epoch: 1 | total time: 46.98m | eta: 132.9m +step 04372/16704 (26.17%) | loss: 2.787465 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,257 | mfu: 50.95 | epoch: 1 | total time: 46.99m | eta: 132.9m +step 04373/16704 (26.18%) | loss: 2.789743 | lrm: 1.00 | dt: 647.54ms | tok/sec: 809,666 | mfu: 50.61 | epoch: 1 | total time: 47.01m | eta: 132.8m +step 04374/16704 (26.19%) | loss: 2.784251 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,179 | mfu: 50.89 | epoch: 1 | total time: 47.02m | eta: 132.8m +step 04375/16704 (26.19%) | loss: 2.780047 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,727 | mfu: 50.86 | epoch: 1 | total time: 47.03m | eta: 132.8m +step 04376/16704 (26.20%) | loss: 2.788666 | lrm: 1.00 | dt: 648.16ms | tok/sec: 808,888 | mfu: 50.56 | epoch: 1 | total time: 47.04m | eta: 132.8m +step 04377/16704 (26.20%) | loss: 2.784796 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,764 | mfu: 50.74 | epoch: 1 | total time: 47.05m | eta: 132.8m +step 04378/16704 (26.21%) | loss: 2.784010 | lrm: 1.00 | dt: 642.28ms | tok/sec: 816,291 | mfu: 51.02 | epoch: 1 | total time: 47.06m | eta: 132.8m +step 04379/16704 (26.22%) | loss: 2.772850 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,702 | mfu: 50.73 | epoch: 1 | total time: 47.07m | eta: 132.8m +step 04380/16704 (26.22%) | loss: 2.769277 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,447 | mfu: 50.97 | epoch: 1 | total time: 47.08m | eta: 132.8m +step 04381/16704 (26.23%) | loss: 2.777020 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,974 | mfu: 50.69 | epoch: 1 | total time: 47.09m | eta: 132.8m +step 04382/16704 (26.23%) | loss: 2.777889 | lrm: 1.00 | dt: 648.63ms | tok/sec: 808,294 | mfu: 50.52 | epoch: 1 | total time: 47.10m | eta: 132.8m +step 04383/16704 (26.24%) | loss: 2.783040 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,900 | mfu: 50.81 | epoch: 1 | total time: 47.11m | eta: 132.7m +step 04384/16704 (26.25%) | loss: 2.781511 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,870 | mfu: 50.74 | epoch: 1 | total time: 47.12m | eta: 132.7m +step 04385/16704 (26.25%) | loss: 2.780795 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,517 | mfu: 50.78 | epoch: 1 | total time: 47.13m | eta: 132.7m +step 04386/16704 (26.26%) | loss: 2.792132 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,046 | mfu: 50.88 | epoch: 1 | total time: 47.15m | eta: 132.7m +step 04387/16704 (26.26%) | loss: 2.804601 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,752 | mfu: 50.92 | epoch: 1 | total time: 47.16m | eta: 132.7m +step 04388/16704 (26.27%) | loss: 2.798853 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,500 | mfu: 50.84 | epoch: 1 | total time: 47.17m | eta: 132.7m +step 04389/16704 (26.28%) | loss: 2.796555 | lrm: 1.00 | dt: 650.01ms | tok/sec: 806,588 | mfu: 50.41 | epoch: 1 | total time: 47.18m | eta: 132.7m +step 04390/16704 (26.28%) | loss: 2.785987 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,497 | mfu: 50.97 | epoch: 1 | total time: 47.19m | eta: 132.7m +step 04391/16704 (26.29%) | loss: 2.777044 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,743 | mfu: 50.80 | epoch: 1 | total time: 47.20m | eta: 132.7m +step 04392/16704 (26.29%) | loss: 2.786702 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,035 | mfu: 50.69 | epoch: 1 | total time: 47.21m | eta: 132.6m +step 04393/16704 (26.30%) | loss: 2.782966 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,492 | mfu: 50.91 | epoch: 1 | total time: 47.22m | eta: 132.6m +step 04394/16704 (26.31%) | loss: 2.784140 | lrm: 1.00 | dt: 648.45ms | tok/sec: 808,519 | mfu: 50.53 | epoch: 1 | total time: 47.23m | eta: 132.6m +step 04395/16704 (26.31%) | loss: 2.790878 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,011 | mfu: 50.69 | epoch: 1 | total time: 47.24m | eta: 132.6m +step 04396/16704 (26.32%) | loss: 2.783663 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,382 | mfu: 50.78 | epoch: 1 | total time: 47.25m | eta: 132.6m +step 04397/16704 (26.32%) | loss: 2.760414 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,837 | mfu: 50.80 | epoch: 1 | total time: 47.26m | eta: 132.6m +step 04398/16704 (26.33%) | loss: 2.750199 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,222 | mfu: 50.70 | epoch: 1 | total time: 47.27m | eta: 132.6m +step 04399/16704 (26.34%) | loss: 2.760473 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,382 | mfu: 50.96 | epoch: 1 | total time: 47.29m | eta: 132.6m +step 04400/16704 (26.34%) | loss: 2.773253 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,902 | mfu: 50.81 | epoch: 1 | total time: 47.30m | eta: 132.6m +step 04401/16704 (26.35%) | loss: 2.765436 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,836 | mfu: 50.68 | epoch: 1 | total time: 47.31m | eta: 132.5m +step 04402/16704 (26.35%) | loss: 2.764808 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,586 | mfu: 50.79 | epoch: 1 | total time: 47.32m | eta: 132.5m +step 04403/16704 (26.36%) | loss: 2.764254 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,168 | mfu: 50.57 | epoch: 1 | total time: 47.33m | eta: 132.5m +step 04404/16704 (26.36%) | loss: 2.769151 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,078 | mfu: 50.82 | epoch: 1 | total time: 47.34m | eta: 132.5m +step 04405/16704 (26.37%) | loss: 2.773196 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,056 | mfu: 50.94 | epoch: 1 | total time: 47.35m | eta: 132.5m +step 04406/16704 (26.38%) | loss: 2.760515 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,126 | mfu: 50.70 | epoch: 1 | total time: 47.36m | eta: 132.5m +step 04407/16704 (26.38%) | loss: 2.758551 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,294 | mfu: 50.77 | epoch: 1 | total time: 47.37m | eta: 132.5m +step 04408/16704 (26.39%) | loss: 2.768631 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,806 | mfu: 50.80 | epoch: 1 | total time: 47.38m | eta: 132.5m +step 04409/16704 (26.39%) | loss: 2.774064 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,494 | mfu: 50.84 | epoch: 1 | total time: 47.39m | eta: 132.5m +step 04410/16704 (26.40%) | loss: 2.782132 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,507 | mfu: 50.97 | epoch: 1 | total time: 47.40m | eta: 132.4m +step 04411/16704 (26.41%) | loss: 2.774723 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,198 | mfu: 50.70 | epoch: 1 | total time: 47.41m | eta: 132.4m +step 04412/16704 (26.41%) | loss: 2.783778 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,338 | mfu: 50.83 | epoch: 1 | total time: 47.42m | eta: 132.4m +step 04413/16704 (26.42%) | loss: 2.785465 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,534 | mfu: 50.85 | epoch: 1 | total time: 47.44m | eta: 132.4m +step 04414/16704 (26.42%) | loss: 2.788261 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,943 | mfu: 50.75 | epoch: 1 | total time: 47.45m | eta: 132.4m +step 04415/16704 (26.43%) | loss: 2.788944 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,905 | mfu: 50.87 | epoch: 1 | total time: 47.46m | eta: 132.4m +step 04416/16704 (26.44%) | loss: 2.795586 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,343 | mfu: 50.90 | epoch: 1 | total time: 47.47m | eta: 132.4m +step 04417/16704 (26.44%) | loss: 2.797231 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,522 | mfu: 50.78 | epoch: 1 | total time: 47.48m | eta: 132.4m +step 04418/16704 (26.45%) | loss: 2.805200 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,702 | mfu: 50.67 | epoch: 1 | total time: 47.49m | eta: 132.4m +step 04419/16704 (26.45%) | loss: 2.799994 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,001 | mfu: 50.81 | epoch: 1 | total time: 47.50m | eta: 132.4m +step 04420/16704 (26.46%) | loss: 2.785963 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,286 | mfu: 50.89 | epoch: 1 | total time: 47.51m | eta: 132.3m +step 04421/16704 (26.47%) | loss: 2.782720 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,621 | mfu: 50.66 | epoch: 1 | total time: 47.52m | eta: 132.3m +step 04422/16704 (26.47%) | loss: 2.777577 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,280 | mfu: 50.83 | epoch: 1 | total time: 47.53m | eta: 132.3m +step 04423/16704 (26.48%) | loss: 2.783688 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,773 | mfu: 50.86 | epoch: 1 | total time: 47.54m | eta: 132.3m +step 04424/16704 (26.48%) | loss: 2.780191 | lrm: 1.00 | dt: 649.22ms | tok/sec: 807,560 | mfu: 50.47 | epoch: 1 | total time: 47.55m | eta: 132.3m +step 04425/16704 (26.49%) | loss: 2.772889 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,035 | mfu: 50.94 | epoch: 1 | total time: 47.56m | eta: 132.3m +step 04426/16704 (26.50%) | loss: 2.754459 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,624 | mfu: 50.73 | epoch: 1 | total time: 47.58m | eta: 132.3m +step 04427/16704 (26.50%) | loss: 2.757892 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,290 | mfu: 50.71 | epoch: 1 | total time: 47.59m | eta: 132.3m +step 04428/16704 (26.51%) | loss: 2.764420 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,272 | mfu: 50.96 | epoch: 1 | total time: 47.60m | eta: 132.3m +step 04429/16704 (26.51%) | loss: 2.751964 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,560 | mfu: 50.66 | epoch: 1 | total time: 47.61m | eta: 132.2m +step 04430/16704 (26.52%) | loss: 2.764873 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,366 | mfu: 50.84 | epoch: 1 | total time: 47.62m | eta: 132.2m +step 04431/16704 (26.53%) | loss: 2.782135 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,360 | mfu: 50.71 | epoch: 1 | total time: 47.63m | eta: 132.2m +step 04432/16704 (26.53%) | loss: 2.770695 | lrm: 1.00 | dt: 647.45ms | tok/sec: 809,778 | mfu: 50.61 | epoch: 1 | total time: 47.64m | eta: 132.2m +step 04433/16704 (26.54%) | loss: 2.775660 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,452 | mfu: 50.97 | epoch: 1 | total time: 47.65m | eta: 132.2m +step 04434/16704 (26.54%) | loss: 2.773202 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,391 | mfu: 50.71 | epoch: 1 | total time: 47.66m | eta: 132.2m +step 04435/16704 (26.55%) | loss: 2.773746 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,733 | mfu: 50.86 | epoch: 1 | total time: 47.67m | eta: 132.2m +step 04436/16704 (26.56%) | loss: 2.776190 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,201 | mfu: 50.83 | epoch: 1 | total time: 47.68m | eta: 132.2m +step 04437/16704 (26.56%) | loss: 2.786038 | lrm: 1.00 | dt: 647.16ms | tok/sec: 810,141 | mfu: 50.63 | epoch: 1 | total time: 47.69m | eta: 132.2m +step 04438/16704 (26.57%) | loss: 2.788565 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,354 | mfu: 50.84 | epoch: 1 | total time: 47.70m | eta: 132.1m +step 04439/16704 (26.57%) | loss: 2.786281 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,516 | mfu: 50.85 | epoch: 1 | total time: 47.72m | eta: 132.1m +step 04440/16704 (26.58%) | loss: 2.791476 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,228 | mfu: 50.58 | epoch: 1 | total time: 47.73m | eta: 132.1m +step 04441/16704 (26.59%) | loss: 2.787292 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,248 | mfu: 50.83 | epoch: 1 | total time: 47.74m | eta: 132.1m +step 04442/16704 (26.59%) | loss: 2.795272 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,961 | mfu: 50.69 | epoch: 1 | total time: 47.75m | eta: 132.1m +step 04443/16704 (26.60%) | loss: 2.798393 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,448 | mfu: 50.72 | epoch: 1 | total time: 47.76m | eta: 132.1m +step 04444/16704 (26.60%) | loss: 2.800729 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,400 | mfu: 50.78 | epoch: 1 | total time: 47.77m | eta: 132.1m +step 04445/16704 (26.61%) | loss: 2.794767 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,617 | mfu: 50.66 | epoch: 1 | total time: 47.78m | eta: 132.1m +step 04446/16704 (26.62%) | loss: 2.806013 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,502 | mfu: 50.72 | epoch: 1 | total time: 47.79m | eta: 132.1m +step 04447/16704 (26.62%) | loss: 2.795047 | lrm: 1.00 | dt: 647.92ms | tok/sec: 809,184 | mfu: 50.58 | epoch: 1 | total time: 47.80m | eta: 132.0m +step 04448/16704 (26.63%) | loss: 2.778490 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,625 | mfu: 50.73 | epoch: 1 | total time: 47.81m | eta: 132.0m +step 04449/16704 (26.63%) | loss: 2.784724 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,762 | mfu: 50.80 | epoch: 1 | total time: 47.82m | eta: 132.0m +step 04450/16704 (26.64%) | loss: 2.774973 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,295 | mfu: 50.77 | epoch: 1 | total time: 47.83m | eta: 132.0m +step 04451/16704 (26.65%) | loss: 2.772887 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,662 | mfu: 50.79 | epoch: 1 | total time: 47.84m | eta: 132.0m +step 04452/16704 (26.65%) | loss: 2.793191 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,691 | mfu: 50.79 | epoch: 1 | total time: 47.86m | eta: 132.0m +step 04453/16704 (26.66%) | loss: 2.776586 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,786 | mfu: 50.68 | epoch: 1 | total time: 47.87m | eta: 132.0m +step 04454/16704 (26.66%) | loss: 2.781259 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,714 | mfu: 50.86 | epoch: 1 | total time: 47.88m | eta: 132.0m +step 04455/16704 (26.67%) | loss: 2.784398 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,821 | mfu: 50.74 | epoch: 1 | total time: 47.89m | eta: 132.0m +step 04456/16704 (26.68%) | loss: 2.777730 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,076 | mfu: 50.88 | epoch: 1 | total time: 47.90m | eta: 132.0m +step 04457/16704 (26.68%) | loss: 2.776618 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,061 | mfu: 50.69 | epoch: 1 | total time: 47.91m | eta: 131.9m +step 04458/16704 (26.69%) | loss: 2.758441 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,214 | mfu: 50.64 | epoch: 1 | total time: 47.92m | eta: 131.9m +step 04459/16704 (26.69%) | loss: 2.762708 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,904 | mfu: 50.87 | epoch: 1 | total time: 47.93m | eta: 131.9m +step 04460/16704 (26.70%) | loss: 2.777022 | lrm: 1.00 | dt: 647.05ms | tok/sec: 810,277 | mfu: 50.64 | epoch: 1 | total time: 47.94m | eta: 131.9m +step 04461/16704 (26.71%) | loss: 2.773083 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,348 | mfu: 50.84 | epoch: 1 | total time: 47.95m | eta: 131.9m +step 04462/16704 (26.71%) | loss: 2.771198 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,588 | mfu: 50.79 | epoch: 1 | total time: 47.96m | eta: 131.9m +step 04463/16704 (26.72%) | loss: 2.768220 | lrm: 1.00 | dt: 648.10ms | tok/sec: 808,959 | mfu: 50.56 | epoch: 1 | total time: 47.97m | eta: 131.9m +step 04464/16704 (26.72%) | loss: 2.765945 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,330 | mfu: 50.83 | epoch: 1 | total time: 47.98m | eta: 131.9m +step 04465/16704 (26.73%) | loss: 2.772139 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,074 | mfu: 50.76 | epoch: 1 | total time: 48.00m | eta: 131.9m +step 04466/16704 (26.74%) | loss: 2.775256 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,748 | mfu: 50.92 | epoch: 1 | total time: 48.01m | eta: 131.8m +step 04467/16704 (26.74%) | loss: 2.785625 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,574 | mfu: 50.85 | epoch: 1 | total time: 48.02m | eta: 131.8m +step 04468/16704 (26.75%) | loss: 2.780660 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,893 | mfu: 50.87 | epoch: 1 | total time: 48.03m | eta: 131.8m +step 04469/16704 (26.75%) | loss: 2.779945 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,311 | mfu: 50.71 | epoch: 1 | total time: 48.04m | eta: 131.8m +step 04470/16704 (26.76%) | loss: 2.779290 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,234 | mfu: 50.83 | epoch: 1 | total time: 48.05m | eta: 131.8m +step 04471/16704 (26.77%) | loss: 2.777467 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,507 | mfu: 50.85 | epoch: 1 | total time: 48.06m | eta: 131.8m +step 04472/16704 (26.77%) | loss: 2.769392 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,530 | mfu: 50.66 | epoch: 1 | total time: 48.07m | eta: 131.8m +step 04473/16704 (26.78%) | loss: 2.768719 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,706 | mfu: 50.80 | epoch: 1 | total time: 48.08m | eta: 131.8m +step 04474/16704 (26.78%) | loss: 2.771603 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,503 | mfu: 50.78 | epoch: 1 | total time: 48.09m | eta: 131.8m +step 04475/16704 (26.79%) | loss: 2.787892 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,466 | mfu: 50.66 | epoch: 1 | total time: 48.10m | eta: 131.7m +step 04476/16704 (26.80%) | loss: 2.786763 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,320 | mfu: 50.83 | epoch: 1 | total time: 48.11m | eta: 131.7m +step 04477/16704 (26.80%) | loss: 2.780450 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,323 | mfu: 50.83 | epoch: 1 | total time: 48.12m | eta: 131.7m +step 04478/16704 (26.81%) | loss: 2.783299 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,039 | mfu: 50.69 | epoch: 1 | total time: 48.13m | eta: 131.7m +step 04479/16704 (26.81%) | loss: 2.789387 | lrm: 1.00 | dt: 647.37ms | tok/sec: 809,868 | mfu: 50.62 | epoch: 1 | total time: 48.15m | eta: 131.7m +step 04480/16704 (26.82%) | loss: 2.775731 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,838 | mfu: 50.74 | epoch: 1 | total time: 48.16m | eta: 131.7m +step 04481/16704 (26.83%) | loss: 2.778256 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,776 | mfu: 50.92 | epoch: 1 | total time: 48.17m | eta: 131.7m +step 04482/16704 (26.83%) | loss: 2.783179 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,912 | mfu: 50.68 | epoch: 1 | total time: 48.18m | eta: 131.7m +step 04483/16704 (26.84%) | loss: 2.795494 | lrm: 1.00 | dt: 646.76ms | tok/sec: 810,643 | mfu: 50.67 | epoch: 1 | total time: 48.19m | eta: 131.7m +step 04484/16704 (26.84%) | loss: 2.793742 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 1 | total time: 48.20m | eta: 131.6m +step 04485/16704 (26.85%) | loss: 2.794106 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,019 | mfu: 50.69 | epoch: 1 | total time: 48.21m | eta: 131.6m +step 04486/16704 (26.86%) | loss: 2.805399 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,362 | mfu: 50.90 | epoch: 1 | total time: 48.22m | eta: 131.6m +step 04487/16704 (26.86%) | loss: 2.805292 | lrm: 1.00 | dt: 647.30ms | tok/sec: 809,960 | mfu: 50.62 | epoch: 1 | total time: 48.23m | eta: 131.6m +step 04488/16704 (26.87%) | loss: 2.812684 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,267 | mfu: 50.77 | epoch: 1 | total time: 48.24m | eta: 131.6m +step 04489/16704 (26.87%) | loss: 2.810850 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,404 | mfu: 50.78 | epoch: 1 | total time: 48.25m | eta: 131.6m +step 04490/16704 (26.88%) | loss: 2.808238 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,192 | mfu: 50.70 | epoch: 1 | total time: 48.26m | eta: 131.6m +step 04491/16704 (26.89%) | loss: 2.796316 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,894 | mfu: 50.74 | epoch: 1 | total time: 48.27m | eta: 131.6m +step 04492/16704 (26.89%) | loss: 2.804208 | lrm: 1.00 | dt: 647.63ms | tok/sec: 809,549 | mfu: 50.60 | epoch: 1 | total time: 48.29m | eta: 131.6m +step 04493/16704 (26.90%) | loss: 2.801735 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,097 | mfu: 50.69 | epoch: 1 | total time: 48.30m | eta: 131.6m +step 04494/16704 (26.90%) | loss: 2.806514 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,720 | mfu: 50.73 | epoch: 1 | total time: 48.31m | eta: 131.5m +step 04495/16704 (26.91%) | loss: 2.801623 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,049 | mfu: 50.69 | epoch: 1 | total time: 48.32m | eta: 131.5m +step 04496/16704 (26.92%) | loss: 2.807502 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,645 | mfu: 50.79 | epoch: 1 | total time: 48.33m | eta: 131.5m +step 04497/16704 (26.92%) | loss: 2.806862 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,666 | mfu: 50.79 | epoch: 1 | total time: 48.34m | eta: 131.5m +step 04498/16704 (26.93%) | loss: 2.813882 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,684 | mfu: 50.79 | epoch: 1 | total time: 48.35m | eta: 131.5m +step 04499/16704 (26.93%) | loss: 2.806934 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,036 | mfu: 50.75 | epoch: 1 | total time: 48.36m | eta: 131.5m +Step 04500 | Validation bpb: 0.845256 +step 04500/16704 (26.94%) | loss: 2.807591 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,365 | mfu: 50.90 | epoch: 1 | total time: 48.37m | eta: 131.5m +step 04501/16704 (26.95%) | loss: 2.807621 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,806 | mfu: 50.61 | epoch: 1 | total time: 48.38m | eta: 131.5m +step 04502/16704 (26.95%) | loss: 2.808140 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,939 | mfu: 50.75 | epoch: 1 | total time: 48.39m | eta: 131.5m +step 04503/16704 (26.96%) | loss: 2.815194 | lrm: 1.00 | dt: 639.63ms | tok/sec: 819,673 | mfu: 51.23 | epoch: 1 | total time: 48.40m | eta: 131.4m +step 04504/16704 (26.96%) | loss: 2.828006 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,620 | mfu: 50.79 | epoch: 1 | total time: 48.41m | eta: 131.4m +step 04505/16704 (26.97%) | loss: 2.827097 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,124 | mfu: 50.82 | epoch: 1 | total time: 48.43m | eta: 131.4m +step 04506/16704 (26.98%) | loss: 2.813498 | lrm: 1.00 | dt: 639.57ms | tok/sec: 819,748 | mfu: 51.24 | epoch: 1 | total time: 48.44m | eta: 131.4m +step 04507/16704 (26.98%) | loss: 2.813979 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,208 | mfu: 50.76 | epoch: 1 | total time: 48.45m | eta: 131.4m +step 04508/16704 (26.99%) | loss: 2.804617 | lrm: 1.00 | dt: 642.19ms | tok/sec: 816,407 | mfu: 51.03 | epoch: 1 | total time: 48.46m | eta: 131.4m +step 04509/16704 (26.99%) | loss: 2.798071 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,008 | mfu: 50.69 | epoch: 1 | total time: 48.47m | eta: 131.4m +step 04510/16704 (27.00%) | loss: 2.793903 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,080 | mfu: 50.94 | epoch: 1 | total time: 48.48m | eta: 131.4m +step 04511/16704 (27.01%) | loss: 2.791399 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,606 | mfu: 50.91 | epoch: 1 | total time: 48.49m | eta: 131.4m +step 04512/16704 (27.01%) | loss: 2.800050 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,487 | mfu: 50.91 | epoch: 1 | total time: 48.50m | eta: 131.3m +step 04513/16704 (27.02%) | loss: 2.807685 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,665 | mfu: 50.92 | epoch: 1 | total time: 48.51m | eta: 131.3m +step 04514/16704 (27.02%) | loss: 2.811301 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,658 | mfu: 50.92 | epoch: 1 | total time: 48.52m | eta: 131.3m +step 04515/16704 (27.03%) | loss: 2.818198 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,711 | mfu: 50.92 | epoch: 1 | total time: 48.53m | eta: 131.3m +step 04516/16704 (27.04%) | loss: 2.813423 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,512 | mfu: 50.85 | epoch: 1 | total time: 48.54m | eta: 131.3m +step 04517/16704 (27.04%) | loss: 2.785629 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,038 | mfu: 50.88 | epoch: 1 | total time: 48.55m | eta: 131.3m +step 04518/16704 (27.05%) | loss: 2.776890 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,352 | mfu: 50.84 | epoch: 1 | total time: 48.56m | eta: 131.3m +step 04519/16704 (27.05%) | loss: 2.785005 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,338 | mfu: 50.90 | epoch: 1 | total time: 48.58m | eta: 131.3m +step 04520/16704 (27.06%) | loss: 2.780131 | lrm: 1.00 | dt: 641.78ms | tok/sec: 816,928 | mfu: 51.06 | epoch: 1 | total time: 48.59m | eta: 131.3m +step 04521/16704 (27.07%) | loss: 2.773915 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,073 | mfu: 50.82 | epoch: 1 | total time: 48.60m | eta: 131.2m +step 04522/16704 (27.07%) | loss: 2.777452 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,522 | mfu: 50.85 | epoch: 1 | total time: 48.61m | eta: 131.2m +step 04523/16704 (27.08%) | loss: 2.793184 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,816 | mfu: 50.99 | epoch: 1 | total time: 48.62m | eta: 131.2m +step 04524/16704 (27.08%) | loss: 2.796571 | lrm: 1.00 | dt: 642.31ms | tok/sec: 816,253 | mfu: 51.02 | epoch: 1 | total time: 48.63m | eta: 131.2m +step 04525/16704 (27.09%) | loss: 2.787781 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,832 | mfu: 50.93 | epoch: 1 | total time: 48.64m | eta: 131.2m +step 04526/16704 (27.10%) | loss: 2.780147 | lrm: 1.00 | dt: 642.10ms | tok/sec: 816,518 | mfu: 51.03 | epoch: 1 | total time: 48.65m | eta: 131.2m +step 04527/16704 (27.10%) | loss: 2.790050 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,259 | mfu: 50.83 | epoch: 1 | total time: 48.66m | eta: 131.2m +step 04528/16704 (27.11%) | loss: 2.788988 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,714 | mfu: 50.98 | epoch: 1 | total time: 48.67m | eta: 131.2m +step 04529/16704 (27.11%) | loss: 2.800490 | lrm: 1.00 | dt: 647.11ms | tok/sec: 810,198 | mfu: 50.64 | epoch: 1 | total time: 48.68m | eta: 131.2m +step 04530/16704 (27.12%) | loss: 2.798951 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,457 | mfu: 50.97 | epoch: 1 | total time: 48.69m | eta: 131.1m +step 04531/16704 (27.13%) | loss: 2.791198 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,349 | mfu: 50.84 | epoch: 1 | total time: 48.70m | eta: 131.1m +step 04532/16704 (27.13%) | loss: 2.778551 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,523 | mfu: 50.91 | epoch: 1 | total time: 48.71m | eta: 131.1m +step 04533/16704 (27.14%) | loss: 2.777089 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,389 | mfu: 50.96 | epoch: 1 | total time: 48.73m | eta: 131.1m +step 04534/16704 (27.14%) | loss: 2.763956 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,165 | mfu: 50.89 | epoch: 1 | total time: 48.74m | eta: 131.1m +step 04535/16704 (27.15%) | loss: 2.759891 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,656 | mfu: 50.98 | epoch: 1 | total time: 48.75m | eta: 131.1m +step 04536/16704 (27.16%) | loss: 2.769385 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,120 | mfu: 50.95 | epoch: 1 | total time: 48.76m | eta: 131.1m +step 04537/16704 (27.16%) | loss: 2.771969 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,534 | mfu: 50.91 | epoch: 1 | total time: 48.77m | eta: 131.1m +step 04538/16704 (27.17%) | loss: 2.784734 | lrm: 1.00 | dt: 642.46ms | tok/sec: 816,058 | mfu: 51.00 | epoch: 1 | total time: 48.78m | eta: 131.1m +step 04539/16704 (27.17%) | loss: 2.789642 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,106 | mfu: 50.88 | epoch: 1 | total time: 48.79m | eta: 131.1m +step 04540/16704 (27.18%) | loss: 2.782678 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,679 | mfu: 50.86 | epoch: 1 | total time: 48.80m | eta: 131.0m +step 04541/16704 (27.19%) | loss: 2.793492 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,187 | mfu: 50.89 | epoch: 1 | total time: 48.81m | eta: 131.0m +step 04542/16704 (27.19%) | loss: 2.777237 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,915 | mfu: 50.81 | epoch: 1 | total time: 48.82m | eta: 131.0m +step 04543/16704 (27.20%) | loss: 2.787606 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,122 | mfu: 50.95 | epoch: 1 | total time: 48.83m | eta: 131.0m +step 04544/16704 (27.20%) | loss: 2.781116 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,412 | mfu: 50.65 | epoch: 1 | total time: 48.84m | eta: 131.0m +step 04545/16704 (27.21%) | loss: 2.772709 | lrm: 1.00 | dt: 640.85ms | tok/sec: 818,109 | mfu: 51.13 | epoch: 1 | total time: 48.85m | eta: 131.0m +step 04546/16704 (27.22%) | loss: 2.785141 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,365 | mfu: 50.77 | epoch: 1 | total time: 48.87m | eta: 131.0m +step 04547/16704 (27.22%) | loss: 2.781681 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,457 | mfu: 50.72 | epoch: 1 | total time: 48.88m | eta: 131.0m +step 04548/16704 (27.23%) | loss: 2.770120 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,282 | mfu: 50.96 | epoch: 1 | total time: 48.89m | eta: 131.0m +step 04549/16704 (27.23%) | loss: 2.760975 | lrm: 1.00 | dt: 647.77ms | tok/sec: 809,375 | mfu: 50.59 | epoch: 1 | total time: 48.90m | eta: 130.9m +step 04550/16704 (27.24%) | loss: 2.755748 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,429 | mfu: 50.84 | epoch: 1 | total time: 48.91m | eta: 130.9m +step 04551/16704 (27.24%) | loss: 2.752667 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,146 | mfu: 50.89 | epoch: 1 | total time: 48.92m | eta: 130.9m +step 04552/16704 (27.25%) | loss: 2.756519 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,769 | mfu: 50.92 | epoch: 1 | total time: 48.93m | eta: 130.9m +step 04553/16704 (27.26%) | loss: 2.761417 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,013 | mfu: 50.88 | epoch: 1 | total time: 48.94m | eta: 130.9m +step 04554/16704 (27.26%) | loss: 2.758881 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,945 | mfu: 50.87 | epoch: 1 | total time: 48.95m | eta: 130.9m +step 04555/16704 (27.27%) | loss: 2.768119 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,185 | mfu: 50.89 | epoch: 1 | total time: 48.96m | eta: 130.9m +step 04556/16704 (27.27%) | loss: 2.779549 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,280 | mfu: 50.89 | epoch: 1 | total time: 48.97m | eta: 130.9m +step 04557/16704 (27.28%) | loss: 2.781700 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 1 | total time: 48.98m | eta: 130.9m +step 04558/16704 (27.29%) | loss: 2.782632 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,210 | mfu: 50.89 | epoch: 1 | total time: 48.99m | eta: 130.8m +step 04559/16704 (27.29%) | loss: 2.785907 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,896 | mfu: 50.99 | epoch: 1 | total time: 49.00m | eta: 130.8m +step 04560/16704 (27.30%) | loss: 2.799081 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,641 | mfu: 50.73 | epoch: 1 | total time: 49.02m | eta: 130.8m +step 04561/16704 (27.30%) | loss: 2.769468 | lrm: 1.00 | dt: 642.17ms | tok/sec: 816,436 | mfu: 51.03 | epoch: 1 | total time: 49.03m | eta: 130.8m +step 04562/16704 (27.31%) | loss: 2.742756 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,337 | mfu: 50.77 | epoch: 1 | total time: 49.04m | eta: 130.8m +step 04563/16704 (27.32%) | loss: 2.746764 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,947 | mfu: 50.87 | epoch: 1 | total time: 49.05m | eta: 130.8m +step 04564/16704 (27.32%) | loss: 2.748019 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,164 | mfu: 50.89 | epoch: 1 | total time: 49.06m | eta: 130.8m +step 04565/16704 (27.33%) | loss: 2.770196 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,910 | mfu: 50.75 | epoch: 1 | total time: 49.07m | eta: 130.8m +step 04566/16704 (27.33%) | loss: 2.766854 | lrm: 1.00 | dt: 648.26ms | tok/sec: 808,759 | mfu: 50.55 | epoch: 1 | total time: 49.08m | eta: 130.8m +step 04567/16704 (27.34%) | loss: 2.774088 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,083 | mfu: 50.94 | epoch: 1 | total time: 49.09m | eta: 130.7m +step 04568/16704 (27.35%) | loss: 2.785247 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,842 | mfu: 50.80 | epoch: 1 | total time: 49.10m | eta: 130.7m +step 04569/16704 (27.35%) | loss: 2.778592 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,958 | mfu: 50.81 | epoch: 1 | total time: 49.11m | eta: 130.7m +step 04570/16704 (27.36%) | loss: 2.768508 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,506 | mfu: 50.78 | epoch: 1 | total time: 49.12m | eta: 130.7m +step 04571/16704 (27.36%) | loss: 2.768398 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,605 | mfu: 50.85 | epoch: 1 | total time: 49.13m | eta: 130.7m +step 04572/16704 (27.37%) | loss: 2.750500 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,433 | mfu: 50.84 | epoch: 1 | total time: 49.14m | eta: 130.7m +step 04573/16704 (27.38%) | loss: 2.763233 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,841 | mfu: 50.87 | epoch: 1 | total time: 49.16m | eta: 130.7m +step 04574/16704 (27.38%) | loss: 2.762590 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,050 | mfu: 50.88 | epoch: 1 | total time: 49.17m | eta: 130.7m +step 04575/16704 (27.39%) | loss: 2.771641 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 49.18m | eta: 130.7m +step 04576/16704 (27.39%) | loss: 2.782857 | lrm: 1.00 | dt: 642.37ms | tok/sec: 816,172 | mfu: 51.01 | epoch: 1 | total time: 49.19m | eta: 130.6m +step 04577/16704 (27.40%) | loss: 2.785397 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,357 | mfu: 50.77 | epoch: 1 | total time: 49.20m | eta: 130.6m +step 04578/16704 (27.41%) | loss: 2.776353 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,303 | mfu: 50.83 | epoch: 1 | total time: 49.21m | eta: 130.6m +step 04579/16704 (27.41%) | loss: 2.777925 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,121 | mfu: 50.88 | epoch: 1 | total time: 49.22m | eta: 130.6m +step 04580/16704 (27.42%) | loss: 2.777015 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,840 | mfu: 50.87 | epoch: 1 | total time: 49.23m | eta: 130.6m +step 04581/16704 (27.42%) | loss: 2.775381 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,172 | mfu: 50.76 | epoch: 1 | total time: 49.24m | eta: 130.6m +step 04582/16704 (27.43%) | loss: 2.794364 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,802 | mfu: 50.80 | epoch: 1 | total time: 49.25m | eta: 130.6m +step 04583/16704 (27.44%) | loss: 2.792196 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,128 | mfu: 50.82 | epoch: 1 | total time: 49.26m | eta: 130.6m +step 04584/16704 (27.44%) | loss: 2.784138 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,975 | mfu: 50.94 | epoch: 1 | total time: 49.27m | eta: 130.6m +step 04585/16704 (27.45%) | loss: 2.779577 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,723 | mfu: 51.05 | epoch: 1 | total time: 49.28m | eta: 130.6m +step 04586/16704 (27.45%) | loss: 2.766292 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,694 | mfu: 50.79 | epoch: 1 | total time: 49.29m | eta: 130.5m +step 04587/16704 (27.46%) | loss: 2.764615 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,112 | mfu: 50.95 | epoch: 1 | total time: 49.31m | eta: 130.5m +step 04588/16704 (27.47%) | loss: 2.754870 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,048 | mfu: 50.88 | epoch: 1 | total time: 49.32m | eta: 130.5m +step 04589/16704 (27.47%) | loss: 2.744015 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,819 | mfu: 50.93 | epoch: 1 | total time: 49.33m | eta: 130.5m +step 04590/16704 (27.48%) | loss: 2.745425 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,344 | mfu: 50.77 | epoch: 1 | total time: 49.34m | eta: 130.5m +step 04591/16704 (27.48%) | loss: 2.766618 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,257 | mfu: 50.89 | epoch: 1 | total time: 49.35m | eta: 130.5m +step 04592/16704 (27.49%) | loss: 2.764809 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 1 | total time: 49.36m | eta: 130.5m +step 04593/16704 (27.50%) | loss: 2.771624 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,367 | mfu: 50.96 | epoch: 1 | total time: 49.37m | eta: 130.5m +step 04594/16704 (27.50%) | loss: 2.765814 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,548 | mfu: 50.79 | epoch: 1 | total time: 49.38m | eta: 130.5m +step 04595/16704 (27.51%) | loss: 2.763872 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,658 | mfu: 50.79 | epoch: 1 | total time: 49.39m | eta: 130.4m +step 04596/16704 (27.51%) | loss: 2.784292 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,813 | mfu: 50.80 | epoch: 1 | total time: 49.40m | eta: 130.4m +step 04597/16704 (27.52%) | loss: 2.774824 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,860 | mfu: 50.87 | epoch: 1 | total time: 49.41m | eta: 130.4m +step 04598/16704 (27.53%) | loss: 2.770253 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,546 | mfu: 50.85 | epoch: 1 | total time: 49.42m | eta: 130.4m +step 04599/16704 (27.53%) | loss: 2.754651 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,590 | mfu: 50.85 | epoch: 1 | total time: 49.43m | eta: 130.4m +step 04600/16704 (27.54%) | loss: 2.748727 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,295 | mfu: 50.89 | epoch: 1 | total time: 49.45m | eta: 130.4m +step 04601/16704 (27.54%) | loss: 2.768837 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,281 | mfu: 50.89 | epoch: 1 | total time: 49.46m | eta: 130.4m +step 04602/16704 (27.55%) | loss: 2.776475 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,463 | mfu: 50.66 | epoch: 1 | total time: 49.47m | eta: 130.4m +step 04603/16704 (27.56%) | loss: 2.781071 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,551 | mfu: 51.04 | epoch: 1 | total time: 49.48m | eta: 130.4m +step 04604/16704 (27.56%) | loss: 2.779627 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,272 | mfu: 50.77 | epoch: 1 | total time: 49.49m | eta: 130.3m +step 04605/16704 (27.57%) | loss: 2.780371 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,219 | mfu: 50.89 | epoch: 1 | total time: 49.50m | eta: 130.3m +step 04606/16704 (27.57%) | loss: 2.776140 | lrm: 1.00 | dt: 642.33ms | tok/sec: 816,226 | mfu: 51.02 | epoch: 1 | total time: 49.51m | eta: 130.3m +step 04607/16704 (27.58%) | loss: 2.774094 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,190 | mfu: 50.95 | epoch: 1 | total time: 49.52m | eta: 130.3m +step 04608/16704 (27.59%) | loss: 2.774403 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,117 | mfu: 50.76 | epoch: 1 | total time: 49.53m | eta: 130.3m +step 04609/16704 (27.59%) | loss: 2.771488 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,186 | mfu: 50.83 | epoch: 1 | total time: 49.54m | eta: 130.3m +step 04610/16704 (27.60%) | loss: 2.762842 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,470 | mfu: 50.91 | epoch: 1 | total time: 49.55m | eta: 130.3m +step 04611/16704 (27.60%) | loss: 2.771100 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,683 | mfu: 50.79 | epoch: 1 | total time: 49.56m | eta: 130.3m +step 04612/16704 (27.61%) | loss: 2.769890 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,479 | mfu: 50.84 | epoch: 1 | total time: 49.57m | eta: 130.3m +step 04613/16704 (27.62%) | loss: 2.762610 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,388 | mfu: 50.96 | epoch: 1 | total time: 49.58m | eta: 130.2m +step 04614/16704 (27.62%) | loss: 2.767298 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,627 | mfu: 50.85 | epoch: 1 | total time: 49.60m | eta: 130.2m +step 04615/16704 (27.63%) | loss: 2.771046 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,397 | mfu: 50.71 | epoch: 1 | total time: 49.61m | eta: 130.2m +step 04616/16704 (27.63%) | loss: 2.767162 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,770 | mfu: 50.86 | epoch: 1 | total time: 49.62m | eta: 130.2m +step 04617/16704 (27.64%) | loss: 2.756877 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,677 | mfu: 50.92 | epoch: 1 | total time: 49.63m | eta: 130.2m +step 04618/16704 (27.65%) | loss: 2.738579 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,273 | mfu: 50.83 | epoch: 1 | total time: 49.64m | eta: 130.2m +step 04619/16704 (27.65%) | loss: 2.739710 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,107 | mfu: 50.95 | epoch: 1 | total time: 49.65m | eta: 130.2m +step 04620/16704 (27.66%) | loss: 2.743343 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,350 | mfu: 50.90 | epoch: 1 | total time: 49.66m | eta: 130.2m +step 04621/16704 (27.66%) | loss: 2.743440 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,104 | mfu: 50.76 | epoch: 1 | total time: 49.67m | eta: 130.2m +step 04622/16704 (27.67%) | loss: 2.730162 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,309 | mfu: 50.83 | epoch: 1 | total time: 49.68m | eta: 130.1m +step 04623/16704 (27.68%) | loss: 2.733443 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 1 | total time: 49.69m | eta: 130.1m +step 04624/16704 (27.68%) | loss: 2.724207 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,513 | mfu: 50.91 | epoch: 1 | total time: 49.70m | eta: 130.1m +step 04625/16704 (27.69%) | loss: 2.718789 | lrm: 1.00 | dt: 641.42ms | tok/sec: 817,386 | mfu: 51.09 | epoch: 1 | total time: 49.71m | eta: 130.1m +step 04626/16704 (27.69%) | loss: 2.736593 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,739 | mfu: 50.92 | epoch: 1 | total time: 49.72m | eta: 130.1m +step 04627/16704 (27.70%) | loss: 2.744689 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,661 | mfu: 50.92 | epoch: 1 | total time: 49.74m | eta: 130.1m +step 04628/16704 (27.71%) | loss: 2.738148 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,277 | mfu: 50.96 | epoch: 1 | total time: 49.75m | eta: 130.1m +step 04629/16704 (27.71%) | loss: 2.738806 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,964 | mfu: 50.87 | epoch: 1 | total time: 49.76m | eta: 130.1m +step 04630/16704 (27.72%) | loss: 2.740940 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,971 | mfu: 50.87 | epoch: 1 | total time: 49.77m | eta: 130.1m +step 04631/16704 (27.72%) | loss: 2.735982 | lrm: 1.00 | dt: 641.87ms | tok/sec: 816,817 | mfu: 51.05 | epoch: 1 | total time: 49.78m | eta: 130.1m +step 04632/16704 (27.73%) | loss: 2.739503 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,825 | mfu: 50.87 | epoch: 1 | total time: 49.79m | eta: 130.0m +step 04633/16704 (27.74%) | loss: 2.726394 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,472 | mfu: 50.97 | epoch: 1 | total time: 49.80m | eta: 130.0m +step 04634/16704 (27.74%) | loss: 2.736077 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,407 | mfu: 50.96 | epoch: 1 | total time: 49.81m | eta: 130.0m +step 04635/16704 (27.75%) | loss: 2.753831 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,590 | mfu: 50.85 | epoch: 1 | total time: 49.82m | eta: 130.0m +step 04636/16704 (27.75%) | loss: 2.741133 | lrm: 1.00 | dt: 641.93ms | tok/sec: 816,734 | mfu: 51.05 | epoch: 1 | total time: 49.83m | eta: 130.0m +step 04637/16704 (27.76%) | loss: 2.750555 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,548 | mfu: 50.79 | epoch: 1 | total time: 49.84m | eta: 130.0m +step 04638/16704 (27.77%) | loss: 2.775990 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,218 | mfu: 50.95 | epoch: 1 | total time: 49.85m | eta: 130.0m +step 04639/16704 (27.77%) | loss: 2.764131 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,606 | mfu: 50.73 | epoch: 1 | total time: 49.86m | eta: 130.0m +step 04640/16704 (27.78%) | loss: 2.768580 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,168 | mfu: 50.95 | epoch: 1 | total time: 49.87m | eta: 130.0m +step 04641/16704 (27.78%) | loss: 2.766227 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,591 | mfu: 50.91 | epoch: 1 | total time: 49.89m | eta: 129.9m +step 04642/16704 (27.79%) | loss: 2.749713 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,638 | mfu: 50.79 | epoch: 1 | total time: 49.90m | eta: 129.9m +step 04643/16704 (27.80%) | loss: 2.765637 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,721 | mfu: 50.86 | epoch: 1 | total time: 49.91m | eta: 129.9m +step 04644/16704 (27.80%) | loss: 2.770408 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,958 | mfu: 50.87 | epoch: 1 | total time: 49.92m | eta: 129.9m +step 04645/16704 (27.81%) | loss: 2.765404 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,789 | mfu: 50.80 | epoch: 1 | total time: 49.93m | eta: 129.9m +step 04646/16704 (27.81%) | loss: 2.775454 | lrm: 1.00 | dt: 642.09ms | tok/sec: 816,532 | mfu: 51.03 | epoch: 1 | total time: 49.94m | eta: 129.9m +step 04647/16704 (27.82%) | loss: 2.789212 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,270 | mfu: 50.89 | epoch: 1 | total time: 49.95m | eta: 129.9m +step 04648/16704 (27.83%) | loss: 2.794205 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,574 | mfu: 50.85 | epoch: 1 | total time: 49.96m | eta: 129.9m +step 04649/16704 (27.83%) | loss: 2.800073 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,539 | mfu: 50.85 | epoch: 1 | total time: 49.97m | eta: 129.9m +step 04650/16704 (27.84%) | loss: 2.802527 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,759 | mfu: 50.92 | epoch: 1 | total time: 49.98m | eta: 129.8m +step 04651/16704 (27.84%) | loss: 2.801830 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 1 | total time: 49.99m | eta: 129.8m +step 04652/16704 (27.85%) | loss: 2.798693 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,465 | mfu: 50.78 | epoch: 1 | total time: 50.00m | eta: 129.8m +step 04653/16704 (27.86%) | loss: 2.793367 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,689 | mfu: 50.79 | epoch: 1 | total time: 50.01m | eta: 129.8m +step 04654/16704 (27.86%) | loss: 2.787590 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,868 | mfu: 50.87 | epoch: 1 | total time: 50.02m | eta: 129.8m +step 04655/16704 (27.87%) | loss: 2.778849 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 1 | total time: 50.04m | eta: 129.8m +step 04656/16704 (27.87%) | loss: 2.777052 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 1 | total time: 50.05m | eta: 129.8m +step 04657/16704 (27.88%) | loss: 2.778214 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,056 | mfu: 50.75 | epoch: 1 | total time: 50.06m | eta: 129.8m +step 04658/16704 (27.89%) | loss: 2.768749 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,988 | mfu: 50.81 | epoch: 1 | total time: 50.07m | eta: 129.8m +step 04659/16704 (27.89%) | loss: 2.781011 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,067 | mfu: 50.76 | epoch: 1 | total time: 50.08m | eta: 129.7m +step 04660/16704 (27.90%) | loss: 2.774290 | lrm: 1.00 | dt: 642.02ms | tok/sec: 816,625 | mfu: 51.04 | epoch: 1 | total time: 50.09m | eta: 129.7m +step 04661/16704 (27.90%) | loss: 2.776219 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,066 | mfu: 50.94 | epoch: 1 | total time: 50.10m | eta: 129.7m +step 04662/16704 (27.91%) | loss: 2.793813 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,380 | mfu: 50.84 | epoch: 1 | total time: 50.11m | eta: 129.7m +step 04663/16704 (27.92%) | loss: 2.789217 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,652 | mfu: 50.98 | epoch: 1 | total time: 50.12m | eta: 129.7m +step 04664/16704 (27.92%) | loss: 2.787383 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,941 | mfu: 50.81 | epoch: 1 | total time: 50.13m | eta: 129.7m +step 04665/16704 (27.93%) | loss: 2.803420 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,573 | mfu: 50.91 | epoch: 1 | total time: 50.14m | eta: 129.7m +step 04666/16704 (27.93%) | loss: 2.793910 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 1 | total time: 50.15m | eta: 129.7m +step 04667/16704 (27.94%) | loss: 2.779847 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,951 | mfu: 50.75 | epoch: 1 | total time: 50.16m | eta: 129.7m +step 04668/16704 (27.95%) | loss: 2.767807 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,122 | mfu: 50.95 | epoch: 1 | total time: 50.18m | eta: 129.6m +step 04669/16704 (27.95%) | loss: 2.788982 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,026 | mfu: 50.82 | epoch: 1 | total time: 50.19m | eta: 129.6m +step 04670/16704 (27.96%) | loss: 2.786675 | lrm: 1.00 | dt: 642.97ms | tok/sec: 815,416 | mfu: 50.96 | epoch: 1 | total time: 50.20m | eta: 129.6m +step 04671/16704 (27.96%) | loss: 2.786130 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,033 | mfu: 50.88 | epoch: 1 | total time: 50.21m | eta: 129.6m +step 04672/16704 (27.97%) | loss: 2.790079 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,401 | mfu: 50.84 | epoch: 1 | total time: 50.22m | eta: 129.6m +step 04673/16704 (27.98%) | loss: 2.795797 | lrm: 1.00 | dt: 642.42ms | tok/sec: 816,109 | mfu: 51.01 | epoch: 1 | total time: 50.23m | eta: 129.6m +step 04674/16704 (27.98%) | loss: 2.803019 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,746 | mfu: 50.86 | epoch: 1 | total time: 50.24m | eta: 129.6m +step 04675/16704 (27.99%) | loss: 2.802546 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,429 | mfu: 50.97 | epoch: 1 | total time: 50.25m | eta: 129.6m +step 04676/16704 (27.99%) | loss: 2.805268 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,037 | mfu: 50.94 | epoch: 1 | total time: 50.26m | eta: 129.6m +step 04677/16704 (28.00%) | loss: 2.809260 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,197 | mfu: 50.89 | epoch: 1 | total time: 50.27m | eta: 129.6m +step 04678/16704 (28.01%) | loss: 2.809269 | lrm: 1.00 | dt: 642.27ms | tok/sec: 816,309 | mfu: 51.02 | epoch: 1 | total time: 50.28m | eta: 129.5m +step 04679/16704 (28.01%) | loss: 2.808756 | lrm: 1.00 | dt: 642.48ms | tok/sec: 816,036 | mfu: 51.00 | epoch: 1 | total time: 50.29m | eta: 129.5m +step 04680/16704 (28.02%) | loss: 2.808363 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,830 | mfu: 50.99 | epoch: 1 | total time: 50.30m | eta: 129.5m +step 04681/16704 (28.02%) | loss: 2.812101 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,482 | mfu: 50.78 | epoch: 1 | total time: 50.31m | eta: 129.5m +step 04682/16704 (28.03%) | loss: 2.818914 | lrm: 1.00 | dt: 642.43ms | tok/sec: 816,101 | mfu: 51.01 | epoch: 1 | total time: 50.33m | eta: 129.5m +step 04683/16704 (28.04%) | loss: 2.811134 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,227 | mfu: 50.77 | epoch: 1 | total time: 50.34m | eta: 129.5m +step 04684/16704 (28.04%) | loss: 2.817164 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,136 | mfu: 50.82 | epoch: 1 | total time: 50.35m | eta: 129.5m +step 04685/16704 (28.05%) | loss: 2.815498 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,167 | mfu: 51.01 | epoch: 1 | total time: 50.36m | eta: 129.5m +step 04686/16704 (28.05%) | loss: 2.806594 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,978 | mfu: 50.81 | epoch: 1 | total time: 50.37m | eta: 129.5m +step 04687/16704 (28.06%) | loss: 2.799750 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,006 | mfu: 50.94 | epoch: 1 | total time: 50.38m | eta: 129.4m +step 04688/16704 (28.07%) | loss: 2.788315 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 1 | total time: 50.39m | eta: 129.4m +step 04689/16704 (28.07%) | loss: 2.789033 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,137 | mfu: 50.88 | epoch: 1 | total time: 50.40m | eta: 129.4m +step 04690/16704 (28.08%) | loss: 2.784953 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,597 | mfu: 50.98 | epoch: 1 | total time: 50.41m | eta: 129.4m +step 04691/16704 (28.08%) | loss: 2.779663 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,005 | mfu: 50.88 | epoch: 1 | total time: 50.42m | eta: 129.4m +step 04692/16704 (28.09%) | loss: 2.777602 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,034 | mfu: 50.75 | epoch: 1 | total time: 50.43m | eta: 129.4m +step 04693/16704 (28.10%) | loss: 2.772101 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,419 | mfu: 50.90 | epoch: 1 | total time: 50.44m | eta: 129.4m +step 04694/16704 (28.10%) | loss: 2.781560 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,209 | mfu: 50.76 | epoch: 1 | total time: 50.45m | eta: 129.4m +step 04695/16704 (28.11%) | loss: 2.787614 | lrm: 1.00 | dt: 642.34ms | tok/sec: 816,217 | mfu: 51.01 | epoch: 1 | total time: 50.46m | eta: 129.4m +step 04696/16704 (28.11%) | loss: 2.782736 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,741 | mfu: 50.73 | epoch: 1 | total time: 50.48m | eta: 129.3m +step 04697/16704 (28.12%) | loss: 2.790156 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,245 | mfu: 50.70 | epoch: 1 | total time: 50.49m | eta: 129.3m +step 04698/16704 (28.12%) | loss: 2.776219 | lrm: 1.00 | dt: 641.23ms | tok/sec: 817,625 | mfu: 51.10 | epoch: 1 | total time: 50.50m | eta: 129.3m +step 04699/16704 (28.13%) | loss: 2.779696 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,307 | mfu: 50.77 | epoch: 1 | total time: 50.51m | eta: 129.3m +step 04700/16704 (28.14%) | loss: 2.781408 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,268 | mfu: 50.89 | epoch: 1 | total time: 50.52m | eta: 129.3m +step 04701/16704 (28.14%) | loss: 2.778861 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 1 | total time: 50.53m | eta: 129.3m +step 04702/16704 (28.15%) | loss: 2.787002 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 1 | total time: 50.54m | eta: 129.3m +step 04703/16704 (28.15%) | loss: 2.789499 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,750 | mfu: 50.86 | epoch: 1 | total time: 50.55m | eta: 129.3m +step 04704/16704 (28.16%) | loss: 2.790348 | lrm: 1.00 | dt: 641.66ms | tok/sec: 817,084 | mfu: 51.07 | epoch: 1 | total time: 50.56m | eta: 129.3m +step 04705/16704 (28.17%) | loss: 2.792120 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,503 | mfu: 50.85 | epoch: 1 | total time: 50.57m | eta: 129.2m +step 04706/16704 (28.17%) | loss: 2.787877 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,039 | mfu: 50.94 | epoch: 1 | total time: 50.58m | eta: 129.2m +step 04707/16704 (28.18%) | loss: 2.784286 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,968 | mfu: 50.94 | epoch: 1 | total time: 50.59m | eta: 129.2m +step 04708/16704 (28.18%) | loss: 2.801781 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,955 | mfu: 50.94 | epoch: 1 | total time: 50.60m | eta: 129.2m +step 04709/16704 (28.19%) | loss: 2.801173 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,427 | mfu: 50.78 | epoch: 1 | total time: 50.62m | eta: 129.2m +step 04710/16704 (28.20%) | loss: 2.800806 | lrm: 1.00 | dt: 642.80ms | tok/sec: 815,625 | mfu: 50.98 | epoch: 1 | total time: 50.63m | eta: 129.2m +step 04711/16704 (28.20%) | loss: 2.795775 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,538 | mfu: 50.85 | epoch: 1 | total time: 50.64m | eta: 129.2m +step 04712/16704 (28.21%) | loss: 2.800580 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,728 | mfu: 50.73 | epoch: 1 | total time: 50.65m | eta: 129.2m +step 04713/16704 (28.21%) | loss: 2.786529 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,348 | mfu: 50.84 | epoch: 1 | total time: 50.66m | eta: 129.2m +step 04714/16704 (28.22%) | loss: 2.786237 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,617 | mfu: 50.91 | epoch: 1 | total time: 50.67m | eta: 129.1m +step 04715/16704 (28.23%) | loss: 2.792366 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,276 | mfu: 50.89 | epoch: 1 | total time: 50.68m | eta: 129.1m +step 04716/16704 (28.23%) | loss: 2.788421 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,198 | mfu: 50.83 | epoch: 1 | total time: 50.69m | eta: 129.1m +step 04717/16704 (28.24%) | loss: 2.785949 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,768 | mfu: 50.86 | epoch: 1 | total time: 50.70m | eta: 129.1m +step 04718/16704 (28.24%) | loss: 2.782613 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,580 | mfu: 50.85 | epoch: 1 | total time: 50.71m | eta: 129.1m +step 04719/16704 (28.25%) | loss: 2.791121 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,322 | mfu: 50.96 | epoch: 1 | total time: 50.72m | eta: 129.1m +step 04720/16704 (28.26%) | loss: 2.793745 | lrm: 1.00 | dt: 642.50ms | tok/sec: 816,014 | mfu: 51.00 | epoch: 1 | total time: 50.73m | eta: 129.1m +step 04721/16704 (28.26%) | loss: 2.792626 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,351 | mfu: 50.84 | epoch: 1 | total time: 50.74m | eta: 129.1m +step 04722/16704 (28.27%) | loss: 2.788234 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,012 | mfu: 50.75 | epoch: 1 | total time: 50.75m | eta: 129.1m +step 04723/16704 (28.27%) | loss: 2.793963 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,184 | mfu: 50.83 | epoch: 1 | total time: 50.77m | eta: 129.1m +step 04724/16704 (28.28%) | loss: 2.802248 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,478 | mfu: 50.84 | epoch: 1 | total time: 50.78m | eta: 129.0m +step 04725/16704 (28.29%) | loss: 2.815521 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,595 | mfu: 50.91 | epoch: 1 | total time: 50.79m | eta: 129.0m +step 04726/16704 (28.29%) | loss: 2.809143 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,938 | mfu: 50.93 | epoch: 1 | total time: 50.80m | eta: 129.0m +step 04727/16704 (28.30%) | loss: 2.795944 | lrm: 1.00 | dt: 641.19ms | tok/sec: 817,679 | mfu: 51.11 | epoch: 1 | total time: 50.81m | eta: 129.0m +step 04728/16704 (28.30%) | loss: 2.794598 | lrm: 1.00 | dt: 641.44ms | tok/sec: 817,361 | mfu: 51.09 | epoch: 1 | total time: 50.82m | eta: 129.0m +step 04729/16704 (28.31%) | loss: 2.799214 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,727 | mfu: 50.80 | epoch: 1 | total time: 50.83m | eta: 129.0m +step 04730/16704 (28.32%) | loss: 2.791900 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,495 | mfu: 51.03 | epoch: 1 | total time: 50.84m | eta: 129.0m +step 04731/16704 (28.32%) | loss: 2.797968 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,583 | mfu: 50.79 | epoch: 1 | total time: 50.85m | eta: 129.0m +step 04732/16704 (28.33%) | loss: 2.800761 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,269 | mfu: 51.02 | epoch: 1 | total time: 50.86m | eta: 129.0m +step 04733/16704 (28.33%) | loss: 2.789460 | lrm: 1.00 | dt: 647.76ms | tok/sec: 809,386 | mfu: 50.59 | epoch: 1 | total time: 50.87m | eta: 128.9m +step 04734/16704 (28.34%) | loss: 2.785895 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,100 | mfu: 50.57 | epoch: 1 | total time: 50.88m | eta: 128.9m +step 04735/16704 (28.35%) | loss: 2.778051 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,920 | mfu: 50.93 | epoch: 1 | total time: 50.89m | eta: 128.9m +step 04736/16704 (28.35%) | loss: 2.788668 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,425 | mfu: 50.78 | epoch: 1 | total time: 50.90m | eta: 128.9m +step 04737/16704 (28.36%) | loss: 2.807761 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,165 | mfu: 51.01 | epoch: 1 | total time: 50.92m | eta: 128.9m +step 04738/16704 (28.36%) | loss: 2.813966 | lrm: 1.00 | dt: 642.51ms | tok/sec: 815,998 | mfu: 51.00 | epoch: 1 | total time: 50.93m | eta: 128.9m +step 04739/16704 (28.37%) | loss: 2.820633 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,105 | mfu: 50.95 | epoch: 1 | total time: 50.94m | eta: 128.9m +step 04740/16704 (28.38%) | loss: 2.815555 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,783 | mfu: 50.86 | epoch: 1 | total time: 50.95m | eta: 128.9m +step 04741/16704 (28.38%) | loss: 2.792631 | lrm: 1.00 | dt: 647.87ms | tok/sec: 809,244 | mfu: 50.58 | epoch: 1 | total time: 50.96m | eta: 128.9m +step 04742/16704 (28.39%) | loss: 2.783955 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,498 | mfu: 50.97 | epoch: 1 | total time: 50.97m | eta: 128.8m +step 04743/16704 (28.39%) | loss: 2.788055 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,210 | mfu: 50.76 | epoch: 1 | total time: 50.98m | eta: 128.8m +step 04744/16704 (28.40%) | loss: 2.787834 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,780 | mfu: 50.92 | epoch: 1 | total time: 50.99m | eta: 128.8m +step 04745/16704 (28.41%) | loss: 2.789561 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,006 | mfu: 50.88 | epoch: 1 | total time: 51.00m | eta: 128.8m +step 04746/16704 (28.41%) | loss: 2.789294 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,109 | mfu: 50.76 | epoch: 1 | total time: 51.01m | eta: 128.8m +step 04747/16704 (28.42%) | loss: 2.787328 | lrm: 1.00 | dt: 642.80ms | tok/sec: 815,628 | mfu: 50.98 | epoch: 1 | total time: 51.02m | eta: 128.8m +step 04748/16704 (28.42%) | loss: 2.771617 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,878 | mfu: 50.87 | epoch: 1 | total time: 51.03m | eta: 128.8m +step 04749/16704 (28.43%) | loss: 2.773444 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,354 | mfu: 50.90 | epoch: 1 | total time: 51.04m | eta: 128.8m +Step 04750 | Validation bpb: 0.843324 +step 04750/16704 (28.44%) | loss: 2.775141 | lrm: 1.00 | dt: 647.27ms | tok/sec: 809,994 | mfu: 50.63 | epoch: 1 | total time: 51.06m | eta: 128.8m +step 04751/16704 (28.44%) | loss: 2.782650 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,510 | mfu: 50.66 | epoch: 1 | total time: 51.07m | eta: 128.7m +step 04752/16704 (28.45%) | loss: 2.790995 | lrm: 1.00 | dt: 648.82ms | tok/sec: 808,064 | mfu: 50.51 | epoch: 1 | total time: 51.08m | eta: 128.7m +step 04753/16704 (28.45%) | loss: 2.809765 | lrm: 1.00 | dt: 638.89ms | tok/sec: 820,617 | mfu: 51.29 | epoch: 1 | total time: 51.09m | eta: 128.7m +step 04754/16704 (28.46%) | loss: 2.801148 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,173 | mfu: 50.64 | epoch: 1 | total time: 51.10m | eta: 128.7m +step 04755/16704 (28.47%) | loss: 2.802434 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,770 | mfu: 50.74 | epoch: 1 | total time: 51.11m | eta: 128.7m +step 04756/16704 (28.47%) | loss: 2.799340 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,511 | mfu: 50.72 | epoch: 1 | total time: 51.12m | eta: 128.7m +step 04757/16704 (28.48%) | loss: 2.794731 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,847 | mfu: 50.93 | epoch: 1 | total time: 51.13m | eta: 128.7m +step 04758/16704 (28.48%) | loss: 2.781258 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,764 | mfu: 50.80 | epoch: 1 | total time: 51.14m | eta: 128.7m +step 04759/16704 (28.49%) | loss: 2.796483 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,922 | mfu: 51.00 | epoch: 1 | total time: 51.15m | eta: 128.7m +step 04760/16704 (28.50%) | loss: 2.795060 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,856 | mfu: 50.87 | epoch: 1 | total time: 51.16m | eta: 128.7m +step 04761/16704 (28.50%) | loss: 2.776785 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,453 | mfu: 50.90 | epoch: 1 | total time: 51.17m | eta: 128.6m +step 04762/16704 (28.51%) | loss: 2.773399 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,671 | mfu: 50.86 | epoch: 1 | total time: 51.18m | eta: 128.6m +step 04763/16704 (28.51%) | loss: 2.773180 | lrm: 1.00 | dt: 642.34ms | tok/sec: 816,220 | mfu: 51.01 | epoch: 1 | total time: 51.19m | eta: 128.6m +step 04764/16704 (28.52%) | loss: 2.777922 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,777 | mfu: 50.74 | epoch: 1 | total time: 51.21m | eta: 128.6m +step 04765/16704 (28.53%) | loss: 2.773340 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,968 | mfu: 50.87 | epoch: 1 | total time: 51.22m | eta: 128.6m +step 04766/16704 (28.53%) | loss: 2.779808 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,715 | mfu: 50.86 | epoch: 1 | total time: 51.23m | eta: 128.6m +step 04767/16704 (28.54%) | loss: 2.787243 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,501 | mfu: 50.84 | epoch: 1 | total time: 51.24m | eta: 128.6m +step 04768/16704 (28.54%) | loss: 2.794780 | lrm: 1.00 | dt: 642.69ms | tok/sec: 815,773 | mfu: 50.99 | epoch: 1 | total time: 51.25m | eta: 128.6m +step 04769/16704 (28.55%) | loss: 2.783667 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,758 | mfu: 50.74 | epoch: 1 | total time: 51.26m | eta: 128.6m +step 04770/16704 (28.56%) | loss: 2.801255 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,368 | mfu: 50.90 | epoch: 1 | total time: 51.27m | eta: 128.5m +step 04771/16704 (28.56%) | loss: 2.797164 | lrm: 1.00 | dt: 642.06ms | tok/sec: 816,569 | mfu: 51.04 | epoch: 1 | total time: 51.28m | eta: 128.5m +step 04772/16704 (28.57%) | loss: 2.785800 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,242 | mfu: 50.95 | epoch: 1 | total time: 51.29m | eta: 128.5m +step 04773/16704 (28.57%) | loss: 2.794652 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,357 | mfu: 50.96 | epoch: 1 | total time: 51.30m | eta: 128.5m +step 04774/16704 (28.58%) | loss: 2.796495 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,111 | mfu: 50.88 | epoch: 1 | total time: 51.31m | eta: 128.5m +step 04775/16704 (28.59%) | loss: 2.801283 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,470 | mfu: 50.84 | epoch: 1 | total time: 51.32m | eta: 128.5m +step 04776/16704 (28.59%) | loss: 2.788747 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,755 | mfu: 50.80 | epoch: 1 | total time: 51.33m | eta: 128.5m +step 04777/16704 (28.60%) | loss: 2.802019 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,010 | mfu: 50.81 | epoch: 1 | total time: 51.35m | eta: 128.5m +step 04778/16704 (28.60%) | loss: 2.794044 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,974 | mfu: 50.81 | epoch: 1 | total time: 51.36m | eta: 128.5m +step 04779/16704 (28.61%) | loss: 2.806323 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,912 | mfu: 50.68 | epoch: 1 | total time: 51.37m | eta: 128.4m +step 04780/16704 (28.62%) | loss: 2.819790 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,138 | mfu: 51.01 | epoch: 1 | total time: 51.38m | eta: 128.4m +step 04781/16704 (28.62%) | loss: 2.812532 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,494 | mfu: 50.72 | epoch: 1 | total time: 51.39m | eta: 128.4m +step 04782/16704 (28.63%) | loss: 2.801379 | lrm: 1.00 | dt: 642.56ms | tok/sec: 815,936 | mfu: 51.00 | epoch: 1 | total time: 51.40m | eta: 128.4m +step 04783/16704 (28.63%) | loss: 2.800299 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,437 | mfu: 50.78 | epoch: 1 | total time: 51.41m | eta: 128.4m +step 04784/16704 (28.64%) | loss: 2.789617 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,376 | mfu: 50.77 | epoch: 1 | total time: 51.42m | eta: 128.4m +step 04785/16704 (28.65%) | loss: 2.785553 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 1 | total time: 51.43m | eta: 128.4m +step 04786/16704 (28.65%) | loss: 2.780616 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,718 | mfu: 50.73 | epoch: 1 | total time: 51.44m | eta: 128.4m +step 04787/16704 (28.66%) | loss: 2.785188 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,322 | mfu: 50.58 | epoch: 1 | total time: 51.45m | eta: 128.4m +step 04788/16704 (28.66%) | loss: 2.786035 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,941 | mfu: 50.87 | epoch: 1 | total time: 51.46m | eta: 128.3m +step 04789/16704 (28.67%) | loss: 2.774424 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,318 | mfu: 50.83 | epoch: 1 | total time: 51.47m | eta: 128.3m +step 04790/16704 (28.68%) | loss: 2.774133 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,049 | mfu: 50.82 | epoch: 1 | total time: 51.48m | eta: 128.3m +step 04791/16704 (28.68%) | loss: 2.763276 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,531 | mfu: 50.91 | epoch: 1 | total time: 51.50m | eta: 128.3m +step 04792/16704 (28.69%) | loss: 2.778698 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,639 | mfu: 50.79 | epoch: 1 | total time: 51.51m | eta: 128.3m +step 04793/16704 (28.69%) | loss: 2.770679 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,352 | mfu: 50.77 | epoch: 1 | total time: 51.52m | eta: 128.3m +step 04794/16704 (28.70%) | loss: 2.775498 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,023 | mfu: 50.94 | epoch: 1 | total time: 51.53m | eta: 128.3m +step 04795/16704 (28.71%) | loss: 2.768694 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,351 | mfu: 50.90 | epoch: 1 | total time: 51.54m | eta: 128.3m +step 04796/16704 (28.71%) | loss: 2.760745 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,041 | mfu: 50.88 | epoch: 1 | total time: 51.55m | eta: 128.3m +step 04797/16704 (28.72%) | loss: 2.756440 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,008 | mfu: 50.88 | epoch: 1 | total time: 51.56m | eta: 128.2m +step 04798/16704 (28.72%) | loss: 2.764715 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,754 | mfu: 50.92 | epoch: 1 | total time: 51.57m | eta: 128.2m +step 04799/16704 (28.73%) | loss: 2.777290 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,907 | mfu: 50.75 | epoch: 1 | total time: 51.58m | eta: 128.2m +step 04800/16704 (28.74%) | loss: 2.781686 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,038 | mfu: 50.82 | epoch: 1 | total time: 51.59m | eta: 128.2m +step 04801/16704 (28.74%) | loss: 2.770238 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,623 | mfu: 50.92 | epoch: 1 | total time: 51.60m | eta: 128.2m +step 04802/16704 (28.75%) | loss: 2.770323 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,453 | mfu: 50.72 | epoch: 1 | total time: 51.61m | eta: 128.2m +step 04803/16704 (28.75%) | loss: 2.771687 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,354 | mfu: 50.77 | epoch: 1 | total time: 51.62m | eta: 128.2m +step 04804/16704 (28.76%) | loss: 2.770959 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,121 | mfu: 50.82 | epoch: 1 | total time: 51.64m | eta: 128.2m +step 04805/16704 (28.77%) | loss: 2.769206 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,468 | mfu: 50.72 | epoch: 1 | total time: 51.65m | eta: 128.2m +step 04806/16704 (28.77%) | loss: 2.751591 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,472 | mfu: 50.97 | epoch: 1 | total time: 51.66m | eta: 128.2m +step 04807/16704 (28.78%) | loss: 2.747427 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,284 | mfu: 50.89 | epoch: 1 | total time: 51.67m | eta: 128.1m +step 04808/16704 (28.78%) | loss: 2.748658 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,257 | mfu: 50.77 | epoch: 1 | total time: 51.68m | eta: 128.1m +step 04809/16704 (28.79%) | loss: 2.745313 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,805 | mfu: 50.86 | epoch: 1 | total time: 51.69m | eta: 128.1m +step 04810/16704 (28.80%) | loss: 2.752435 | lrm: 1.00 | dt: 641.75ms | tok/sec: 816,962 | mfu: 51.06 | epoch: 1 | total time: 51.70m | eta: 128.1m +step 04811/16704 (28.80%) | loss: 2.760700 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 1 | total time: 51.71m | eta: 128.1m +step 04812/16704 (28.81%) | loss: 2.756333 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,337 | mfu: 50.77 | epoch: 1 | total time: 51.72m | eta: 128.1m +step 04813/16704 (28.81%) | loss: 2.764758 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,640 | mfu: 50.79 | epoch: 1 | total time: 51.73m | eta: 128.1m +step 04814/16704 (28.82%) | loss: 2.774681 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,392 | mfu: 50.78 | epoch: 1 | total time: 51.74m | eta: 128.1m +step 04815/16704 (28.83%) | loss: 2.764507 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,676 | mfu: 50.86 | epoch: 1 | total time: 51.75m | eta: 128.1m +step 04816/16704 (28.83%) | loss: 2.768298 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,067 | mfu: 50.76 | epoch: 1 | total time: 51.76m | eta: 128.0m +step 04817/16704 (28.84%) | loss: 2.777156 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,284 | mfu: 50.77 | epoch: 1 | total time: 51.78m | eta: 128.0m +step 04818/16704 (28.84%) | loss: 2.777386 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,922 | mfu: 50.87 | epoch: 1 | total time: 51.79m | eta: 128.0m +step 04819/16704 (28.85%) | loss: 2.783537 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,699 | mfu: 50.86 | epoch: 1 | total time: 51.80m | eta: 128.0m +step 04820/16704 (28.86%) | loss: 2.783070 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,416 | mfu: 50.84 | epoch: 1 | total time: 51.81m | eta: 128.0m +step 04821/16704 (28.86%) | loss: 2.783438 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 1 | total time: 51.82m | eta: 128.0m +step 04822/16704 (28.87%) | loss: 2.783676 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,831 | mfu: 50.80 | epoch: 1 | total time: 51.83m | eta: 128.0m +step 04823/16704 (28.87%) | loss: 2.767701 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,837 | mfu: 50.93 | epoch: 1 | total time: 51.84m | eta: 128.0m +step 04824/16704 (28.88%) | loss: 2.776747 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,442 | mfu: 50.72 | epoch: 1 | total time: 51.85m | eta: 128.0m +step 04825/16704 (28.89%) | loss: 2.782295 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,131 | mfu: 50.70 | epoch: 1 | total time: 51.86m | eta: 127.9m +step 04826/16704 (28.89%) | loss: 2.788043 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,248 | mfu: 50.89 | epoch: 1 | total time: 51.87m | eta: 127.9m +step 04827/16704 (28.90%) | loss: 2.775064 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,957 | mfu: 50.81 | epoch: 1 | total time: 51.88m | eta: 127.9m +step 04828/16704 (28.90%) | loss: 2.788730 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,592 | mfu: 50.79 | epoch: 1 | total time: 51.89m | eta: 127.9m +step 04829/16704 (28.91%) | loss: 2.788780 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,409 | mfu: 50.84 | epoch: 1 | total time: 51.90m | eta: 127.9m +step 04830/16704 (28.92%) | loss: 2.773860 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 1 | total time: 51.91m | eta: 127.9m +step 04831/16704 (28.92%) | loss: 2.787948 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,184 | mfu: 50.95 | epoch: 1 | total time: 51.93m | eta: 127.9m +step 04832/16704 (28.93%) | loss: 2.797174 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,531 | mfu: 50.78 | epoch: 1 | total time: 51.94m | eta: 127.9m +step 04833/16704 (28.93%) | loss: 2.796163 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,945 | mfu: 50.69 | epoch: 1 | total time: 51.95m | eta: 127.9m +step 04834/16704 (28.94%) | loss: 2.801518 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,304 | mfu: 50.90 | epoch: 1 | total time: 51.96m | eta: 127.8m +step 04835/16704 (28.95%) | loss: 2.794852 | lrm: 1.00 | dt: 648.92ms | tok/sec: 807,941 | mfu: 50.50 | epoch: 1 | total time: 51.97m | eta: 127.8m +step 04836/16704 (28.95%) | loss: 2.789617 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,798 | mfu: 50.93 | epoch: 1 | total time: 51.98m | eta: 127.8m +step 04837/16704 (28.96%) | loss: 2.789088 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,200 | mfu: 50.70 | epoch: 1 | total time: 51.99m | eta: 127.8m +step 04838/16704 (28.96%) | loss: 2.771642 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,412 | mfu: 50.78 | epoch: 1 | total time: 52.00m | eta: 127.8m +step 04839/16704 (28.97%) | loss: 2.781713 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,976 | mfu: 50.81 | epoch: 1 | total time: 52.01m | eta: 127.8m +step 04840/16704 (28.98%) | loss: 2.777694 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,794 | mfu: 50.68 | epoch: 1 | total time: 52.02m | eta: 127.8m +step 04841/16704 (28.98%) | loss: 2.770654 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,928 | mfu: 50.81 | epoch: 1 | total time: 52.03m | eta: 127.8m +step 04842/16704 (28.99%) | loss: 2.760643 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,402 | mfu: 50.84 | epoch: 1 | total time: 52.04m | eta: 127.8m +step 04843/16704 (28.99%) | loss: 2.774808 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,951 | mfu: 50.87 | epoch: 1 | total time: 52.05m | eta: 127.8m +step 04844/16704 (29.00%) | loss: 2.759787 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,450 | mfu: 50.84 | epoch: 1 | total time: 52.07m | eta: 127.7m +step 04845/16704 (29.01%) | loss: 2.767788 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 1 | total time: 52.08m | eta: 127.7m +step 04846/16704 (29.01%) | loss: 2.768381 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,623 | mfu: 50.85 | epoch: 1 | total time: 52.09m | eta: 127.7m +step 04847/16704 (29.02%) | loss: 2.772040 | lrm: 1.00 | dt: 642.54ms | tok/sec: 815,967 | mfu: 51.00 | epoch: 1 | total time: 52.10m | eta: 127.7m +step 04848/16704 (29.02%) | loss: 2.762905 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,316 | mfu: 51.02 | epoch: 1 | total time: 52.11m | eta: 127.7m +step 04849/16704 (29.03%) | loss: 2.758485 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,826 | mfu: 50.87 | epoch: 1 | total time: 52.12m | eta: 127.7m +step 04850/16704 (29.03%) | loss: 2.767910 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,977 | mfu: 50.81 | epoch: 1 | total time: 52.13m | eta: 127.7m +step 04851/16704 (29.04%) | loss: 2.780728 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,239 | mfu: 50.89 | epoch: 1 | total time: 52.14m | eta: 127.7m +step 04852/16704 (29.05%) | loss: 2.772157 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,084 | mfu: 50.94 | epoch: 1 | total time: 52.15m | eta: 127.7m +step 04853/16704 (29.05%) | loss: 2.772303 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,476 | mfu: 50.66 | epoch: 1 | total time: 52.16m | eta: 127.6m +step 04854/16704 (29.06%) | loss: 2.768400 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,864 | mfu: 50.87 | epoch: 1 | total time: 52.17m | eta: 127.6m +step 04855/16704 (29.06%) | loss: 2.769892 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,928 | mfu: 50.75 | epoch: 1 | total time: 52.18m | eta: 127.6m +step 04856/16704 (29.07%) | loss: 2.779006 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,228 | mfu: 50.95 | epoch: 1 | total time: 52.19m | eta: 127.6m +step 04857/16704 (29.08%) | loss: 2.779172 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 52.20m | eta: 127.6m +step 04858/16704 (29.08%) | loss: 2.778170 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,683 | mfu: 50.92 | epoch: 1 | total time: 52.22m | eta: 127.6m +step 04859/16704 (29.09%) | loss: 2.786176 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,304 | mfu: 50.96 | epoch: 1 | total time: 52.23m | eta: 127.6m +step 04860/16704 (29.09%) | loss: 2.786143 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,942 | mfu: 50.94 | epoch: 1 | total time: 52.24m | eta: 127.6m +step 04861/16704 (29.10%) | loss: 2.767156 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,074 | mfu: 50.88 | epoch: 1 | total time: 52.25m | eta: 127.6m +step 04862/16704 (29.11%) | loss: 2.767563 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,528 | mfu: 50.72 | epoch: 1 | total time: 52.26m | eta: 127.5m +step 04863/16704 (29.11%) | loss: 2.767281 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,291 | mfu: 50.89 | epoch: 1 | total time: 52.27m | eta: 127.5m +step 04864/16704 (29.12%) | loss: 2.765571 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,293 | mfu: 50.96 | epoch: 1 | total time: 52.28m | eta: 127.5m +step 04865/16704 (29.12%) | loss: 2.768768 | lrm: 1.00 | dt: 642.61ms | tok/sec: 815,877 | mfu: 50.99 | epoch: 1 | total time: 52.29m | eta: 127.5m +step 04866/16704 (29.13%) | loss: 2.765218 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,237 | mfu: 50.77 | epoch: 1 | total time: 52.30m | eta: 127.5m +step 04867/16704 (29.14%) | loss: 2.769192 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,873 | mfu: 50.87 | epoch: 1 | total time: 52.31m | eta: 127.5m +step 04868/16704 (29.14%) | loss: 2.756683 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,435 | mfu: 50.78 | epoch: 1 | total time: 52.32m | eta: 127.5m +step 04869/16704 (29.15%) | loss: 2.750700 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,689 | mfu: 50.54 | epoch: 1 | total time: 52.33m | eta: 127.5m +step 04870/16704 (29.15%) | loss: 2.750834 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,719 | mfu: 50.92 | epoch: 1 | total time: 52.34m | eta: 127.5m +step 04871/16704 (29.16%) | loss: 2.765921 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,875 | mfu: 50.81 | epoch: 1 | total time: 52.36m | eta: 127.4m +step 04872/16704 (29.17%) | loss: 2.764743 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,461 | mfu: 50.84 | epoch: 1 | total time: 52.37m | eta: 127.4m +step 04873/16704 (29.17%) | loss: 2.761539 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,788 | mfu: 50.74 | epoch: 1 | total time: 52.38m | eta: 127.4m +step 04874/16704 (29.18%) | loss: 2.747566 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,091 | mfu: 50.69 | epoch: 1 | total time: 52.39m | eta: 127.4m +step 04875/16704 (29.18%) | loss: 2.754716 | lrm: 1.00 | dt: 642.23ms | tok/sec: 816,358 | mfu: 51.02 | epoch: 1 | total time: 52.40m | eta: 127.4m +step 04876/16704 (29.19%) | loss: 2.751790 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,445 | mfu: 50.78 | epoch: 1 | total time: 52.41m | eta: 127.4m +step 04877/16704 (29.20%) | loss: 2.745916 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,632 | mfu: 50.79 | epoch: 1 | total time: 52.42m | eta: 127.4m +step 04878/16704 (29.20%) | loss: 2.756688 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,696 | mfu: 50.92 | epoch: 1 | total time: 52.43m | eta: 127.4m +step 04879/16704 (29.21%) | loss: 2.769671 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,292 | mfu: 50.77 | epoch: 1 | total time: 52.44m | eta: 127.4m +step 04880/16704 (29.21%) | loss: 2.767245 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 1 | total time: 52.45m | eta: 127.3m +step 04881/16704 (29.22%) | loss: 2.776456 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,468 | mfu: 50.78 | epoch: 1 | total time: 52.46m | eta: 127.3m +step 04882/16704 (29.23%) | loss: 2.779082 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,967 | mfu: 50.75 | epoch: 1 | total time: 52.47m | eta: 127.3m +step 04883/16704 (29.23%) | loss: 2.792560 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,170 | mfu: 50.89 | epoch: 1 | total time: 52.48m | eta: 127.3m +step 04884/16704 (29.24%) | loss: 2.777832 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,160 | mfu: 50.82 | epoch: 1 | total time: 52.49m | eta: 127.3m +step 04885/16704 (29.24%) | loss: 2.767585 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,967 | mfu: 50.69 | epoch: 1 | total time: 52.51m | eta: 127.3m +step 04886/16704 (29.25%) | loss: 2.768697 | lrm: 1.00 | dt: 648.39ms | tok/sec: 808,595 | mfu: 50.54 | epoch: 1 | total time: 52.52m | eta: 127.3m +step 04887/16704 (29.26%) | loss: 2.760123 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,712 | mfu: 50.92 | epoch: 1 | total time: 52.53m | eta: 127.3m +step 04888/16704 (29.26%) | loss: 2.766452 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,409 | mfu: 50.65 | epoch: 1 | total time: 52.54m | eta: 127.3m +step 04889/16704 (29.27%) | loss: 2.770505 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,971 | mfu: 50.81 | epoch: 1 | total time: 52.55m | eta: 127.3m +step 04890/16704 (29.27%) | loss: 2.773377 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,847 | mfu: 50.68 | epoch: 1 | total time: 52.56m | eta: 127.2m +step 04891/16704 (29.28%) | loss: 2.765377 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,845 | mfu: 50.68 | epoch: 1 | total time: 52.57m | eta: 127.2m +step 04892/16704 (29.29%) | loss: 2.771941 | lrm: 1.00 | dt: 649.45ms | tok/sec: 807,273 | mfu: 50.46 | epoch: 1 | total time: 52.58m | eta: 127.2m +step 04893/16704 (29.29%) | loss: 2.788687 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,404 | mfu: 50.84 | epoch: 1 | total time: 52.59m | eta: 127.2m +step 04894/16704 (29.30%) | loss: 2.783677 | lrm: 1.00 | dt: 650.73ms | tok/sec: 805,697 | mfu: 50.36 | epoch: 1 | total time: 52.60m | eta: 127.2m +step 04895/16704 (29.30%) | loss: 2.779331 | lrm: 1.00 | dt: 648.21ms | tok/sec: 808,818 | mfu: 50.55 | epoch: 1 | total time: 52.61m | eta: 127.2m +step 04896/16704 (29.31%) | loss: 2.769711 | lrm: 1.00 | dt: 648.32ms | tok/sec: 808,689 | mfu: 50.54 | epoch: 1 | total time: 52.62m | eta: 127.2m +step 04897/16704 (29.32%) | loss: 2.766941 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,174 | mfu: 50.57 | epoch: 1 | total time: 52.64m | eta: 127.2m +step 04898/16704 (29.32%) | loss: 2.764981 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,116 | mfu: 50.82 | epoch: 1 | total time: 52.65m | eta: 127.2m +step 04899/16704 (29.33%) | loss: 2.765845 | lrm: 1.00 | dt: 649.15ms | tok/sec: 807,657 | mfu: 50.48 | epoch: 1 | total time: 52.66m | eta: 127.1m +step 04900/16704 (29.33%) | loss: 2.766072 | lrm: 1.00 | dt: 649.03ms | tok/sec: 807,807 | mfu: 50.49 | epoch: 1 | total time: 52.67m | eta: 127.1m +step 04901/16704 (29.34%) | loss: 2.780854 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,966 | mfu: 50.69 | epoch: 1 | total time: 52.68m | eta: 127.1m +step 04902/16704 (29.35%) | loss: 2.788750 | lrm: 1.00 | dt: 647.65ms | tok/sec: 809,529 | mfu: 50.60 | epoch: 1 | total time: 52.69m | eta: 127.1m +step 04903/16704 (29.35%) | loss: 2.777862 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,733 | mfu: 50.73 | epoch: 1 | total time: 52.70m | eta: 127.1m +step 04904/16704 (29.36%) | loss: 2.758313 | lrm: 1.00 | dt: 648.72ms | tok/sec: 808,194 | mfu: 50.51 | epoch: 1 | total time: 52.71m | eta: 127.1m +step 04905/16704 (29.36%) | loss: 2.776375 | lrm: 1.00 | dt: 648.76ms | tok/sec: 808,141 | mfu: 50.51 | epoch: 1 | total time: 52.72m | eta: 127.1m +step 04906/16704 (29.37%) | loss: 2.769119 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,497 | mfu: 50.84 | epoch: 1 | total time: 52.73m | eta: 127.1m +step 04907/16704 (29.38%) | loss: 2.766683 | lrm: 1.00 | dt: 650.16ms | tok/sec: 806,399 | mfu: 50.40 | epoch: 1 | total time: 52.74m | eta: 127.1m +step 04908/16704 (29.38%) | loss: 2.775669 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,462 | mfu: 50.84 | epoch: 1 | total time: 52.75m | eta: 127.0m +step 04909/16704 (29.39%) | loss: 2.775500 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,514 | mfu: 50.72 | epoch: 1 | total time: 52.76m | eta: 127.0m +step 04910/16704 (29.39%) | loss: 2.776189 | lrm: 1.00 | dt: 647.69ms | tok/sec: 809,478 | mfu: 50.59 | epoch: 1 | total time: 52.78m | eta: 127.0m +step 04911/16704 (29.40%) | loss: 2.772542 | lrm: 1.00 | dt: 649.85ms | tok/sec: 806,783 | mfu: 50.43 | epoch: 1 | total time: 52.79m | eta: 127.0m +step 04912/16704 (29.41%) | loss: 2.773495 | lrm: 1.00 | dt: 649.64ms | tok/sec: 807,038 | mfu: 50.44 | epoch: 1 | total time: 52.80m | eta: 127.0m +step 04913/16704 (29.41%) | loss: 2.771699 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,904 | mfu: 50.62 | epoch: 1 | total time: 52.81m | eta: 127.0m +step 04914/16704 (29.42%) | loss: 2.774398 | lrm: 1.00 | dt: 648.96ms | tok/sec: 807,886 | mfu: 50.49 | epoch: 1 | total time: 52.82m | eta: 127.0m +step 04915/16704 (29.42%) | loss: 2.763280 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,221 | mfu: 50.64 | epoch: 1 | total time: 52.83m | eta: 127.0m +step 04916/16704 (29.43%) | loss: 2.756390 | lrm: 1.00 | dt: 649.10ms | tok/sec: 807,709 | mfu: 50.48 | epoch: 1 | total time: 52.84m | eta: 127.0m +step 04917/16704 (29.44%) | loss: 2.747247 | lrm: 1.00 | dt: 648.78ms | tok/sec: 808,108 | mfu: 50.51 | epoch: 1 | total time: 52.85m | eta: 127.0m +step 04918/16704 (29.44%) | loss: 2.746141 | lrm: 1.00 | dt: 651.31ms | tok/sec: 804,978 | mfu: 50.31 | epoch: 1 | total time: 52.86m | eta: 126.9m +step 04919/16704 (29.45%) | loss: 2.741550 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,866 | mfu: 50.81 | epoch: 1 | total time: 52.87m | eta: 126.9m +step 04920/16704 (29.45%) | loss: 2.740018 | lrm: 1.00 | dt: 648.98ms | tok/sec: 807,867 | mfu: 50.49 | epoch: 1 | total time: 52.88m | eta: 126.9m +step 04921/16704 (29.46%) | loss: 2.737666 | lrm: 1.00 | dt: 650.06ms | tok/sec: 806,523 | mfu: 50.41 | epoch: 1 | total time: 52.89m | eta: 126.9m +step 04922/16704 (29.47%) | loss: 2.748914 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,098 | mfu: 50.57 | epoch: 1 | total time: 52.91m | eta: 126.9m +step 04923/16704 (29.47%) | loss: 2.755054 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,280 | mfu: 50.58 | epoch: 1 | total time: 52.92m | eta: 126.9m +step 04924/16704 (29.48%) | loss: 2.758426 | lrm: 1.00 | dt: 649.61ms | tok/sec: 807,076 | mfu: 50.44 | epoch: 1 | total time: 52.93m | eta: 126.9m +step 04925/16704 (29.48%) | loss: 2.764058 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,732 | mfu: 50.67 | epoch: 1 | total time: 52.94m | eta: 126.9m +step 04926/16704 (29.49%) | loss: 2.768180 | lrm: 1.00 | dt: 652.10ms | tok/sec: 804,001 | mfu: 50.25 | epoch: 1 | total time: 52.95m | eta: 126.9m +step 04927/16704 (29.50%) | loss: 2.758014 | lrm: 1.00 | dt: 647.71ms | tok/sec: 809,447 | mfu: 50.59 | epoch: 1 | total time: 52.96m | eta: 126.8m +step 04928/16704 (29.50%) | loss: 2.764827 | lrm: 1.00 | dt: 649.06ms | tok/sec: 807,759 | mfu: 50.49 | epoch: 1 | total time: 52.97m | eta: 126.8m +step 04929/16704 (29.51%) | loss: 2.764432 | lrm: 1.00 | dt: 649.02ms | tok/sec: 807,816 | mfu: 50.49 | epoch: 1 | total time: 52.98m | eta: 126.8m +step 04930/16704 (29.51%) | loss: 2.759710 | lrm: 1.00 | dt: 650.04ms | tok/sec: 806,547 | mfu: 50.41 | epoch: 1 | total time: 52.99m | eta: 126.8m +step 04931/16704 (29.52%) | loss: 2.765477 | lrm: 1.00 | dt: 647.64ms | tok/sec: 809,535 | mfu: 50.60 | epoch: 1 | total time: 53.00m | eta: 126.8m +step 04932/16704 (29.53%) | loss: 2.770395 | lrm: 1.00 | dt: 647.44ms | tok/sec: 809,787 | mfu: 50.61 | epoch: 1 | total time: 53.01m | eta: 126.8m +step 04933/16704 (29.53%) | loss: 2.780728 | lrm: 1.00 | dt: 649.11ms | tok/sec: 807,706 | mfu: 50.48 | epoch: 1 | total time: 53.02m | eta: 126.8m +step 04934/16704 (29.54%) | loss: 2.786215 | lrm: 1.00 | dt: 647.29ms | tok/sec: 809,968 | mfu: 50.62 | epoch: 1 | total time: 53.03m | eta: 126.8m +step 04935/16704 (29.54%) | loss: 2.792221 | lrm: 1.00 | dt: 650.23ms | tok/sec: 806,315 | mfu: 50.40 | epoch: 1 | total time: 53.05m | eta: 126.8m +step 04936/16704 (29.55%) | loss: 2.799409 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,464 | mfu: 50.66 | epoch: 1 | total time: 53.06m | eta: 126.7m +step 04937/16704 (29.56%) | loss: 2.777120 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,095 | mfu: 50.76 | epoch: 1 | total time: 53.07m | eta: 126.7m +step 04938/16704 (29.56%) | loss: 2.781260 | lrm: 1.00 | dt: 649.83ms | tok/sec: 806,809 | mfu: 50.43 | epoch: 1 | total time: 53.08m | eta: 126.7m +step 04939/16704 (29.57%) | loss: 2.784249 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,399 | mfu: 50.78 | epoch: 1 | total time: 53.09m | eta: 126.7m +step 04940/16704 (29.57%) | loss: 2.782979 | lrm: 1.00 | dt: 650.11ms | tok/sec: 806,463 | mfu: 50.41 | epoch: 1 | total time: 53.10m | eta: 126.7m +step 04941/16704 (29.58%) | loss: 2.785223 | lrm: 1.00 | dt: 650.18ms | tok/sec: 806,376 | mfu: 50.40 | epoch: 1 | total time: 53.11m | eta: 126.7m +step 04942/16704 (29.59%) | loss: 2.787976 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,820 | mfu: 50.68 | epoch: 1 | total time: 53.12m | eta: 126.7m +step 04943/16704 (29.59%) | loss: 2.795146 | lrm: 1.00 | dt: 649.32ms | tok/sec: 807,435 | mfu: 50.47 | epoch: 1 | total time: 53.13m | eta: 126.7m +step 04944/16704 (29.60%) | loss: 2.804583 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,978 | mfu: 50.81 | epoch: 1 | total time: 53.14m | eta: 126.7m +step 04945/16704 (29.60%) | loss: 2.805575 | lrm: 1.00 | dt: 647.70ms | tok/sec: 809,462 | mfu: 50.59 | epoch: 1 | total time: 53.15m | eta: 126.7m +step 04946/16704 (29.61%) | loss: 2.798984 | lrm: 1.00 | dt: 650.55ms | tok/sec: 805,911 | mfu: 50.37 | epoch: 1 | total time: 53.16m | eta: 126.6m +step 04947/16704 (29.62%) | loss: 2.802059 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 1 | total time: 53.18m | eta: 126.6m +step 04948/16704 (29.62%) | loss: 2.808801 | lrm: 1.00 | dt: 651.71ms | tok/sec: 804,484 | mfu: 50.28 | epoch: 1 | total time: 53.19m | eta: 126.6m +step 04949/16704 (29.63%) | loss: 2.792810 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,309 | mfu: 50.83 | epoch: 1 | total time: 53.20m | eta: 126.6m +step 04950/16704 (29.63%) | loss: 2.782937 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,361 | mfu: 50.77 | epoch: 1 | total time: 53.21m | eta: 126.6m +step 04951/16704 (29.64%) | loss: 2.782293 | lrm: 1.00 | dt: 652.82ms | tok/sec: 803,115 | mfu: 50.20 | epoch: 1 | total time: 53.22m | eta: 126.6m +step 04952/16704 (29.65%) | loss: 2.788751 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,020 | mfu: 50.69 | epoch: 1 | total time: 53.23m | eta: 126.6m +step 04953/16704 (29.65%) | loss: 2.795196 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,731 | mfu: 50.73 | epoch: 1 | total time: 53.24m | eta: 126.6m +step 04954/16704 (29.66%) | loss: 2.790452 | lrm: 1.00 | dt: 648.02ms | tok/sec: 809,055 | mfu: 50.57 | epoch: 1 | total time: 53.25m | eta: 126.6m +step 04955/16704 (29.66%) | loss: 2.800448 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,501 | mfu: 50.66 | epoch: 1 | total time: 53.26m | eta: 126.5m +step 04956/16704 (29.67%) | loss: 2.784233 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,093 | mfu: 50.57 | epoch: 1 | total time: 53.27m | eta: 126.5m +step 04957/16704 (29.68%) | loss: 2.778396 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,783 | mfu: 50.74 | epoch: 1 | total time: 53.28m | eta: 126.5m +step 04958/16704 (29.68%) | loss: 2.781929 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,233 | mfu: 50.70 | epoch: 1 | total time: 53.29m | eta: 126.5m +step 04959/16704 (29.69%) | loss: 2.774385 | lrm: 1.00 | dt: 647.40ms | tok/sec: 809,838 | mfu: 50.62 | epoch: 1 | total time: 53.30m | eta: 126.5m +step 04960/16704 (29.69%) | loss: 2.758989 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,031 | mfu: 50.69 | epoch: 1 | total time: 53.32m | eta: 126.5m +step 04961/16704 (29.70%) | loss: 2.758858 | lrm: 1.00 | dt: 649.07ms | tok/sec: 807,758 | mfu: 50.49 | epoch: 1 | total time: 53.33m | eta: 126.5m +step 04962/16704 (29.71%) | loss: 2.769407 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,346 | mfu: 50.71 | epoch: 1 | total time: 53.34m | eta: 126.5m +step 04963/16704 (29.71%) | loss: 2.763342 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,483 | mfu: 50.84 | epoch: 1 | total time: 53.35m | eta: 126.5m +step 04964/16704 (29.72%) | loss: 2.763228 | lrm: 1.00 | dt: 649.70ms | tok/sec: 806,971 | mfu: 50.44 | epoch: 1 | total time: 53.36m | eta: 126.4m +step 04965/16704 (29.72%) | loss: 2.755563 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,549 | mfu: 50.72 | epoch: 1 | total time: 53.37m | eta: 126.4m +step 04966/16704 (29.73%) | loss: 2.775335 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,625 | mfu: 50.67 | epoch: 1 | total time: 53.38m | eta: 126.4m +step 04967/16704 (29.74%) | loss: 2.774720 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,556 | mfu: 50.66 | epoch: 1 | total time: 53.39m | eta: 126.4m +step 04968/16704 (29.74%) | loss: 2.769863 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,163 | mfu: 50.76 | epoch: 1 | total time: 53.40m | eta: 126.4m +step 04969/16704 (29.75%) | loss: 2.774471 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,642 | mfu: 50.79 | epoch: 1 | total time: 53.41m | eta: 126.4m +step 04970/16704 (29.75%) | loss: 2.777914 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,662 | mfu: 50.73 | epoch: 1 | total time: 53.42m | eta: 126.4m +step 04971/16704 (29.76%) | loss: 2.778757 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,446 | mfu: 50.84 | epoch: 1 | total time: 53.43m | eta: 126.4m +step 04972/16704 (29.77%) | loss: 2.782654 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,626 | mfu: 50.79 | epoch: 1 | total time: 53.44m | eta: 126.4m +step 04973/16704 (29.77%) | loss: 2.776834 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,041 | mfu: 50.63 | epoch: 1 | total time: 53.46m | eta: 126.4m +step 04974/16704 (29.78%) | loss: 2.780306 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,542 | mfu: 50.91 | epoch: 1 | total time: 53.47m | eta: 126.3m +step 04975/16704 (29.78%) | loss: 2.787124 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,752 | mfu: 50.61 | epoch: 1 | total time: 53.48m | eta: 126.3m +step 04976/16704 (29.79%) | loss: 2.772487 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,288 | mfu: 50.83 | epoch: 1 | total time: 53.49m | eta: 126.3m +step 04977/16704 (29.80%) | loss: 2.771674 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,094 | mfu: 50.94 | epoch: 1 | total time: 53.50m | eta: 126.3m +step 04978/16704 (29.80%) | loss: 2.779526 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,605 | mfu: 50.85 | epoch: 1 | total time: 53.51m | eta: 126.3m +step 04979/16704 (29.81%) | loss: 2.768309 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,215 | mfu: 50.64 | epoch: 1 | total time: 53.52m | eta: 126.3m +step 04980/16704 (29.81%) | loss: 2.756495 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 1 | total time: 53.53m | eta: 126.3m +step 04981/16704 (29.82%) | loss: 2.758584 | lrm: 1.00 | dt: 647.51ms | tok/sec: 809,704 | mfu: 50.61 | epoch: 1 | total time: 53.54m | eta: 126.3m +step 04982/16704 (29.83%) | loss: 2.767512 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,258 | mfu: 50.83 | epoch: 1 | total time: 53.55m | eta: 126.3m +step 04983/16704 (29.83%) | loss: 2.768114 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,555 | mfu: 50.85 | epoch: 1 | total time: 53.56m | eta: 126.2m +step 04984/16704 (29.84%) | loss: 2.758851 | lrm: 1.00 | dt: 647.15ms | tok/sec: 810,149 | mfu: 50.64 | epoch: 1 | total time: 53.57m | eta: 126.2m +step 04985/16704 (29.84%) | loss: 2.756771 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,537 | mfu: 50.72 | epoch: 1 | total time: 53.58m | eta: 126.2m +step 04986/16704 (29.85%) | loss: 2.749375 | lrm: 1.00 | dt: 642.16ms | tok/sec: 816,443 | mfu: 51.03 | epoch: 1 | total time: 53.60m | eta: 126.2m +step 04987/16704 (29.86%) | loss: 2.748628 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,110 | mfu: 50.82 | epoch: 1 | total time: 53.61m | eta: 126.2m +step 04988/16704 (29.86%) | loss: 2.755291 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,695 | mfu: 50.86 | epoch: 1 | total time: 53.62m | eta: 126.2m +step 04989/16704 (29.87%) | loss: 2.757436 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,029 | mfu: 50.69 | epoch: 1 | total time: 53.63m | eta: 126.2m +step 04990/16704 (29.87%) | loss: 2.766047 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,026 | mfu: 50.94 | epoch: 1 | total time: 53.64m | eta: 126.2m +step 04991/16704 (29.88%) | loss: 2.772054 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,783 | mfu: 50.74 | epoch: 1 | total time: 53.65m | eta: 126.2m +step 04992/16704 (29.89%) | loss: 2.771114 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,979 | mfu: 50.94 | epoch: 1 | total time: 53.66m | eta: 126.1m +step 04993/16704 (29.89%) | loss: 2.773560 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,444 | mfu: 50.78 | epoch: 1 | total time: 53.67m | eta: 126.1m +step 04994/16704 (29.90%) | loss: 2.773194 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,853 | mfu: 50.80 | epoch: 1 | total time: 53.68m | eta: 126.1m +step 04995/16704 (29.90%) | loss: 2.764914 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,832 | mfu: 50.74 | epoch: 1 | total time: 53.69m | eta: 126.1m +step 04996/16704 (29.91%) | loss: 2.755746 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,288 | mfu: 50.89 | epoch: 1 | total time: 53.70m | eta: 126.1m +step 04997/16704 (29.91%) | loss: 2.749952 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,342 | mfu: 50.90 | epoch: 1 | total time: 53.71m | eta: 126.1m +step 04998/16704 (29.92%) | loss: 2.741178 | lrm: 1.00 | dt: 648.60ms | tok/sec: 808,338 | mfu: 50.52 | epoch: 1 | total time: 53.72m | eta: 126.1m +step 04999/16704 (29.93%) | loss: 2.739693 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,907 | mfu: 51.00 | epoch: 1 | total time: 53.73m | eta: 126.1m +Step 05000 | Validation bpb: 0.841265 +step 05000/16704 (29.93%) | loss: 2.748041 | lrm: 1.00 | dt: 641.28ms | tok/sec: 817,564 | mfu: 51.10 | epoch: 1 | total time: 53.75m | eta: 126.1m +step 05001/16704 (29.94%) | loss: 2.742224 | lrm: 1.00 | dt: 647.73ms | tok/sec: 809,421 | mfu: 50.59 | epoch: 1 | total time: 53.76m | eta: 126.0m +step 05002/16704 (29.94%) | loss: 2.741807 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,119 | mfu: 50.76 | epoch: 1 | total time: 53.77m | eta: 126.0m +step 05003/16704 (29.95%) | loss: 2.756103 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,986 | mfu: 50.94 | epoch: 1 | total time: 53.78m | eta: 126.0m +step 05004/16704 (29.96%) | loss: 2.748436 | lrm: 1.00 | dt: 647.22ms | tok/sec: 810,064 | mfu: 50.63 | epoch: 1 | total time: 53.79m | eta: 126.0m +step 05005/16704 (29.96%) | loss: 2.754387 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,878 | mfu: 50.93 | epoch: 1 | total time: 53.80m | eta: 126.0m +step 05006/16704 (29.97%) | loss: 2.762780 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,152 | mfu: 50.89 | epoch: 1 | total time: 53.81m | eta: 126.0m +step 05007/16704 (29.97%) | loss: 2.777557 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,265 | mfu: 50.77 | epoch: 1 | total time: 53.82m | eta: 126.0m +step 05008/16704 (29.98%) | loss: 2.777768 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,011 | mfu: 50.81 | epoch: 1 | total time: 53.83m | eta: 126.0m +step 05009/16704 (29.99%) | loss: 2.767482 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,832 | mfu: 50.74 | epoch: 1 | total time: 53.84m | eta: 126.0m +step 05010/16704 (29.99%) | loss: 2.771513 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,819 | mfu: 50.68 | epoch: 1 | total time: 53.85m | eta: 126.0m +step 05011/16704 (30.00%) | loss: 2.765545 | lrm: 1.00 | dt: 642.13ms | tok/sec: 816,486 | mfu: 51.03 | epoch: 1 | total time: 53.86m | eta: 125.9m +step 05012/16704 (30.00%) | loss: 2.754458 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,245 | mfu: 50.83 | epoch: 1 | total time: 53.87m | eta: 125.9m +step 05013/16704 (30.01%) | loss: 2.743931 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,593 | mfu: 50.85 | epoch: 1 | total time: 53.89m | eta: 125.9m +step 05014/16704 (30.02%) | loss: 2.731555 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,596 | mfu: 50.79 | epoch: 1 | total time: 53.90m | eta: 125.9m +step 05015/16704 (30.02%) | loss: 2.735562 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,411 | mfu: 50.65 | epoch: 1 | total time: 53.91m | eta: 125.9m +step 05016/16704 (30.03%) | loss: 2.740969 | lrm: 1.00 | dt: 641.91ms | tok/sec: 816,764 | mfu: 51.05 | epoch: 1 | total time: 53.92m | eta: 125.9m +step 05017/16704 (30.03%) | loss: 2.749381 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,567 | mfu: 50.91 | epoch: 1 | total time: 53.93m | eta: 125.9m +step 05018/16704 (30.04%) | loss: 2.763147 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 1 | total time: 53.94m | eta: 125.9m +step 05019/16704 (30.05%) | loss: 2.750955 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,433 | mfu: 50.90 | epoch: 1 | total time: 53.95m | eta: 125.9m +step 05020/16704 (30.05%) | loss: 2.750574 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,291 | mfu: 50.83 | epoch: 1 | total time: 53.96m | eta: 125.8m +step 05021/16704 (30.06%) | loss: 2.751507 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,313 | mfu: 51.02 | epoch: 1 | total time: 53.97m | eta: 125.8m +step 05022/16704 (30.06%) | loss: 2.754813 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,044 | mfu: 50.88 | epoch: 1 | total time: 53.98m | eta: 125.8m +step 05023/16704 (30.07%) | loss: 2.769619 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,497 | mfu: 50.66 | epoch: 1 | total time: 53.99m | eta: 125.8m +step 05024/16704 (30.08%) | loss: 2.780596 | lrm: 1.00 | dt: 643.30ms | tok/sec: 814,993 | mfu: 50.94 | epoch: 1 | total time: 54.00m | eta: 125.8m +step 05025/16704 (30.08%) | loss: 2.786109 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,249 | mfu: 50.83 | epoch: 1 | total time: 54.01m | eta: 125.8m +step 05026/16704 (30.09%) | loss: 2.774907 | lrm: 1.00 | dt: 644.09ms | tok/sec: 814,001 | mfu: 50.88 | epoch: 1 | total time: 54.03m | eta: 125.8m +step 05027/16704 (30.09%) | loss: 2.778769 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,598 | mfu: 50.79 | epoch: 1 | total time: 54.04m | eta: 125.8m +step 05028/16704 (30.10%) | loss: 2.768942 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,499 | mfu: 50.97 | epoch: 1 | total time: 54.05m | eta: 125.8m +step 05029/16704 (30.11%) | loss: 2.759021 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,670 | mfu: 50.79 | epoch: 1 | total time: 54.06m | eta: 125.7m +step 05030/16704 (30.11%) | loss: 2.757397 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,772 | mfu: 50.92 | epoch: 1 | total time: 54.07m | eta: 125.7m +step 05031/16704 (30.12%) | loss: 2.750638 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,349 | mfu: 50.96 | epoch: 1 | total time: 54.08m | eta: 125.7m +step 05032/16704 (30.12%) | loss: 2.764257 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,357 | mfu: 50.90 | epoch: 1 | total time: 54.09m | eta: 125.7m +step 05033/16704 (30.13%) | loss: 2.763665 | lrm: 1.00 | dt: 646.98ms | tok/sec: 810,359 | mfu: 50.65 | epoch: 1 | total time: 54.10m | eta: 125.7m +step 05034/16704 (30.14%) | loss: 2.758205 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,529 | mfu: 50.91 | epoch: 1 | total time: 54.11m | eta: 125.7m +step 05035/16704 (30.14%) | loss: 2.755664 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,167 | mfu: 50.82 | epoch: 1 | total time: 54.12m | eta: 125.7m +step 05036/16704 (30.15%) | loss: 2.760894 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,449 | mfu: 50.72 | epoch: 1 | total time: 54.13m | eta: 125.7m +step 05037/16704 (30.15%) | loss: 2.754797 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,377 | mfu: 50.90 | epoch: 1 | total time: 54.14m | eta: 125.7m +step 05038/16704 (30.16%) | loss: 2.756101 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,165 | mfu: 50.76 | epoch: 1 | total time: 54.15m | eta: 125.6m +step 05039/16704 (30.17%) | loss: 2.742922 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,551 | mfu: 50.79 | epoch: 1 | total time: 54.16m | eta: 125.6m +step 05040/16704 (30.17%) | loss: 2.755548 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,708 | mfu: 50.73 | epoch: 1 | total time: 54.18m | eta: 125.6m +step 05041/16704 (30.18%) | loss: 2.763740 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,188 | mfu: 50.89 | epoch: 1 | total time: 54.19m | eta: 125.6m +step 05042/16704 (30.18%) | loss: 2.764545 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,542 | mfu: 50.91 | epoch: 1 | total time: 54.20m | eta: 125.6m +step 05043/16704 (30.19%) | loss: 2.774385 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,211 | mfu: 50.70 | epoch: 1 | total time: 54.21m | eta: 125.6m +step 05044/16704 (30.20%) | loss: 2.776830 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,098 | mfu: 50.76 | epoch: 1 | total time: 54.22m | eta: 125.6m +step 05045/16704 (30.20%) | loss: 2.773998 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 1 | total time: 54.23m | eta: 125.6m +step 05046/16704 (30.21%) | loss: 2.782653 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,962 | mfu: 50.94 | epoch: 1 | total time: 54.24m | eta: 125.6m +step 05047/16704 (30.21%) | loss: 2.787495 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,452 | mfu: 50.78 | epoch: 1 | total time: 54.25m | eta: 125.6m +step 05048/16704 (30.22%) | loss: 2.779309 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,511 | mfu: 50.85 | epoch: 1 | total time: 54.26m | eta: 125.5m +step 05049/16704 (30.23%) | loss: 2.776722 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,303 | mfu: 50.90 | epoch: 1 | total time: 54.27m | eta: 125.5m +step 05050/16704 (30.23%) | loss: 2.770263 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,412 | mfu: 50.71 | epoch: 1 | total time: 54.28m | eta: 125.5m +step 05051/16704 (30.24%) | loss: 2.771875 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,250 | mfu: 50.89 | epoch: 1 | total time: 54.29m | eta: 125.5m +step 05052/16704 (30.24%) | loss: 2.770943 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,551 | mfu: 50.72 | epoch: 1 | total time: 54.30m | eta: 125.5m +step 05053/16704 (30.25%) | loss: 2.777720 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,440 | mfu: 50.78 | epoch: 1 | total time: 54.32m | eta: 125.5m +step 05054/16704 (30.26%) | loss: 2.776911 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,260 | mfu: 50.83 | epoch: 1 | total time: 54.33m | eta: 125.5m +step 05055/16704 (30.26%) | loss: 2.784251 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,461 | mfu: 50.91 | epoch: 1 | total time: 54.34m | eta: 125.5m +step 05056/16704 (30.27%) | loss: 2.774719 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,620 | mfu: 50.79 | epoch: 1 | total time: 54.35m | eta: 125.5m +step 05057/16704 (30.27%) | loss: 2.763629 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,865 | mfu: 50.74 | epoch: 1 | total time: 54.36m | eta: 125.4m +step 05058/16704 (30.28%) | loss: 2.769592 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,548 | mfu: 51.04 | epoch: 1 | total time: 54.37m | eta: 125.4m +step 05059/16704 (30.29%) | loss: 2.767108 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,646 | mfu: 50.67 | epoch: 1 | total time: 54.38m | eta: 125.4m +step 05060/16704 (30.29%) | loss: 2.758246 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,133 | mfu: 50.76 | epoch: 1 | total time: 54.39m | eta: 125.4m +step 05061/16704 (30.30%) | loss: 2.764076 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,769 | mfu: 50.74 | epoch: 1 | total time: 54.40m | eta: 125.4m +step 05062/16704 (30.30%) | loss: 2.758887 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 1 | total time: 54.41m | eta: 125.4m +step 05063/16704 (30.31%) | loss: 2.764006 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,095 | mfu: 50.88 | epoch: 1 | total time: 54.42m | eta: 125.4m +step 05064/16704 (30.32%) | loss: 2.757951 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,173 | mfu: 50.70 | epoch: 1 | total time: 54.43m | eta: 125.4m +step 05065/16704 (30.32%) | loss: 2.752046 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,177 | mfu: 50.82 | epoch: 1 | total time: 54.44m | eta: 125.4m +step 05066/16704 (30.33%) | loss: 2.757828 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,251 | mfu: 50.64 | epoch: 1 | total time: 54.45m | eta: 125.3m +step 05067/16704 (30.33%) | loss: 2.757122 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,197 | mfu: 50.95 | epoch: 1 | total time: 54.47m | eta: 125.3m +step 05068/16704 (30.34%) | loss: 2.747704 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,027 | mfu: 50.75 | epoch: 1 | total time: 54.48m | eta: 125.3m +step 05069/16704 (30.35%) | loss: 2.752708 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,828 | mfu: 50.68 | epoch: 1 | total time: 54.49m | eta: 125.3m +step 05070/16704 (30.35%) | loss: 2.751381 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,913 | mfu: 50.68 | epoch: 1 | total time: 54.50m | eta: 125.3m +step 05071/16704 (30.36%) | loss: 2.737553 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,818 | mfu: 50.74 | epoch: 1 | total time: 54.51m | eta: 125.3m +step 05072/16704 (30.36%) | loss: 2.727144 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,450 | mfu: 50.65 | epoch: 1 | total time: 54.52m | eta: 125.3m +step 05073/16704 (30.37%) | loss: 2.731621 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,584 | mfu: 50.85 | epoch: 1 | total time: 54.53m | eta: 125.3m +step 05074/16704 (30.38%) | loss: 2.724594 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,169 | mfu: 50.76 | epoch: 1 | total time: 54.54m | eta: 125.3m +step 05075/16704 (30.38%) | loss: 2.724395 | lrm: 1.00 | dt: 647.52ms | tok/sec: 809,689 | mfu: 50.61 | epoch: 1 | total time: 54.55m | eta: 125.2m +step 05076/16704 (30.39%) | loss: 2.730210 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,269 | mfu: 50.89 | epoch: 1 | total time: 54.56m | eta: 125.2m +step 05077/16704 (30.39%) | loss: 2.729234 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,944 | mfu: 50.87 | epoch: 1 | total time: 54.57m | eta: 125.2m +step 05078/16704 (30.40%) | loss: 2.729915 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,369 | mfu: 50.90 | epoch: 1 | total time: 54.58m | eta: 125.2m +step 05079/16704 (30.41%) | loss: 2.730819 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,242 | mfu: 50.70 | epoch: 1 | total time: 54.59m | eta: 125.2m +step 05080/16704 (30.41%) | loss: 2.741568 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,083 | mfu: 50.82 | epoch: 1 | total time: 54.61m | eta: 125.2m +step 05081/16704 (30.42%) | loss: 2.750119 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,142 | mfu: 50.89 | epoch: 1 | total time: 54.62m | eta: 125.2m +step 05082/16704 (30.42%) | loss: 2.749914 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,375 | mfu: 50.96 | epoch: 1 | total time: 54.63m | eta: 125.2m +step 05083/16704 (30.43%) | loss: 2.762009 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,202 | mfu: 50.58 | epoch: 1 | total time: 54.64m | eta: 125.2m +step 05084/16704 (30.44%) | loss: 2.763782 | lrm: 1.00 | dt: 646.33ms | tok/sec: 811,174 | mfu: 50.70 | epoch: 1 | total time: 54.65m | eta: 125.2m +step 05085/16704 (30.44%) | loss: 2.768812 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,825 | mfu: 50.87 | epoch: 1 | total time: 54.66m | eta: 125.1m +step 05086/16704 (30.45%) | loss: 2.760131 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,250 | mfu: 50.70 | epoch: 1 | total time: 54.67m | eta: 125.1m +step 05087/16704 (30.45%) | loss: 2.762433 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,415 | mfu: 50.84 | epoch: 1 | total time: 54.68m | eta: 125.1m +step 05088/16704 (30.46%) | loss: 2.755019 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,261 | mfu: 50.71 | epoch: 1 | total time: 54.69m | eta: 125.1m +step 05089/16704 (30.47%) | loss: 2.765466 | lrm: 1.00 | dt: 642.20ms | tok/sec: 816,399 | mfu: 51.03 | epoch: 1 | total time: 54.70m | eta: 125.1m +step 05090/16704 (30.47%) | loss: 2.754478 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,178 | mfu: 50.95 | epoch: 1 | total time: 54.71m | eta: 125.1m +step 05091/16704 (30.48%) | loss: 2.753905 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,355 | mfu: 50.77 | epoch: 1 | total time: 54.72m | eta: 125.1m +step 05092/16704 (30.48%) | loss: 2.758456 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,929 | mfu: 50.75 | epoch: 1 | total time: 54.73m | eta: 125.1m +step 05093/16704 (30.49%) | loss: 2.770173 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,738 | mfu: 50.80 | epoch: 1 | total time: 54.75m | eta: 125.1m +step 05094/16704 (30.50%) | loss: 2.771323 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,589 | mfu: 50.66 | epoch: 1 | total time: 54.76m | eta: 125.0m +step 05095/16704 (30.50%) | loss: 2.760764 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,209 | mfu: 50.76 | epoch: 1 | total time: 54.77m | eta: 125.0m +step 05096/16704 (30.51%) | loss: 2.779929 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,887 | mfu: 50.87 | epoch: 1 | total time: 54.78m | eta: 125.0m +step 05097/16704 (30.51%) | loss: 2.771494 | lrm: 1.00 | dt: 647.89ms | tok/sec: 809,228 | mfu: 50.58 | epoch: 1 | total time: 54.79m | eta: 125.0m +step 05098/16704 (30.52%) | loss: 2.768190 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,157 | mfu: 50.70 | epoch: 1 | total time: 54.80m | eta: 125.0m +step 05099/16704 (30.53%) | loss: 2.746526 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,735 | mfu: 50.86 | epoch: 1 | total time: 54.81m | eta: 125.0m +step 05100/16704 (30.53%) | loss: 2.746857 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,325 | mfu: 50.90 | epoch: 1 | total time: 54.82m | eta: 125.0m +step 05101/16704 (30.54%) | loss: 2.742555 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,624 | mfu: 50.85 | epoch: 1 | total time: 54.83m | eta: 125.0m +step 05102/16704 (30.54%) | loss: 2.746902 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,509 | mfu: 50.66 | epoch: 1 | total time: 54.84m | eta: 125.0m +step 05103/16704 (30.55%) | loss: 2.750152 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,873 | mfu: 50.81 | epoch: 1 | total time: 54.85m | eta: 124.9m +step 05104/16704 (30.56%) | loss: 2.741213 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,314 | mfu: 50.77 | epoch: 1 | total time: 54.86m | eta: 124.9m +step 05105/16704 (30.56%) | loss: 2.739025 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,354 | mfu: 50.71 | epoch: 1 | total time: 54.87m | eta: 124.9m +step 05106/16704 (30.57%) | loss: 2.749336 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,425 | mfu: 50.72 | epoch: 1 | total time: 54.89m | eta: 124.9m +step 05107/16704 (30.57%) | loss: 2.752490 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,888 | mfu: 50.81 | epoch: 1 | total time: 54.90m | eta: 124.9m +step 05108/16704 (30.58%) | loss: 2.784163 | lrm: 1.00 | dt: 641.87ms | tok/sec: 816,816 | mfu: 51.05 | epoch: 1 | total time: 54.91m | eta: 124.9m +step 05109/16704 (30.59%) | loss: 2.780754 | lrm: 1.00 | dt: 647.98ms | tok/sec: 809,111 | mfu: 50.57 | epoch: 1 | total time: 54.92m | eta: 124.9m +step 05110/16704 (30.59%) | loss: 2.785555 | lrm: 1.00 | dt: 642.17ms | tok/sec: 816,426 | mfu: 51.03 | epoch: 1 | total time: 54.93m | eta: 124.9m +step 05111/16704 (30.60%) | loss: 2.782875 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,697 | mfu: 50.73 | epoch: 1 | total time: 54.94m | eta: 124.9m +step 05112/16704 (30.60%) | loss: 2.780402 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,893 | mfu: 50.87 | epoch: 1 | total time: 54.95m | eta: 124.8m +step 05113/16704 (30.61%) | loss: 2.786564 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,946 | mfu: 50.87 | epoch: 1 | total time: 54.96m | eta: 124.8m +step 05114/16704 (30.62%) | loss: 2.788722 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,555 | mfu: 50.72 | epoch: 1 | total time: 54.97m | eta: 124.8m +step 05115/16704 (30.62%) | loss: 2.785117 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,482 | mfu: 50.78 | epoch: 1 | total time: 54.98m | eta: 124.8m +step 05116/16704 (30.63%) | loss: 2.777942 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,505 | mfu: 50.72 | epoch: 1 | total time: 54.99m | eta: 124.8m +step 05117/16704 (30.63%) | loss: 2.771516 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,476 | mfu: 50.78 | epoch: 1 | total time: 55.00m | eta: 124.8m +step 05118/16704 (30.64%) | loss: 2.763483 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,849 | mfu: 50.80 | epoch: 1 | total time: 55.01m | eta: 124.8m +step 05119/16704 (30.65%) | loss: 2.769054 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,297 | mfu: 50.83 | epoch: 1 | total time: 55.02m | eta: 124.8m +step 05120/16704 (30.65%) | loss: 2.759971 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,438 | mfu: 50.72 | epoch: 1 | total time: 55.04m | eta: 124.8m +step 05121/16704 (30.66%) | loss: 2.763262 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,240 | mfu: 50.77 | epoch: 1 | total time: 55.05m | eta: 124.8m +step 05122/16704 (30.66%) | loss: 2.770544 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,127 | mfu: 50.76 | epoch: 1 | total time: 55.06m | eta: 124.7m +step 05123/16704 (30.67%) | loss: 2.770314 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,418 | mfu: 50.78 | epoch: 1 | total time: 55.07m | eta: 124.7m +step 05124/16704 (30.68%) | loss: 2.767091 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,856 | mfu: 50.74 | epoch: 1 | total time: 55.08m | eta: 124.7m +step 05125/16704 (30.68%) | loss: 2.769707 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,419 | mfu: 50.78 | epoch: 1 | total time: 55.09m | eta: 124.7m +step 05126/16704 (30.69%) | loss: 2.766610 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,020 | mfu: 50.69 | epoch: 1 | total time: 55.10m | eta: 124.7m +step 05127/16704 (30.69%) | loss: 2.757359 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,881 | mfu: 50.68 | epoch: 1 | total time: 55.11m | eta: 124.7m +step 05128/16704 (30.70%) | loss: 2.755032 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,669 | mfu: 50.67 | epoch: 1 | total time: 55.12m | eta: 124.7m +step 05129/16704 (30.71%) | loss: 2.745404 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,159 | mfu: 50.89 | epoch: 1 | total time: 55.13m | eta: 124.7m +step 05130/16704 (30.71%) | loss: 2.750133 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,390 | mfu: 50.84 | epoch: 1 | total time: 55.14m | eta: 124.7m +step 05131/16704 (30.72%) | loss: 2.750074 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,543 | mfu: 50.85 | epoch: 1 | total time: 55.15m | eta: 124.6m +step 05132/16704 (30.72%) | loss: 2.747846 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,654 | mfu: 50.73 | epoch: 1 | total time: 55.16m | eta: 124.6m +step 05133/16704 (30.73%) | loss: 2.743571 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,975 | mfu: 50.75 | epoch: 1 | total time: 55.18m | eta: 124.6m +step 05134/16704 (30.74%) | loss: 2.750795 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,198 | mfu: 50.83 | epoch: 1 | total time: 55.19m | eta: 124.6m +step 05135/16704 (30.74%) | loss: 2.743005 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,296 | mfu: 50.71 | epoch: 1 | total time: 55.20m | eta: 124.6m +step 05136/16704 (30.75%) | loss: 2.726674 | lrm: 1.00 | dt: 642.54ms | tok/sec: 815,967 | mfu: 51.00 | epoch: 1 | total time: 55.21m | eta: 124.6m +step 05137/16704 (30.75%) | loss: 2.729822 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,271 | mfu: 50.83 | epoch: 1 | total time: 55.22m | eta: 124.6m +step 05138/16704 (30.76%) | loss: 2.733365 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,852 | mfu: 50.74 | epoch: 1 | total time: 55.23m | eta: 124.6m +step 05139/16704 (30.77%) | loss: 2.738586 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,312 | mfu: 50.65 | epoch: 1 | total time: 55.24m | eta: 124.6m +step 05140/16704 (30.77%) | loss: 2.733324 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,314 | mfu: 50.96 | epoch: 1 | total time: 55.25m | eta: 124.5m +step 05141/16704 (30.78%) | loss: 2.734333 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,758 | mfu: 50.92 | epoch: 1 | total time: 55.26m | eta: 124.5m +step 05142/16704 (30.78%) | loss: 2.738438 | lrm: 1.00 | dt: 648.06ms | tok/sec: 809,010 | mfu: 50.56 | epoch: 1 | total time: 55.27m | eta: 124.5m +step 05143/16704 (30.79%) | loss: 2.728409 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,451 | mfu: 50.90 | epoch: 1 | total time: 55.28m | eta: 124.5m +step 05144/16704 (30.80%) | loss: 2.732382 | lrm: 1.00 | dt: 647.47ms | tok/sec: 809,746 | mfu: 50.61 | epoch: 1 | total time: 55.29m | eta: 124.5m +step 05145/16704 (30.80%) | loss: 2.723619 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,858 | mfu: 50.93 | epoch: 1 | total time: 55.30m | eta: 124.5m +step 05146/16704 (30.81%) | loss: 2.738733 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,914 | mfu: 50.81 | epoch: 1 | total time: 55.32m | eta: 124.5m +step 05147/16704 (30.81%) | loss: 2.743593 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,977 | mfu: 50.69 | epoch: 1 | total time: 55.33m | eta: 124.5m +step 05148/16704 (30.82%) | loss: 2.735565 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,840 | mfu: 50.87 | epoch: 1 | total time: 55.34m | eta: 124.5m +step 05149/16704 (30.82%) | loss: 2.733024 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,367 | mfu: 50.84 | epoch: 1 | total time: 55.35m | eta: 124.4m +step 05150/16704 (30.83%) | loss: 2.736839 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,567 | mfu: 50.91 | epoch: 1 | total time: 55.36m | eta: 124.4m +step 05151/16704 (30.84%) | loss: 2.734318 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,180 | mfu: 50.76 | epoch: 1 | total time: 55.37m | eta: 124.4m +step 05152/16704 (30.84%) | loss: 2.739360 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,915 | mfu: 50.68 | epoch: 1 | total time: 55.38m | eta: 124.4m +step 05153/16704 (30.85%) | loss: 2.734260 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,629 | mfu: 50.79 | epoch: 1 | total time: 55.39m | eta: 124.4m +step 05154/16704 (30.85%) | loss: 2.733802 | lrm: 1.00 | dt: 646.88ms | tok/sec: 810,481 | mfu: 50.66 | epoch: 1 | total time: 55.40m | eta: 124.4m +step 05155/16704 (30.86%) | loss: 2.746207 | lrm: 1.00 | dt: 641.33ms | tok/sec: 817,502 | mfu: 51.10 | epoch: 1 | total time: 55.41m | eta: 124.4m +step 05156/16704 (30.87%) | loss: 2.765814 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,459 | mfu: 50.90 | epoch: 1 | total time: 55.42m | eta: 124.4m +step 05157/16704 (30.87%) | loss: 2.766013 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,677 | mfu: 50.73 | epoch: 1 | total time: 55.43m | eta: 124.4m +step 05158/16704 (30.88%) | loss: 2.742540 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,068 | mfu: 50.88 | epoch: 1 | total time: 55.44m | eta: 124.4m +step 05159/16704 (30.88%) | loss: 2.750344 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,474 | mfu: 50.78 | epoch: 1 | total time: 55.45m | eta: 124.3m +step 05160/16704 (30.89%) | loss: 2.753328 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,604 | mfu: 50.91 | epoch: 1 | total time: 55.47m | eta: 124.3m +step 05161/16704 (30.90%) | loss: 2.762459 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,892 | mfu: 50.81 | epoch: 1 | total time: 55.48m | eta: 124.3m +step 05162/16704 (30.90%) | loss: 2.768812 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 1 | total time: 55.49m | eta: 124.3m +step 05163/16704 (30.91%) | loss: 2.769461 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,430 | mfu: 50.84 | epoch: 1 | total time: 55.50m | eta: 124.3m +step 05164/16704 (30.91%) | loss: 2.769648 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,957 | mfu: 50.75 | epoch: 1 | total time: 55.51m | eta: 124.3m +step 05165/16704 (30.92%) | loss: 2.764826 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,146 | mfu: 50.76 | epoch: 1 | total time: 55.52m | eta: 124.3m +step 05166/16704 (30.93%) | loss: 2.764007 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,522 | mfu: 50.66 | epoch: 1 | total time: 55.53m | eta: 124.3m +step 05167/16704 (30.93%) | loss: 2.754637 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,102 | mfu: 50.76 | epoch: 1 | total time: 55.54m | eta: 124.3m +step 05168/16704 (30.94%) | loss: 2.753828 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,672 | mfu: 50.73 | epoch: 1 | total time: 55.55m | eta: 124.2m +step 05169/16704 (30.94%) | loss: 2.770311 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,528 | mfu: 50.72 | epoch: 1 | total time: 55.56m | eta: 124.2m +step 05170/16704 (30.95%) | loss: 2.769152 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,534 | mfu: 50.85 | epoch: 1 | total time: 55.57m | eta: 124.2m +step 05171/16704 (30.96%) | loss: 2.773129 | lrm: 1.00 | dt: 648.62ms | tok/sec: 808,318 | mfu: 50.52 | epoch: 1 | total time: 55.58m | eta: 124.2m +step 05172/16704 (30.96%) | loss: 2.774745 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,862 | mfu: 50.81 | epoch: 1 | total time: 55.59m | eta: 124.2m +step 05173/16704 (30.97%) | loss: 2.767628 | lrm: 1.00 | dt: 645.26ms | tok/sec: 812,527 | mfu: 50.78 | epoch: 1 | total time: 55.61m | eta: 124.2m +step 05174/16704 (30.97%) | loss: 2.749819 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,365 | mfu: 50.71 | epoch: 1 | total time: 55.62m | eta: 124.2m +step 05175/16704 (30.98%) | loss: 2.746840 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 1 | total time: 55.63m | eta: 124.2m +step 05176/16704 (30.99%) | loss: 2.755697 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,701 | mfu: 50.92 | epoch: 1 | total time: 55.64m | eta: 124.2m +step 05177/16704 (30.99%) | loss: 2.758014 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,313 | mfu: 50.96 | epoch: 1 | total time: 55.65m | eta: 124.1m +step 05178/16704 (31.00%) | loss: 2.753822 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,444 | mfu: 50.90 | epoch: 1 | total time: 55.66m | eta: 124.1m +step 05179/16704 (31.00%) | loss: 2.747320 | lrm: 1.00 | dt: 647.02ms | tok/sec: 810,311 | mfu: 50.65 | epoch: 1 | total time: 55.67m | eta: 124.1m +step 05180/16704 (31.01%) | loss: 2.739476 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,659 | mfu: 50.79 | epoch: 1 | total time: 55.68m | eta: 124.1m +step 05181/16704 (31.02%) | loss: 2.749117 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,879 | mfu: 50.81 | epoch: 1 | total time: 55.69m | eta: 124.1m +step 05182/16704 (31.02%) | loss: 2.756749 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,395 | mfu: 50.90 | epoch: 1 | total time: 55.70m | eta: 124.1m +step 05183/16704 (31.03%) | loss: 2.763460 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,836 | mfu: 50.87 | epoch: 1 | total time: 55.71m | eta: 124.1m +step 05184/16704 (31.03%) | loss: 2.771396 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,464 | mfu: 50.78 | epoch: 1 | total time: 55.72m | eta: 124.1m +step 05185/16704 (31.04%) | loss: 2.766702 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,049 | mfu: 50.88 | epoch: 1 | total time: 55.73m | eta: 124.1m +step 05186/16704 (31.05%) | loss: 2.772800 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,765 | mfu: 50.61 | epoch: 1 | total time: 55.75m | eta: 124.0m +step 05187/16704 (31.05%) | loss: 2.784661 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,398 | mfu: 50.90 | epoch: 1 | total time: 55.76m | eta: 124.0m +step 05188/16704 (31.06%) | loss: 2.786936 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,011 | mfu: 50.94 | epoch: 1 | total time: 55.77m | eta: 124.0m +step 05189/16704 (31.06%) | loss: 2.789216 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,975 | mfu: 50.81 | epoch: 1 | total time: 55.78m | eta: 124.0m +step 05190/16704 (31.07%) | loss: 2.784414 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,974 | mfu: 50.87 | epoch: 1 | total time: 55.79m | eta: 124.0m +step 05191/16704 (31.08%) | loss: 2.770254 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,497 | mfu: 50.97 | epoch: 1 | total time: 55.80m | eta: 124.0m +step 05192/16704 (31.08%) | loss: 2.771824 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,287 | mfu: 50.71 | epoch: 1 | total time: 55.81m | eta: 124.0m +step 05193/16704 (31.09%) | loss: 2.773192 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,867 | mfu: 50.68 | epoch: 1 | total time: 55.82m | eta: 124.0m +step 05194/16704 (31.09%) | loss: 2.761888 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,429 | mfu: 50.84 | epoch: 1 | total time: 55.83m | eta: 124.0m +step 05195/16704 (31.10%) | loss: 2.760335 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,144 | mfu: 50.76 | epoch: 1 | total time: 55.84m | eta: 124.0m +step 05196/16704 (31.11%) | loss: 2.744871 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,293 | mfu: 50.83 | epoch: 1 | total time: 55.85m | eta: 123.9m +step 05197/16704 (31.11%) | loss: 2.749968 | lrm: 1.00 | dt: 648.52ms | tok/sec: 808,434 | mfu: 50.53 | epoch: 1 | total time: 55.86m | eta: 123.9m +step 05198/16704 (31.12%) | loss: 2.730139 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,409 | mfu: 50.90 | epoch: 1 | total time: 55.87m | eta: 123.9m +step 05199/16704 (31.12%) | loss: 2.734003 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,471 | mfu: 50.78 | epoch: 1 | total time: 55.89m | eta: 123.9m +step 05200/16704 (31.13%) | loss: 2.735384 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,967 | mfu: 50.69 | epoch: 1 | total time: 55.90m | eta: 123.9m +step 05201/16704 (31.14%) | loss: 2.732263 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 1 | total time: 55.91m | eta: 123.9m +step 05202/16704 (31.14%) | loss: 2.734496 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,076 | mfu: 50.82 | epoch: 1 | total time: 55.92m | eta: 123.9m +step 05203/16704 (31.15%) | loss: 2.729783 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,305 | mfu: 50.77 | epoch: 1 | total time: 55.93m | eta: 123.9m +step 05204/16704 (31.15%) | loss: 2.732855 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,693 | mfu: 50.86 | epoch: 1 | total time: 55.94m | eta: 123.9m +step 05205/16704 (31.16%) | loss: 2.752018 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,874 | mfu: 50.87 | epoch: 1 | total time: 55.95m | eta: 123.8m +step 05206/16704 (31.17%) | loss: 2.748450 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,499 | mfu: 50.78 | epoch: 1 | total time: 55.96m | eta: 123.8m +step 05207/16704 (31.17%) | loss: 2.751284 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,458 | mfu: 50.78 | epoch: 1 | total time: 55.97m | eta: 123.8m +step 05208/16704 (31.18%) | loss: 2.755242 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,072 | mfu: 50.76 | epoch: 1 | total time: 55.98m | eta: 123.8m +step 05209/16704 (31.18%) | loss: 2.762805 | lrm: 1.00 | dt: 647.74ms | tok/sec: 809,405 | mfu: 50.59 | epoch: 1 | total time: 55.99m | eta: 123.8m +step 05210/16704 (31.19%) | loss: 2.751681 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,891 | mfu: 50.99 | epoch: 1 | total time: 56.00m | eta: 123.8m +step 05211/16704 (31.20%) | loss: 2.746522 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,587 | mfu: 50.79 | epoch: 1 | total time: 56.01m | eta: 123.8m +step 05212/16704 (31.20%) | loss: 2.756476 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,887 | mfu: 50.74 | epoch: 1 | total time: 56.02m | eta: 123.8m +step 05213/16704 (31.21%) | loss: 2.762764 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,092 | mfu: 50.76 | epoch: 1 | total time: 56.04m | eta: 123.8m +step 05214/16704 (31.21%) | loss: 2.761419 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,291 | mfu: 50.83 | epoch: 1 | total time: 56.05m | eta: 123.7m +step 05215/16704 (31.22%) | loss: 2.759121 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,671 | mfu: 50.79 | epoch: 1 | total time: 56.06m | eta: 123.7m +step 05216/16704 (31.23%) | loss: 2.759774 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,802 | mfu: 50.86 | epoch: 1 | total time: 56.07m | eta: 123.7m +step 05217/16704 (31.23%) | loss: 2.756415 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,431 | mfu: 50.84 | epoch: 1 | total time: 56.08m | eta: 123.7m +step 05218/16704 (31.24%) | loss: 2.753541 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,812 | mfu: 50.74 | epoch: 1 | total time: 56.09m | eta: 123.7m +step 05219/16704 (31.24%) | loss: 2.765747 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,174 | mfu: 50.89 | epoch: 1 | total time: 56.10m | eta: 123.7m +step 05220/16704 (31.25%) | loss: 2.759798 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,330 | mfu: 50.71 | epoch: 1 | total time: 56.11m | eta: 123.7m +step 05221/16704 (31.26%) | loss: 2.751059 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,958 | mfu: 50.94 | epoch: 1 | total time: 56.12m | eta: 123.7m +step 05222/16704 (31.26%) | loss: 2.744826 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 1 | total time: 56.13m | eta: 123.7m +step 05223/16704 (31.27%) | loss: 2.744201 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,291 | mfu: 50.77 | epoch: 1 | total time: 56.14m | eta: 123.6m +step 05224/16704 (31.27%) | loss: 2.759194 | lrm: 1.00 | dt: 646.95ms | tok/sec: 810,404 | mfu: 50.65 | epoch: 1 | total time: 56.15m | eta: 123.6m +step 05225/16704 (31.28%) | loss: 2.763057 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,738 | mfu: 50.73 | epoch: 1 | total time: 56.16m | eta: 123.6m +step 05226/16704 (31.29%) | loss: 2.761244 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,461 | mfu: 50.78 | epoch: 1 | total time: 56.18m | eta: 123.6m +step 05227/16704 (31.29%) | loss: 2.753533 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,634 | mfu: 50.92 | epoch: 1 | total time: 56.19m | eta: 123.6m +step 05228/16704 (31.30%) | loss: 2.752196 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,879 | mfu: 50.68 | epoch: 1 | total time: 56.20m | eta: 123.6m +step 05229/16704 (31.30%) | loss: 2.752786 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,744 | mfu: 50.92 | epoch: 1 | total time: 56.21m | eta: 123.6m +step 05230/16704 (31.31%) | loss: 2.746929 | lrm: 1.00 | dt: 648.01ms | tok/sec: 809,069 | mfu: 50.57 | epoch: 1 | total time: 56.22m | eta: 123.6m +step 05231/16704 (31.32%) | loss: 2.750945 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,720 | mfu: 50.80 | epoch: 1 | total time: 56.23m | eta: 123.6m +step 05232/16704 (31.32%) | loss: 2.754666 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,522 | mfu: 50.91 | epoch: 1 | total time: 56.24m | eta: 123.6m +step 05233/16704 (31.33%) | loss: 2.773769 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,334 | mfu: 50.83 | epoch: 1 | total time: 56.25m | eta: 123.5m +step 05234/16704 (31.33%) | loss: 2.770492 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,853 | mfu: 50.68 | epoch: 1 | total time: 56.26m | eta: 123.5m +step 05235/16704 (31.34%) | loss: 2.764951 | lrm: 1.00 | dt: 646.59ms | tok/sec: 810,853 | mfu: 50.68 | epoch: 1 | total time: 56.27m | eta: 123.5m +step 05236/16704 (31.35%) | loss: 2.761792 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,177 | mfu: 50.95 | epoch: 1 | total time: 56.28m | eta: 123.5m +step 05237/16704 (31.35%) | loss: 2.761741 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,490 | mfu: 50.78 | epoch: 1 | total time: 56.29m | eta: 123.5m +step 05238/16704 (31.36%) | loss: 2.742668 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,576 | mfu: 50.85 | epoch: 1 | total time: 56.30m | eta: 123.5m +step 05239/16704 (31.36%) | loss: 2.757758 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,017 | mfu: 50.81 | epoch: 1 | total time: 56.32m | eta: 123.5m +step 05240/16704 (31.37%) | loss: 2.766786 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,947 | mfu: 50.81 | epoch: 1 | total time: 56.33m | eta: 123.5m +step 05241/16704 (31.38%) | loss: 2.762689 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,749 | mfu: 50.92 | epoch: 1 | total time: 56.34m | eta: 123.5m +step 05242/16704 (31.38%) | loss: 2.759291 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,403 | mfu: 50.71 | epoch: 1 | total time: 56.35m | eta: 123.4m +step 05243/16704 (31.39%) | loss: 2.751920 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,529 | mfu: 50.85 | epoch: 1 | total time: 56.36m | eta: 123.4m +step 05244/16704 (31.39%) | loss: 2.756931 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,988 | mfu: 50.88 | epoch: 1 | total time: 56.37m | eta: 123.4m +step 05245/16704 (31.40%) | loss: 2.757491 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,184 | mfu: 50.70 | epoch: 1 | total time: 56.38m | eta: 123.4m +step 05246/16704 (31.41%) | loss: 2.763585 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,168 | mfu: 50.95 | epoch: 1 | total time: 56.39m | eta: 123.4m +step 05247/16704 (31.41%) | loss: 2.765127 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,571 | mfu: 50.66 | epoch: 1 | total time: 56.40m | eta: 123.4m +step 05248/16704 (31.42%) | loss: 2.763591 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,740 | mfu: 50.98 | epoch: 1 | total time: 56.41m | eta: 123.4m +step 05249/16704 (31.42%) | loss: 2.787758 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,869 | mfu: 50.87 | epoch: 1 | total time: 56.42m | eta: 123.4m +Step 05250 | Validation bpb: 0.839379 +step 05250/16704 (31.43%) | loss: 2.795107 | lrm: 1.00 | dt: 649.34ms | tok/sec: 807,416 | mfu: 50.46 | epoch: 1 | total time: 56.43m | eta: 123.4m +step 05251/16704 (31.44%) | loss: 2.781428 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,322 | mfu: 50.90 | epoch: 1 | total time: 56.44m | eta: 123.3m +step 05252/16704 (31.44%) | loss: 2.769444 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,876 | mfu: 50.68 | epoch: 1 | total time: 56.45m | eta: 123.3m +step 05253/16704 (31.45%) | loss: 2.767225 | lrm: 1.00 | dt: 641.74ms | tok/sec: 816,976 | mfu: 51.06 | epoch: 1 | total time: 56.47m | eta: 123.3m +step 05254/16704 (31.45%) | loss: 2.766272 | lrm: 1.00 | dt: 648.57ms | tok/sec: 808,371 | mfu: 50.52 | epoch: 1 | total time: 56.48m | eta: 123.3m +step 05255/16704 (31.46%) | loss: 2.776445 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,737 | mfu: 50.86 | epoch: 1 | total time: 56.49m | eta: 123.3m +step 05256/16704 (31.47%) | loss: 2.776845 | lrm: 1.00 | dt: 641.48ms | tok/sec: 817,309 | mfu: 51.08 | epoch: 1 | total time: 56.50m | eta: 123.3m +step 05257/16704 (31.47%) | loss: 2.780854 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,451 | mfu: 50.84 | epoch: 1 | total time: 56.51m | eta: 123.3m +step 05258/16704 (31.48%) | loss: 2.770827 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,347 | mfu: 50.90 | epoch: 1 | total time: 56.52m | eta: 123.3m +step 05259/16704 (31.48%) | loss: 2.786066 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,378 | mfu: 50.84 | epoch: 1 | total time: 56.53m | eta: 123.3m +step 05260/16704 (31.49%) | loss: 2.775363 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,549 | mfu: 50.72 | epoch: 1 | total time: 56.54m | eta: 123.2m +step 05261/16704 (31.50%) | loss: 2.779126 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,417 | mfu: 50.90 | epoch: 1 | total time: 56.55m | eta: 123.2m +step 05262/16704 (31.50%) | loss: 2.777770 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,265 | mfu: 50.89 | epoch: 1 | total time: 56.56m | eta: 123.2m +step 05263/16704 (31.51%) | loss: 2.768210 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,933 | mfu: 50.75 | epoch: 1 | total time: 56.57m | eta: 123.2m +step 05264/16704 (31.51%) | loss: 2.767234 | lrm: 1.00 | dt: 642.11ms | tok/sec: 816,510 | mfu: 51.03 | epoch: 1 | total time: 56.58m | eta: 123.2m +step 05265/16704 (31.52%) | loss: 2.757556 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,476 | mfu: 50.78 | epoch: 1 | total time: 56.59m | eta: 123.2m +step 05266/16704 (31.53%) | loss: 2.762890 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,357 | mfu: 50.77 | epoch: 1 | total time: 56.61m | eta: 123.2m +step 05267/16704 (31.53%) | loss: 2.758479 | lrm: 1.00 | dt: 641.69ms | tok/sec: 817,048 | mfu: 51.07 | epoch: 1 | total time: 56.62m | eta: 123.2m +step 05268/16704 (31.54%) | loss: 2.776518 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,459 | mfu: 50.72 | epoch: 1 | total time: 56.63m | eta: 123.2m +step 05269/16704 (31.54%) | loss: 2.775710 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,636 | mfu: 50.92 | epoch: 1 | total time: 56.64m | eta: 123.2m +step 05270/16704 (31.55%) | loss: 2.761692 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,738 | mfu: 50.80 | epoch: 1 | total time: 56.65m | eta: 123.1m +step 05271/16704 (31.56%) | loss: 2.764973 | lrm: 1.00 | dt: 642.97ms | tok/sec: 815,421 | mfu: 50.97 | epoch: 1 | total time: 56.66m | eta: 123.1m +step 05272/16704 (31.56%) | loss: 2.765648 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,914 | mfu: 51.00 | epoch: 1 | total time: 56.67m | eta: 123.1m +step 05273/16704 (31.57%) | loss: 2.773439 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,448 | mfu: 50.78 | epoch: 1 | total time: 56.68m | eta: 123.1m +step 05274/16704 (31.57%) | loss: 2.776962 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,604 | mfu: 50.85 | epoch: 1 | total time: 56.69m | eta: 123.1m +step 05275/16704 (31.58%) | loss: 2.784613 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,589 | mfu: 50.91 | epoch: 1 | total time: 56.70m | eta: 123.1m +step 05276/16704 (31.59%) | loss: 2.777810 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,746 | mfu: 50.92 | epoch: 1 | total time: 56.71m | eta: 123.1m +step 05277/16704 (31.59%) | loss: 2.788653 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,794 | mfu: 50.80 | epoch: 1 | total time: 56.72m | eta: 123.1m +step 05278/16704 (31.60%) | loss: 2.785435 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,431 | mfu: 50.72 | epoch: 1 | total time: 56.73m | eta: 123.1m +step 05279/16704 (31.60%) | loss: 2.785881 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,957 | mfu: 50.94 | epoch: 1 | total time: 56.74m | eta: 123.0m +step 05280/16704 (31.61%) | loss: 2.780788 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 1 | total time: 56.76m | eta: 123.0m +step 05281/16704 (31.62%) | loss: 2.762912 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,864 | mfu: 50.99 | epoch: 1 | total time: 56.77m | eta: 123.0m +step 05282/16704 (31.62%) | loss: 2.754537 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,769 | mfu: 50.80 | epoch: 1 | total time: 56.78m | eta: 123.0m +step 05283/16704 (31.63%) | loss: 2.768047 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,550 | mfu: 50.72 | epoch: 1 | total time: 56.79m | eta: 123.0m +step 05284/16704 (31.63%) | loss: 2.757507 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,598 | mfu: 50.98 | epoch: 1 | total time: 56.80m | eta: 123.0m +step 05285/16704 (31.64%) | loss: 2.748286 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,740 | mfu: 50.67 | epoch: 1 | total time: 56.81m | eta: 123.0m +step 05286/16704 (31.65%) | loss: 2.757595 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,007 | mfu: 50.94 | epoch: 1 | total time: 56.82m | eta: 123.0m +step 05287/16704 (31.65%) | loss: 2.768573 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,626 | mfu: 50.85 | epoch: 1 | total time: 56.83m | eta: 123.0m +step 05288/16704 (31.66%) | loss: 2.765622 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 1 | total time: 56.84m | eta: 122.9m +step 05289/16704 (31.66%) | loss: 2.768897 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 1 | total time: 56.85m | eta: 122.9m +step 05290/16704 (31.67%) | loss: 2.768950 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,565 | mfu: 50.79 | epoch: 1 | total time: 56.86m | eta: 122.9m +step 05291/16704 (31.68%) | loss: 2.767872 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,964 | mfu: 50.81 | epoch: 1 | total time: 56.87m | eta: 122.9m +step 05292/16704 (31.68%) | loss: 2.784502 | lrm: 1.00 | dt: 641.70ms | tok/sec: 817,025 | mfu: 51.07 | epoch: 1 | total time: 56.88m | eta: 122.9m +step 05293/16704 (31.69%) | loss: 2.787858 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,329 | mfu: 50.90 | epoch: 1 | total time: 56.90m | eta: 122.9m +step 05294/16704 (31.69%) | loss: 2.775050 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,425 | mfu: 50.84 | epoch: 1 | total time: 56.91m | eta: 122.9m +step 05295/16704 (31.70%) | loss: 2.785148 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,935 | mfu: 50.87 | epoch: 1 | total time: 56.92m | eta: 122.9m +step 05296/16704 (31.70%) | loss: 2.774645 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,723 | mfu: 50.86 | epoch: 1 | total time: 56.93m | eta: 122.9m +step 05297/16704 (31.71%) | loss: 2.778249 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,199 | mfu: 50.70 | epoch: 1 | total time: 56.94m | eta: 122.8m +step 05298/16704 (31.72%) | loss: 2.757149 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,027 | mfu: 50.94 | epoch: 1 | total time: 56.95m | eta: 122.8m +step 05299/16704 (31.72%) | loss: 2.750112 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,362 | mfu: 50.84 | epoch: 1 | total time: 56.96m | eta: 122.8m +step 05300/16704 (31.73%) | loss: 2.742062 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,605 | mfu: 50.91 | epoch: 1 | total time: 56.97m | eta: 122.8m +step 05301/16704 (31.73%) | loss: 2.748672 | lrm: 1.00 | dt: 640.77ms | tok/sec: 818,211 | mfu: 51.14 | epoch: 1 | total time: 56.98m | eta: 122.8m +step 05302/16704 (31.74%) | loss: 2.746745 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,820 | mfu: 50.68 | epoch: 1 | total time: 56.99m | eta: 122.8m +step 05303/16704 (31.75%) | loss: 2.749616 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,339 | mfu: 50.90 | epoch: 1 | total time: 57.00m | eta: 122.8m +step 05304/16704 (31.75%) | loss: 2.751994 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,889 | mfu: 50.68 | epoch: 1 | total time: 57.01m | eta: 122.8m +step 05305/16704 (31.76%) | loss: 2.762008 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,918 | mfu: 50.81 | epoch: 1 | total time: 57.02m | eta: 122.8m +step 05306/16704 (31.76%) | loss: 2.765177 | lrm: 1.00 | dt: 642.32ms | tok/sec: 816,246 | mfu: 51.02 | epoch: 1 | total time: 57.03m | eta: 122.7m +step 05307/16704 (31.77%) | loss: 2.773218 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 1 | total time: 57.05m | eta: 122.7m +step 05308/16704 (31.78%) | loss: 2.775861 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,153 | mfu: 50.82 | epoch: 1 | total time: 57.06m | eta: 122.7m +step 05309/16704 (31.78%) | loss: 2.762143 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 1 | total time: 57.07m | eta: 122.7m +step 05310/16704 (31.79%) | loss: 2.755659 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,633 | mfu: 50.92 | epoch: 1 | total time: 57.08m | eta: 122.7m +step 05311/16704 (31.79%) | loss: 2.765571 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,802 | mfu: 50.86 | epoch: 1 | total time: 57.09m | eta: 122.7m +step 05312/16704 (31.80%) | loss: 2.762410 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,233 | mfu: 50.70 | epoch: 1 | total time: 57.10m | eta: 122.7m +step 05313/16704 (31.81%) | loss: 2.771668 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,892 | mfu: 50.68 | epoch: 1 | total time: 57.11m | eta: 122.7m +step 05314/16704 (31.81%) | loss: 2.764177 | lrm: 1.00 | dt: 641.18ms | tok/sec: 817,694 | mfu: 51.11 | epoch: 1 | total time: 57.12m | eta: 122.7m +step 05315/16704 (31.82%) | loss: 2.768732 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,323 | mfu: 50.71 | epoch: 1 | total time: 57.13m | eta: 122.7m +step 05316/16704 (31.82%) | loss: 2.767836 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,219 | mfu: 50.89 | epoch: 1 | total time: 57.14m | eta: 122.6m +step 05317/16704 (31.83%) | loss: 2.774932 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,722 | mfu: 50.80 | epoch: 1 | total time: 57.15m | eta: 122.6m +step 05318/16704 (31.84%) | loss: 2.769749 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,911 | mfu: 50.87 | epoch: 1 | total time: 57.16m | eta: 122.6m +step 05319/16704 (31.84%) | loss: 2.773685 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,292 | mfu: 50.89 | epoch: 1 | total time: 57.17m | eta: 122.6m +step 05320/16704 (31.85%) | loss: 2.785936 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,663 | mfu: 50.86 | epoch: 1 | total time: 57.19m | eta: 122.6m +step 05321/16704 (31.85%) | loss: 2.781105 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,892 | mfu: 50.87 | epoch: 1 | total time: 57.20m | eta: 122.6m +step 05322/16704 (31.86%) | loss: 2.769761 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,877 | mfu: 50.87 | epoch: 1 | total time: 57.21m | eta: 122.6m +step 05323/16704 (31.87%) | loss: 2.776896 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,744 | mfu: 50.67 | epoch: 1 | total time: 57.22m | eta: 122.6m +step 05324/16704 (31.87%) | loss: 2.773269 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,394 | mfu: 50.90 | epoch: 1 | total time: 57.23m | eta: 122.6m +step 05325/16704 (31.88%) | loss: 2.772582 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,154 | mfu: 50.89 | epoch: 1 | total time: 57.24m | eta: 122.5m +step 05326/16704 (31.88%) | loss: 2.761038 | lrm: 1.00 | dt: 647.26ms | tok/sec: 810,015 | mfu: 50.63 | epoch: 1 | total time: 57.25m | eta: 122.5m +step 05327/16704 (31.89%) | loss: 2.774662 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,730 | mfu: 51.05 | epoch: 1 | total time: 57.26m | eta: 122.5m +step 05328/16704 (31.90%) | loss: 2.761910 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,857 | mfu: 50.93 | epoch: 1 | total time: 57.27m | eta: 122.5m +step 05329/16704 (31.90%) | loss: 2.762742 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,819 | mfu: 50.86 | epoch: 1 | total time: 57.28m | eta: 122.5m +step 05330/16704 (31.91%) | loss: 2.762321 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,803 | mfu: 50.80 | epoch: 1 | total time: 57.29m | eta: 122.5m +step 05331/16704 (31.91%) | loss: 2.750008 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 1 | total time: 57.30m | eta: 122.5m +step 05332/16704 (31.92%) | loss: 2.744744 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,557 | mfu: 50.97 | epoch: 1 | total time: 57.31m | eta: 122.5m +step 05333/16704 (31.93%) | loss: 2.756237 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,411 | mfu: 50.90 | epoch: 1 | total time: 57.32m | eta: 122.5m +step 05334/16704 (31.93%) | loss: 2.753648 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,464 | mfu: 50.84 | epoch: 1 | total time: 57.34m | eta: 122.4m +step 05335/16704 (31.94%) | loss: 2.765130 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,607 | mfu: 50.79 | epoch: 1 | total time: 57.35m | eta: 122.4m +step 05336/16704 (31.94%) | loss: 2.748594 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,911 | mfu: 51.00 | epoch: 1 | total time: 57.36m | eta: 122.4m +step 05337/16704 (31.95%) | loss: 2.756233 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,421 | mfu: 50.71 | epoch: 1 | total time: 57.37m | eta: 122.4m +step 05338/16704 (31.96%) | loss: 2.762791 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,158 | mfu: 50.95 | epoch: 1 | total time: 57.38m | eta: 122.4m +step 05339/16704 (31.96%) | loss: 2.770277 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 1 | total time: 57.39m | eta: 122.4m +step 05340/16704 (31.97%) | loss: 2.775182 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.85 | epoch: 1 | total time: 57.40m | eta: 122.4m +step 05341/16704 (31.97%) | loss: 2.773096 | lrm: 1.00 | dt: 641.51ms | tok/sec: 817,272 | mfu: 51.08 | epoch: 1 | total time: 57.41m | eta: 122.4m +step 05342/16704 (31.98%) | loss: 2.769826 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,206 | mfu: 50.89 | epoch: 1 | total time: 57.42m | eta: 122.4m +step 05343/16704 (31.99%) | loss: 2.779385 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,364 | mfu: 50.77 | epoch: 1 | total time: 57.43m | eta: 122.3m +step 05344/16704 (31.99%) | loss: 2.772815 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,320 | mfu: 50.90 | epoch: 1 | total time: 57.44m | eta: 122.3m +step 05345/16704 (32.00%) | loss: 2.776499 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,022 | mfu: 50.82 | epoch: 1 | total time: 57.45m | eta: 122.3m +step 05346/16704 (32.00%) | loss: 2.782999 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,539 | mfu: 50.97 | epoch: 1 | total time: 57.46m | eta: 122.3m +step 05347/16704 (32.01%) | loss: 2.784679 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,466 | mfu: 50.91 | epoch: 1 | total time: 57.48m | eta: 122.3m +step 05348/16704 (32.02%) | loss: 2.786049 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 1 | total time: 57.49m | eta: 122.3m +step 05349/16704 (32.02%) | loss: 2.784366 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 1 | total time: 57.50m | eta: 122.3m +step 05350/16704 (32.03%) | loss: 2.773344 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,627 | mfu: 50.73 | epoch: 1 | total time: 57.51m | eta: 122.3m +step 05351/16704 (32.03%) | loss: 2.778881 | lrm: 1.00 | dt: 642.19ms | tok/sec: 816,406 | mfu: 51.03 | epoch: 1 | total time: 57.52m | eta: 122.3m +step 05352/16704 (32.04%) | loss: 2.769896 | lrm: 1.00 | dt: 642.69ms | tok/sec: 815,772 | mfu: 50.99 | epoch: 1 | total time: 57.53m | eta: 122.3m +step 05353/16704 (32.05%) | loss: 2.775896 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,477 | mfu: 50.78 | epoch: 1 | total time: 57.54m | eta: 122.2m +step 05354/16704 (32.05%) | loss: 2.783623 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,487 | mfu: 50.84 | epoch: 1 | total time: 57.55m | eta: 122.2m +step 05355/16704 (32.06%) | loss: 2.785056 | lrm: 1.00 | dt: 649.73ms | tok/sec: 806,927 | mfu: 50.43 | epoch: 1 | total time: 57.56m | eta: 122.2m +step 05356/16704 (32.06%) | loss: 2.785697 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,953 | mfu: 51.00 | epoch: 1 | total time: 57.57m | eta: 122.2m +step 05357/16704 (32.07%) | loss: 2.778694 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,773 | mfu: 50.74 | epoch: 1 | total time: 57.58m | eta: 122.2m +step 05358/16704 (32.08%) | loss: 2.785287 | lrm: 1.00 | dt: 642.74ms | tok/sec: 815,703 | mfu: 50.98 | epoch: 1 | total time: 57.59m | eta: 122.2m +step 05359/16704 (32.08%) | loss: 2.777754 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,133 | mfu: 50.88 | epoch: 1 | total time: 57.60m | eta: 122.2m +step 05360/16704 (32.09%) | loss: 2.782117 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,665 | mfu: 50.79 | epoch: 1 | total time: 57.61m | eta: 122.2m +step 05361/16704 (32.09%) | loss: 2.790976 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,877 | mfu: 50.87 | epoch: 1 | total time: 57.63m | eta: 122.2m +step 05362/16704 (32.10%) | loss: 2.777278 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,837 | mfu: 50.80 | epoch: 1 | total time: 57.64m | eta: 122.1m +step 05363/16704 (32.11%) | loss: 2.767190 | lrm: 1.00 | dt: 642.03ms | tok/sec: 816,615 | mfu: 51.04 | epoch: 1 | total time: 57.65m | eta: 122.1m +step 05364/16704 (32.11%) | loss: 2.769944 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 1 | total time: 57.66m | eta: 122.1m +step 05365/16704 (32.12%) | loss: 2.748142 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,766 | mfu: 50.80 | epoch: 1 | total time: 57.67m | eta: 122.1m +step 05366/16704 (32.12%) | loss: 2.742486 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,801 | mfu: 50.86 | epoch: 1 | total time: 57.68m | eta: 122.1m +step 05367/16704 (32.13%) | loss: 2.749990 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,741 | mfu: 50.80 | epoch: 1 | total time: 57.69m | eta: 122.1m +step 05368/16704 (32.14%) | loss: 2.756615 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,667 | mfu: 50.73 | epoch: 1 | total time: 57.70m | eta: 122.1m +step 05369/16704 (32.14%) | loss: 2.763335 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,040 | mfu: 50.94 | epoch: 1 | total time: 57.71m | eta: 122.1m +step 05370/16704 (32.15%) | loss: 2.757870 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,070 | mfu: 50.88 | epoch: 1 | total time: 57.72m | eta: 122.1m +step 05371/16704 (32.15%) | loss: 2.763233 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,511 | mfu: 50.66 | epoch: 1 | total time: 57.73m | eta: 122.0m +step 05372/16704 (32.16%) | loss: 2.755555 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 1 | total time: 57.74m | eta: 122.0m +step 05373/16704 (32.17%) | loss: 2.731169 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,024 | mfu: 50.88 | epoch: 1 | total time: 57.75m | eta: 122.0m +step 05374/16704 (32.17%) | loss: 2.728333 | lrm: 1.00 | dt: 642.52ms | tok/sec: 815,993 | mfu: 51.00 | epoch: 1 | total time: 57.77m | eta: 122.0m +step 05375/16704 (32.18%) | loss: 2.730304 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,304 | mfu: 50.71 | epoch: 1 | total time: 57.78m | eta: 122.0m +step 05376/16704 (32.18%) | loss: 2.736909 | lrm: 1.00 | dt: 642.72ms | tok/sec: 815,732 | mfu: 50.98 | epoch: 1 | total time: 57.79m | eta: 122.0m +step 05377/16704 (32.19%) | loss: 2.747268 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,613 | mfu: 50.98 | epoch: 1 | total time: 57.80m | eta: 122.0m +step 05378/16704 (32.20%) | loss: 2.753242 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,094 | mfu: 50.94 | epoch: 1 | total time: 57.81m | eta: 122.0m +step 05379/16704 (32.20%) | loss: 2.758031 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,230 | mfu: 50.83 | epoch: 1 | total time: 57.82m | eta: 122.0m +step 05380/16704 (32.21%) | loss: 2.761036 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,788 | mfu: 50.93 | epoch: 1 | total time: 57.83m | eta: 121.9m +step 05381/16704 (32.21%) | loss: 2.752493 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,406 | mfu: 50.65 | epoch: 1 | total time: 57.84m | eta: 121.9m +step 05382/16704 (32.22%) | loss: 2.765296 | lrm: 1.00 | dt: 642.33ms | tok/sec: 816,228 | mfu: 51.02 | epoch: 1 | total time: 57.85m | eta: 121.9m +step 05383/16704 (32.23%) | loss: 2.774870 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,067 | mfu: 50.88 | epoch: 1 | total time: 57.86m | eta: 121.9m +step 05384/16704 (32.23%) | loss: 2.772460 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,331 | mfu: 50.77 | epoch: 1 | total time: 57.87m | eta: 121.9m +step 05385/16704 (32.24%) | loss: 2.761853 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,164 | mfu: 50.89 | epoch: 1 | total time: 57.88m | eta: 121.9m +step 05386/16704 (32.24%) | loss: 2.761457 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,499 | mfu: 50.66 | epoch: 1 | total time: 57.89m | eta: 121.9m +step 05387/16704 (32.25%) | loss: 2.753352 | lrm: 1.00 | dt: 642.32ms | tok/sec: 816,236 | mfu: 51.02 | epoch: 1 | total time: 57.90m | eta: 121.9m +step 05388/16704 (32.26%) | loss: 2.751801 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,447 | mfu: 50.90 | epoch: 1 | total time: 57.92m | eta: 121.9m +step 05389/16704 (32.26%) | loss: 2.745787 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,132 | mfu: 50.82 | epoch: 1 | total time: 57.93m | eta: 121.9m +step 05390/16704 (32.27%) | loss: 2.757621 | lrm: 1.00 | dt: 641.19ms | tok/sec: 817,681 | mfu: 51.11 | epoch: 1 | total time: 57.94m | eta: 121.8m +step 05391/16704 (32.27%) | loss: 2.750098 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,794 | mfu: 50.80 | epoch: 1 | total time: 57.95m | eta: 121.8m +step 05392/16704 (32.28%) | loss: 2.758432 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,976 | mfu: 50.87 | epoch: 1 | total time: 57.96m | eta: 121.8m +step 05393/16704 (32.29%) | loss: 2.759766 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,836 | mfu: 50.87 | epoch: 1 | total time: 57.97m | eta: 121.8m +step 05394/16704 (32.29%) | loss: 2.754114 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,751 | mfu: 50.80 | epoch: 1 | total time: 57.98m | eta: 121.8m +step 05395/16704 (32.30%) | loss: 2.767744 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,593 | mfu: 50.91 | epoch: 1 | total time: 57.99m | eta: 121.8m +step 05396/16704 (32.30%) | loss: 2.766314 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,878 | mfu: 50.81 | epoch: 1 | total time: 58.00m | eta: 121.8m +step 05397/16704 (32.31%) | loss: 2.762954 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,281 | mfu: 50.89 | epoch: 1 | total time: 58.01m | eta: 121.8m +step 05398/16704 (32.32%) | loss: 2.765157 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,067 | mfu: 50.76 | epoch: 1 | total time: 58.02m | eta: 121.8m +step 05399/16704 (32.32%) | loss: 2.774969 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,974 | mfu: 50.75 | epoch: 1 | total time: 58.03m | eta: 121.7m +step 05400/16704 (32.33%) | loss: 2.766119 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,726 | mfu: 50.98 | epoch: 1 | total time: 58.04m | eta: 121.7m +step 05401/16704 (32.33%) | loss: 2.763078 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,771 | mfu: 50.80 | epoch: 1 | total time: 58.05m | eta: 121.7m +step 05402/16704 (32.34%) | loss: 2.770228 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,936 | mfu: 50.75 | epoch: 1 | total time: 58.07m | eta: 121.7m +step 05403/16704 (32.35%) | loss: 2.770341 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,372 | mfu: 50.90 | epoch: 1 | total time: 58.08m | eta: 121.7m +step 05404/16704 (32.35%) | loss: 2.764489 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,146 | mfu: 50.95 | epoch: 1 | total time: 58.09m | eta: 121.7m +step 05405/16704 (32.36%) | loss: 2.767425 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,554 | mfu: 50.85 | epoch: 1 | total time: 58.10m | eta: 121.7m +step 05406/16704 (32.36%) | loss: 2.776395 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,299 | mfu: 50.89 | epoch: 1 | total time: 58.11m | eta: 121.7m +step 05407/16704 (32.37%) | loss: 2.774027 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,380 | mfu: 50.90 | epoch: 1 | total time: 58.12m | eta: 121.7m +step 05408/16704 (32.38%) | loss: 2.772210 | lrm: 1.00 | dt: 647.82ms | tok/sec: 809,306 | mfu: 50.58 | epoch: 1 | total time: 58.13m | eta: 121.6m +step 05409/16704 (32.38%) | loss: 2.779055 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,320 | mfu: 50.90 | epoch: 1 | total time: 58.14m | eta: 121.6m +step 05410/16704 (32.39%) | loss: 2.776430 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,896 | mfu: 50.93 | epoch: 1 | total time: 58.15m | eta: 121.6m +step 05411/16704 (32.39%) | loss: 2.773814 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,664 | mfu: 50.73 | epoch: 1 | total time: 58.16m | eta: 121.6m +step 05412/16704 (32.40%) | loss: 2.769654 | lrm: 1.00 | dt: 641.90ms | tok/sec: 816,773 | mfu: 51.05 | epoch: 1 | total time: 58.17m | eta: 121.6m +step 05413/16704 (32.41%) | loss: 2.749274 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,886 | mfu: 50.81 | epoch: 1 | total time: 58.18m | eta: 121.6m +step 05414/16704 (32.41%) | loss: 2.763866 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,840 | mfu: 50.80 | epoch: 1 | total time: 58.19m | eta: 121.6m +step 05415/16704 (32.42%) | loss: 2.755708 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,314 | mfu: 50.83 | epoch: 1 | total time: 58.21m | eta: 121.6m +step 05416/16704 (32.42%) | loss: 2.753550 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,292 | mfu: 50.77 | epoch: 1 | total time: 58.22m | eta: 121.6m +step 05417/16704 (32.43%) | loss: 2.765174 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,401 | mfu: 50.90 | epoch: 1 | total time: 58.23m | eta: 121.5m +step 05418/16704 (32.44%) | loss: 2.764713 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,449 | mfu: 50.78 | epoch: 1 | total time: 58.24m | eta: 121.5m +step 05419/16704 (32.44%) | loss: 2.773516 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,503 | mfu: 50.85 | epoch: 1 | total time: 58.25m | eta: 121.5m +step 05420/16704 (32.45%) | loss: 2.772408 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,546 | mfu: 50.85 | epoch: 1 | total time: 58.26m | eta: 121.5m +step 05421/16704 (32.45%) | loss: 2.760588 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,365 | mfu: 50.90 | epoch: 1 | total time: 58.27m | eta: 121.5m +step 05422/16704 (32.46%) | loss: 2.758773 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,987 | mfu: 50.94 | epoch: 1 | total time: 58.28m | eta: 121.5m +step 05423/16704 (32.47%) | loss: 2.758406 | lrm: 1.00 | dt: 647.36ms | tok/sec: 809,881 | mfu: 50.62 | epoch: 1 | total time: 58.29m | eta: 121.5m +step 05424/16704 (32.47%) | loss: 2.756194 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,823 | mfu: 50.93 | epoch: 1 | total time: 58.30m | eta: 121.5m +step 05425/16704 (32.48%) | loss: 2.758116 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,936 | mfu: 50.87 | epoch: 1 | total time: 58.31m | eta: 121.5m +step 05426/16704 (32.48%) | loss: 2.765252 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,933 | mfu: 50.87 | epoch: 1 | total time: 58.32m | eta: 121.4m +step 05427/16704 (32.49%) | loss: 2.765943 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,823 | mfu: 50.80 | epoch: 1 | total time: 58.33m | eta: 121.4m +step 05428/16704 (32.50%) | loss: 2.753980 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,173 | mfu: 50.76 | epoch: 1 | total time: 58.35m | eta: 121.4m +step 05429/16704 (32.50%) | loss: 2.753257 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,038 | mfu: 50.69 | epoch: 1 | total time: 58.36m | eta: 121.4m +step 05430/16704 (32.51%) | loss: 2.756866 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,628 | mfu: 50.92 | epoch: 1 | total time: 58.37m | eta: 121.4m +step 05431/16704 (32.51%) | loss: 2.749418 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,167 | mfu: 50.82 | epoch: 1 | total time: 58.38m | eta: 121.4m +step 05432/16704 (32.52%) | loss: 2.741763 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,896 | mfu: 50.87 | epoch: 1 | total time: 58.39m | eta: 121.4m +step 05433/16704 (32.53%) | loss: 2.737766 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,445 | mfu: 50.78 | epoch: 1 | total time: 58.40m | eta: 121.4m +step 05434/16704 (32.53%) | loss: 2.740483 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,275 | mfu: 50.83 | epoch: 1 | total time: 58.41m | eta: 121.4m +step 05435/16704 (32.54%) | loss: 2.730114 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,099 | mfu: 50.88 | epoch: 1 | total time: 58.42m | eta: 121.4m +step 05436/16704 (32.54%) | loss: 2.730518 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 1 | total time: 58.43m | eta: 121.3m +step 05437/16704 (32.55%) | loss: 2.732698 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,868 | mfu: 50.68 | epoch: 1 | total time: 58.44m | eta: 121.3m +step 05438/16704 (32.56%) | loss: 2.743533 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,735 | mfu: 50.92 | epoch: 1 | total time: 58.45m | eta: 121.3m +step 05439/16704 (32.56%) | loss: 2.759105 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,041 | mfu: 50.63 | epoch: 1 | total time: 58.46m | eta: 121.3m +step 05440/16704 (32.57%) | loss: 2.759400 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,849 | mfu: 50.74 | epoch: 1 | total time: 58.47m | eta: 121.3m +step 05441/16704 (32.57%) | loss: 2.752954 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,058 | mfu: 50.94 | epoch: 1 | total time: 58.48m | eta: 121.3m +step 05442/16704 (32.58%) | loss: 2.745477 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,584 | mfu: 50.91 | epoch: 1 | total time: 58.50m | eta: 121.3m +step 05443/16704 (32.59%) | loss: 2.752813 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,436 | mfu: 50.97 | epoch: 1 | total time: 58.51m | eta: 121.3m +step 05444/16704 (32.59%) | loss: 2.759511 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,968 | mfu: 50.87 | epoch: 1 | total time: 58.52m | eta: 121.3m +step 05445/16704 (32.60%) | loss: 2.759334 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,520 | mfu: 50.66 | epoch: 1 | total time: 58.53m | eta: 121.2m +step 05446/16704 (32.60%) | loss: 2.753527 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,399 | mfu: 50.71 | epoch: 1 | total time: 58.54m | eta: 121.2m +step 05447/16704 (32.61%) | loss: 2.764367 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,540 | mfu: 50.85 | epoch: 1 | total time: 58.55m | eta: 121.2m +step 05448/16704 (32.61%) | loss: 2.749373 | lrm: 1.00 | dt: 651.95ms | tok/sec: 804,181 | mfu: 50.26 | epoch: 1 | total time: 58.56m | eta: 121.2m +step 05449/16704 (32.62%) | loss: 2.752233 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,665 | mfu: 50.79 | epoch: 1 | total time: 58.57m | eta: 121.2m +step 05450/16704 (32.63%) | loss: 2.759370 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,846 | mfu: 50.99 | epoch: 1 | total time: 58.58m | eta: 121.2m +step 05451/16704 (32.63%) | loss: 2.757368 | lrm: 1.00 | dt: 648.34ms | tok/sec: 808,660 | mfu: 50.54 | epoch: 1 | total time: 58.59m | eta: 121.2m +step 05452/16704 (32.64%) | loss: 2.760823 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,924 | mfu: 50.87 | epoch: 1 | total time: 58.60m | eta: 121.2m +step 05453/16704 (32.64%) | loss: 2.762037 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,827 | mfu: 50.93 | epoch: 1 | total time: 58.61m | eta: 121.2m +step 05454/16704 (32.65%) | loss: 2.748699 | lrm: 1.00 | dt: 642.04ms | tok/sec: 816,601 | mfu: 51.04 | epoch: 1 | total time: 58.62m | eta: 121.1m +step 05455/16704 (32.66%) | loss: 2.736171 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,185 | mfu: 50.89 | epoch: 1 | total time: 58.64m | eta: 121.1m +step 05456/16704 (32.66%) | loss: 2.741907 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,051 | mfu: 50.69 | epoch: 1 | total time: 58.65m | eta: 121.1m +step 05457/16704 (32.67%) | loss: 2.747512 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,152 | mfu: 50.95 | epoch: 1 | total time: 58.66m | eta: 121.1m +step 05458/16704 (32.67%) | loss: 2.756772 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 1 | total time: 58.67m | eta: 121.1m +step 05459/16704 (32.68%) | loss: 2.770028 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,352 | mfu: 50.96 | epoch: 1 | total time: 58.68m | eta: 121.1m +step 05460/16704 (32.69%) | loss: 2.765046 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,637 | mfu: 50.85 | epoch: 1 | total time: 58.69m | eta: 121.1m +step 05461/16704 (32.69%) | loss: 2.761087 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,262 | mfu: 50.89 | epoch: 1 | total time: 58.70m | eta: 121.1m +step 05462/16704 (32.70%) | loss: 2.754525 | lrm: 1.00 | dt: 646.79ms | tok/sec: 810,597 | mfu: 50.66 | epoch: 1 | total time: 58.71m | eta: 121.1m +step 05463/16704 (32.70%) | loss: 2.768266 | lrm: 1.00 | dt: 642.49ms | tok/sec: 816,018 | mfu: 51.00 | epoch: 1 | total time: 58.72m | eta: 121.0m +step 05464/16704 (32.71%) | loss: 2.769269 | lrm: 1.00 | dt: 644.09ms | tok/sec: 814,002 | mfu: 50.88 | epoch: 1 | total time: 58.73m | eta: 121.0m +step 05465/16704 (32.72%) | loss: 2.767440 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 1 | total time: 58.74m | eta: 121.0m +step 05466/16704 (32.72%) | loss: 2.773411 | lrm: 1.00 | dt: 642.45ms | tok/sec: 816,072 | mfu: 51.01 | epoch: 1 | total time: 58.75m | eta: 121.0m +step 05467/16704 (32.73%) | loss: 2.768603 | lrm: 1.00 | dt: 647.59ms | tok/sec: 809,598 | mfu: 50.60 | epoch: 1 | total time: 58.76m | eta: 121.0m +step 05468/16704 (32.73%) | loss: 2.752091 | lrm: 1.00 | dt: 641.77ms | tok/sec: 816,944 | mfu: 51.06 | epoch: 1 | total time: 58.77m | eta: 121.0m +step 05469/16704 (32.74%) | loss: 2.757056 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,474 | mfu: 50.84 | epoch: 1 | total time: 58.79m | eta: 121.0m +step 05470/16704 (32.75%) | loss: 2.767753 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,104 | mfu: 50.88 | epoch: 1 | total time: 58.80m | eta: 121.0m +step 05471/16704 (32.75%) | loss: 2.762936 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 1 | total time: 58.81m | eta: 121.0m +step 05472/16704 (32.76%) | loss: 2.749562 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,279 | mfu: 50.89 | epoch: 1 | total time: 58.82m | eta: 121.0m +step 05473/16704 (32.76%) | loss: 2.745247 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 1 | total time: 58.83m | eta: 120.9m +step 05474/16704 (32.77%) | loss: 2.740410 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,179 | mfu: 50.89 | epoch: 1 | total time: 58.84m | eta: 120.9m +step 05475/16704 (32.78%) | loss: 2.738787 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,493 | mfu: 50.97 | epoch: 1 | total time: 58.85m | eta: 120.9m +step 05476/16704 (32.78%) | loss: 2.739614 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,591 | mfu: 50.91 | epoch: 1 | total time: 58.86m | eta: 120.9m +step 05477/16704 (32.79%) | loss: 2.753983 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,918 | mfu: 50.87 | epoch: 1 | total time: 58.87m | eta: 120.9m +step 05478/16704 (32.79%) | loss: 2.750278 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,750 | mfu: 50.92 | epoch: 1 | total time: 58.88m | eta: 120.9m +step 05479/16704 (32.80%) | loss: 2.741659 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,586 | mfu: 50.85 | epoch: 1 | total time: 58.89m | eta: 120.9m +step 05480/16704 (32.81%) | loss: 2.731406 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,553 | mfu: 50.72 | epoch: 1 | total time: 58.90m | eta: 120.9m +step 05481/16704 (32.81%) | loss: 2.735507 | lrm: 1.00 | dt: 642.45ms | tok/sec: 816,075 | mfu: 51.01 | epoch: 1 | total time: 58.91m | eta: 120.9m +step 05482/16704 (32.82%) | loss: 2.734406 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,805 | mfu: 50.86 | epoch: 1 | total time: 58.93m | eta: 120.8m +step 05483/16704 (32.82%) | loss: 2.738850 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,354 | mfu: 50.90 | epoch: 1 | total time: 58.94m | eta: 120.8m +step 05484/16704 (32.83%) | loss: 2.743920 | lrm: 1.00 | dt: 641.99ms | tok/sec: 816,665 | mfu: 51.04 | epoch: 1 | total time: 58.95m | eta: 120.8m +step 05485/16704 (32.84%) | loss: 2.742900 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,184 | mfu: 50.70 | epoch: 1 | total time: 58.96m | eta: 120.8m +step 05486/16704 (32.84%) | loss: 2.750081 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,607 | mfu: 50.85 | epoch: 1 | total time: 58.97m | eta: 120.8m +step 05487/16704 (32.85%) | loss: 2.748592 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,631 | mfu: 50.79 | epoch: 1 | total time: 58.98m | eta: 120.8m +step 05488/16704 (32.85%) | loss: 2.751369 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,733 | mfu: 50.80 | epoch: 1 | total time: 58.99m | eta: 120.8m +step 05489/16704 (32.86%) | loss: 2.750635 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,232 | mfu: 50.89 | epoch: 1 | total time: 59.00m | eta: 120.8m +step 05490/16704 (32.87%) | loss: 2.742701 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,016 | mfu: 50.75 | epoch: 1 | total time: 59.01m | eta: 120.8m +step 05491/16704 (32.87%) | loss: 2.750516 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,077 | mfu: 50.94 | epoch: 1 | total time: 59.02m | eta: 120.7m +step 05492/16704 (32.88%) | loss: 2.750285 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,809 | mfu: 50.80 | epoch: 1 | total time: 59.03m | eta: 120.7m +step 05493/16704 (32.88%) | loss: 2.757190 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,879 | mfu: 50.81 | epoch: 1 | total time: 59.04m | eta: 120.7m +step 05494/16704 (32.89%) | loss: 2.769664 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,291 | mfu: 50.58 | epoch: 1 | total time: 59.05m | eta: 120.7m +step 05495/16704 (32.90%) | loss: 2.768307 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,621 | mfu: 50.98 | epoch: 1 | total time: 59.06m | eta: 120.7m +step 05496/16704 (32.90%) | loss: 2.761443 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,724 | mfu: 50.80 | epoch: 1 | total time: 59.08m | eta: 120.7m +step 05497/16704 (32.91%) | loss: 2.758406 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,915 | mfu: 50.81 | epoch: 1 | total time: 59.09m | eta: 120.7m +step 05498/16704 (32.91%) | loss: 2.753502 | lrm: 1.00 | dt: 640.41ms | tok/sec: 818,673 | mfu: 51.17 | epoch: 1 | total time: 59.10m | eta: 120.7m +step 05499/16704 (32.92%) | loss: 2.741614 | lrm: 1.00 | dt: 649.91ms | tok/sec: 806,712 | mfu: 50.42 | epoch: 1 | total time: 59.11m | eta: 120.7m +Step 05500 | Validation bpb: 0.837162 +step 05500/16704 (32.93%) | loss: 2.755563 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,575 | mfu: 50.97 | epoch: 1 | total time: 59.12m | eta: 120.6m +step 05501/16704 (32.93%) | loss: 2.748626 | lrm: 1.00 | dt: 650.62ms | tok/sec: 805,825 | mfu: 50.37 | epoch: 1 | total time: 59.13m | eta: 120.6m +step 05502/16704 (32.94%) | loss: 2.763015 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,284 | mfu: 50.83 | epoch: 1 | total time: 59.14m | eta: 120.6m +step 05503/16704 (32.94%) | loss: 2.762390 | lrm: 1.00 | dt: 640.64ms | tok/sec: 818,386 | mfu: 51.15 | epoch: 1 | total time: 59.15m | eta: 120.6m +step 05504/16704 (32.95%) | loss: 2.764307 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,950 | mfu: 50.62 | epoch: 1 | total time: 59.16m | eta: 120.6m +step 05505/16704 (32.96%) | loss: 2.755501 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,543 | mfu: 50.91 | epoch: 1 | total time: 59.17m | eta: 120.6m +step 05506/16704 (32.96%) | loss: 2.772455 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,817 | mfu: 50.74 | epoch: 1 | total time: 59.18m | eta: 120.6m +step 05507/16704 (32.97%) | loss: 2.755386 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,844 | mfu: 50.87 | epoch: 1 | total time: 59.19m | eta: 120.6m +step 05508/16704 (32.97%) | loss: 2.755654 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,410 | mfu: 50.84 | epoch: 1 | total time: 59.20m | eta: 120.6m +step 05509/16704 (32.98%) | loss: 2.755994 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,888 | mfu: 50.81 | epoch: 1 | total time: 59.22m | eta: 120.6m +step 05510/16704 (32.99%) | loss: 2.762520 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 1 | total time: 59.23m | eta: 120.5m +step 05511/16704 (32.99%) | loss: 2.769296 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,310 | mfu: 50.77 | epoch: 1 | total time: 59.24m | eta: 120.5m +step 05512/16704 (33.00%) | loss: 2.759925 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,871 | mfu: 50.93 | epoch: 1 | total time: 59.25m | eta: 120.5m +step 05513/16704 (33.00%) | loss: 2.763882 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,480 | mfu: 50.91 | epoch: 1 | total time: 59.26m | eta: 120.5m +step 05514/16704 (33.01%) | loss: 2.765738 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,049 | mfu: 50.82 | epoch: 1 | total time: 59.27m | eta: 120.5m +step 05515/16704 (33.02%) | loss: 2.765096 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,774 | mfu: 50.67 | epoch: 1 | total time: 59.28m | eta: 120.5m +step 05516/16704 (33.02%) | loss: 2.768042 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,425 | mfu: 50.97 | epoch: 1 | total time: 59.29m | eta: 120.5m +step 05517/16704 (33.03%) | loss: 2.773555 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,751 | mfu: 50.86 | epoch: 1 | total time: 59.30m | eta: 120.5m +step 05518/16704 (33.03%) | loss: 2.762140 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,958 | mfu: 50.87 | epoch: 1 | total time: 59.31m | eta: 120.5m +step 05519/16704 (33.04%) | loss: 2.770551 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,163 | mfu: 50.76 | epoch: 1 | total time: 59.32m | eta: 120.4m +step 05520/16704 (33.05%) | loss: 2.776016 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,327 | mfu: 50.96 | epoch: 1 | total time: 59.33m | eta: 120.4m +step 05521/16704 (33.05%) | loss: 2.768733 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,251 | mfu: 50.83 | epoch: 1 | total time: 59.34m | eta: 120.4m +step 05522/16704 (33.06%) | loss: 2.772333 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,612 | mfu: 50.91 | epoch: 1 | total time: 59.35m | eta: 120.4m +step 05523/16704 (33.06%) | loss: 2.776877 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,472 | mfu: 50.91 | epoch: 1 | total time: 59.37m | eta: 120.4m +step 05524/16704 (33.07%) | loss: 2.771378 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,040 | mfu: 50.94 | epoch: 1 | total time: 59.38m | eta: 120.4m +step 05525/16704 (33.08%) | loss: 2.780871 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,875 | mfu: 50.74 | epoch: 1 | total time: 59.39m | eta: 120.4m +step 05526/16704 (33.08%) | loss: 2.776440 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,977 | mfu: 50.75 | epoch: 1 | total time: 59.40m | eta: 120.4m +step 05527/16704 (33.09%) | loss: 2.782715 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,104 | mfu: 50.88 | epoch: 1 | total time: 59.41m | eta: 120.4m +step 05528/16704 (33.09%) | loss: 2.782456 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,734 | mfu: 50.80 | epoch: 1 | total time: 59.42m | eta: 120.3m +step 05529/16704 (33.10%) | loss: 2.776371 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,450 | mfu: 50.90 | epoch: 1 | total time: 59.43m | eta: 120.3m +step 05530/16704 (33.11%) | loss: 2.783366 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,360 | mfu: 50.71 | epoch: 1 | total time: 59.44m | eta: 120.3m +step 05531/16704 (33.11%) | loss: 2.774925 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,018 | mfu: 50.69 | epoch: 1 | total time: 59.45m | eta: 120.3m +step 05532/16704 (33.12%) | loss: 2.784863 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,304 | mfu: 50.83 | epoch: 1 | total time: 59.46m | eta: 120.3m +step 05533/16704 (33.12%) | loss: 2.782597 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,717 | mfu: 50.98 | epoch: 1 | total time: 59.47m | eta: 120.3m +step 05534/16704 (33.13%) | loss: 2.783646 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,413 | mfu: 50.90 | epoch: 1 | total time: 59.48m | eta: 120.3m +step 05535/16704 (33.14%) | loss: 2.762302 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,838 | mfu: 50.80 | epoch: 1 | total time: 59.49m | eta: 120.3m +step 05536/16704 (33.14%) | loss: 2.758831 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,991 | mfu: 50.81 | epoch: 1 | total time: 59.51m | eta: 120.3m +step 05537/16704 (33.15%) | loss: 2.756198 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,372 | mfu: 50.90 | epoch: 1 | total time: 59.52m | eta: 120.2m +step 05538/16704 (33.15%) | loss: 2.761603 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,694 | mfu: 50.86 | epoch: 1 | total time: 59.53m | eta: 120.2m +step 05539/16704 (33.16%) | loss: 2.765764 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,879 | mfu: 50.81 | epoch: 1 | total time: 59.54m | eta: 120.2m +step 05540/16704 (33.17%) | loss: 2.773604 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,433 | mfu: 50.90 | epoch: 1 | total time: 59.55m | eta: 120.2m +step 05541/16704 (33.17%) | loss: 2.785618 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,430 | mfu: 50.90 | epoch: 1 | total time: 59.56m | eta: 120.2m +step 05542/16704 (33.18%) | loss: 2.775824 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,162 | mfu: 50.89 | epoch: 1 | total time: 59.57m | eta: 120.2m +step 05543/16704 (33.18%) | loss: 2.759737 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,414 | mfu: 50.90 | epoch: 1 | total time: 59.58m | eta: 120.2m +step 05544/16704 (33.19%) | loss: 2.758541 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,132 | mfu: 50.88 | epoch: 1 | total time: 59.59m | eta: 120.2m +step 05545/16704 (33.20%) | loss: 2.765736 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,751 | mfu: 50.92 | epoch: 1 | total time: 59.60m | eta: 120.2m +step 05546/16704 (33.20%) | loss: 2.757786 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,062 | mfu: 50.82 | epoch: 1 | total time: 59.61m | eta: 120.2m +step 05547/16704 (33.21%) | loss: 2.763633 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,542 | mfu: 50.91 | epoch: 1 | total time: 59.62m | eta: 120.1m +step 05548/16704 (33.21%) | loss: 2.760251 | lrm: 1.00 | dt: 648.36ms | tok/sec: 808,643 | mfu: 50.54 | epoch: 1 | total time: 59.63m | eta: 120.1m +step 05549/16704 (33.22%) | loss: 2.767348 | lrm: 1.00 | dt: 642.21ms | tok/sec: 816,378 | mfu: 51.02 | epoch: 1 | total time: 59.64m | eta: 120.1m +step 05550/16704 (33.23%) | loss: 2.761446 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,791 | mfu: 50.74 | epoch: 1 | total time: 59.66m | eta: 120.1m +step 05551/16704 (33.23%) | loss: 2.773448 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 1 | total time: 59.67m | eta: 120.1m +step 05552/16704 (33.24%) | loss: 2.772447 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,683 | mfu: 50.86 | epoch: 1 | total time: 59.68m | eta: 120.1m +step 05553/16704 (33.24%) | loss: 2.778471 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,559 | mfu: 50.85 | epoch: 1 | total time: 59.69m | eta: 120.1m +step 05554/16704 (33.25%) | loss: 2.773678 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,292 | mfu: 50.64 | epoch: 1 | total time: 59.70m | eta: 120.1m +step 05555/16704 (33.26%) | loss: 2.767833 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,363 | mfu: 50.84 | epoch: 1 | total time: 59.71m | eta: 120.1m +step 05556/16704 (33.26%) | loss: 2.763681 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 1 | total time: 59.72m | eta: 120.0m +step 05557/16704 (33.27%) | loss: 2.768085 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,967 | mfu: 50.75 | epoch: 1 | total time: 59.73m | eta: 120.0m +step 05558/16704 (33.27%) | loss: 2.767832 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,263 | mfu: 50.89 | epoch: 1 | total time: 59.74m | eta: 120.0m +step 05559/16704 (33.28%) | loss: 2.766968 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,691 | mfu: 50.86 | epoch: 1 | total time: 59.75m | eta: 120.0m +step 05560/16704 (33.29%) | loss: 2.766165 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,277 | mfu: 50.77 | epoch: 1 | total time: 59.76m | eta: 120.0m +step 05561/16704 (33.29%) | loss: 2.759675 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,555 | mfu: 50.91 | epoch: 1 | total time: 59.77m | eta: 120.0m +step 05562/16704 (33.30%) | loss: 2.759726 | lrm: 1.00 | dt: 646.88ms | tok/sec: 810,484 | mfu: 50.66 | epoch: 1 | total time: 59.78m | eta: 120.0m +step 05563/16704 (33.30%) | loss: 2.762560 | lrm: 1.00 | dt: 647.67ms | tok/sec: 809,499 | mfu: 50.59 | epoch: 1 | total time: 59.80m | eta: 120.0m +step 05564/16704 (33.31%) | loss: 2.754500 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,095 | mfu: 50.94 | epoch: 1 | total time: 59.81m | eta: 120.0m +step 05565/16704 (33.32%) | loss: 2.761355 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,501 | mfu: 50.91 | epoch: 1 | total time: 59.82m | eta: 119.9m +step 05566/16704 (33.32%) | loss: 2.753735 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,108 | mfu: 50.76 | epoch: 1 | total time: 59.83m | eta: 119.9m +step 05567/16704 (33.33%) | loss: 2.770635 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,318 | mfu: 50.96 | epoch: 1 | total time: 59.84m | eta: 119.9m +step 05568/16704 (33.33%) | loss: 2.755982 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,850 | mfu: 50.99 | epoch: 1 | total time: 59.85m | eta: 119.9m +step 05569/16704 (33.34%) | loss: 2.756954 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,454 | mfu: 50.90 | epoch: 1 | total time: 59.86m | eta: 119.9m +step 05570/16704 (33.35%) | loss: 2.749165 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,904 | mfu: 50.93 | epoch: 1 | total time: 59.87m | eta: 119.9m +step 05571/16704 (33.35%) | loss: 2.749623 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,983 | mfu: 50.81 | epoch: 1 | total time: 59.88m | eta: 119.9m +step 05572/16704 (33.36%) | loss: 2.754950 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,880 | mfu: 50.99 | epoch: 1 | total time: 59.89m | eta: 119.9m +step 05573/16704 (33.36%) | loss: 2.760008 | lrm: 1.00 | dt: 646.19ms | tok/sec: 811,356 | mfu: 50.71 | epoch: 1 | total time: 59.90m | eta: 119.9m +step 05574/16704 (33.37%) | loss: 2.763191 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,853 | mfu: 50.87 | epoch: 1 | total time: 59.91m | eta: 119.8m +step 05575/16704 (33.38%) | loss: 2.771995 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,790 | mfu: 50.80 | epoch: 1 | total time: 59.92m | eta: 119.8m +step 05576/16704 (33.38%) | loss: 2.765604 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 1 | total time: 59.93m | eta: 119.8m +step 05577/16704 (33.39%) | loss: 2.768988 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,203 | mfu: 50.95 | epoch: 1 | total time: 59.95m | eta: 119.8m +step 05578/16704 (33.39%) | loss: 2.771058 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 1 | total time: 59.96m | eta: 119.8m +step 05579/16704 (33.40%) | loss: 2.776048 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,875 | mfu: 50.93 | epoch: 1 | total time: 59.97m | eta: 119.8m +step 05580/16704 (33.41%) | loss: 2.773343 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,449 | mfu: 50.72 | epoch: 1 | total time: 59.98m | eta: 119.8m +step 05581/16704 (33.41%) | loss: 2.786677 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,148 | mfu: 50.82 | epoch: 1 | total time: 59.99m | eta: 119.8m +step 05582/16704 (33.42%) | loss: 2.791383 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,615 | mfu: 50.98 | epoch: 1 | total time: 60.00m | eta: 119.8m +step 05583/16704 (33.42%) | loss: 2.773080 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,588 | mfu: 50.91 | epoch: 1 | total time: 60.01m | eta: 119.8m +step 05584/16704 (33.43%) | loss: 2.762765 | lrm: 1.00 | dt: 642.25ms | tok/sec: 816,327 | mfu: 51.02 | epoch: 1 | total time: 60.02m | eta: 119.7m +step 05585/16704 (33.44%) | loss: 2.754756 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 1 | total time: 60.03m | eta: 119.7m +step 05586/16704 (33.44%) | loss: 2.764617 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,269 | mfu: 50.89 | epoch: 1 | total time: 60.04m | eta: 119.7m +step 05587/16704 (33.45%) | loss: 2.769433 | lrm: 1.00 | dt: 641.90ms | tok/sec: 816,770 | mfu: 51.05 | epoch: 1 | total time: 60.05m | eta: 119.7m +step 05588/16704 (33.45%) | loss: 2.756219 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,204 | mfu: 51.01 | epoch: 1 | total time: 60.06m | eta: 119.7m +step 05589/16704 (33.46%) | loss: 2.762634 | lrm: 1.00 | dt: 641.97ms | tok/sec: 816,684 | mfu: 51.04 | epoch: 1 | total time: 60.07m | eta: 119.7m +step 05590/16704 (33.47%) | loss: 2.772669 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,477 | mfu: 50.84 | epoch: 1 | total time: 60.09m | eta: 119.7m +step 05591/16704 (33.47%) | loss: 2.762583 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,278 | mfu: 50.89 | epoch: 1 | total time: 60.10m | eta: 119.7m +step 05592/16704 (33.48%) | loss: 2.749423 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,016 | mfu: 50.75 | epoch: 1 | total time: 60.11m | eta: 119.7m +step 05593/16704 (33.48%) | loss: 2.754054 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,509 | mfu: 50.78 | epoch: 1 | total time: 60.12m | eta: 119.6m +step 05594/16704 (33.49%) | loss: 2.750618 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,590 | mfu: 50.98 | epoch: 1 | total time: 60.13m | eta: 119.6m +step 05595/16704 (33.49%) | loss: 2.756006 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,450 | mfu: 50.97 | epoch: 1 | total time: 60.14m | eta: 119.6m +step 05596/16704 (33.50%) | loss: 2.756761 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,237 | mfu: 50.77 | epoch: 1 | total time: 60.15m | eta: 119.6m +step 05597/16704 (33.51%) | loss: 2.753140 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,331 | mfu: 50.90 | epoch: 1 | total time: 60.16m | eta: 119.6m +step 05598/16704 (33.51%) | loss: 2.763262 | lrm: 1.00 | dt: 641.52ms | tok/sec: 817,261 | mfu: 51.08 | epoch: 1 | total time: 60.17m | eta: 119.6m +step 05599/16704 (33.52%) | loss: 2.752082 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,158 | mfu: 50.95 | epoch: 1 | total time: 60.18m | eta: 119.6m +step 05600/16704 (33.52%) | loss: 2.744293 | lrm: 1.00 | dt: 641.51ms | tok/sec: 817,275 | mfu: 51.08 | epoch: 1 | total time: 60.19m | eta: 119.6m +step 05601/16704 (33.53%) | loss: 2.733404 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,548 | mfu: 50.97 | epoch: 1 | total time: 60.20m | eta: 119.6m +step 05602/16704 (33.54%) | loss: 2.734470 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,275 | mfu: 50.89 | epoch: 1 | total time: 60.21m | eta: 119.5m +step 05603/16704 (33.54%) | loss: 2.744699 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,522 | mfu: 50.72 | epoch: 1 | total time: 60.22m | eta: 119.5m +step 05604/16704 (33.55%) | loss: 2.744342 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 1 | total time: 60.24m | eta: 119.5m +step 05605/16704 (33.55%) | loss: 2.725855 | lrm: 1.00 | dt: 642.37ms | tok/sec: 816,173 | mfu: 51.01 | epoch: 1 | total time: 60.25m | eta: 119.5m +step 05606/16704 (33.56%) | loss: 2.732573 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,428 | mfu: 50.90 | epoch: 1 | total time: 60.26m | eta: 119.5m +step 05607/16704 (33.57%) | loss: 2.735238 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,979 | mfu: 50.87 | epoch: 1 | total time: 60.27m | eta: 119.5m +step 05608/16704 (33.57%) | loss: 2.734350 | lrm: 1.00 | dt: 647.58ms | tok/sec: 809,605 | mfu: 50.60 | epoch: 1 | total time: 60.28m | eta: 119.5m +step 05609/16704 (33.58%) | loss: 2.749005 | lrm: 1.00 | dt: 641.37ms | tok/sec: 817,449 | mfu: 51.09 | epoch: 1 | total time: 60.29m | eta: 119.5m +step 05610/16704 (33.58%) | loss: 2.759705 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,258 | mfu: 50.83 | epoch: 1 | total time: 60.30m | eta: 119.5m +step 05611/16704 (33.59%) | loss: 2.767179 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,046 | mfu: 50.88 | epoch: 1 | total time: 60.31m | eta: 119.4m +step 05612/16704 (33.60%) | loss: 2.753129 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 1 | total time: 60.32m | eta: 119.4m +step 05613/16704 (33.60%) | loss: 2.746675 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,031 | mfu: 50.94 | epoch: 1 | total time: 60.33m | eta: 119.4m +step 05614/16704 (33.61%) | loss: 2.740815 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,828 | mfu: 50.93 | epoch: 1 | total time: 60.34m | eta: 119.4m +step 05615/16704 (33.61%) | loss: 2.732189 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,897 | mfu: 50.99 | epoch: 1 | total time: 60.35m | eta: 119.4m +step 05616/16704 (33.62%) | loss: 2.738532 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,911 | mfu: 50.81 | epoch: 1 | total time: 60.36m | eta: 119.4m +step 05617/16704 (33.63%) | loss: 2.747795 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,513 | mfu: 50.85 | epoch: 1 | total time: 60.37m | eta: 119.4m +step 05618/16704 (33.63%) | loss: 2.767260 | lrm: 1.00 | dt: 642.85ms | tok/sec: 815,563 | mfu: 50.97 | epoch: 1 | total time: 60.39m | eta: 119.4m +step 05619/16704 (33.64%) | loss: 2.777564 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,197 | mfu: 50.95 | epoch: 1 | total time: 60.40m | eta: 119.4m +step 05620/16704 (33.64%) | loss: 2.779531 | lrm: 1.00 | dt: 641.71ms | tok/sec: 817,010 | mfu: 51.06 | epoch: 1 | total time: 60.41m | eta: 119.3m +step 05621/16704 (33.65%) | loss: 2.771476 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,509 | mfu: 50.85 | epoch: 1 | total time: 60.42m | eta: 119.3m +step 05622/16704 (33.66%) | loss: 2.758027 | lrm: 1.00 | dt: 642.19ms | tok/sec: 816,402 | mfu: 51.03 | epoch: 1 | total time: 60.43m | eta: 119.3m +step 05623/16704 (33.66%) | loss: 2.753745 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,887 | mfu: 50.81 | epoch: 1 | total time: 60.44m | eta: 119.3m +step 05624/16704 (33.67%) | loss: 2.754384 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,159 | mfu: 50.70 | epoch: 1 | total time: 60.45m | eta: 119.3m +step 05625/16704 (33.67%) | loss: 2.762533 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,887 | mfu: 50.74 | epoch: 1 | total time: 60.46m | eta: 119.3m +step 05626/16704 (33.68%) | loss: 2.756674 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,990 | mfu: 50.88 | epoch: 1 | total time: 60.47m | eta: 119.3m +step 05627/16704 (33.69%) | loss: 2.745970 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,781 | mfu: 50.74 | epoch: 1 | total time: 60.48m | eta: 119.3m +step 05628/16704 (33.69%) | loss: 2.734208 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,813 | mfu: 50.74 | epoch: 1 | total time: 60.49m | eta: 119.3m +step 05629/16704 (33.70%) | loss: 2.736334 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,134 | mfu: 51.01 | epoch: 1 | total time: 60.50m | eta: 119.3m +step 05630/16704 (33.70%) | loss: 2.737422 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,920 | mfu: 51.00 | epoch: 1 | total time: 60.51m | eta: 119.2m +step 05631/16704 (33.71%) | loss: 2.730741 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,812 | mfu: 50.86 | epoch: 1 | total time: 60.53m | eta: 119.2m +step 05632/16704 (33.72%) | loss: 2.743695 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,623 | mfu: 50.98 | epoch: 1 | total time: 60.54m | eta: 119.2m +step 05633/16704 (33.72%) | loss: 2.756677 | lrm: 1.00 | dt: 642.01ms | tok/sec: 816,640 | mfu: 51.04 | epoch: 1 | total time: 60.55m | eta: 119.2m +step 05634/16704 (33.73%) | loss: 2.764731 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,186 | mfu: 50.70 | epoch: 1 | total time: 60.56m | eta: 119.2m +step 05635/16704 (33.73%) | loss: 2.763012 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,089 | mfu: 50.94 | epoch: 1 | total time: 60.57m | eta: 119.2m +step 05636/16704 (33.74%) | loss: 2.752632 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,293 | mfu: 50.83 | epoch: 1 | total time: 60.58m | eta: 119.2m +step 05637/16704 (33.75%) | loss: 2.760777 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,185 | mfu: 50.83 | epoch: 1 | total time: 60.59m | eta: 119.2m +step 05638/16704 (33.75%) | loss: 2.756733 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,832 | mfu: 50.99 | epoch: 1 | total time: 60.60m | eta: 119.2m +step 05639/16704 (33.76%) | loss: 2.759629 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,891 | mfu: 50.74 | epoch: 1 | total time: 60.61m | eta: 119.1m +step 05640/16704 (33.76%) | loss: 2.771096 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,603 | mfu: 50.98 | epoch: 1 | total time: 60.62m | eta: 119.1m +step 05641/16704 (33.77%) | loss: 2.770093 | lrm: 1.00 | dt: 641.99ms | tok/sec: 816,659 | mfu: 51.04 | epoch: 1 | total time: 60.63m | eta: 119.1m +step 05642/16704 (33.78%) | loss: 2.755564 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,278 | mfu: 50.96 | epoch: 1 | total time: 60.64m | eta: 119.1m +step 05643/16704 (33.78%) | loss: 2.767377 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,702 | mfu: 50.92 | epoch: 1 | total time: 60.65m | eta: 119.1m +step 05644/16704 (33.79%) | loss: 2.758455 | lrm: 1.00 | dt: 642.49ms | tok/sec: 816,029 | mfu: 51.00 | epoch: 1 | total time: 60.66m | eta: 119.1m +step 05645/16704 (33.79%) | loss: 2.779502 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,864 | mfu: 50.93 | epoch: 1 | total time: 60.68m | eta: 119.1m +step 05646/16704 (33.80%) | loss: 2.789338 | lrm: 1.00 | dt: 639.97ms | tok/sec: 819,234 | mfu: 51.20 | epoch: 1 | total time: 60.69m | eta: 119.1m +step 05647/16704 (33.81%) | loss: 2.794803 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,843 | mfu: 50.80 | epoch: 1 | total time: 60.70m | eta: 119.1m +step 05648/16704 (33.81%) | loss: 2.791341 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,333 | mfu: 50.90 | epoch: 1 | total time: 60.71m | eta: 119.0m +step 05649/16704 (33.82%) | loss: 2.798489 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,198 | mfu: 50.89 | epoch: 1 | total time: 60.72m | eta: 119.0m +step 05650/16704 (33.82%) | loss: 2.795771 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,394 | mfu: 50.71 | epoch: 1 | total time: 60.73m | eta: 119.0m +step 05651/16704 (33.83%) | loss: 2.794551 | lrm: 1.00 | dt: 641.65ms | tok/sec: 817,087 | mfu: 51.07 | epoch: 1 | total time: 60.74m | eta: 119.0m +step 05652/16704 (33.84%) | loss: 2.786704 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,101 | mfu: 50.76 | epoch: 1 | total time: 60.75m | eta: 119.0m +step 05653/16704 (33.84%) | loss: 2.793471 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,404 | mfu: 50.96 | epoch: 1 | total time: 60.76m | eta: 119.0m +step 05654/16704 (33.85%) | loss: 2.797032 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,304 | mfu: 50.90 | epoch: 1 | total time: 60.77m | eta: 119.0m +step 05655/16704 (33.85%) | loss: 2.793777 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,136 | mfu: 50.76 | epoch: 1 | total time: 60.78m | eta: 119.0m +step 05656/16704 (33.86%) | loss: 2.804241 | lrm: 1.00 | dt: 640.02ms | tok/sec: 819,168 | mfu: 51.20 | epoch: 1 | total time: 60.79m | eta: 119.0m +step 05657/16704 (33.87%) | loss: 2.790837 | lrm: 1.00 | dt: 647.07ms | tok/sec: 810,248 | mfu: 50.64 | epoch: 1 | total time: 60.80m | eta: 118.9m +step 05658/16704 (33.87%) | loss: 2.801758 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,015 | mfu: 50.88 | epoch: 1 | total time: 60.81m | eta: 118.9m +step 05659/16704 (33.88%) | loss: 2.804642 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,608 | mfu: 50.98 | epoch: 1 | total time: 60.83m | eta: 118.9m +step 05660/16704 (33.88%) | loss: 2.794299 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,667 | mfu: 50.86 | epoch: 1 | total time: 60.84m | eta: 118.9m +step 05661/16704 (33.89%) | loss: 2.781588 | lrm: 1.00 | dt: 642.49ms | tok/sec: 816,030 | mfu: 51.00 | epoch: 1 | total time: 60.85m | eta: 118.9m +step 05662/16704 (33.90%) | loss: 2.764956 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,196 | mfu: 50.76 | epoch: 1 | total time: 60.86m | eta: 118.9m +step 05663/16704 (33.90%) | loss: 2.773329 | lrm: 1.00 | dt: 642.50ms | tok/sec: 816,007 | mfu: 51.00 | epoch: 1 | total time: 60.87m | eta: 118.9m +step 05664/16704 (33.91%) | loss: 2.765971 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,307 | mfu: 50.90 | epoch: 1 | total time: 60.88m | eta: 118.9m +step 05665/16704 (33.91%) | loss: 2.758684 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 1 | total time: 60.89m | eta: 118.9m +step 05666/16704 (33.92%) | loss: 2.742019 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,835 | mfu: 50.99 | epoch: 1 | total time: 60.90m | eta: 118.9m +step 05667/16704 (33.93%) | loss: 2.738423 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,056 | mfu: 50.94 | epoch: 1 | total time: 60.91m | eta: 118.8m +step 05668/16704 (33.93%) | loss: 2.748372 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,312 | mfu: 50.90 | epoch: 1 | total time: 60.92m | eta: 118.8m +step 05669/16704 (33.94%) | loss: 2.731068 | lrm: 1.00 | dt: 642.76ms | tok/sec: 815,682 | mfu: 50.98 | epoch: 1 | total time: 60.93m | eta: 118.8m +step 05670/16704 (33.94%) | loss: 2.721045 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,235 | mfu: 50.89 | epoch: 1 | total time: 60.94m | eta: 118.8m +step 05671/16704 (33.95%) | loss: 2.726251 | lrm: 1.00 | dt: 641.77ms | tok/sec: 816,936 | mfu: 51.06 | epoch: 1 | total time: 60.95m | eta: 118.8m +step 05672/16704 (33.96%) | loss: 2.730009 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,149 | mfu: 50.89 | epoch: 1 | total time: 60.96m | eta: 118.8m +step 05673/16704 (33.96%) | loss: 2.736358 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,605 | mfu: 50.98 | epoch: 1 | total time: 60.98m | eta: 118.8m +step 05674/16704 (33.97%) | loss: 2.718921 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,076 | mfu: 50.94 | epoch: 1 | total time: 60.99m | eta: 118.8m +step 05675/16704 (33.97%) | loss: 2.712580 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,133 | mfu: 50.88 | epoch: 1 | total time: 61.00m | eta: 118.8m +step 05676/16704 (33.98%) | loss: 2.719307 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,827 | mfu: 50.93 | epoch: 1 | total time: 61.01m | eta: 118.7m +step 05677/16704 (33.99%) | loss: 2.731947 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,058 | mfu: 50.82 | epoch: 1 | total time: 61.02m | eta: 118.7m +step 05678/16704 (33.99%) | loss: 2.736674 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,151 | mfu: 50.95 | epoch: 1 | total time: 61.03m | eta: 118.7m +step 05679/16704 (34.00%) | loss: 2.743046 | lrm: 1.00 | dt: 641.41ms | tok/sec: 817,400 | mfu: 51.09 | epoch: 1 | total time: 61.04m | eta: 118.7m +step 05680/16704 (34.00%) | loss: 2.747386 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,014 | mfu: 50.88 | epoch: 1 | total time: 61.05m | eta: 118.7m +step 05681/16704 (34.01%) | loss: 2.738962 | lrm: 1.00 | dt: 642.21ms | tok/sec: 816,379 | mfu: 51.02 | epoch: 1 | total time: 61.06m | eta: 118.7m +step 05682/16704 (34.02%) | loss: 2.739003 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,322 | mfu: 50.83 | epoch: 1 | total time: 61.07m | eta: 118.7m +step 05683/16704 (34.02%) | loss: 2.749570 | lrm: 1.00 | dt: 640.83ms | tok/sec: 818,143 | mfu: 51.14 | epoch: 1 | total time: 61.08m | eta: 118.7m +step 05684/16704 (34.03%) | loss: 2.754233 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,130 | mfu: 51.01 | epoch: 1 | total time: 61.09m | eta: 118.7m +step 05685/16704 (34.03%) | loss: 2.750756 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,436 | mfu: 50.97 | epoch: 1 | total time: 61.10m | eta: 118.6m +step 05686/16704 (34.04%) | loss: 2.741507 | lrm: 1.00 | dt: 642.23ms | tok/sec: 816,355 | mfu: 51.02 | epoch: 1 | total time: 61.11m | eta: 118.6m +step 05687/16704 (34.05%) | loss: 2.734550 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 1 | total time: 61.13m | eta: 118.6m +step 05688/16704 (34.05%) | loss: 2.732780 | lrm: 1.00 | dt: 642.97ms | tok/sec: 815,417 | mfu: 50.96 | epoch: 1 | total time: 61.14m | eta: 118.6m +step 05689/16704 (34.06%) | loss: 2.734868 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 1 | total time: 61.15m | eta: 118.6m +step 05690/16704 (34.06%) | loss: 2.745279 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,350 | mfu: 50.90 | epoch: 1 | total time: 61.16m | eta: 118.6m +step 05691/16704 (34.07%) | loss: 2.748575 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,081 | mfu: 50.88 | epoch: 1 | total time: 61.17m | eta: 118.6m +step 05692/16704 (34.08%) | loss: 2.753260 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,827 | mfu: 50.87 | epoch: 1 | total time: 61.18m | eta: 118.6m +step 05693/16704 (34.08%) | loss: 2.760483 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,278 | mfu: 50.89 | epoch: 1 | total time: 61.19m | eta: 118.6m +step 05694/16704 (34.09%) | loss: 2.766283 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,525 | mfu: 50.97 | epoch: 1 | total time: 61.20m | eta: 118.5m +step 05695/16704 (34.09%) | loss: 2.764921 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,082 | mfu: 50.76 | epoch: 1 | total time: 61.21m | eta: 118.5m +step 05696/16704 (34.10%) | loss: 2.775651 | lrm: 1.00 | dt: 642.03ms | tok/sec: 816,613 | mfu: 51.04 | epoch: 1 | total time: 61.22m | eta: 118.5m +step 05697/16704 (34.11%) | loss: 2.788838 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,869 | mfu: 50.93 | epoch: 1 | total time: 61.23m | eta: 118.5m +step 05698/16704 (34.11%) | loss: 2.782751 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,489 | mfu: 50.97 | epoch: 1 | total time: 61.24m | eta: 118.5m +step 05699/16704 (34.12%) | loss: 2.787635 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,453 | mfu: 50.72 | epoch: 1 | total time: 61.25m | eta: 118.5m +step 05700/16704 (34.12%) | loss: 2.775194 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,896 | mfu: 50.68 | epoch: 1 | total time: 61.27m | eta: 118.5m +step 05701/16704 (34.13%) | loss: 2.768104 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,185 | mfu: 51.01 | epoch: 1 | total time: 61.28m | eta: 118.5m +step 05702/16704 (34.14%) | loss: 2.781165 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,654 | mfu: 50.79 | epoch: 1 | total time: 61.29m | eta: 118.5m +step 05703/16704 (34.14%) | loss: 2.794428 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,749 | mfu: 50.86 | epoch: 1 | total time: 61.30m | eta: 118.4m +step 05704/16704 (34.15%) | loss: 2.779236 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,090 | mfu: 50.94 | epoch: 1 | total time: 61.31m | eta: 118.4m +step 05705/16704 (34.15%) | loss: 2.783322 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,976 | mfu: 50.69 | epoch: 1 | total time: 61.32m | eta: 118.4m +step 05706/16704 (34.16%) | loss: 2.771984 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,729 | mfu: 51.05 | epoch: 1 | total time: 61.33m | eta: 118.4m +step 05707/16704 (34.17%) | loss: 2.779442 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,720 | mfu: 50.86 | epoch: 1 | total time: 61.34m | eta: 118.4m +step 05708/16704 (34.17%) | loss: 2.783066 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,007 | mfu: 50.88 | epoch: 1 | total time: 61.35m | eta: 118.4m +step 05709/16704 (34.18%) | loss: 2.791859 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,396 | mfu: 50.96 | epoch: 1 | total time: 61.36m | eta: 118.4m +step 05710/16704 (34.18%) | loss: 2.776504 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 1 | total time: 61.37m | eta: 118.4m +step 05711/16704 (34.19%) | loss: 2.778009 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,586 | mfu: 50.85 | epoch: 1 | total time: 61.38m | eta: 118.4m +step 05712/16704 (34.20%) | loss: 2.774080 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,454 | mfu: 50.90 | epoch: 1 | total time: 61.39m | eta: 118.4m +step 05713/16704 (34.20%) | loss: 2.778253 | lrm: 1.00 | dt: 645.83ms | tok/sec: 811,807 | mfu: 50.74 | epoch: 1 | total time: 61.40m | eta: 118.3m +step 05714/16704 (34.21%) | loss: 2.772211 | lrm: 1.00 | dt: 642.16ms | tok/sec: 816,443 | mfu: 51.03 | epoch: 1 | total time: 61.42m | eta: 118.3m +step 05715/16704 (34.21%) | loss: 2.787687 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,166 | mfu: 50.89 | epoch: 1 | total time: 61.43m | eta: 118.3m +step 05716/16704 (34.22%) | loss: 2.781871 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,614 | mfu: 50.91 | epoch: 1 | total time: 61.44m | eta: 118.3m +step 05717/16704 (34.23%) | loss: 2.773655 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,614 | mfu: 50.91 | epoch: 1 | total time: 61.45m | eta: 118.3m +step 05718/16704 (34.23%) | loss: 2.762565 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,841 | mfu: 50.93 | epoch: 1 | total time: 61.46m | eta: 118.3m +step 05719/16704 (34.24%) | loss: 2.750909 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,819 | mfu: 50.99 | epoch: 1 | total time: 61.47m | eta: 118.3m +step 05720/16704 (34.24%) | loss: 2.750415 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,147 | mfu: 50.82 | epoch: 1 | total time: 61.48m | eta: 118.3m +step 05721/16704 (34.25%) | loss: 2.746419 | lrm: 1.00 | dt: 642.50ms | tok/sec: 816,017 | mfu: 51.00 | epoch: 1 | total time: 61.49m | eta: 118.3m +step 05722/16704 (34.26%) | loss: 2.747077 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,861 | mfu: 50.87 | epoch: 1 | total time: 61.50m | eta: 118.2m +step 05723/16704 (34.26%) | loss: 2.739970 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,559 | mfu: 50.79 | epoch: 1 | total time: 61.51m | eta: 118.2m +step 05724/16704 (34.27%) | loss: 2.748954 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,742 | mfu: 50.99 | epoch: 1 | total time: 61.52m | eta: 118.2m +step 05725/16704 (34.27%) | loss: 2.740178 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,734 | mfu: 50.80 | epoch: 1 | total time: 61.53m | eta: 118.2m +step 05726/16704 (34.28%) | loss: 2.747122 | lrm: 1.00 | dt: 642.21ms | tok/sec: 816,386 | mfu: 51.03 | epoch: 1 | total time: 61.54m | eta: 118.2m +step 05727/16704 (34.29%) | loss: 2.752565 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,373 | mfu: 50.77 | epoch: 1 | total time: 61.55m | eta: 118.2m +step 05728/16704 (34.29%) | loss: 2.747639 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,455 | mfu: 50.78 | epoch: 1 | total time: 61.57m | eta: 118.2m +step 05729/16704 (34.30%) | loss: 2.749040 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,612 | mfu: 50.91 | epoch: 1 | total time: 61.58m | eta: 118.2m +step 05730/16704 (34.30%) | loss: 2.750723 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,401 | mfu: 50.90 | epoch: 1 | total time: 61.59m | eta: 118.2m +step 05731/16704 (34.31%) | loss: 2.756895 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 61.60m | eta: 118.1m +step 05732/16704 (34.32%) | loss: 2.758863 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,693 | mfu: 50.92 | epoch: 1 | total time: 61.61m | eta: 118.1m +step 05733/16704 (34.32%) | loss: 2.752616 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,699 | mfu: 50.92 | epoch: 1 | total time: 61.62m | eta: 118.1m +step 05734/16704 (34.33%) | loss: 2.737581 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,516 | mfu: 50.85 | epoch: 1 | total time: 61.63m | eta: 118.1m +step 05735/16704 (34.33%) | loss: 2.747086 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,850 | mfu: 50.99 | epoch: 1 | total time: 61.64m | eta: 118.1m +step 05736/16704 (34.34%) | loss: 2.740728 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,765 | mfu: 50.74 | epoch: 1 | total time: 61.65m | eta: 118.1m +step 05737/16704 (34.35%) | loss: 2.735061 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,161 | mfu: 50.95 | epoch: 1 | total time: 61.66m | eta: 118.1m +step 05738/16704 (34.35%) | loss: 2.734307 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,651 | mfu: 50.67 | epoch: 1 | total time: 61.67m | eta: 118.1m +step 05739/16704 (34.36%) | loss: 2.736645 | lrm: 1.00 | dt: 643.26ms | tok/sec: 815,047 | mfu: 50.94 | epoch: 1 | total time: 61.68m | eta: 118.1m +step 05740/16704 (34.36%) | loss: 2.737383 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,993 | mfu: 50.81 | epoch: 1 | total time: 61.69m | eta: 118.0m +step 05741/16704 (34.37%) | loss: 2.723848 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,634 | mfu: 50.73 | epoch: 1 | total time: 61.71m | eta: 118.0m +step 05742/16704 (34.38%) | loss: 2.729082 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,874 | mfu: 50.93 | epoch: 1 | total time: 61.72m | eta: 118.0m +step 05743/16704 (34.38%) | loss: 2.742960 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,439 | mfu: 50.90 | epoch: 1 | total time: 61.73m | eta: 118.0m +step 05744/16704 (34.39%) | loss: 2.747827 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,569 | mfu: 50.85 | epoch: 1 | total time: 61.74m | eta: 118.0m +step 05745/16704 (34.39%) | loss: 2.738625 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,497 | mfu: 50.78 | epoch: 1 | total time: 61.75m | eta: 118.0m +step 05746/16704 (34.40%) | loss: 2.743749 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,189 | mfu: 51.01 | epoch: 1 | total time: 61.76m | eta: 118.0m +step 05747/16704 (34.40%) | loss: 2.741184 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,275 | mfu: 50.77 | epoch: 1 | total time: 61.77m | eta: 118.0m +step 05748/16704 (34.41%) | loss: 2.740381 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,068 | mfu: 50.88 | epoch: 1 | total time: 61.78m | eta: 118.0m +step 05749/16704 (34.42%) | loss: 2.737467 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,228 | mfu: 50.77 | epoch: 1 | total time: 61.79m | eta: 118.0m +Step 05750 | Validation bpb: 0.835459 +step 05750/16704 (34.42%) | loss: 2.747407 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,816 | mfu: 50.68 | epoch: 1 | total time: 61.80m | eta: 117.9m +step 05751/16704 (34.43%) | loss: 2.746117 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,710 | mfu: 50.80 | epoch: 1 | total time: 61.81m | eta: 117.9m +step 05752/16704 (34.43%) | loss: 2.751177 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,428 | mfu: 50.72 | epoch: 1 | total time: 61.82m | eta: 117.9m +step 05753/16704 (34.44%) | loss: 2.747886 | lrm: 1.00 | dt: 640.07ms | tok/sec: 819,115 | mfu: 51.20 | epoch: 1 | total time: 61.83m | eta: 117.9m +step 05754/16704 (34.45%) | loss: 2.752475 | lrm: 1.00 | dt: 648.62ms | tok/sec: 808,309 | mfu: 50.52 | epoch: 1 | total time: 61.84m | eta: 117.9m +step 05755/16704 (34.45%) | loss: 2.753883 | lrm: 1.00 | dt: 641.02ms | tok/sec: 817,902 | mfu: 51.12 | epoch: 1 | total time: 61.86m | eta: 117.9m +step 05756/16704 (34.46%) | loss: 2.737744 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,756 | mfu: 50.86 | epoch: 1 | total time: 61.87m | eta: 117.9m +step 05757/16704 (34.46%) | loss: 2.734354 | lrm: 1.00 | dt: 648.88ms | tok/sec: 807,993 | mfu: 50.50 | epoch: 1 | total time: 61.88m | eta: 117.9m +step 05758/16704 (34.47%) | loss: 2.738878 | lrm: 1.00 | dt: 641.14ms | tok/sec: 817,743 | mfu: 51.11 | epoch: 1 | total time: 61.89m | eta: 117.9m +step 05759/16704 (34.48%) | loss: 2.735191 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,534 | mfu: 50.72 | epoch: 1 | total time: 61.90m | eta: 117.8m +step 05760/16704 (34.48%) | loss: 2.743861 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,664 | mfu: 50.79 | epoch: 1 | total time: 61.91m | eta: 117.8m +step 05761/16704 (34.49%) | loss: 2.735240 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,124 | mfu: 50.76 | epoch: 1 | total time: 61.92m | eta: 117.8m +step 05762/16704 (34.49%) | loss: 2.733419 | lrm: 1.00 | dt: 646.97ms | tok/sec: 810,376 | mfu: 50.65 | epoch: 1 | total time: 61.93m | eta: 117.8m +step 05763/16704 (34.50%) | loss: 2.737274 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 1 | total time: 61.94m | eta: 117.8m +step 05764/16704 (34.51%) | loss: 2.717538 | lrm: 1.00 | dt: 647.46ms | tok/sec: 809,758 | mfu: 50.61 | epoch: 1 | total time: 61.95m | eta: 117.8m +step 05765/16704 (34.51%) | loss: 2.724020 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,483 | mfu: 50.59 | epoch: 1 | total time: 61.96m | eta: 117.8m +step 05766/16704 (34.52%) | loss: 2.732243 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,617 | mfu: 50.91 | epoch: 1 | total time: 61.97m | eta: 117.8m +step 05767/16704 (34.52%) | loss: 2.730571 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,646 | mfu: 50.79 | epoch: 1 | total time: 61.98m | eta: 117.8m +step 05768/16704 (34.53%) | loss: 2.732429 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,597 | mfu: 50.91 | epoch: 1 | total time: 62.00m | eta: 117.7m +step 05769/16704 (34.54%) | loss: 2.738301 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,172 | mfu: 50.82 | epoch: 1 | total time: 62.01m | eta: 117.7m +step 05770/16704 (34.54%) | loss: 2.747351 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,217 | mfu: 50.76 | epoch: 1 | total time: 62.02m | eta: 117.7m +step 05771/16704 (34.55%) | loss: 2.735341 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,964 | mfu: 50.75 | epoch: 1 | total time: 62.03m | eta: 117.7m +step 05772/16704 (34.55%) | loss: 2.744171 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,450 | mfu: 50.97 | epoch: 1 | total time: 62.04m | eta: 117.7m +step 05773/16704 (34.56%) | loss: 2.736155 | lrm: 1.00 | dt: 646.47ms | tok/sec: 811,000 | mfu: 50.69 | epoch: 1 | total time: 62.05m | eta: 117.7m +step 05774/16704 (34.57%) | loss: 2.733997 | lrm: 1.00 | dt: 647.06ms | tok/sec: 810,255 | mfu: 50.64 | epoch: 1 | total time: 62.06m | eta: 117.7m +step 05775/16704 (34.57%) | loss: 2.731252 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,005 | mfu: 50.88 | epoch: 1 | total time: 62.07m | eta: 117.7m +step 05776/16704 (34.58%) | loss: 2.737674 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,621 | mfu: 50.73 | epoch: 1 | total time: 62.08m | eta: 117.7m +step 05777/16704 (34.58%) | loss: 2.740512 | lrm: 1.00 | dt: 641.49ms | tok/sec: 817,301 | mfu: 51.08 | epoch: 1 | total time: 62.09m | eta: 117.6m +step 05778/16704 (34.59%) | loss: 2.742587 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,881 | mfu: 50.81 | epoch: 1 | total time: 62.10m | eta: 117.6m +step 05779/16704 (34.60%) | loss: 2.736757 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,627 | mfu: 50.85 | epoch: 1 | total time: 62.11m | eta: 117.6m +step 05780/16704 (34.60%) | loss: 2.740218 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,138 | mfu: 50.88 | epoch: 1 | total time: 62.12m | eta: 117.6m +step 05781/16704 (34.61%) | loss: 2.739416 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,602 | mfu: 50.73 | epoch: 1 | total time: 62.14m | eta: 117.6m +step 05782/16704 (34.61%) | loss: 2.733724 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,336 | mfu: 50.96 | epoch: 1 | total time: 62.15m | eta: 117.6m +step 05783/16704 (34.62%) | loss: 2.737858 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,302 | mfu: 50.71 | epoch: 1 | total time: 62.16m | eta: 117.6m +step 05784/16704 (34.63%) | loss: 2.733799 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,795 | mfu: 50.74 | epoch: 1 | total time: 62.17m | eta: 117.6m +step 05785/16704 (34.63%) | loss: 2.740094 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,960 | mfu: 50.94 | epoch: 1 | total time: 62.18m | eta: 117.6m +step 05786/16704 (34.64%) | loss: 2.737129 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,955 | mfu: 50.94 | epoch: 1 | total time: 62.19m | eta: 117.6m +step 05787/16704 (34.64%) | loss: 2.724129 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,936 | mfu: 50.75 | epoch: 1 | total time: 62.20m | eta: 117.5m +step 05788/16704 (34.65%) | loss: 2.720123 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,174 | mfu: 50.82 | epoch: 1 | total time: 62.21m | eta: 117.5m +step 05789/16704 (34.66%) | loss: 2.717731 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,159 | mfu: 50.89 | epoch: 1 | total time: 62.22m | eta: 117.5m +step 05790/16704 (34.66%) | loss: 2.714639 | lrm: 1.00 | dt: 641.88ms | tok/sec: 816,798 | mfu: 51.05 | epoch: 1 | total time: 62.23m | eta: 117.5m +step 05791/16704 (34.67%) | loss: 2.719753 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,104 | mfu: 50.63 | epoch: 1 | total time: 62.24m | eta: 117.5m +step 05792/16704 (34.67%) | loss: 2.723973 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,208 | mfu: 50.83 | epoch: 1 | total time: 62.25m | eta: 117.5m +step 05793/16704 (34.68%) | loss: 2.727701 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,446 | mfu: 50.84 | epoch: 1 | total time: 62.26m | eta: 117.5m +step 05794/16704 (34.69%) | loss: 2.730119 | lrm: 1.00 | dt: 641.66ms | tok/sec: 817,078 | mfu: 51.07 | epoch: 1 | total time: 62.27m | eta: 117.5m +step 05795/16704 (34.69%) | loss: 2.736246 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,648 | mfu: 50.92 | epoch: 1 | total time: 62.29m | eta: 117.5m +step 05796/16704 (34.70%) | loss: 2.746573 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 1 | total time: 62.30m | eta: 117.4m +step 05797/16704 (34.70%) | loss: 2.744248 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,166 | mfu: 50.82 | epoch: 1 | total time: 62.31m | eta: 117.4m +step 05798/16704 (34.71%) | loss: 2.739781 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,411 | mfu: 50.90 | epoch: 1 | total time: 62.32m | eta: 117.4m +step 05799/16704 (34.72%) | loss: 2.737523 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,953 | mfu: 50.69 | epoch: 1 | total time: 62.33m | eta: 117.4m +step 05800/16704 (34.72%) | loss: 2.746135 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,747 | mfu: 50.80 | epoch: 1 | total time: 62.34m | eta: 117.4m +step 05801/16704 (34.73%) | loss: 2.737619 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,952 | mfu: 51.00 | epoch: 1 | total time: 62.35m | eta: 117.4m +step 05802/16704 (34.73%) | loss: 2.746234 | lrm: 1.00 | dt: 646.26ms | tok/sec: 811,266 | mfu: 50.71 | epoch: 1 | total time: 62.36m | eta: 117.4m +step 05803/16704 (34.74%) | loss: 2.755929 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,073 | mfu: 50.76 | epoch: 1 | total time: 62.37m | eta: 117.4m +step 05804/16704 (34.75%) | loss: 2.756118 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,301 | mfu: 50.89 | epoch: 1 | total time: 62.38m | eta: 117.4m +step 05805/16704 (34.75%) | loss: 2.764153 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,321 | mfu: 50.90 | epoch: 1 | total time: 62.39m | eta: 117.3m +step 05806/16704 (34.76%) | loss: 2.754141 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,101 | mfu: 50.82 | epoch: 1 | total time: 62.40m | eta: 117.3m +step 05807/16704 (34.76%) | loss: 2.752049 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,630 | mfu: 50.79 | epoch: 1 | total time: 62.41m | eta: 117.3m +step 05808/16704 (34.77%) | loss: 2.756002 | lrm: 1.00 | dt: 646.72ms | tok/sec: 810,686 | mfu: 50.67 | epoch: 1 | total time: 62.43m | eta: 117.3m +step 05809/16704 (34.78%) | loss: 2.770219 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,614 | mfu: 50.66 | epoch: 1 | total time: 62.44m | eta: 117.3m +step 05810/16704 (34.78%) | loss: 2.772297 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,377 | mfu: 50.71 | epoch: 1 | total time: 62.45m | eta: 117.3m +step 05811/16704 (34.79%) | loss: 2.760094 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,738 | mfu: 50.86 | epoch: 1 | total time: 62.46m | eta: 117.3m +step 05812/16704 (34.79%) | loss: 2.759455 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,673 | mfu: 50.73 | epoch: 1 | total time: 62.47m | eta: 117.3m +step 05813/16704 (34.80%) | loss: 2.763837 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 1 | total time: 62.48m | eta: 117.3m +step 05814/16704 (34.81%) | loss: 2.766777 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,432 | mfu: 50.84 | epoch: 1 | total time: 62.49m | eta: 117.2m +step 05815/16704 (34.81%) | loss: 2.763576 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,487 | mfu: 50.78 | epoch: 1 | total time: 62.50m | eta: 117.2m +step 05816/16704 (34.82%) | loss: 2.752994 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,056 | mfu: 50.88 | epoch: 1 | total time: 62.51m | eta: 117.2m +step 05817/16704 (34.82%) | loss: 2.762315 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,060 | mfu: 50.82 | epoch: 1 | total time: 62.52m | eta: 117.2m +step 05818/16704 (34.83%) | loss: 2.748049 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 1 | total time: 62.53m | eta: 117.2m +step 05819/16704 (34.84%) | loss: 2.755064 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,975 | mfu: 51.00 | epoch: 1 | total time: 62.54m | eta: 117.2m +step 05820/16704 (34.84%) | loss: 2.758661 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,453 | mfu: 50.78 | epoch: 1 | total time: 62.55m | eta: 117.2m +step 05821/16704 (34.85%) | loss: 2.776523 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,170 | mfu: 50.82 | epoch: 1 | total time: 62.57m | eta: 117.2m +step 05822/16704 (34.85%) | loss: 2.771512 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,577 | mfu: 50.97 | epoch: 1 | total time: 62.58m | eta: 117.2m +step 05823/16704 (34.86%) | loss: 2.757233 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,976 | mfu: 50.87 | epoch: 1 | total time: 62.59m | eta: 117.2m +step 05824/16704 (34.87%) | loss: 2.763276 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,188 | mfu: 50.89 | epoch: 1 | total time: 62.60m | eta: 117.1m +step 05825/16704 (34.87%) | loss: 2.758773 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,280 | mfu: 50.71 | epoch: 1 | total time: 62.61m | eta: 117.1m +step 05826/16704 (34.88%) | loss: 2.762890 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,287 | mfu: 50.89 | epoch: 1 | total time: 62.62m | eta: 117.1m +step 05827/16704 (34.88%) | loss: 2.770416 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,595 | mfu: 50.98 | epoch: 1 | total time: 62.63m | eta: 117.1m +step 05828/16704 (34.89%) | loss: 2.777542 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,648 | mfu: 50.79 | epoch: 1 | total time: 62.64m | eta: 117.1m +step 05829/16704 (34.90%) | loss: 2.782104 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,983 | mfu: 50.88 | epoch: 1 | total time: 62.65m | eta: 117.1m +step 05830/16704 (34.90%) | loss: 2.789212 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,407 | mfu: 50.96 | epoch: 1 | total time: 62.66m | eta: 117.1m +step 05831/16704 (34.91%) | loss: 2.770027 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,927 | mfu: 50.62 | epoch: 1 | total time: 62.67m | eta: 117.1m +step 05832/16704 (34.91%) | loss: 2.754646 | lrm: 1.00 | dt: 641.33ms | tok/sec: 817,505 | mfu: 51.10 | epoch: 1 | total time: 62.68m | eta: 117.1m +step 05833/16704 (34.92%) | loss: 2.748316 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,929 | mfu: 50.62 | epoch: 1 | total time: 62.69m | eta: 117.0m +step 05834/16704 (34.93%) | loss: 2.749162 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,230 | mfu: 50.83 | epoch: 1 | total time: 62.70m | eta: 117.0m +step 05835/16704 (34.93%) | loss: 2.741012 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,460 | mfu: 50.90 | epoch: 1 | total time: 62.72m | eta: 117.0m +step 05836/16704 (34.94%) | loss: 2.733344 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 1 | total time: 62.73m | eta: 117.0m +step 05837/16704 (34.94%) | loss: 2.736079 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,491 | mfu: 51.03 | epoch: 1 | total time: 62.74m | eta: 117.0m +step 05838/16704 (34.95%) | loss: 2.738409 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,940 | mfu: 50.81 | epoch: 1 | total time: 62.75m | eta: 117.0m +step 05839/16704 (34.96%) | loss: 2.735400 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,732 | mfu: 50.86 | epoch: 1 | total time: 62.76m | eta: 117.0m +step 05840/16704 (34.96%) | loss: 2.741916 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,429 | mfu: 50.97 | epoch: 1 | total time: 62.77m | eta: 117.0m +step 05841/16704 (34.97%) | loss: 2.732743 | lrm: 1.00 | dt: 647.26ms | tok/sec: 810,017 | mfu: 50.63 | epoch: 1 | total time: 62.78m | eta: 117.0m +step 05842/16704 (34.97%) | loss: 2.742209 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,235 | mfu: 50.83 | epoch: 1 | total time: 62.79m | eta: 116.9m +step 05843/16704 (34.98%) | loss: 2.742031 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,015 | mfu: 50.81 | epoch: 1 | total time: 62.80m | eta: 116.9m +step 05844/16704 (34.99%) | loss: 2.744530 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,421 | mfu: 50.72 | epoch: 1 | total time: 62.81m | eta: 116.9m +step 05845/16704 (34.99%) | loss: 2.733023 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,602 | mfu: 50.91 | epoch: 1 | total time: 62.82m | eta: 116.9m +step 05846/16704 (35.00%) | loss: 2.754168 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,639 | mfu: 50.92 | epoch: 1 | total time: 62.83m | eta: 116.9m +step 05847/16704 (35.00%) | loss: 2.751352 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,286 | mfu: 50.83 | epoch: 1 | total time: 62.84m | eta: 116.9m +step 05848/16704 (35.01%) | loss: 2.761580 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,559 | mfu: 50.91 | epoch: 1 | total time: 62.85m | eta: 116.9m +step 05849/16704 (35.02%) | loss: 2.765797 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 1 | total time: 62.87m | eta: 116.9m +step 05850/16704 (35.02%) | loss: 2.767204 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,456 | mfu: 50.97 | epoch: 1 | total time: 62.88m | eta: 116.9m +step 05851/16704 (35.03%) | loss: 2.774095 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,941 | mfu: 50.81 | epoch: 1 | total time: 62.89m | eta: 116.8m +step 05852/16704 (35.03%) | loss: 2.768714 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,913 | mfu: 50.87 | epoch: 1 | total time: 62.90m | eta: 116.8m +step 05853/16704 (35.04%) | loss: 2.758710 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,268 | mfu: 50.77 | epoch: 1 | total time: 62.91m | eta: 116.8m +step 05854/16704 (35.05%) | loss: 2.752192 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,385 | mfu: 50.71 | epoch: 1 | total time: 62.92m | eta: 116.8m +step 05855/16704 (35.05%) | loss: 2.745164 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,883 | mfu: 50.99 | epoch: 1 | total time: 62.93m | eta: 116.8m +step 05856/16704 (35.06%) | loss: 2.747802 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,874 | mfu: 50.74 | epoch: 1 | total time: 62.94m | eta: 116.8m +step 05857/16704 (35.06%) | loss: 2.743916 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,152 | mfu: 50.76 | epoch: 1 | total time: 62.95m | eta: 116.8m +step 05858/16704 (35.07%) | loss: 2.744940 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,868 | mfu: 50.68 | epoch: 1 | total time: 62.96m | eta: 116.8m +step 05859/16704 (35.08%) | loss: 2.750489 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,635 | mfu: 50.92 | epoch: 1 | total time: 62.97m | eta: 116.8m +step 05860/16704 (35.08%) | loss: 2.746753 | lrm: 1.00 | dt: 647.68ms | tok/sec: 809,485 | mfu: 50.59 | epoch: 1 | total time: 62.98m | eta: 116.8m +step 05861/16704 (35.09%) | loss: 2.745900 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,137 | mfu: 50.76 | epoch: 1 | total time: 62.99m | eta: 116.7m +step 05862/16704 (35.09%) | loss: 2.744818 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,557 | mfu: 50.97 | epoch: 1 | total time: 63.01m | eta: 116.7m +step 05863/16704 (35.10%) | loss: 2.744111 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,619 | mfu: 50.60 | epoch: 1 | total time: 63.02m | eta: 116.7m +step 05864/16704 (35.11%) | loss: 2.755156 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,007 | mfu: 50.94 | epoch: 1 | total time: 63.03m | eta: 116.7m +step 05865/16704 (35.11%) | loss: 2.753343 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,306 | mfu: 50.71 | epoch: 1 | total time: 63.04m | eta: 116.7m +step 05866/16704 (35.12%) | loss: 2.753159 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,584 | mfu: 50.79 | epoch: 1 | total time: 63.05m | eta: 116.7m +step 05867/16704 (35.12%) | loss: 2.759015 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,604 | mfu: 50.98 | epoch: 1 | total time: 63.06m | eta: 116.7m +step 05868/16704 (35.13%) | loss: 2.745102 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,782 | mfu: 50.68 | epoch: 1 | total time: 63.07m | eta: 116.7m +step 05869/16704 (35.14%) | loss: 2.741018 | lrm: 1.00 | dt: 646.98ms | tok/sec: 810,364 | mfu: 50.65 | epoch: 1 | total time: 63.08m | eta: 116.7m +step 05870/16704 (35.14%) | loss: 2.740516 | lrm: 1.00 | dt: 641.91ms | tok/sec: 816,768 | mfu: 51.05 | epoch: 1 | total time: 63.09m | eta: 116.6m +step 05871/16704 (35.15%) | loss: 2.748874 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,636 | mfu: 50.85 | epoch: 1 | total time: 63.10m | eta: 116.6m +step 05872/16704 (35.15%) | loss: 2.754768 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 1 | total time: 63.11m | eta: 116.6m +step 05873/16704 (35.16%) | loss: 2.744435 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,475 | mfu: 50.84 | epoch: 1 | total time: 63.12m | eta: 116.6m +step 05874/16704 (35.17%) | loss: 2.727985 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,281 | mfu: 50.77 | epoch: 1 | total time: 63.13m | eta: 116.6m +step 05875/16704 (35.17%) | loss: 2.726156 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 1 | total time: 63.15m | eta: 116.6m +step 05876/16704 (35.18%) | loss: 2.729214 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,137 | mfu: 50.88 | epoch: 1 | total time: 63.16m | eta: 116.6m +step 05877/16704 (35.18%) | loss: 2.736074 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,341 | mfu: 50.90 | epoch: 1 | total time: 63.17m | eta: 116.6m +step 05878/16704 (35.19%) | loss: 2.733848 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,423 | mfu: 50.78 | epoch: 1 | total time: 63.18m | eta: 116.6m +step 05879/16704 (35.20%) | loss: 2.750810 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,334 | mfu: 50.90 | epoch: 1 | total time: 63.19m | eta: 116.5m +step 05880/16704 (35.20%) | loss: 2.752050 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,289 | mfu: 50.83 | epoch: 1 | total time: 63.20m | eta: 116.5m +step 05881/16704 (35.21%) | loss: 2.741075 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,955 | mfu: 50.94 | epoch: 1 | total time: 63.21m | eta: 116.5m +step 05882/16704 (35.21%) | loss: 2.736288 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,837 | mfu: 50.87 | epoch: 1 | total time: 63.22m | eta: 116.5m +step 05883/16704 (35.22%) | loss: 2.733764 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,229 | mfu: 50.77 | epoch: 1 | total time: 63.23m | eta: 116.5m +step 05884/16704 (35.23%) | loss: 2.735893 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,628 | mfu: 50.79 | epoch: 1 | total time: 63.24m | eta: 116.5m +step 05885/16704 (35.23%) | loss: 2.746497 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,071 | mfu: 50.69 | epoch: 1 | total time: 63.25m | eta: 116.5m +step 05886/16704 (35.24%) | loss: 2.744972 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,848 | mfu: 50.87 | epoch: 1 | total time: 63.26m | eta: 116.5m +step 05887/16704 (35.24%) | loss: 2.729725 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,323 | mfu: 50.83 | epoch: 1 | total time: 63.27m | eta: 116.5m +step 05888/16704 (35.25%) | loss: 2.731913 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,406 | mfu: 50.78 | epoch: 1 | total time: 63.28m | eta: 116.4m +step 05889/16704 (35.26%) | loss: 2.729516 | lrm: 1.00 | dt: 641.25ms | tok/sec: 817,598 | mfu: 51.10 | epoch: 1 | total time: 63.30m | eta: 116.4m +step 05890/16704 (35.26%) | loss: 2.742622 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,662 | mfu: 50.86 | epoch: 1 | total time: 63.31m | eta: 116.4m +step 05891/16704 (35.27%) | loss: 2.748524 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,925 | mfu: 50.87 | epoch: 1 | total time: 63.32m | eta: 116.4m +step 05892/16704 (35.27%) | loss: 2.739547 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,075 | mfu: 50.88 | epoch: 1 | total time: 63.33m | eta: 116.4m +step 05893/16704 (35.28%) | loss: 2.736032 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,915 | mfu: 50.75 | epoch: 1 | total time: 63.34m | eta: 116.4m +step 05894/16704 (35.28%) | loss: 2.741923 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,525 | mfu: 50.97 | epoch: 1 | total time: 63.35m | eta: 116.4m +step 05895/16704 (35.29%) | loss: 2.748404 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,990 | mfu: 50.75 | epoch: 1 | total time: 63.36m | eta: 116.4m +step 05896/16704 (35.30%) | loss: 2.740496 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,045 | mfu: 50.69 | epoch: 1 | total time: 63.37m | eta: 116.4m +step 05897/16704 (35.30%) | loss: 2.750731 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,957 | mfu: 50.94 | epoch: 1 | total time: 63.38m | eta: 116.4m +step 05898/16704 (35.31%) | loss: 2.754135 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,453 | mfu: 50.78 | epoch: 1 | total time: 63.39m | eta: 116.3m +step 05899/16704 (35.31%) | loss: 2.745263 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,473 | mfu: 50.72 | epoch: 1 | total time: 63.40m | eta: 116.3m +step 05900/16704 (35.32%) | loss: 2.743154 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,091 | mfu: 50.76 | epoch: 1 | total time: 63.41m | eta: 116.3m +step 05901/16704 (35.33%) | loss: 2.740627 | lrm: 1.00 | dt: 648.45ms | tok/sec: 808,531 | mfu: 50.53 | epoch: 1 | total time: 63.42m | eta: 116.3m +step 05902/16704 (35.33%) | loss: 2.732459 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,164 | mfu: 50.70 | epoch: 1 | total time: 63.44m | eta: 116.3m +step 05903/16704 (35.34%) | loss: 2.740460 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,422 | mfu: 50.90 | epoch: 1 | total time: 63.45m | eta: 116.3m +step 05904/16704 (35.34%) | loss: 2.739701 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,473 | mfu: 50.78 | epoch: 1 | total time: 63.46m | eta: 116.3m +step 05905/16704 (35.35%) | loss: 2.734732 | lrm: 1.00 | dt: 649.01ms | tok/sec: 807,824 | mfu: 50.49 | epoch: 1 | total time: 63.47m | eta: 116.3m +step 05906/16704 (35.36%) | loss: 2.746944 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,355 | mfu: 50.84 | epoch: 1 | total time: 63.48m | eta: 116.3m +step 05907/16704 (35.36%) | loss: 2.753424 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,811 | mfu: 50.80 | epoch: 1 | total time: 63.49m | eta: 116.2m +step 05908/16704 (35.37%) | loss: 2.754387 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,818 | mfu: 50.93 | epoch: 1 | total time: 63.50m | eta: 116.2m +step 05909/16704 (35.37%) | loss: 2.755802 | lrm: 1.00 | dt: 642.01ms | tok/sec: 816,631 | mfu: 51.04 | epoch: 1 | total time: 63.51m | eta: 116.2m +step 05910/16704 (35.38%) | loss: 2.753550 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 1 | total time: 63.52m | eta: 116.2m +step 05911/16704 (35.39%) | loss: 2.754783 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,390 | mfu: 50.84 | epoch: 1 | total time: 63.53m | eta: 116.2m +step 05912/16704 (35.39%) | loss: 2.760357 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,857 | mfu: 50.87 | epoch: 1 | total time: 63.54m | eta: 116.2m +step 05913/16704 (35.40%) | loss: 2.767780 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 1 | total time: 63.55m | eta: 116.2m +step 05914/16704 (35.40%) | loss: 2.776568 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,182 | mfu: 50.89 | epoch: 1 | total time: 63.56m | eta: 116.2m +step 05915/16704 (35.41%) | loss: 2.785399 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,204 | mfu: 50.95 | epoch: 1 | total time: 63.57m | eta: 116.2m +step 05916/16704 (35.42%) | loss: 2.775595 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,694 | mfu: 50.92 | epoch: 1 | total time: 63.59m | eta: 116.1m +step 05917/16704 (35.42%) | loss: 2.777331 | lrm: 1.00 | dt: 646.98ms | tok/sec: 810,361 | mfu: 50.65 | epoch: 1 | total time: 63.60m | eta: 116.1m +step 05918/16704 (35.43%) | loss: 2.777765 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,282 | mfu: 50.64 | epoch: 1 | total time: 63.61m | eta: 116.1m +step 05919/16704 (35.43%) | loss: 2.790134 | lrm: 1.00 | dt: 640.26ms | tok/sec: 818,868 | mfu: 51.18 | epoch: 1 | total time: 63.62m | eta: 116.1m +step 05920/16704 (35.44%) | loss: 2.791327 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,482 | mfu: 50.78 | epoch: 1 | total time: 63.63m | eta: 116.1m +step 05921/16704 (35.45%) | loss: 2.792884 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,672 | mfu: 50.67 | epoch: 1 | total time: 63.64m | eta: 116.1m +step 05922/16704 (35.45%) | loss: 2.793805 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 1 | total time: 63.65m | eta: 116.1m +step 05923/16704 (35.46%) | loss: 2.796654 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,182 | mfu: 50.95 | epoch: 1 | total time: 63.66m | eta: 116.1m +step 05924/16704 (35.46%) | loss: 2.787794 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,229 | mfu: 50.83 | epoch: 1 | total time: 63.67m | eta: 116.1m +step 05925/16704 (35.47%) | loss: 2.786370 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,631 | mfu: 50.73 | epoch: 1 | total time: 63.68m | eta: 116.0m +step 05926/16704 (35.48%) | loss: 2.794634 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,388 | mfu: 50.90 | epoch: 1 | total time: 63.69m | eta: 116.0m +step 05927/16704 (35.48%) | loss: 2.776755 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,897 | mfu: 50.87 | epoch: 1 | total time: 63.70m | eta: 116.0m +step 05928/16704 (35.49%) | loss: 2.772256 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,587 | mfu: 50.85 | epoch: 1 | total time: 63.71m | eta: 116.0m +step 05929/16704 (35.49%) | loss: 2.773381 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,983 | mfu: 50.94 | epoch: 1 | total time: 63.73m | eta: 116.0m +step 05930/16704 (35.50%) | loss: 2.775032 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,255 | mfu: 50.77 | epoch: 1 | total time: 63.74m | eta: 116.0m +step 05931/16704 (35.51%) | loss: 2.769490 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,063 | mfu: 50.69 | epoch: 1 | total time: 63.75m | eta: 116.0m +step 05932/16704 (35.51%) | loss: 2.764368 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,925 | mfu: 50.87 | epoch: 1 | total time: 63.76m | eta: 116.0m +step 05933/16704 (35.52%) | loss: 2.754573 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,274 | mfu: 50.83 | epoch: 1 | total time: 63.77m | eta: 116.0m +step 05934/16704 (35.52%) | loss: 2.751219 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,879 | mfu: 50.68 | epoch: 1 | total time: 63.78m | eta: 116.0m +step 05935/16704 (35.53%) | loss: 2.741186 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,328 | mfu: 50.71 | epoch: 1 | total time: 63.79m | eta: 115.9m +step 05936/16704 (35.54%) | loss: 2.740181 | lrm: 1.00 | dt: 642.33ms | tok/sec: 816,223 | mfu: 51.02 | epoch: 1 | total time: 63.80m | eta: 115.9m +step 05937/16704 (35.54%) | loss: 2.736879 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,085 | mfu: 50.94 | epoch: 1 | total time: 63.81m | eta: 115.9m +step 05938/16704 (35.55%) | loss: 2.748888 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,820 | mfu: 50.80 | epoch: 1 | total time: 63.82m | eta: 115.9m +step 05939/16704 (35.55%) | loss: 2.751740 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,590 | mfu: 50.91 | epoch: 1 | total time: 63.83m | eta: 115.9m +step 05940/16704 (35.56%) | loss: 2.753398 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,641 | mfu: 50.92 | epoch: 1 | total time: 63.84m | eta: 115.9m +step 05941/16704 (35.57%) | loss: 2.754142 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 1 | total time: 63.85m | eta: 115.9m +step 05942/16704 (35.57%) | loss: 2.748505 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,949 | mfu: 50.75 | epoch: 1 | total time: 63.87m | eta: 115.9m +step 05943/16704 (35.58%) | loss: 2.742260 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,824 | mfu: 50.93 | epoch: 1 | total time: 63.88m | eta: 115.9m +step 05944/16704 (35.58%) | loss: 2.748737 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,776 | mfu: 50.86 | epoch: 1 | total time: 63.89m | eta: 115.8m +step 05945/16704 (35.59%) | loss: 2.751747 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,361 | mfu: 50.90 | epoch: 1 | total time: 63.90m | eta: 115.8m +step 05946/16704 (35.60%) | loss: 2.743258 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,954 | mfu: 51.00 | epoch: 1 | total time: 63.91m | eta: 115.8m +step 05947/16704 (35.60%) | loss: 2.738610 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,463 | mfu: 50.66 | epoch: 1 | total time: 63.92m | eta: 115.8m +step 05948/16704 (35.61%) | loss: 2.745436 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,511 | mfu: 50.85 | epoch: 1 | total time: 63.93m | eta: 115.8m +step 05949/16704 (35.61%) | loss: 2.735204 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,682 | mfu: 50.92 | epoch: 1 | total time: 63.94m | eta: 115.8m +step 05950/16704 (35.62%) | loss: 2.736901 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,493 | mfu: 50.97 | epoch: 1 | total time: 63.95m | eta: 115.8m +step 05951/16704 (35.63%) | loss: 2.741021 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,167 | mfu: 50.82 | epoch: 1 | total time: 63.96m | eta: 115.8m +step 05952/16704 (35.63%) | loss: 2.746983 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,188 | mfu: 50.95 | epoch: 1 | total time: 63.97m | eta: 115.8m +step 05953/16704 (35.64%) | loss: 2.745189 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,193 | mfu: 50.76 | epoch: 1 | total time: 63.98m | eta: 115.7m +step 05954/16704 (35.64%) | loss: 2.740955 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,319 | mfu: 50.83 | epoch: 1 | total time: 63.99m | eta: 115.7m +step 05955/16704 (35.65%) | loss: 2.735386 | lrm: 1.00 | dt: 647.11ms | tok/sec: 810,196 | mfu: 50.64 | epoch: 1 | total time: 64.00m | eta: 115.7m +step 05956/16704 (35.66%) | loss: 2.748814 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,941 | mfu: 50.81 | epoch: 1 | total time: 64.02m | eta: 115.7m +step 05957/16704 (35.66%) | loss: 2.748130 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,201 | mfu: 50.89 | epoch: 1 | total time: 64.03m | eta: 115.7m +step 05958/16704 (35.67%) | loss: 2.747569 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,534 | mfu: 50.91 | epoch: 1 | total time: 64.04m | eta: 115.7m +step 05959/16704 (35.67%) | loss: 2.738356 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,029 | mfu: 50.75 | epoch: 1 | total time: 64.05m | eta: 115.7m +step 05960/16704 (35.68%) | loss: 2.735427 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,255 | mfu: 50.77 | epoch: 1 | total time: 64.06m | eta: 115.7m +step 05961/16704 (35.69%) | loss: 2.726554 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 1 | total time: 64.07m | eta: 115.7m +step 05962/16704 (35.69%) | loss: 2.736675 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,712 | mfu: 50.92 | epoch: 1 | total time: 64.08m | eta: 115.6m +step 05963/16704 (35.70%) | loss: 2.734867 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,815 | mfu: 50.86 | epoch: 1 | total time: 64.09m | eta: 115.6m +step 05964/16704 (35.70%) | loss: 2.729893 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,343 | mfu: 50.96 | epoch: 1 | total time: 64.10m | eta: 115.6m +step 05965/16704 (35.71%) | loss: 2.727731 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 1 | total time: 64.11m | eta: 115.6m +step 05966/16704 (35.72%) | loss: 2.735758 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,652 | mfu: 50.79 | epoch: 1 | total time: 64.12m | eta: 115.6m +step 05967/16704 (35.72%) | loss: 2.734945 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,820 | mfu: 50.68 | epoch: 1 | total time: 64.13m | eta: 115.6m +step 05968/16704 (35.73%) | loss: 2.736308 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,659 | mfu: 50.79 | epoch: 1 | total time: 64.14m | eta: 115.6m +step 05969/16704 (35.73%) | loss: 2.743433 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,365 | mfu: 50.84 | epoch: 1 | total time: 64.16m | eta: 115.6m +step 05970/16704 (35.74%) | loss: 2.736661 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,869 | mfu: 50.74 | epoch: 1 | total time: 64.17m | eta: 115.6m +step 05971/16704 (35.75%) | loss: 2.725953 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,179 | mfu: 50.89 | epoch: 1 | total time: 64.18m | eta: 115.6m +step 05972/16704 (35.75%) | loss: 2.746609 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,131 | mfu: 50.70 | epoch: 1 | total time: 64.19m | eta: 115.5m +step 05973/16704 (35.76%) | loss: 2.738633 | lrm: 1.00 | dt: 644.98ms | tok/sec: 812,873 | mfu: 50.81 | epoch: 1 | total time: 64.20m | eta: 115.5m +step 05974/16704 (35.76%) | loss: 2.735049 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,844 | mfu: 50.68 | epoch: 1 | total time: 64.21m | eta: 115.5m +step 05975/16704 (35.77%) | loss: 2.730663 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,278 | mfu: 50.83 | epoch: 1 | total time: 64.22m | eta: 115.5m +step 05976/16704 (35.78%) | loss: 2.739737 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,509 | mfu: 50.91 | epoch: 1 | total time: 64.23m | eta: 115.5m +step 05977/16704 (35.78%) | loss: 2.735785 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,754 | mfu: 50.86 | epoch: 1 | total time: 64.24m | eta: 115.5m +step 05978/16704 (35.79%) | loss: 2.751243 | lrm: 1.00 | dt: 642.72ms | tok/sec: 815,727 | mfu: 50.98 | epoch: 1 | total time: 64.25m | eta: 115.5m +step 05979/16704 (35.79%) | loss: 2.749635 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,698 | mfu: 50.79 | epoch: 1 | total time: 64.26m | eta: 115.5m +step 05980/16704 (35.80%) | loss: 2.756665 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,046 | mfu: 50.75 | epoch: 1 | total time: 64.27m | eta: 115.5m +step 05981/16704 (35.81%) | loss: 2.751917 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,815 | mfu: 50.86 | epoch: 1 | total time: 64.28m | eta: 115.4m +step 05982/16704 (35.81%) | loss: 2.752560 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,085 | mfu: 50.82 | epoch: 1 | total time: 64.29m | eta: 115.4m +step 05983/16704 (35.82%) | loss: 2.752069 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,539 | mfu: 50.72 | epoch: 1 | total time: 64.31m | eta: 115.4m +step 05984/16704 (35.82%) | loss: 2.756269 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,032 | mfu: 50.88 | epoch: 1 | total time: 64.32m | eta: 115.4m +step 05985/16704 (35.83%) | loss: 2.757900 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,718 | mfu: 50.86 | epoch: 1 | total time: 64.33m | eta: 115.4m +step 05986/16704 (35.84%) | loss: 2.746072 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,896 | mfu: 50.81 | epoch: 1 | total time: 64.34m | eta: 115.4m +step 05987/16704 (35.84%) | loss: 2.753890 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,878 | mfu: 50.87 | epoch: 1 | total time: 64.35m | eta: 115.4m +step 05988/16704 (35.85%) | loss: 2.761609 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,122 | mfu: 50.70 | epoch: 1 | total time: 64.36m | eta: 115.4m +step 05989/16704 (35.85%) | loss: 2.765504 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,271 | mfu: 50.96 | epoch: 1 | total time: 64.37m | eta: 115.4m +step 05990/16704 (35.86%) | loss: 2.763456 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 1 | total time: 64.38m | eta: 115.3m +step 05991/16704 (35.87%) | loss: 2.767869 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,071 | mfu: 50.88 | epoch: 1 | total time: 64.39m | eta: 115.3m +step 05992/16704 (35.87%) | loss: 2.769205 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,566 | mfu: 50.85 | epoch: 1 | total time: 64.40m | eta: 115.3m +step 05993/16704 (35.88%) | loss: 2.772493 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,922 | mfu: 50.87 | epoch: 1 | total time: 64.41m | eta: 115.3m +step 05994/16704 (35.88%) | loss: 2.779726 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,657 | mfu: 50.85 | epoch: 1 | total time: 64.42m | eta: 115.3m +step 05995/16704 (35.89%) | loss: 2.767638 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 1 | total time: 64.43m | eta: 115.3m +step 05996/16704 (35.90%) | loss: 2.768638 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,421 | mfu: 50.65 | epoch: 1 | total time: 64.45m | eta: 115.3m +step 05997/16704 (35.90%) | loss: 2.758847 | lrm: 1.00 | dt: 649.90ms | tok/sec: 806,720 | mfu: 50.42 | epoch: 1 | total time: 64.46m | eta: 115.3m +step 05998/16704 (35.91%) | loss: 2.756100 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,146 | mfu: 51.01 | epoch: 1 | total time: 64.47m | eta: 115.3m +step 05999/16704 (35.91%) | loss: 2.738388 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,912 | mfu: 50.87 | epoch: 1 | total time: 64.48m | eta: 115.2m +[GC rank7] gen2: 42.1ms collected 0 objects +[GC rank6] gen2: 42.6ms collected 0 objects +[GC rank2] gen2: 45.2ms collected 0 objects +[GC rank0] gen2: 45.4ms collected 0 objects +[GC rank4] gen2: 54.7ms collected 0 objects[GC rank5] gen2: 54.7ms collected 0 objects + +[GC rank1] gen2: 55.3ms collected 0 objects +[GC rank3] gen2: 57.3ms collected 0 objects +Step 06000 | Validation bpb: 0.834056 +Evaluating: hellaswag_zeroshot (0-shot, type: multiple_choice)... accuracy: 0.4177 | centered: 0.2237 | time: 23.17s +Evaluating: jeopardy (10-shot, type: language_modeling)... accuracy: 0.0746 | centered: 0.0746 | time: 4.93s +Evaluating: bigbench_qa_wikidata (10-shot, type: language_modeling)... accuracy: 0.4170 | centered: 0.4170 | time: 48.39s +Evaluating: arc_easy (10-shot, type: multiple_choice)... accuracy: 0.6225 | centered: 0.4966 | time: 6.05s +Evaluating: arc_challenge (10-shot, type: multiple_choice)... accuracy: 0.3251 | centered: 0.1001 | time: 3.01s +Evaluating: copa (0-shot, type: multiple_choice)... accuracy: 0.6300 | centered: 0.2600 | time: 0.24s +Evaluating: commonsense_qa (10-shot, type: multiple_choice)... accuracy: 0.2146 | centered: 0.0182 | time: 3.19s +Evaluating: piqa (10-shot, type: multiple_choice)... accuracy: 0.6855 | centered: 0.3711 | time: 4.51s +Evaluating: openbook_qa (0-shot, type: multiple_choice)... accuracy: 0.3560 | centered: 0.1413 | time: 1.16s +Evaluating: lambada_openai (0-shot, type: language_modeling)... accuracy: 0.3390 | centered: 0.3390 | time: 11.67s +Evaluating: hellaswag (10-shot, type: multiple_choice)... accuracy: 0.4178 | centered: 0.2238 | time: 35.47s +Evaluating: winograd (0-shot, type: schema)... accuracy: 0.6703 | centered: 0.3407 | time: 0.62s +Evaluating: winogrande (0-shot, type: schema)... accuracy: 0.5525 | centered: 0.1050 | time: 2.85s +Evaluating: bigbench_dyck_languages (10-shot, type: language_modeling)... accuracy: 0.1340 | centered: 0.1340 | time: 2.43s +Evaluating: agi_eval_lsat_ar (3-shot, type: multiple_choice)... accuracy: 0.2957 | centered: 0.1196 | time: 0.80s +Evaluating: bigbench_cs_algorithms (10-shot, type: language_modeling)... accuracy: 0.3826 | centered: 0.3826 | time: 3.16s +Evaluating: bigbench_operators (10-shot, type: language_modeling)... accuracy: 0.1714 | centered: 0.1714 | time: 0.51s +Evaluating: bigbench_repeat_copy_logic (10-shot, type: language_modeling)... accuracy: 0.0000 | centered: 0.0000 | time: 0.08s +Evaluating: squad (10-shot, type: language_modeling)... accuracy: 0.2363 | centered: 0.2363 | time: 29.01s +Evaluating: coqa (0-shot, type: language_modeling)... accuracy: 0.1899 | centered: 0.1899 | time: 18.90s +Evaluating: boolq (10-shot, type: multiple_choice)... accuracy: 0.5080 | centered: -0.2949 | time: 10.79s +Evaluating: bigbench_language_identification (10-shot, type: multiple_choice)... accuracy: 0.2503 | centered: 0.1752 | time: 59.98s +Step 06000 | CORE metric: 0.1921 +step 06000/16704 (35.92%) | loss: 2.744007 | lrm: 1.00 | dt: 626.83ms | tok/sec: 836,414 | mfu: 52.28 | epoch: 1 | total time: 64.49m | eta: 115.2m +step 06001/16704 (35.93%) | loss: 2.740574 | lrm: 1.00 | dt: 654.71ms | tok/sec: 800,794 | mfu: 50.05 | epoch: 1 | total time: 64.50m | eta: 115.2m +step 06002/16704 (35.93%) | loss: 2.728201 | lrm: 1.00 | dt: 642.09ms | tok/sec: 816,535 | mfu: 51.03 | epoch: 1 | total time: 64.51m | eta: 115.2m +step 06003/16704 (35.94%) | loss: 2.730834 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,590 | mfu: 50.98 | epoch: 1 | total time: 64.52m | eta: 115.2m +step 06004/16704 (35.94%) | loss: 2.738945 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,441 | mfu: 50.72 | epoch: 1 | total time: 64.53m | eta: 115.2m +step 06005/16704 (35.95%) | loss: 2.742336 | lrm: 1.00 | dt: 641.50ms | tok/sec: 817,279 | mfu: 51.08 | epoch: 1 | total time: 64.54m | eta: 115.2m +step 06006/16704 (35.96%) | loss: 2.737215 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,344 | mfu: 50.90 | epoch: 1 | total time: 64.55m | eta: 115.2m +step 06007/16704 (35.96%) | loss: 2.725512 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,781 | mfu: 50.86 | epoch: 1 | total time: 64.56m | eta: 115.2m +step 06008/16704 (35.97%) | loss: 2.728327 | lrm: 1.00 | dt: 642.03ms | tok/sec: 816,607 | mfu: 51.04 | epoch: 1 | total time: 64.57m | eta: 115.2m +step 06009/16704 (35.97%) | loss: 2.738192 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,593 | mfu: 50.98 | epoch: 1 | total time: 64.58m | eta: 115.1m +step 06010/16704 (35.98%) | loss: 2.727527 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,662 | mfu: 50.86 | epoch: 1 | total time: 64.60m | eta: 115.1m +step 06011/16704 (35.99%) | loss: 2.735228 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 1 | total time: 64.61m | eta: 115.1m +step 06012/16704 (35.99%) | loss: 2.752980 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,655 | mfu: 50.92 | epoch: 1 | total time: 64.62m | eta: 115.1m +step 06013/16704 (36.00%) | loss: 2.755519 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,711 | mfu: 50.92 | epoch: 1 | total time: 64.63m | eta: 115.1m +step 06014/16704 (36.00%) | loss: 2.751729 | lrm: 1.00 | dt: 643.30ms | tok/sec: 814,995 | mfu: 50.94 | epoch: 1 | total time: 64.64m | eta: 115.1m +step 06015/16704 (36.01%) | loss: 2.746377 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,942 | mfu: 50.87 | epoch: 1 | total time: 64.65m | eta: 115.1m +step 06016/16704 (36.02%) | loss: 2.748537 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,938 | mfu: 50.75 | epoch: 1 | total time: 64.66m | eta: 115.1m +step 06017/16704 (36.02%) | loss: 2.753581 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,595 | mfu: 50.91 | epoch: 1 | total time: 64.67m | eta: 115.1m +step 06018/16704 (36.03%) | loss: 2.740790 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,381 | mfu: 50.78 | epoch: 1 | total time: 64.68m | eta: 115.0m +step 06019/16704 (36.03%) | loss: 2.732301 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,605 | mfu: 50.98 | epoch: 1 | total time: 64.69m | eta: 115.0m +step 06020/16704 (36.04%) | loss: 2.739841 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,951 | mfu: 50.94 | epoch: 1 | total time: 64.70m | eta: 115.0m +step 06021/16704 (36.05%) | loss: 2.734724 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,780 | mfu: 50.80 | epoch: 1 | total time: 64.71m | eta: 115.0m +step 06022/16704 (36.05%) | loss: 2.740568 | lrm: 1.00 | dt: 642.44ms | tok/sec: 816,094 | mfu: 51.01 | epoch: 1 | total time: 64.72m | eta: 115.0m +step 06023/16704 (36.06%) | loss: 2.737493 | lrm: 1.00 | dt: 642.32ms | tok/sec: 816,243 | mfu: 51.02 | epoch: 1 | total time: 64.73m | eta: 115.0m +step 06024/16704 (36.06%) | loss: 2.732366 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,170 | mfu: 50.76 | epoch: 1 | total time: 64.75m | eta: 115.0m +step 06025/16704 (36.07%) | loss: 2.737442 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,387 | mfu: 50.96 | epoch: 1 | total time: 64.76m | eta: 115.0m +step 06026/16704 (36.08%) | loss: 2.745005 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,428 | mfu: 50.90 | epoch: 1 | total time: 64.77m | eta: 115.0m +step 06027/16704 (36.08%) | loss: 2.741645 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,431 | mfu: 50.90 | epoch: 1 | total time: 64.78m | eta: 114.9m +step 06028/16704 (36.09%) | loss: 2.747819 | lrm: 1.00 | dt: 642.03ms | tok/sec: 816,613 | mfu: 51.04 | epoch: 1 | total time: 64.79m | eta: 114.9m +step 06029/16704 (36.09%) | loss: 2.740900 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,245 | mfu: 50.95 | epoch: 1 | total time: 64.80m | eta: 114.9m +step 06030/16704 (36.10%) | loss: 2.747026 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,779 | mfu: 50.86 | epoch: 1 | total time: 64.81m | eta: 114.9m +step 06031/16704 (36.11%) | loss: 2.747592 | lrm: 1.00 | dt: 641.80ms | tok/sec: 816,900 | mfu: 51.06 | epoch: 1 | total time: 64.82m | eta: 114.9m +step 06032/16704 (36.11%) | loss: 2.763877 | lrm: 1.00 | dt: 641.79ms | tok/sec: 816,916 | mfu: 51.06 | epoch: 1 | total time: 64.83m | eta: 114.9m +step 06033/16704 (36.12%) | loss: 2.772221 | lrm: 1.00 | dt: 642.46ms | tok/sec: 816,063 | mfu: 51.01 | epoch: 1 | total time: 64.84m | eta: 114.9m +step 06034/16704 (36.12%) | loss: 2.761136 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,831 | mfu: 50.99 | epoch: 1 | total time: 64.85m | eta: 114.9m +step 06035/16704 (36.13%) | loss: 2.758100 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,832 | mfu: 50.99 | epoch: 1 | total time: 64.86m | eta: 114.9m +step 06036/16704 (36.14%) | loss: 2.769025 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,614 | mfu: 50.98 | epoch: 1 | total time: 64.87m | eta: 114.8m +step 06037/16704 (36.14%) | loss: 2.766385 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,400 | mfu: 50.78 | epoch: 1 | total time: 64.88m | eta: 114.8m +step 06038/16704 (36.15%) | loss: 2.764673 | lrm: 1.00 | dt: 641.04ms | tok/sec: 817,867 | mfu: 51.12 | epoch: 1 | total time: 64.90m | eta: 114.8m +step 06039/16704 (36.15%) | loss: 2.757726 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,688 | mfu: 50.92 | epoch: 1 | total time: 64.91m | eta: 114.8m +step 06040/16704 (36.16%) | loss: 2.745844 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,255 | mfu: 50.89 | epoch: 1 | total time: 64.92m | eta: 114.8m +step 06041/16704 (36.16%) | loss: 2.745695 | lrm: 1.00 | dt: 642.13ms | tok/sec: 816,477 | mfu: 51.03 | epoch: 1 | total time: 64.93m | eta: 114.8m +step 06042/16704 (36.17%) | loss: 2.753134 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,024 | mfu: 50.88 | epoch: 1 | total time: 64.94m | eta: 114.8m +step 06043/16704 (36.18%) | loss: 2.751263 | lrm: 1.00 | dt: 641.99ms | tok/sec: 816,661 | mfu: 51.04 | epoch: 1 | total time: 64.95m | eta: 114.8m +step 06044/16704 (36.18%) | loss: 2.749706 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,686 | mfu: 50.86 | epoch: 1 | total time: 64.96m | eta: 114.8m +step 06045/16704 (36.19%) | loss: 2.746734 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,934 | mfu: 50.93 | epoch: 1 | total time: 64.97m | eta: 114.8m +step 06046/16704 (36.19%) | loss: 2.747677 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,009 | mfu: 50.88 | epoch: 1 | total time: 64.98m | eta: 114.7m +step 06047/16704 (36.20%) | loss: 2.744726 | lrm: 1.00 | dt: 642.50ms | tok/sec: 816,009 | mfu: 51.00 | epoch: 1 | total time: 64.99m | eta: 114.7m +step 06048/16704 (36.21%) | loss: 2.746869 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,622 | mfu: 50.85 | epoch: 1 | total time: 65.00m | eta: 114.7m +step 06049/16704 (36.21%) | loss: 2.747595 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,746 | mfu: 50.92 | epoch: 1 | total time: 65.01m | eta: 114.7m +step 06050/16704 (36.22%) | loss: 2.743997 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,022 | mfu: 50.88 | epoch: 1 | total time: 65.02m | eta: 114.7m +step 06051/16704 (36.22%) | loss: 2.752739 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,665 | mfu: 50.86 | epoch: 1 | total time: 65.04m | eta: 114.7m +step 06052/16704 (36.23%) | loss: 2.750043 | lrm: 1.00 | dt: 640.48ms | tok/sec: 818,590 | mfu: 51.16 | epoch: 1 | total time: 65.05m | eta: 114.7m +step 06053/16704 (36.24%) | loss: 2.747525 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,618 | mfu: 50.73 | epoch: 1 | total time: 65.06m | eta: 114.7m +step 06054/16704 (36.24%) | loss: 2.751184 | lrm: 1.00 | dt: 642.28ms | tok/sec: 816,293 | mfu: 51.02 | epoch: 1 | total time: 65.07m | eta: 114.7m +step 06055/16704 (36.25%) | loss: 2.747463 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,813 | mfu: 50.93 | epoch: 1 | total time: 65.08m | eta: 114.6m +step 06056/16704 (36.25%) | loss: 2.749138 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,821 | mfu: 50.86 | epoch: 1 | total time: 65.09m | eta: 114.6m +step 06057/16704 (36.26%) | loss: 2.746296 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,437 | mfu: 50.90 | epoch: 1 | total time: 65.10m | eta: 114.6m +step 06058/16704 (36.27%) | loss: 2.755026 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,412 | mfu: 50.90 | epoch: 1 | total time: 65.11m | eta: 114.6m +step 06059/16704 (36.27%) | loss: 2.744399 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,817 | mfu: 50.99 | epoch: 1 | total time: 65.12m | eta: 114.6m +step 06060/16704 (36.28%) | loss: 2.754998 | lrm: 1.00 | dt: 642.74ms | tok/sec: 815,706 | mfu: 50.98 | epoch: 1 | total time: 65.13m | eta: 114.6m +step 06061/16704 (36.28%) | loss: 2.741391 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,410 | mfu: 50.78 | epoch: 1 | total time: 65.14m | eta: 114.6m +step 06062/16704 (36.29%) | loss: 2.734939 | lrm: 1.00 | dt: 640.94ms | tok/sec: 817,998 | mfu: 51.13 | epoch: 1 | total time: 65.15m | eta: 114.6m +step 06063/16704 (36.30%) | loss: 2.744258 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,377 | mfu: 50.96 | epoch: 1 | total time: 65.16m | eta: 114.6m +step 06064/16704 (36.30%) | loss: 2.734201 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,426 | mfu: 50.90 | epoch: 1 | total time: 65.17m | eta: 114.5m +step 06065/16704 (36.31%) | loss: 2.741107 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,437 | mfu: 50.97 | epoch: 1 | total time: 65.19m | eta: 114.5m +step 06066/16704 (36.31%) | loss: 2.745833 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 1 | total time: 65.20m | eta: 114.5m +step 06067/16704 (36.32%) | loss: 2.755583 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,503 | mfu: 50.91 | epoch: 1 | total time: 65.21m | eta: 114.5m +step 06068/16704 (36.33%) | loss: 2.750095 | lrm: 1.00 | dt: 642.09ms | tok/sec: 816,538 | mfu: 51.03 | epoch: 1 | total time: 65.22m | eta: 114.5m +step 06069/16704 (36.33%) | loss: 2.747918 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 1 | total time: 65.23m | eta: 114.5m +step 06070/16704 (36.34%) | loss: 2.751298 | lrm: 1.00 | dt: 641.24ms | tok/sec: 817,609 | mfu: 51.10 | epoch: 1 | total time: 65.24m | eta: 114.5m +step 06071/16704 (36.34%) | loss: 2.754867 | lrm: 1.00 | dt: 641.05ms | tok/sec: 817,851 | mfu: 51.12 | epoch: 1 | total time: 65.25m | eta: 114.5m +step 06072/16704 (36.35%) | loss: 2.746151 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,274 | mfu: 50.83 | epoch: 1 | total time: 65.26m | eta: 114.5m +step 06073/16704 (36.36%) | loss: 2.740621 | lrm: 1.00 | dt: 641.50ms | tok/sec: 817,282 | mfu: 51.08 | epoch: 1 | total time: 65.27m | eta: 114.4m +step 06074/16704 (36.36%) | loss: 2.759403 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,858 | mfu: 50.87 | epoch: 1 | total time: 65.28m | eta: 114.4m +step 06075/16704 (36.37%) | loss: 2.743120 | lrm: 1.00 | dt: 641.14ms | tok/sec: 817,747 | mfu: 51.11 | epoch: 1 | total time: 65.29m | eta: 114.4m +step 06076/16704 (36.37%) | loss: 2.732935 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,579 | mfu: 50.97 | epoch: 1 | total time: 65.30m | eta: 114.4m +step 06077/16704 (36.38%) | loss: 2.730339 | lrm: 1.00 | dt: 642.28ms | tok/sec: 816,296 | mfu: 51.02 | epoch: 1 | total time: 65.31m | eta: 114.4m +step 06078/16704 (36.39%) | loss: 2.724362 | lrm: 1.00 | dt: 641.72ms | tok/sec: 817,007 | mfu: 51.06 | epoch: 1 | total time: 65.32m | eta: 114.4m +step 06079/16704 (36.39%) | loss: 2.723531 | lrm: 1.00 | dt: 642.85ms | tok/sec: 815,572 | mfu: 50.97 | epoch: 1 | total time: 65.34m | eta: 114.4m +step 06080/16704 (36.40%) | loss: 2.740897 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,948 | mfu: 50.87 | epoch: 1 | total time: 65.35m | eta: 114.4m +step 06081/16704 (36.40%) | loss: 2.749449 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,639 | mfu: 50.98 | epoch: 1 | total time: 65.36m | eta: 114.4m +step 06082/16704 (36.41%) | loss: 2.746902 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,857 | mfu: 50.99 | epoch: 1 | total time: 65.37m | eta: 114.3m +step 06083/16704 (36.42%) | loss: 2.741914 | lrm: 1.00 | dt: 641.34ms | tok/sec: 817,484 | mfu: 51.09 | epoch: 1 | total time: 65.38m | eta: 114.3m +step 06084/16704 (36.42%) | loss: 2.748888 | lrm: 1.00 | dt: 642.56ms | tok/sec: 815,930 | mfu: 51.00 | epoch: 1 | total time: 65.39m | eta: 114.3m +step 06085/16704 (36.43%) | loss: 2.733805 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,754 | mfu: 50.86 | epoch: 1 | total time: 65.40m | eta: 114.3m +step 06086/16704 (36.43%) | loss: 2.725074 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,305 | mfu: 50.96 | epoch: 1 | total time: 65.41m | eta: 114.3m +step 06087/16704 (36.44%) | loss: 2.727048 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,315 | mfu: 50.96 | epoch: 1 | total time: 65.42m | eta: 114.3m +step 06088/16704 (36.45%) | loss: 2.710601 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,795 | mfu: 50.93 | epoch: 1 | total time: 65.43m | eta: 114.3m +step 06089/16704 (36.45%) | loss: 2.707842 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,760 | mfu: 50.99 | epoch: 1 | total time: 65.44m | eta: 114.3m +step 06090/16704 (36.46%) | loss: 2.714754 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,472 | mfu: 50.91 | epoch: 1 | total time: 65.45m | eta: 114.3m +step 06091/16704 (36.46%) | loss: 2.712859 | lrm: 1.00 | dt: 641.95ms | tok/sec: 816,717 | mfu: 51.05 | epoch: 1 | total time: 65.46m | eta: 114.3m +step 06092/16704 (36.47%) | loss: 2.727707 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,991 | mfu: 50.94 | epoch: 1 | total time: 65.47m | eta: 114.2m +step 06093/16704 (36.48%) | loss: 2.742772 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,767 | mfu: 50.92 | epoch: 1 | total time: 65.49m | eta: 114.2m +step 06094/16704 (36.48%) | loss: 2.747151 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,192 | mfu: 51.01 | epoch: 1 | total time: 65.50m | eta: 114.2m +step 06095/16704 (36.49%) | loss: 2.752382 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,796 | mfu: 50.86 | epoch: 1 | total time: 65.51m | eta: 114.2m +step 06096/16704 (36.49%) | loss: 2.753233 | lrm: 1.00 | dt: 641.19ms | tok/sec: 817,679 | mfu: 51.11 | epoch: 1 | total time: 65.52m | eta: 114.2m +step 06097/16704 (36.50%) | loss: 2.769119 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,874 | mfu: 50.93 | epoch: 1 | total time: 65.53m | eta: 114.2m +step 06098/16704 (36.51%) | loss: 2.764847 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,512 | mfu: 50.91 | epoch: 1 | total time: 65.54m | eta: 114.2m +step 06099/16704 (36.51%) | loss: 2.754750 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,434 | mfu: 50.90 | epoch: 1 | total time: 65.55m | eta: 114.2m +step 06100/16704 (36.52%) | loss: 2.752008 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 1 | total time: 65.56m | eta: 114.2m +step 06101/16704 (36.52%) | loss: 2.755050 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,252 | mfu: 50.83 | epoch: 1 | total time: 65.57m | eta: 114.1m +step 06102/16704 (36.53%) | loss: 2.754296 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,754 | mfu: 50.86 | epoch: 1 | total time: 65.58m | eta: 114.1m +step 06103/16704 (36.54%) | loss: 2.767755 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,491 | mfu: 51.03 | epoch: 1 | total time: 65.59m | eta: 114.1m +step 06104/16704 (36.54%) | loss: 2.764014 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,459 | mfu: 50.84 | epoch: 1 | total time: 65.60m | eta: 114.1m +step 06105/16704 (36.55%) | loss: 2.746946 | lrm: 1.00 | dt: 642.56ms | tok/sec: 815,931 | mfu: 51.00 | epoch: 1 | total time: 65.61m | eta: 114.1m +step 06106/16704 (36.55%) | loss: 2.736392 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,087 | mfu: 50.94 | epoch: 1 | total time: 65.62m | eta: 114.1m +step 06107/16704 (36.56%) | loss: 2.745343 | lrm: 1.00 | dt: 640.13ms | tok/sec: 819,039 | mfu: 51.19 | epoch: 1 | total time: 65.64m | eta: 114.1m +step 06108/16704 (36.57%) | loss: 2.751823 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,850 | mfu: 50.99 | epoch: 1 | total time: 65.65m | eta: 114.1m +step 06109/16704 (36.57%) | loss: 2.756020 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,824 | mfu: 50.93 | epoch: 1 | total time: 65.66m | eta: 114.1m +step 06110/16704 (36.58%) | loss: 2.752258 | lrm: 1.00 | dt: 641.68ms | tok/sec: 817,055 | mfu: 51.07 | epoch: 1 | total time: 65.67m | eta: 114.0m +step 06111/16704 (36.58%) | loss: 2.751788 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,463 | mfu: 50.91 | epoch: 1 | total time: 65.68m | eta: 114.0m +step 06112/16704 (36.59%) | loss: 2.741419 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,748 | mfu: 50.99 | epoch: 1 | total time: 65.69m | eta: 114.0m +step 06113/16704 (36.60%) | loss: 2.751272 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 1 | total time: 65.70m | eta: 114.0m +step 06114/16704 (36.60%) | loss: 2.754349 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,373 | mfu: 50.84 | epoch: 1 | total time: 65.71m | eta: 114.0m +step 06115/16704 (36.61%) | loss: 2.762968 | lrm: 1.00 | dt: 642.16ms | tok/sec: 816,449 | mfu: 51.03 | epoch: 1 | total time: 65.72m | eta: 114.0m +step 06116/16704 (36.61%) | loss: 2.753756 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,372 | mfu: 50.90 | epoch: 1 | total time: 65.73m | eta: 114.0m +step 06117/16704 (36.62%) | loss: 2.773166 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 1 | total time: 65.74m | eta: 114.0m +step 06118/16704 (36.63%) | loss: 2.763146 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,906 | mfu: 51.00 | epoch: 1 | total time: 65.75m | eta: 114.0m +step 06119/16704 (36.63%) | loss: 2.762348 | lrm: 1.00 | dt: 642.43ms | tok/sec: 816,101 | mfu: 51.01 | epoch: 1 | total time: 65.76m | eta: 113.9m +step 06120/16704 (36.64%) | loss: 2.767292 | lrm: 1.00 | dt: 642.15ms | tok/sec: 816,452 | mfu: 51.03 | epoch: 1 | total time: 65.77m | eta: 113.9m +step 06121/16704 (36.64%) | loss: 2.767488 | lrm: 1.00 | dt: 640.85ms | tok/sec: 818,110 | mfu: 51.13 | epoch: 1 | total time: 65.79m | eta: 113.9m +step 06122/16704 (36.65%) | loss: 2.765521 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,637 | mfu: 50.73 | epoch: 1 | total time: 65.80m | eta: 113.9m +step 06123/16704 (36.66%) | loss: 2.762843 | lrm: 1.00 | dt: 642.21ms | tok/sec: 816,381 | mfu: 51.02 | epoch: 1 | total time: 65.81m | eta: 113.9m +step 06124/16704 (36.66%) | loss: 2.761654 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,595 | mfu: 50.85 | epoch: 1 | total time: 65.82m | eta: 113.9m +step 06125/16704 (36.67%) | loss: 2.755109 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,023 | mfu: 50.75 | epoch: 1 | total time: 65.83m | eta: 113.9m +step 06126/16704 (36.67%) | loss: 2.732046 | lrm: 1.00 | dt: 641.91ms | tok/sec: 816,759 | mfu: 51.05 | epoch: 1 | total time: 65.84m | eta: 113.9m +step 06127/16704 (36.68%) | loss: 2.729601 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,378 | mfu: 50.96 | epoch: 1 | total time: 65.85m | eta: 113.9m +step 06128/16704 (36.69%) | loss: 2.729585 | lrm: 1.00 | dt: 642.20ms | tok/sec: 816,387 | mfu: 51.03 | epoch: 1 | total time: 65.86m | eta: 113.9m +step 06129/16704 (36.69%) | loss: 2.733639 | lrm: 1.00 | dt: 642.02ms | tok/sec: 816,616 | mfu: 51.04 | epoch: 1 | total time: 65.87m | eta: 113.8m +step 06130/16704 (36.70%) | loss: 2.743107 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,389 | mfu: 50.90 | epoch: 1 | total time: 65.88m | eta: 113.8m +step 06131/16704 (36.70%) | loss: 2.734920 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,925 | mfu: 50.93 | epoch: 1 | total time: 65.89m | eta: 113.8m +step 06132/16704 (36.71%) | loss: 2.740597 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,125 | mfu: 50.95 | epoch: 1 | total time: 65.90m | eta: 113.8m +step 06133/16704 (36.72%) | loss: 2.743949 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,548 | mfu: 50.97 | epoch: 1 | total time: 65.91m | eta: 113.8m +step 06134/16704 (36.72%) | loss: 2.737821 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,568 | mfu: 50.91 | epoch: 1 | total time: 65.92m | eta: 113.8m +step 06135/16704 (36.73%) | loss: 2.737137 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,303 | mfu: 50.96 | epoch: 1 | total time: 65.94m | eta: 113.8m +step 06136/16704 (36.73%) | loss: 2.737212 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,684 | mfu: 50.86 | epoch: 1 | total time: 65.95m | eta: 113.8m +step 06137/16704 (36.74%) | loss: 2.738318 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,800 | mfu: 50.93 | epoch: 1 | total time: 65.96m | eta: 113.8m +step 06138/16704 (36.75%) | loss: 2.719275 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,947 | mfu: 51.00 | epoch: 1 | total time: 65.97m | eta: 113.7m +step 06139/16704 (36.75%) | loss: 2.733870 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,880 | mfu: 50.99 | epoch: 1 | total time: 65.98m | eta: 113.7m +step 06140/16704 (36.76%) | loss: 2.745728 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,528 | mfu: 50.97 | epoch: 1 | total time: 65.99m | eta: 113.7m +step 06141/16704 (36.76%) | loss: 2.749239 | lrm: 1.00 | dt: 641.81ms | tok/sec: 816,891 | mfu: 51.06 | epoch: 1 | total time: 66.00m | eta: 113.7m +step 06142/16704 (36.77%) | loss: 2.750682 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,575 | mfu: 50.97 | epoch: 1 | total time: 66.01m | eta: 113.7m +step 06143/16704 (36.78%) | loss: 2.744294 | lrm: 1.00 | dt: 642.18ms | tok/sec: 816,417 | mfu: 51.03 | epoch: 1 | total time: 66.02m | eta: 113.7m +step 06144/16704 (36.78%) | loss: 2.747256 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,882 | mfu: 50.93 | epoch: 1 | total time: 66.03m | eta: 113.7m +step 06145/16704 (36.79%) | loss: 2.728094 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,313 | mfu: 50.83 | epoch: 1 | total time: 66.04m | eta: 113.7m +step 06146/16704 (36.79%) | loss: 2.709441 | lrm: 1.00 | dt: 641.40ms | tok/sec: 817,416 | mfu: 51.09 | epoch: 1 | total time: 66.05m | eta: 113.7m +step 06147/16704 (36.80%) | loss: 2.715010 | lrm: 1.00 | dt: 642.25ms | tok/sec: 816,325 | mfu: 51.02 | epoch: 1 | total time: 66.06m | eta: 113.6m +step 06148/16704 (36.81%) | loss: 2.722998 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,858 | mfu: 50.87 | epoch: 1 | total time: 66.07m | eta: 113.6m +step 06149/16704 (36.81%) | loss: 2.729174 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,762 | mfu: 50.92 | epoch: 1 | total time: 66.09m | eta: 113.6m +step 06150/16704 (36.82%) | loss: 2.736594 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,835 | mfu: 50.87 | epoch: 1 | total time: 66.10m | eta: 113.6m +step 06151/16704 (36.82%) | loss: 2.736004 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,224 | mfu: 50.89 | epoch: 1 | total time: 66.11m | eta: 113.6m +step 06152/16704 (36.83%) | loss: 2.743084 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,127 | mfu: 51.01 | epoch: 1 | total time: 66.12m | eta: 113.6m +step 06153/16704 (36.84%) | loss: 2.731476 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,871 | mfu: 50.87 | epoch: 1 | total time: 66.13m | eta: 113.6m +step 06154/16704 (36.84%) | loss: 2.715477 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,557 | mfu: 50.97 | epoch: 1 | total time: 66.14m | eta: 113.6m +step 06155/16704 (36.85%) | loss: 2.717686 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,075 | mfu: 50.94 | epoch: 1 | total time: 66.15m | eta: 113.6m +step 06156/16704 (36.85%) | loss: 2.724970 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,337 | mfu: 50.65 | epoch: 1 | total time: 66.16m | eta: 113.5m +step 06157/16704 (36.86%) | loss: 2.714455 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,799 | mfu: 50.99 | epoch: 1 | total time: 66.17m | eta: 113.5m +step 06158/16704 (36.87%) | loss: 2.723661 | lrm: 1.00 | dt: 642.64ms | tok/sec: 815,831 | mfu: 50.99 | epoch: 1 | total time: 66.18m | eta: 113.5m +step 06159/16704 (36.87%) | loss: 2.727716 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,409 | mfu: 50.90 | epoch: 1 | total time: 66.19m | eta: 113.5m +step 06160/16704 (36.88%) | loss: 2.733239 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,985 | mfu: 50.94 | epoch: 1 | total time: 66.20m | eta: 113.5m +step 06161/16704 (36.88%) | loss: 2.722202 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,605 | mfu: 50.85 | epoch: 1 | total time: 66.21m | eta: 113.5m +step 06162/16704 (36.89%) | loss: 2.732471 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,496 | mfu: 51.03 | epoch: 1 | total time: 66.22m | eta: 113.5m +step 06163/16704 (36.90%) | loss: 2.718171 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,387 | mfu: 50.96 | epoch: 1 | total time: 66.24m | eta: 113.5m +step 06164/16704 (36.90%) | loss: 2.712041 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,424 | mfu: 50.90 | epoch: 1 | total time: 66.25m | eta: 113.5m +step 06165/16704 (36.91%) | loss: 2.703197 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,253 | mfu: 50.89 | epoch: 1 | total time: 66.26m | eta: 113.4m +step 06166/16704 (36.91%) | loss: 2.699977 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,253 | mfu: 50.95 | epoch: 1 | total time: 66.27m | eta: 113.4m +step 06167/16704 (36.92%) | loss: 2.709837 | lrm: 1.00 | dt: 641.50ms | tok/sec: 817,280 | mfu: 51.08 | epoch: 1 | total time: 66.28m | eta: 113.4m +step 06168/16704 (36.93%) | loss: 2.710832 | lrm: 1.00 | dt: 641.79ms | tok/sec: 816,915 | mfu: 51.06 | epoch: 1 | total time: 66.29m | eta: 113.4m +step 06169/16704 (36.93%) | loss: 2.710720 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,194 | mfu: 50.89 | epoch: 1 | total time: 66.30m | eta: 113.4m +step 06170/16704 (36.94%) | loss: 2.714609 | lrm: 1.00 | dt: 641.43ms | tok/sec: 817,369 | mfu: 51.09 | epoch: 1 | total time: 66.31m | eta: 113.4m +step 06171/16704 (36.94%) | loss: 2.715814 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,818 | mfu: 50.99 | epoch: 1 | total time: 66.32m | eta: 113.4m +step 06172/16704 (36.95%) | loss: 2.720976 | lrm: 1.00 | dt: 642.19ms | tok/sec: 816,402 | mfu: 51.03 | epoch: 1 | total time: 66.33m | eta: 113.4m +step 06173/16704 (36.96%) | loss: 2.741791 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,234 | mfu: 50.89 | epoch: 1 | total time: 66.34m | eta: 113.4m +step 06174/16704 (36.96%) | loss: 2.748468 | lrm: 1.00 | dt: 648.30ms | tok/sec: 808,715 | mfu: 50.55 | epoch: 1 | total time: 66.35m | eta: 113.4m +step 06175/16704 (36.97%) | loss: 2.753899 | lrm: 1.00 | dt: 640.80ms | tok/sec: 818,176 | mfu: 51.14 | epoch: 1 | total time: 66.36m | eta: 113.3m +step 06176/16704 (36.97%) | loss: 2.757030 | lrm: 1.00 | dt: 642.17ms | tok/sec: 816,426 | mfu: 51.03 | epoch: 1 | total time: 66.37m | eta: 113.3m +step 06177/16704 (36.98%) | loss: 2.765280 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,310 | mfu: 50.96 | epoch: 1 | total time: 66.39m | eta: 113.3m +step 06178/16704 (36.99%) | loss: 2.749282 | lrm: 1.00 | dt: 642.48ms | tok/sec: 816,038 | mfu: 51.00 | epoch: 1 | total time: 66.40m | eta: 113.3m +step 06179/16704 (36.99%) | loss: 2.760035 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,005 | mfu: 50.88 | epoch: 1 | total time: 66.41m | eta: 113.3m +step 06180/16704 (37.00%) | loss: 2.770127 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 1 | total time: 66.42m | eta: 113.3m +step 06181/16704 (37.00%) | loss: 2.768647 | lrm: 1.00 | dt: 640.90ms | tok/sec: 818,052 | mfu: 51.13 | epoch: 1 | total time: 66.43m | eta: 113.3m +step 06182/16704 (37.01%) | loss: 2.765045 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,544 | mfu: 51.04 | epoch: 1 | total time: 66.44m | eta: 113.3m +step 06183/16704 (37.02%) | loss: 2.757713 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,681 | mfu: 50.86 | epoch: 1 | total time: 66.45m | eta: 113.3m +step 06184/16704 (37.02%) | loss: 2.760220 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,329 | mfu: 50.96 | epoch: 1 | total time: 66.46m | eta: 113.2m +step 06185/16704 (37.03%) | loss: 2.757895 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,307 | mfu: 50.71 | epoch: 1 | total time: 66.47m | eta: 113.2m +step 06186/16704 (37.03%) | loss: 2.740192 | lrm: 1.00 | dt: 642.07ms | tok/sec: 816,558 | mfu: 51.04 | epoch: 1 | total time: 66.48m | eta: 113.2m +step 06187/16704 (37.04%) | loss: 2.746412 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,293 | mfu: 50.83 | epoch: 1 | total time: 66.49m | eta: 113.2m +step 06188/16704 (37.05%) | loss: 2.735209 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,081 | mfu: 50.82 | epoch: 1 | total time: 66.50m | eta: 113.2m +step 06189/16704 (37.05%) | loss: 2.725995 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,375 | mfu: 50.90 | epoch: 1 | total time: 66.51m | eta: 113.2m +step 06190/16704 (37.06%) | loss: 2.729488 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,835 | mfu: 50.80 | epoch: 1 | total time: 66.53m | eta: 113.2m +step 06191/16704 (37.06%) | loss: 2.711541 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 1 | total time: 66.54m | eta: 113.2m +step 06192/16704 (37.07%) | loss: 2.708728 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,110 | mfu: 50.76 | epoch: 1 | total time: 66.55m | eta: 113.2m +step 06193/16704 (37.07%) | loss: 2.729087 | lrm: 1.00 | dt: 641.20ms | tok/sec: 817,664 | mfu: 51.11 | epoch: 1 | total time: 66.56m | eta: 113.1m +step 06194/16704 (37.08%) | loss: 2.720084 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,744 | mfu: 50.92 | epoch: 1 | total time: 66.57m | eta: 113.1m +step 06195/16704 (37.09%) | loss: 2.730441 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,385 | mfu: 50.84 | epoch: 1 | total time: 66.58m | eta: 113.1m +step 06196/16704 (37.09%) | loss: 2.731782 | lrm: 1.00 | dt: 641.63ms | tok/sec: 817,120 | mfu: 51.07 | epoch: 1 | total time: 66.59m | eta: 113.1m +step 06197/16704 (37.10%) | loss: 2.724897 | lrm: 1.00 | dt: 647.43ms | tok/sec: 809,801 | mfu: 50.61 | epoch: 1 | total time: 66.60m | eta: 113.1m +step 06198/16704 (37.10%) | loss: 2.714757 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,758 | mfu: 50.92 | epoch: 1 | total time: 66.61m | eta: 113.1m +step 06199/16704 (37.11%) | loss: 2.702850 | lrm: 1.00 | dt: 642.34ms | tok/sec: 816,210 | mfu: 51.01 | epoch: 1 | total time: 66.62m | eta: 113.1m +step 06200/16704 (37.12%) | loss: 2.716450 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,879 | mfu: 50.87 | epoch: 1 | total time: 66.63m | eta: 113.1m +step 06201/16704 (37.12%) | loss: 2.737481 | lrm: 1.00 | dt: 641.93ms | tok/sec: 816,736 | mfu: 51.05 | epoch: 1 | total time: 66.64m | eta: 113.1m +step 06202/16704 (37.13%) | loss: 2.748317 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,759 | mfu: 50.99 | epoch: 1 | total time: 66.65m | eta: 113.0m +step 06203/16704 (37.13%) | loss: 2.754180 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,930 | mfu: 50.93 | epoch: 1 | total time: 66.66m | eta: 113.0m +step 06204/16704 (37.14%) | loss: 2.748698 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,496 | mfu: 50.97 | epoch: 1 | total time: 66.68m | eta: 113.0m +step 06205/16704 (37.15%) | loss: 2.757287 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,162 | mfu: 50.89 | epoch: 1 | total time: 66.69m | eta: 113.0m +step 06206/16704 (37.15%) | loss: 2.750414 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,260 | mfu: 50.83 | epoch: 1 | total time: 66.70m | eta: 113.0m +step 06207/16704 (37.16%) | loss: 2.751010 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,792 | mfu: 50.99 | epoch: 1 | total time: 66.71m | eta: 113.0m +step 06208/16704 (37.16%) | loss: 2.739721 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,694 | mfu: 50.79 | epoch: 1 | total time: 66.72m | eta: 113.0m +step 06209/16704 (37.17%) | loss: 2.744476 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,883 | mfu: 50.99 | epoch: 1 | total time: 66.73m | eta: 113.0m +step 06210/16704 (37.18%) | loss: 2.752934 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,634 | mfu: 50.85 | epoch: 1 | total time: 66.74m | eta: 113.0m +step 06211/16704 (37.18%) | loss: 2.750874 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,380 | mfu: 50.90 | epoch: 1 | total time: 66.75m | eta: 113.0m +step 06212/16704 (37.19%) | loss: 2.745245 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,960 | mfu: 50.94 | epoch: 1 | total time: 66.76m | eta: 112.9m +step 06213/16704 (37.19%) | loss: 2.744745 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,258 | mfu: 50.95 | epoch: 1 | total time: 66.77m | eta: 112.9m +step 06214/16704 (37.20%) | loss: 2.739783 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,973 | mfu: 51.00 | epoch: 1 | total time: 66.78m | eta: 112.9m +step 06215/16704 (37.21%) | loss: 2.740433 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,751 | mfu: 50.99 | epoch: 1 | total time: 66.79m | eta: 112.9m +step 06216/16704 (37.21%) | loss: 2.755796 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,715 | mfu: 50.73 | epoch: 1 | total time: 66.80m | eta: 112.9m +step 06217/16704 (37.22%) | loss: 2.757816 | lrm: 1.00 | dt: 642.22ms | tok/sec: 816,373 | mfu: 51.02 | epoch: 1 | total time: 66.81m | eta: 112.9m +step 06218/16704 (37.22%) | loss: 2.764120 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,140 | mfu: 50.88 | epoch: 1 | total time: 66.83m | eta: 112.9m +step 06219/16704 (37.23%) | loss: 2.760588 | lrm: 1.00 | dt: 644.18ms | tok/sec: 813,887 | mfu: 50.87 | epoch: 1 | total time: 66.84m | eta: 112.9m +step 06220/16704 (37.24%) | loss: 2.754375 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,921 | mfu: 50.87 | epoch: 1 | total time: 66.85m | eta: 112.9m +step 06221/16704 (37.24%) | loss: 2.746413 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,128 | mfu: 50.95 | epoch: 1 | total time: 66.86m | eta: 112.8m +step 06222/16704 (37.25%) | loss: 2.736057 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,002 | mfu: 50.75 | epoch: 1 | total time: 66.87m | eta: 112.8m +step 06223/16704 (37.25%) | loss: 2.732882 | lrm: 1.00 | dt: 641.40ms | tok/sec: 817,407 | mfu: 51.09 | epoch: 1 | total time: 66.88m | eta: 112.8m +step 06224/16704 (37.26%) | loss: 2.733619 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,598 | mfu: 50.98 | epoch: 1 | total time: 66.89m | eta: 112.8m +step 06225/16704 (37.27%) | loss: 2.741899 | lrm: 1.00 | dt: 641.16ms | tok/sec: 817,723 | mfu: 51.11 | epoch: 1 | total time: 66.90m | eta: 112.8m +step 06226/16704 (37.27%) | loss: 2.731620 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,715 | mfu: 50.86 | epoch: 1 | total time: 66.91m | eta: 112.8m +step 06227/16704 (37.28%) | loss: 2.728116 | lrm: 1.00 | dt: 640.48ms | tok/sec: 818,583 | mfu: 51.16 | epoch: 1 | total time: 66.92m | eta: 112.8m +step 06228/16704 (37.28%) | loss: 2.736577 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,082 | mfu: 50.88 | epoch: 1 | total time: 66.93m | eta: 112.8m +step 06229/16704 (37.29%) | loss: 2.742755 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,517 | mfu: 50.91 | epoch: 1 | total time: 66.94m | eta: 112.8m +step 06230/16704 (37.30%) | loss: 2.750278 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,342 | mfu: 50.90 | epoch: 1 | total time: 66.95m | eta: 112.7m +step 06231/16704 (37.30%) | loss: 2.753051 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,125 | mfu: 50.95 | epoch: 1 | total time: 66.96m | eta: 112.7m +step 06232/16704 (37.31%) | loss: 2.745595 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,134 | mfu: 51.01 | epoch: 1 | total time: 66.98m | eta: 112.7m +step 06233/16704 (37.31%) | loss: 2.739388 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,483 | mfu: 50.84 | epoch: 1 | total time: 66.99m | eta: 112.7m +step 06234/16704 (37.32%) | loss: 2.724878 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,736 | mfu: 50.86 | epoch: 1 | total time: 67.00m | eta: 112.7m +step 06235/16704 (37.33%) | loss: 2.723487 | lrm: 1.00 | dt: 641.88ms | tok/sec: 816,798 | mfu: 51.05 | epoch: 1 | total time: 67.01m | eta: 112.7m +step 06236/16704 (37.33%) | loss: 2.734916 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,015 | mfu: 50.81 | epoch: 1 | total time: 67.02m | eta: 112.7m +step 06237/16704 (37.34%) | loss: 2.730164 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,269 | mfu: 51.02 | epoch: 1 | total time: 67.03m | eta: 112.7m +step 06238/16704 (37.34%) | loss: 2.742371 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,266 | mfu: 50.96 | epoch: 1 | total time: 67.04m | eta: 112.7m +step 06239/16704 (37.35%) | loss: 2.748329 | lrm: 1.00 | dt: 642.45ms | tok/sec: 816,069 | mfu: 51.01 | epoch: 1 | total time: 67.05m | eta: 112.6m +step 06240/16704 (37.36%) | loss: 2.743605 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,499 | mfu: 51.03 | epoch: 1 | total time: 67.06m | eta: 112.6m +step 06241/16704 (37.36%) | loss: 2.745027 | lrm: 1.00 | dt: 641.57ms | tok/sec: 817,192 | mfu: 51.08 | epoch: 1 | total time: 67.07m | eta: 112.6m +step 06242/16704 (37.37%) | loss: 2.732988 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,330 | mfu: 50.90 | epoch: 1 | total time: 67.08m | eta: 112.6m +step 06243/16704 (37.37%) | loss: 2.735519 | lrm: 1.00 | dt: 642.04ms | tok/sec: 816,593 | mfu: 51.04 | epoch: 1 | total time: 67.09m | eta: 112.6m +step 06244/16704 (37.38%) | loss: 2.747711 | lrm: 1.00 | dt: 642.33ms | tok/sec: 816,232 | mfu: 51.02 | epoch: 1 | total time: 67.10m | eta: 112.6m +step 06245/16704 (37.39%) | loss: 2.743583 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 1 | total time: 67.11m | eta: 112.6m +step 06246/16704 (37.39%) | loss: 2.744963 | lrm: 1.00 | dt: 642.25ms | tok/sec: 816,333 | mfu: 51.02 | epoch: 1 | total time: 67.13m | eta: 112.6m +step 06247/16704 (37.40%) | loss: 2.745036 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,289 | mfu: 50.77 | epoch: 1 | total time: 67.14m | eta: 112.6m +step 06248/16704 (37.40%) | loss: 2.751711 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,932 | mfu: 50.87 | epoch: 1 | total time: 67.15m | eta: 112.6m +step 06249/16704 (37.41%) | loss: 2.747143 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,097 | mfu: 50.82 | epoch: 1 | total time: 67.16m | eta: 112.5m +Step 06250 | Validation bpb: 0.832226 +step 06250/16704 (37.42%) | loss: 2.751181 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,768 | mfu: 50.86 | epoch: 1 | total time: 67.17m | eta: 112.5m +step 06251/16704 (37.42%) | loss: 2.755159 | lrm: 1.00 | dt: 647.84ms | tok/sec: 809,282 | mfu: 50.58 | epoch: 1 | total time: 67.18m | eta: 112.5m +step 06252/16704 (37.43%) | loss: 2.763293 | lrm: 1.00 | dt: 647.98ms | tok/sec: 809,111 | mfu: 50.57 | epoch: 1 | total time: 67.19m | eta: 112.5m +step 06253/16704 (37.43%) | loss: 2.752947 | lrm: 1.00 | dt: 640.48ms | tok/sec: 818,579 | mfu: 51.16 | epoch: 1 | total time: 67.20m | eta: 112.5m +step 06254/16704 (37.44%) | loss: 2.753419 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,210 | mfu: 50.95 | epoch: 1 | total time: 67.21m | eta: 112.5m +step 06255/16704 (37.45%) | loss: 2.751606 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,260 | mfu: 50.83 | epoch: 1 | total time: 67.22m | eta: 112.5m +step 06256/16704 (37.45%) | loss: 2.763757 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,400 | mfu: 50.84 | epoch: 1 | total time: 67.23m | eta: 112.5m +step 06257/16704 (37.46%) | loss: 2.762929 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,867 | mfu: 50.87 | epoch: 1 | total time: 67.24m | eta: 112.5m +step 06258/16704 (37.46%) | loss: 2.742124 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,424 | mfu: 50.97 | epoch: 1 | total time: 67.25m | eta: 112.4m +step 06259/16704 (37.47%) | loss: 2.744754 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,248 | mfu: 50.95 | epoch: 1 | total time: 67.27m | eta: 112.4m +step 06260/16704 (37.48%) | loss: 2.737594 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,380 | mfu: 50.90 | epoch: 1 | total time: 67.28m | eta: 112.4m +step 06261/16704 (37.48%) | loss: 2.741873 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,643 | mfu: 50.98 | epoch: 1 | total time: 67.29m | eta: 112.4m +step 06262/16704 (37.49%) | loss: 2.729209 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,343 | mfu: 50.96 | epoch: 1 | total time: 67.30m | eta: 112.4m +step 06263/16704 (37.49%) | loss: 2.734676 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,542 | mfu: 50.85 | epoch: 1 | total time: 67.31m | eta: 112.4m +step 06264/16704 (37.50%) | loss: 2.728111 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,277 | mfu: 50.71 | epoch: 1 | total time: 67.32m | eta: 112.4m +step 06265/16704 (37.51%) | loss: 2.741829 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,290 | mfu: 50.77 | epoch: 1 | total time: 67.33m | eta: 112.4m +step 06266/16704 (37.51%) | loss: 2.733494 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,883 | mfu: 50.93 | epoch: 1 | total time: 67.34m | eta: 112.4m +step 06267/16704 (37.52%) | loss: 2.746839 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,377 | mfu: 50.84 | epoch: 1 | total time: 67.35m | eta: 112.3m +step 06268/16704 (37.52%) | loss: 2.774876 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,866 | mfu: 50.87 | epoch: 1 | total time: 67.36m | eta: 112.3m +step 06269/16704 (37.53%) | loss: 2.773794 | lrm: 1.00 | dt: 641.42ms | tok/sec: 817,392 | mfu: 51.09 | epoch: 1 | total time: 67.37m | eta: 112.3m +step 06270/16704 (37.54%) | loss: 2.773275 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,682 | mfu: 50.79 | epoch: 1 | total time: 67.38m | eta: 112.3m +step 06271/16704 (37.54%) | loss: 2.772911 | lrm: 1.00 | dt: 639.68ms | tok/sec: 819,614 | mfu: 51.23 | epoch: 1 | total time: 67.39m | eta: 112.3m +step 06272/16704 (37.55%) | loss: 2.770500 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,018 | mfu: 50.88 | epoch: 1 | total time: 67.40m | eta: 112.3m +step 06273/16704 (37.55%) | loss: 2.765814 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,613 | mfu: 50.85 | epoch: 1 | total time: 67.42m | eta: 112.3m +step 06274/16704 (37.56%) | loss: 2.768717 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,454 | mfu: 50.90 | epoch: 1 | total time: 67.43m | eta: 112.3m +step 06275/16704 (37.57%) | loss: 2.759319 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,480 | mfu: 50.84 | epoch: 1 | total time: 67.44m | eta: 112.3m +step 06276/16704 (37.57%) | loss: 2.756590 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,654 | mfu: 50.98 | epoch: 1 | total time: 67.45m | eta: 112.2m +step 06277/16704 (37.58%) | loss: 2.753348 | lrm: 1.00 | dt: 641.73ms | tok/sec: 816,989 | mfu: 51.06 | epoch: 1 | total time: 67.46m | eta: 112.2m +step 06278/16704 (37.58%) | loss: 2.763297 | lrm: 1.00 | dt: 642.04ms | tok/sec: 816,593 | mfu: 51.04 | epoch: 1 | total time: 67.47m | eta: 112.2m +step 06279/16704 (37.59%) | loss: 2.767684 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,076 | mfu: 50.88 | epoch: 1 | total time: 67.48m | eta: 112.2m +step 06280/16704 (37.60%) | loss: 2.762725 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,983 | mfu: 50.88 | epoch: 1 | total time: 67.49m | eta: 112.2m +step 06281/16704 (37.60%) | loss: 2.762151 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,054 | mfu: 50.88 | epoch: 1 | total time: 67.50m | eta: 112.2m +step 06282/16704 (37.61%) | loss: 2.741025 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,058 | mfu: 50.82 | epoch: 1 | total time: 67.51m | eta: 112.2m +step 06283/16704 (37.61%) | loss: 2.734533 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,278 | mfu: 50.96 | epoch: 1 | total time: 67.52m | eta: 112.2m +step 06284/16704 (37.62%) | loss: 2.730805 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,578 | mfu: 50.85 | epoch: 1 | total time: 67.53m | eta: 112.2m +step 06285/16704 (37.63%) | loss: 2.720881 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,640 | mfu: 50.79 | epoch: 1 | total time: 67.54m | eta: 112.1m +step 06286/16704 (37.63%) | loss: 2.724843 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,221 | mfu: 50.89 | epoch: 1 | total time: 67.55m | eta: 112.1m +step 06287/16704 (37.64%) | loss: 2.734971 | lrm: 1.00 | dt: 640.77ms | tok/sec: 818,211 | mfu: 51.14 | epoch: 1 | total time: 67.57m | eta: 112.1m +step 06288/16704 (37.64%) | loss: 2.733641 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,496 | mfu: 50.84 | epoch: 1 | total time: 67.58m | eta: 112.1m +step 06289/16704 (37.65%) | loss: 2.735327 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,677 | mfu: 50.92 | epoch: 1 | total time: 67.59m | eta: 112.1m +step 06290/16704 (37.66%) | loss: 2.737668 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,955 | mfu: 50.94 | epoch: 1 | total time: 67.60m | eta: 112.1m +step 06291/16704 (37.66%) | loss: 2.740466 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,346 | mfu: 50.84 | epoch: 1 | total time: 67.61m | eta: 112.1m +step 06292/16704 (37.67%) | loss: 2.739755 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,901 | mfu: 50.93 | epoch: 1 | total time: 67.62m | eta: 112.1m +step 06293/16704 (37.67%) | loss: 2.740411 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,887 | mfu: 50.99 | epoch: 1 | total time: 67.63m | eta: 112.1m +step 06294/16704 (37.68%) | loss: 2.731787 | lrm: 1.00 | dt: 642.52ms | tok/sec: 815,988 | mfu: 51.00 | epoch: 1 | total time: 67.64m | eta: 112.1m +step 06295/16704 (37.69%) | loss: 2.726119 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,295 | mfu: 50.83 | epoch: 1 | total time: 67.65m | eta: 112.0m +step 06296/16704 (37.69%) | loss: 2.723718 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,080 | mfu: 50.94 | epoch: 1 | total time: 67.66m | eta: 112.0m +step 06297/16704 (37.70%) | loss: 2.736692 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,730 | mfu: 50.73 | epoch: 1 | total time: 67.67m | eta: 112.0m +step 06298/16704 (37.70%) | loss: 2.732048 | lrm: 1.00 | dt: 640.33ms | tok/sec: 818,775 | mfu: 51.17 | epoch: 1 | total time: 67.68m | eta: 112.0m +step 06299/16704 (37.71%) | loss: 2.730471 | lrm: 1.00 | dt: 641.75ms | tok/sec: 816,965 | mfu: 51.06 | epoch: 1 | total time: 67.69m | eta: 112.0m +step 06300/16704 (37.72%) | loss: 2.724864 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,530 | mfu: 50.85 | epoch: 1 | total time: 67.70m | eta: 112.0m +step 06301/16704 (37.72%) | loss: 2.733014 | lrm: 1.00 | dt: 640.77ms | tok/sec: 818,220 | mfu: 51.14 | epoch: 1 | total time: 67.72m | eta: 112.0m +step 06302/16704 (37.73%) | loss: 2.742157 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,244 | mfu: 50.89 | epoch: 1 | total time: 67.73m | eta: 112.0m +step 06303/16704 (37.73%) | loss: 2.741572 | lrm: 1.00 | dt: 642.68ms | tok/sec: 815,788 | mfu: 50.99 | epoch: 1 | total time: 67.74m | eta: 112.0m +step 06304/16704 (37.74%) | loss: 2.742306 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,904 | mfu: 50.93 | epoch: 1 | total time: 67.75m | eta: 111.9m +step 06305/16704 (37.75%) | loss: 2.733275 | lrm: 1.00 | dt: 641.47ms | tok/sec: 817,324 | mfu: 51.08 | epoch: 1 | total time: 67.76m | eta: 111.9m +step 06306/16704 (37.75%) | loss: 2.733372 | lrm: 1.00 | dt: 640.81ms | tok/sec: 818,170 | mfu: 51.14 | epoch: 1 | total time: 67.77m | eta: 111.9m +step 06307/16704 (37.76%) | loss: 2.719009 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,248 | mfu: 50.83 | epoch: 1 | total time: 67.78m | eta: 111.9m +step 06308/16704 (37.76%) | loss: 2.706976 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,166 | mfu: 50.95 | epoch: 1 | total time: 67.79m | eta: 111.9m +step 06309/16704 (37.77%) | loss: 2.716914 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,753 | mfu: 50.99 | epoch: 1 | total time: 67.80m | eta: 111.9m +step 06310/16704 (37.78%) | loss: 2.716973 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,490 | mfu: 50.78 | epoch: 1 | total time: 67.81m | eta: 111.9m +step 06311/16704 (37.78%) | loss: 2.732645 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,687 | mfu: 50.86 | epoch: 1 | total time: 67.82m | eta: 111.9m +step 06312/16704 (37.79%) | loss: 2.718093 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 1 | total time: 67.83m | eta: 111.9m +step 06313/16704 (37.79%) | loss: 2.733995 | lrm: 1.00 | dt: 642.22ms | tok/sec: 816,370 | mfu: 51.02 | epoch: 1 | total time: 67.84m | eta: 111.8m +step 06314/16704 (37.80%) | loss: 2.723849 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,874 | mfu: 50.93 | epoch: 1 | total time: 67.85m | eta: 111.8m +step 06315/16704 (37.81%) | loss: 2.721395 | lrm: 1.00 | dt: 641.61ms | tok/sec: 817,141 | mfu: 51.07 | epoch: 1 | total time: 67.87m | eta: 111.8m +step 06316/16704 (37.81%) | loss: 2.735163 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,429 | mfu: 50.78 | epoch: 1 | total time: 67.88m | eta: 111.8m +step 06317/16704 (37.82%) | loss: 2.728173 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,631 | mfu: 50.92 | epoch: 1 | total time: 67.89m | eta: 111.8m +step 06318/16704 (37.82%) | loss: 2.732589 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 1 | total time: 67.90m | eta: 111.8m +step 06319/16704 (37.83%) | loss: 2.724693 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,831 | mfu: 50.87 | epoch: 1 | total time: 67.91m | eta: 111.8m +step 06320/16704 (37.84%) | loss: 2.721797 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,402 | mfu: 50.96 | epoch: 1 | total time: 67.92m | eta: 111.8m +step 06321/16704 (37.84%) | loss: 2.724177 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,639 | mfu: 50.98 | epoch: 1 | total time: 67.93m | eta: 111.8m +step 06322/16704 (37.85%) | loss: 2.735823 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,097 | mfu: 50.94 | epoch: 1 | total time: 67.94m | eta: 111.7m +step 06323/16704 (37.85%) | loss: 2.728412 | lrm: 1.00 | dt: 642.33ms | tok/sec: 816,222 | mfu: 51.02 | epoch: 1 | total time: 67.95m | eta: 111.7m +step 06324/16704 (37.86%) | loss: 2.719813 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,207 | mfu: 50.64 | epoch: 1 | total time: 67.96m | eta: 111.7m +step 06325/16704 (37.87%) | loss: 2.723082 | lrm: 1.00 | dt: 642.46ms | tok/sec: 816,068 | mfu: 51.01 | epoch: 1 | total time: 67.97m | eta: 111.7m +step 06326/16704 (37.87%) | loss: 2.732811 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,076 | mfu: 50.82 | epoch: 1 | total time: 67.98m | eta: 111.7m +step 06327/16704 (37.88%) | loss: 2.737286 | lrm: 1.00 | dt: 641.37ms | tok/sec: 817,452 | mfu: 51.09 | epoch: 1 | total time: 67.99m | eta: 111.7m +step 06328/16704 (37.88%) | loss: 2.721755 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,862 | mfu: 50.99 | epoch: 1 | total time: 68.01m | eta: 111.7m +step 06329/16704 (37.89%) | loss: 2.737697 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,921 | mfu: 50.93 | epoch: 1 | total time: 68.02m | eta: 111.7m +step 06330/16704 (37.90%) | loss: 2.747177 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,158 | mfu: 50.95 | epoch: 1 | total time: 68.03m | eta: 111.7m +step 06331/16704 (37.90%) | loss: 2.741003 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,012 | mfu: 50.88 | epoch: 1 | total time: 68.04m | eta: 111.7m +step 06332/16704 (37.91%) | loss: 2.737566 | lrm: 1.00 | dt: 641.74ms | tok/sec: 816,983 | mfu: 51.06 | epoch: 1 | total time: 68.05m | eta: 111.6m +step 06333/16704 (37.91%) | loss: 2.729469 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,659 | mfu: 50.98 | epoch: 1 | total time: 68.06m | eta: 111.6m +step 06334/16704 (37.92%) | loss: 2.730297 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,520 | mfu: 50.91 | epoch: 1 | total time: 68.07m | eta: 111.6m +step 06335/16704 (37.93%) | loss: 2.729848 | lrm: 1.00 | dt: 641.57ms | tok/sec: 817,189 | mfu: 51.08 | epoch: 1 | total time: 68.08m | eta: 111.6m +step 06336/16704 (37.93%) | loss: 2.738736 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 1 | total time: 68.09m | eta: 111.6m +step 06337/16704 (37.94%) | loss: 2.747241 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,284 | mfu: 50.83 | epoch: 1 | total time: 68.10m | eta: 111.6m +step 06338/16704 (37.94%) | loss: 2.754984 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,858 | mfu: 51.05 | epoch: 1 | total time: 68.11m | eta: 111.6m +step 06339/16704 (37.95%) | loss: 2.755160 | lrm: 1.00 | dt: 647.70ms | tok/sec: 809,457 | mfu: 50.59 | epoch: 1 | total time: 68.12m | eta: 111.6m +step 06340/16704 (37.95%) | loss: 2.749061 | lrm: 1.00 | dt: 642.20ms | tok/sec: 816,391 | mfu: 51.03 | epoch: 1 | total time: 68.13m | eta: 111.6m +step 06341/16704 (37.96%) | loss: 2.748521 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,468 | mfu: 50.72 | epoch: 1 | total time: 68.14m | eta: 111.5m +step 06342/16704 (37.97%) | loss: 2.745551 | lrm: 1.00 | dt: 648.47ms | tok/sec: 808,505 | mfu: 50.53 | epoch: 1 | total time: 68.16m | eta: 111.5m +step 06343/16704 (37.97%) | loss: 2.749743 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,951 | mfu: 51.00 | epoch: 1 | total time: 68.17m | eta: 111.5m +step 06344/16704 (37.98%) | loss: 2.753889 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,594 | mfu: 50.79 | epoch: 1 | total time: 68.18m | eta: 111.5m +step 06345/16704 (37.98%) | loss: 2.744106 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,539 | mfu: 50.91 | epoch: 1 | total time: 68.19m | eta: 111.5m +step 06346/16704 (37.99%) | loss: 2.738890 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,984 | mfu: 50.88 | epoch: 1 | total time: 68.20m | eta: 111.5m +step 06347/16704 (38.00%) | loss: 2.738447 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,904 | mfu: 50.93 | epoch: 1 | total time: 68.21m | eta: 111.5m +step 06348/16704 (38.00%) | loss: 2.729001 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,118 | mfu: 50.82 | epoch: 1 | total time: 68.22m | eta: 111.5m +step 06349/16704 (38.01%) | loss: 2.717371 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,315 | mfu: 50.71 | epoch: 1 | total time: 68.23m | eta: 111.5m +step 06350/16704 (38.01%) | loss: 2.716280 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,319 | mfu: 50.90 | epoch: 1 | total time: 68.24m | eta: 111.4m +step 06351/16704 (38.02%) | loss: 2.716738 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,509 | mfu: 50.85 | epoch: 1 | total time: 68.25m | eta: 111.4m +step 06352/16704 (38.03%) | loss: 2.724409 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,898 | mfu: 50.87 | epoch: 1 | total time: 68.26m | eta: 111.4m +step 06353/16704 (38.03%) | loss: 2.720237 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,436 | mfu: 50.97 | epoch: 1 | total time: 68.27m | eta: 111.4m +step 06354/16704 (38.04%) | loss: 2.715185 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,552 | mfu: 50.91 | epoch: 1 | total time: 68.28m | eta: 111.4m +step 06355/16704 (38.04%) | loss: 2.705236 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 1 | total time: 68.29m | eta: 111.4m +step 06356/16704 (38.05%) | loss: 2.709766 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,778 | mfu: 50.86 | epoch: 1 | total time: 68.31m | eta: 111.4m +step 06357/16704 (38.06%) | loss: 2.718452 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,495 | mfu: 50.84 | epoch: 1 | total time: 68.32m | eta: 111.4m +step 06358/16704 (38.06%) | loss: 2.721954 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,267 | mfu: 50.83 | epoch: 1 | total time: 68.33m | eta: 111.4m +step 06359/16704 (38.07%) | loss: 2.727143 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,636 | mfu: 50.85 | epoch: 1 | total time: 68.34m | eta: 111.3m +step 06360/16704 (38.07%) | loss: 2.729968 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,591 | mfu: 50.91 | epoch: 1 | total time: 68.35m | eta: 111.3m +step 06361/16704 (38.08%) | loss: 2.731180 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,765 | mfu: 50.92 | epoch: 1 | total time: 68.36m | eta: 111.3m +step 06362/16704 (38.09%) | loss: 2.738173 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,495 | mfu: 50.97 | epoch: 1 | total time: 68.37m | eta: 111.3m +step 06363/16704 (38.09%) | loss: 2.747671 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,397 | mfu: 50.96 | epoch: 1 | total time: 68.38m | eta: 111.3m +step 06364/16704 (38.10%) | loss: 2.752654 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,348 | mfu: 50.84 | epoch: 1 | total time: 68.39m | eta: 111.3m +step 06365/16704 (38.10%) | loss: 2.746275 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,307 | mfu: 50.83 | epoch: 1 | total time: 68.40m | eta: 111.3m +step 06366/16704 (38.11%) | loss: 2.745569 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,640 | mfu: 50.98 | epoch: 1 | total time: 68.41m | eta: 111.3m +step 06367/16704 (38.12%) | loss: 2.747613 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,121 | mfu: 50.70 | epoch: 1 | total time: 68.42m | eta: 111.3m +step 06368/16704 (38.12%) | loss: 2.741600 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,753 | mfu: 50.99 | epoch: 1 | total time: 68.43m | eta: 111.3m +step 06369/16704 (38.13%) | loss: 2.734524 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,156 | mfu: 50.82 | epoch: 1 | total time: 68.45m | eta: 111.2m +step 06370/16704 (38.13%) | loss: 2.729693 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,287 | mfu: 50.96 | epoch: 1 | total time: 68.46m | eta: 111.2m +step 06371/16704 (38.14%) | loss: 2.736082 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,606 | mfu: 50.73 | epoch: 1 | total time: 68.47m | eta: 111.2m +step 06372/16704 (38.15%) | loss: 2.741117 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,461 | mfu: 50.90 | epoch: 1 | total time: 68.48m | eta: 111.2m +step 06373/16704 (38.15%) | loss: 2.748884 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,036 | mfu: 50.88 | epoch: 1 | total time: 68.49m | eta: 111.2m +step 06374/16704 (38.16%) | loss: 2.737916 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,341 | mfu: 50.84 | epoch: 1 | total time: 68.50m | eta: 111.2m +step 06375/16704 (38.16%) | loss: 2.737445 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,609 | mfu: 50.85 | epoch: 1 | total time: 68.51m | eta: 111.2m +step 06376/16704 (38.17%) | loss: 2.755032 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,574 | mfu: 50.85 | epoch: 1 | total time: 68.52m | eta: 111.2m +step 06377/16704 (38.18%) | loss: 2.742628 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,428 | mfu: 50.97 | epoch: 1 | total time: 68.53m | eta: 111.2m +step 06378/16704 (38.18%) | loss: 2.744943 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,458 | mfu: 50.90 | epoch: 1 | total time: 68.54m | eta: 111.1m +step 06379/16704 (38.19%) | loss: 2.743028 | lrm: 1.00 | dt: 642.92ms | tok/sec: 815,482 | mfu: 50.97 | epoch: 1 | total time: 68.55m | eta: 111.1m +step 06380/16704 (38.19%) | loss: 2.737519 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,559 | mfu: 50.72 | epoch: 1 | total time: 68.56m | eta: 111.1m +step 06381/16704 (38.20%) | loss: 2.750447 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,530 | mfu: 50.97 | epoch: 1 | total time: 68.57m | eta: 111.1m +step 06382/16704 (38.21%) | loss: 2.737297 | lrm: 1.00 | dt: 641.21ms | tok/sec: 817,652 | mfu: 51.10 | epoch: 1 | total time: 68.58m | eta: 111.1m +step 06383/16704 (38.21%) | loss: 2.738850 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,920 | mfu: 50.81 | epoch: 1 | total time: 68.60m | eta: 111.1m +step 06384/16704 (38.22%) | loss: 2.742317 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 1 | total time: 68.61m | eta: 111.1m +step 06385/16704 (38.22%) | loss: 2.741568 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,431 | mfu: 50.90 | epoch: 1 | total time: 68.62m | eta: 111.1m +step 06386/16704 (38.23%) | loss: 2.748214 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,323 | mfu: 50.83 | epoch: 1 | total time: 68.63m | eta: 111.1m +step 06387/16704 (38.24%) | loss: 2.755007 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,691 | mfu: 50.98 | epoch: 1 | total time: 68.64m | eta: 111.0m +step 06388/16704 (38.24%) | loss: 2.762080 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,623 | mfu: 50.85 | epoch: 1 | total time: 68.65m | eta: 111.0m +step 06389/16704 (38.25%) | loss: 2.749932 | lrm: 1.00 | dt: 646.37ms | tok/sec: 811,126 | mfu: 50.70 | epoch: 1 | total time: 68.66m | eta: 111.0m +step 06390/16704 (38.25%) | loss: 2.754853 | lrm: 1.00 | dt: 641.63ms | tok/sec: 817,121 | mfu: 51.07 | epoch: 1 | total time: 68.67m | eta: 111.0m +step 06391/16704 (38.26%) | loss: 2.747557 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,554 | mfu: 50.72 | epoch: 1 | total time: 68.68m | eta: 111.0m +step 06392/16704 (38.27%) | loss: 2.758852 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,463 | mfu: 50.84 | epoch: 1 | total time: 68.69m | eta: 111.0m +step 06393/16704 (38.27%) | loss: 2.754135 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,918 | mfu: 50.75 | epoch: 1 | total time: 68.70m | eta: 111.0m +step 06394/16704 (38.28%) | loss: 2.753460 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,522 | mfu: 50.85 | epoch: 1 | total time: 68.71m | eta: 111.0m +step 06395/16704 (38.28%) | loss: 2.766963 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,187 | mfu: 51.01 | epoch: 1 | total time: 68.72m | eta: 111.0m +step 06396/16704 (38.29%) | loss: 2.768519 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,192 | mfu: 50.95 | epoch: 1 | total time: 68.73m | eta: 110.9m +step 06397/16704 (38.30%) | loss: 2.771457 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,920 | mfu: 50.93 | epoch: 1 | total time: 68.75m | eta: 110.9m +step 06398/16704 (38.30%) | loss: 2.772403 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,427 | mfu: 50.90 | epoch: 1 | total time: 68.76m | eta: 110.9m +step 06399/16704 (38.31%) | loss: 2.765505 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,352 | mfu: 50.90 | epoch: 1 | total time: 68.77m | eta: 110.9m +step 06400/16704 (38.31%) | loss: 2.768754 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,005 | mfu: 50.94 | epoch: 1 | total time: 68.78m | eta: 110.9m +step 06401/16704 (38.32%) | loss: 2.772473 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,713 | mfu: 50.73 | epoch: 1 | total time: 68.79m | eta: 110.9m +step 06402/16704 (38.33%) | loss: 2.770757 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,279 | mfu: 50.89 | epoch: 1 | total time: 68.80m | eta: 110.9m +step 06403/16704 (38.33%) | loss: 2.764997 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,383 | mfu: 50.78 | epoch: 1 | total time: 68.81m | eta: 110.9m +step 06404/16704 (38.34%) | loss: 2.764692 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,196 | mfu: 51.01 | epoch: 1 | total time: 68.82m | eta: 110.9m +step 06405/16704 (38.34%) | loss: 2.753632 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,011 | mfu: 50.81 | epoch: 1 | total time: 68.83m | eta: 110.9m +step 06406/16704 (38.35%) | loss: 2.758036 | lrm: 1.00 | dt: 641.86ms | tok/sec: 816,831 | mfu: 51.05 | epoch: 1 | total time: 68.84m | eta: 110.8m +step 06407/16704 (38.36%) | loss: 2.758031 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,292 | mfu: 50.77 | epoch: 1 | total time: 68.85m | eta: 110.8m +step 06408/16704 (38.36%) | loss: 2.751393 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,891 | mfu: 50.74 | epoch: 1 | total time: 68.86m | eta: 110.8m +step 06409/16704 (38.37%) | loss: 2.728219 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,951 | mfu: 50.94 | epoch: 1 | total time: 68.87m | eta: 110.8m +step 06410/16704 (38.37%) | loss: 2.741572 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,901 | mfu: 50.81 | epoch: 1 | total time: 68.89m | eta: 110.8m +step 06411/16704 (38.38%) | loss: 2.749929 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,793 | mfu: 50.93 | epoch: 1 | total time: 68.90m | eta: 110.8m +step 06412/16704 (38.39%) | loss: 2.752868 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,255 | mfu: 50.83 | epoch: 1 | total time: 68.91m | eta: 110.8m +step 06413/16704 (38.39%) | loss: 2.749227 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,350 | mfu: 50.90 | epoch: 1 | total time: 68.92m | eta: 110.8m +step 06414/16704 (38.40%) | loss: 2.760652 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,863 | mfu: 51.06 | epoch: 1 | total time: 68.93m | eta: 110.8m +step 06415/16704 (38.40%) | loss: 2.762356 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,788 | mfu: 50.93 | epoch: 1 | total time: 68.94m | eta: 110.7m +step 06416/16704 (38.41%) | loss: 2.757331 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,717 | mfu: 50.92 | epoch: 1 | total time: 68.95m | eta: 110.7m +step 06417/16704 (38.42%) | loss: 2.746148 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,295 | mfu: 50.96 | epoch: 1 | total time: 68.96m | eta: 110.7m +step 06418/16704 (38.42%) | loss: 2.741197 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,310 | mfu: 50.83 | epoch: 1 | total time: 68.97m | eta: 110.7m +step 06419/16704 (38.43%) | loss: 2.745966 | lrm: 1.00 | dt: 641.89ms | tok/sec: 816,785 | mfu: 51.05 | epoch: 1 | total time: 68.98m | eta: 110.7m +step 06420/16704 (38.43%) | loss: 2.747228 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,254 | mfu: 50.89 | epoch: 1 | total time: 68.99m | eta: 110.7m +step 06421/16704 (38.44%) | loss: 2.755086 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,304 | mfu: 50.90 | epoch: 1 | total time: 69.00m | eta: 110.7m +step 06422/16704 (38.45%) | loss: 2.754956 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,364 | mfu: 50.96 | epoch: 1 | total time: 69.01m | eta: 110.7m +step 06423/16704 (38.45%) | loss: 2.743436 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,060 | mfu: 50.82 | epoch: 1 | total time: 69.02m | eta: 110.7m +step 06424/16704 (38.46%) | loss: 2.744911 | lrm: 1.00 | dt: 642.29ms | tok/sec: 816,275 | mfu: 51.02 | epoch: 1 | total time: 69.04m | eta: 110.6m +step 06425/16704 (38.46%) | loss: 2.742055 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 1 | total time: 69.05m | eta: 110.6m +step 06426/16704 (38.47%) | loss: 2.745313 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,430 | mfu: 50.84 | epoch: 1 | total time: 69.06m | eta: 110.6m +step 06427/16704 (38.48%) | loss: 2.741031 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,906 | mfu: 50.93 | epoch: 1 | total time: 69.07m | eta: 110.6m +step 06428/16704 (38.48%) | loss: 2.745491 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,488 | mfu: 50.91 | epoch: 1 | total time: 69.08m | eta: 110.6m +step 06429/16704 (38.49%) | loss: 2.741674 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,179 | mfu: 50.82 | epoch: 1 | total time: 69.09m | eta: 110.6m +step 06430/16704 (38.49%) | loss: 2.735710 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,849 | mfu: 50.80 | epoch: 1 | total time: 69.10m | eta: 110.6m +step 06431/16704 (38.50%) | loss: 2.736666 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,263 | mfu: 50.96 | epoch: 1 | total time: 69.11m | eta: 110.6m +step 06432/16704 (38.51%) | loss: 2.731980 | lrm: 1.00 | dt: 647.11ms | tok/sec: 810,198 | mfu: 50.64 | epoch: 1 | total time: 69.12m | eta: 110.6m +step 06433/16704 (38.51%) | loss: 2.734970 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,209 | mfu: 50.89 | epoch: 1 | total time: 69.13m | eta: 110.5m +step 06434/16704 (38.52%) | loss: 2.725707 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,666 | mfu: 50.86 | epoch: 1 | total time: 69.14m | eta: 110.5m +step 06435/16704 (38.52%) | loss: 2.727598 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,909 | mfu: 50.75 | epoch: 1 | total time: 69.15m | eta: 110.5m +step 06436/16704 (38.53%) | loss: 2.727580 | lrm: 1.00 | dt: 641.76ms | tok/sec: 816,950 | mfu: 51.06 | epoch: 1 | total time: 69.16m | eta: 110.5m +step 06437/16704 (38.54%) | loss: 2.724289 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,578 | mfu: 50.97 | epoch: 1 | total time: 69.17m | eta: 110.5m +step 06438/16704 (38.54%) | loss: 2.719746 | lrm: 1.00 | dt: 642.00ms | tok/sec: 816,643 | mfu: 51.04 | epoch: 1 | total time: 69.19m | eta: 110.5m +step 06439/16704 (38.55%) | loss: 2.711866 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,698 | mfu: 50.73 | epoch: 1 | total time: 69.20m | eta: 110.5m +step 06440/16704 (38.55%) | loss: 2.714885 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,249 | mfu: 50.95 | epoch: 1 | total time: 69.21m | eta: 110.5m +step 06441/16704 (38.56%) | loss: 2.722244 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,218 | mfu: 50.89 | epoch: 1 | total time: 69.22m | eta: 110.5m +step 06442/16704 (38.57%) | loss: 2.722746 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,547 | mfu: 50.97 | epoch: 1 | total time: 69.23m | eta: 110.5m +step 06443/16704 (38.57%) | loss: 2.734774 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,117 | mfu: 50.82 | epoch: 1 | total time: 69.24m | eta: 110.4m +step 06444/16704 (38.58%) | loss: 2.747480 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,934 | mfu: 50.93 | epoch: 1 | total time: 69.25m | eta: 110.4m +step 06445/16704 (38.58%) | loss: 2.746799 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,603 | mfu: 50.73 | epoch: 1 | total time: 69.26m | eta: 110.4m +step 06446/16704 (38.59%) | loss: 2.739816 | lrm: 1.00 | dt: 642.07ms | tok/sec: 816,558 | mfu: 51.04 | epoch: 1 | total time: 69.27m | eta: 110.4m +step 06447/16704 (38.60%) | loss: 2.751503 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,084 | mfu: 50.94 | epoch: 1 | total time: 69.28m | eta: 110.4m +step 06448/16704 (38.60%) | loss: 2.745267 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,860 | mfu: 50.93 | epoch: 1 | total time: 69.29m | eta: 110.4m +step 06449/16704 (38.61%) | loss: 2.757869 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 1 | total time: 69.30m | eta: 110.4m +step 06450/16704 (38.61%) | loss: 2.756128 | lrm: 1.00 | dt: 641.78ms | tok/sec: 816,931 | mfu: 51.06 | epoch: 1 | total time: 69.31m | eta: 110.4m +step 06451/16704 (38.62%) | loss: 2.738117 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,136 | mfu: 50.82 | epoch: 1 | total time: 69.33m | eta: 110.4m +step 06452/16704 (38.63%) | loss: 2.731573 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,771 | mfu: 50.80 | epoch: 1 | total time: 69.34m | eta: 110.3m +step 06453/16704 (38.63%) | loss: 2.742359 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,062 | mfu: 50.82 | epoch: 1 | total time: 69.35m | eta: 110.3m +step 06454/16704 (38.64%) | loss: 2.765564 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,788 | mfu: 50.93 | epoch: 1 | total time: 69.36m | eta: 110.3m +step 06455/16704 (38.64%) | loss: 2.758050 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,232 | mfu: 50.95 | epoch: 1 | total time: 69.37m | eta: 110.3m +step 06456/16704 (38.65%) | loss: 2.753453 | lrm: 1.00 | dt: 641.77ms | tok/sec: 816,943 | mfu: 51.06 | epoch: 1 | total time: 69.38m | eta: 110.3m +step 06457/16704 (38.66%) | loss: 2.756163 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 1 | total time: 69.39m | eta: 110.3m +step 06458/16704 (38.66%) | loss: 2.748978 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,268 | mfu: 51.02 | epoch: 1 | total time: 69.40m | eta: 110.3m +step 06459/16704 (38.67%) | loss: 2.744504 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,773 | mfu: 50.86 | epoch: 1 | total time: 69.41m | eta: 110.3m +step 06460/16704 (38.67%) | loss: 2.760792 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,700 | mfu: 50.79 | epoch: 1 | total time: 69.42m | eta: 110.3m +step 06461/16704 (38.68%) | loss: 2.757444 | lrm: 1.00 | dt: 641.82ms | tok/sec: 816,881 | mfu: 51.06 | epoch: 1 | total time: 69.43m | eta: 110.2m +step 06462/16704 (38.69%) | loss: 2.747565 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,009 | mfu: 50.81 | epoch: 1 | total time: 69.44m | eta: 110.2m +step 06463/16704 (38.69%) | loss: 2.742594 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,159 | mfu: 50.70 | epoch: 1 | total time: 69.45m | eta: 110.2m +step 06464/16704 (38.70%) | loss: 2.742915 | lrm: 1.00 | dt: 639.81ms | tok/sec: 819,438 | mfu: 51.22 | epoch: 1 | total time: 69.46m | eta: 110.2m +step 06465/16704 (38.70%) | loss: 2.740999 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,231 | mfu: 50.89 | epoch: 1 | total time: 69.48m | eta: 110.2m +step 06466/16704 (38.71%) | loss: 2.732238 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,438 | mfu: 50.84 | epoch: 1 | total time: 69.49m | eta: 110.2m +step 06467/16704 (38.72%) | loss: 2.736902 | lrm: 1.00 | dt: 640.32ms | tok/sec: 818,795 | mfu: 51.18 | epoch: 1 | total time: 69.50m | eta: 110.2m +step 06468/16704 (38.72%) | loss: 2.728027 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,359 | mfu: 50.77 | epoch: 1 | total time: 69.51m | eta: 110.2m +step 06469/16704 (38.73%) | loss: 2.721430 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,092 | mfu: 50.88 | epoch: 1 | total time: 69.52m | eta: 110.2m +step 06470/16704 (38.73%) | loss: 2.703156 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,839 | mfu: 50.93 | epoch: 1 | total time: 69.53m | eta: 110.1m +step 06471/16704 (38.74%) | loss: 2.710396 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,066 | mfu: 50.94 | epoch: 1 | total time: 69.54m | eta: 110.1m +step 06472/16704 (38.75%) | loss: 2.707762 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,717 | mfu: 50.86 | epoch: 1 | total time: 69.55m | eta: 110.1m +step 06473/16704 (38.75%) | loss: 2.698119 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 1 | total time: 69.56m | eta: 110.1m +step 06474/16704 (38.76%) | loss: 2.704145 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,315 | mfu: 50.96 | epoch: 1 | total time: 69.57m | eta: 110.1m +step 06475/16704 (38.76%) | loss: 2.707543 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,463 | mfu: 50.78 | epoch: 1 | total time: 69.58m | eta: 110.1m +step 06476/16704 (38.77%) | loss: 2.704867 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,056 | mfu: 50.94 | epoch: 1 | total time: 69.59m | eta: 110.1m +step 06477/16704 (38.78%) | loss: 2.715468 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,430 | mfu: 50.97 | epoch: 1 | total time: 69.60m | eta: 110.1m +step 06478/16704 (38.78%) | loss: 2.713624 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,443 | mfu: 50.84 | epoch: 1 | total time: 69.61m | eta: 110.1m +step 06479/16704 (38.79%) | loss: 2.707372 | lrm: 1.00 | dt: 642.11ms | tok/sec: 816,513 | mfu: 51.03 | epoch: 1 | total time: 69.63m | eta: 110.1m +step 06480/16704 (38.79%) | loss: 2.711630 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,774 | mfu: 50.86 | epoch: 1 | total time: 69.64m | eta: 110.0m +step 06481/16704 (38.80%) | loss: 2.709334 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,263 | mfu: 50.89 | epoch: 1 | total time: 69.65m | eta: 110.0m +step 06482/16704 (38.81%) | loss: 2.715989 | lrm: 1.00 | dt: 642.51ms | tok/sec: 815,993 | mfu: 51.00 | epoch: 1 | total time: 69.66m | eta: 110.0m +step 06483/16704 (38.81%) | loss: 2.724751 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,397 | mfu: 50.78 | epoch: 1 | total time: 69.67m | eta: 110.0m +step 06484/16704 (38.82%) | loss: 2.725129 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,488 | mfu: 50.84 | epoch: 1 | total time: 69.68m | eta: 110.0m +step 06485/16704 (38.82%) | loss: 2.738565 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,104 | mfu: 50.95 | epoch: 1 | total time: 69.69m | eta: 110.0m +step 06486/16704 (38.83%) | loss: 2.735424 | lrm: 1.00 | dt: 641.80ms | tok/sec: 816,908 | mfu: 51.06 | epoch: 1 | total time: 69.70m | eta: 110.0m +step 06487/16704 (38.84%) | loss: 2.733962 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,069 | mfu: 50.82 | epoch: 1 | total time: 69.71m | eta: 110.0m +step 06488/16704 (38.84%) | loss: 2.733107 | lrm: 1.00 | dt: 642.28ms | tok/sec: 816,286 | mfu: 51.02 | epoch: 1 | total time: 69.72m | eta: 110.0m +step 06489/16704 (38.85%) | loss: 2.731076 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,677 | mfu: 50.73 | epoch: 1 | total time: 69.73m | eta: 109.9m +step 06490/16704 (38.85%) | loss: 2.729692 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,140 | mfu: 50.88 | epoch: 1 | total time: 69.74m | eta: 109.9m +step 06491/16704 (38.86%) | loss: 2.728594 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,844 | mfu: 50.80 | epoch: 1 | total time: 69.75m | eta: 109.9m +step 06492/16704 (38.86%) | loss: 2.731860 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,674 | mfu: 50.73 | epoch: 1 | total time: 69.77m | eta: 109.9m +step 06493/16704 (38.87%) | loss: 2.739489 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,198 | mfu: 50.70 | epoch: 1 | total time: 69.78m | eta: 109.9m +step 06494/16704 (38.88%) | loss: 2.722930 | lrm: 1.00 | dt: 643.19ms | tok/sec: 815,138 | mfu: 50.95 | epoch: 1 | total time: 69.79m | eta: 109.9m +step 06495/16704 (38.88%) | loss: 2.728962 | lrm: 1.00 | dt: 641.76ms | tok/sec: 816,955 | mfu: 51.06 | epoch: 1 | total time: 69.80m | eta: 109.9m +step 06496/16704 (38.89%) | loss: 2.722375 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,355 | mfu: 50.84 | epoch: 1 | total time: 69.81m | eta: 109.9m +step 06497/16704 (38.89%) | loss: 2.718561 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,219 | mfu: 50.95 | epoch: 1 | total time: 69.82m | eta: 109.9m +step 06498/16704 (38.90%) | loss: 2.725082 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,048 | mfu: 50.69 | epoch: 1 | total time: 69.83m | eta: 109.8m +step 06499/16704 (38.91%) | loss: 2.723092 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,298 | mfu: 50.96 | epoch: 1 | total time: 69.84m | eta: 109.8m +Step 06500 | Validation bpb: 0.830462 +step 06500/16704 (38.91%) | loss: 2.715284 | lrm: 1.00 | dt: 640.88ms | tok/sec: 818,074 | mfu: 51.13 | epoch: 1 | total time: 69.85m | eta: 109.8m +step 06501/16704 (38.92%) | loss: 2.726587 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,421 | mfu: 50.65 | epoch: 1 | total time: 69.86m | eta: 109.8m +step 06502/16704 (38.92%) | loss: 2.726422 | lrm: 1.00 | dt: 647.06ms | tok/sec: 810,261 | mfu: 50.64 | epoch: 1 | total time: 69.87m | eta: 109.8m +step 06503/16704 (38.93%) | loss: 2.736516 | lrm: 1.00 | dt: 641.96ms | tok/sec: 816,698 | mfu: 51.04 | epoch: 1 | total time: 69.88m | eta: 109.8m +step 06504/16704 (38.94%) | loss: 2.738055 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,350 | mfu: 50.77 | epoch: 1 | total time: 69.89m | eta: 109.8m +step 06505/16704 (38.94%) | loss: 2.733230 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,954 | mfu: 50.75 | epoch: 1 | total time: 69.90m | eta: 109.8m +step 06506/16704 (38.95%) | loss: 2.725396 | lrm: 1.00 | dt: 641.99ms | tok/sec: 816,665 | mfu: 51.04 | epoch: 1 | total time: 69.92m | eta: 109.8m +step 06507/16704 (38.95%) | loss: 2.722059 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,542 | mfu: 50.79 | epoch: 1 | total time: 69.93m | eta: 109.7m +step 06508/16704 (38.96%) | loss: 2.722910 | lrm: 1.00 | dt: 640.67ms | tok/sec: 818,340 | mfu: 51.15 | epoch: 1 | total time: 69.94m | eta: 109.7m +step 06509/16704 (38.97%) | loss: 2.707248 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 1 | total time: 69.95m | eta: 109.7m +step 06510/16704 (38.97%) | loss: 2.706266 | lrm: 1.00 | dt: 643.26ms | tok/sec: 815,043 | mfu: 50.94 | epoch: 1 | total time: 69.96m | eta: 109.7m +step 06511/16704 (38.98%) | loss: 2.696915 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,326 | mfu: 50.90 | epoch: 1 | total time: 69.97m | eta: 109.7m +step 06512/16704 (38.98%) | loss: 2.704631 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,596 | mfu: 50.85 | epoch: 1 | total time: 69.98m | eta: 109.7m +step 06513/16704 (38.99%) | loss: 2.718969 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 1 | total time: 69.99m | eta: 109.7m +step 06514/16704 (39.00%) | loss: 2.738468 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 1 | total time: 70.00m | eta: 109.7m +step 06515/16704 (39.00%) | loss: 2.739294 | lrm: 1.00 | dt: 642.42ms | tok/sec: 816,115 | mfu: 51.01 | epoch: 1 | total time: 70.01m | eta: 109.7m +step 06516/16704 (39.01%) | loss: 2.728048 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,550 | mfu: 50.85 | epoch: 1 | total time: 70.02m | eta: 109.7m +step 06517/16704 (39.01%) | loss: 2.730454 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,308 | mfu: 50.96 | epoch: 1 | total time: 70.03m | eta: 109.6m +step 06518/16704 (39.02%) | loss: 2.735863 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,471 | mfu: 50.97 | epoch: 1 | total time: 70.04m | eta: 109.6m +step 06519/16704 (39.03%) | loss: 2.737953 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,785 | mfu: 50.68 | epoch: 1 | total time: 70.05m | eta: 109.6m +step 06520/16704 (39.03%) | loss: 2.733972 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,180 | mfu: 50.89 | epoch: 1 | total time: 70.07m | eta: 109.6m +step 06521/16704 (39.04%) | loss: 2.728223 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,306 | mfu: 50.90 | epoch: 1 | total time: 70.08m | eta: 109.6m +step 06522/16704 (39.04%) | loss: 2.728185 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,559 | mfu: 50.79 | epoch: 1 | total time: 70.09m | eta: 109.6m +step 06523/16704 (39.05%) | loss: 2.724478 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,816 | mfu: 50.93 | epoch: 1 | total time: 70.10m | eta: 109.6m +step 06524/16704 (39.06%) | loss: 2.738033 | lrm: 1.00 | dt: 642.24ms | tok/sec: 816,346 | mfu: 51.02 | epoch: 1 | total time: 70.11m | eta: 109.6m +step 06525/16704 (39.06%) | loss: 2.731314 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,426 | mfu: 50.78 | epoch: 1 | total time: 70.12m | eta: 109.6m +step 06526/16704 (39.07%) | loss: 2.739748 | lrm: 1.00 | dt: 641.06ms | tok/sec: 817,840 | mfu: 51.12 | epoch: 1 | total time: 70.13m | eta: 109.5m +step 06527/16704 (39.07%) | loss: 2.744994 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,381 | mfu: 50.96 | epoch: 1 | total time: 70.14m | eta: 109.5m +step 06528/16704 (39.08%) | loss: 2.739854 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,379 | mfu: 50.96 | epoch: 1 | total time: 70.15m | eta: 109.5m +step 06529/16704 (39.09%) | loss: 2.747271 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,217 | mfu: 50.95 | epoch: 1 | total time: 70.16m | eta: 109.5m +step 06530/16704 (39.09%) | loss: 2.750223 | lrm: 1.00 | dt: 641.99ms | tok/sec: 816,661 | mfu: 51.04 | epoch: 1 | total time: 70.17m | eta: 109.5m +step 06531/16704 (39.10%) | loss: 2.744428 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,385 | mfu: 50.90 | epoch: 1 | total time: 70.18m | eta: 109.5m +step 06532/16704 (39.10%) | loss: 2.735723 | lrm: 1.00 | dt: 642.72ms | tok/sec: 815,733 | mfu: 50.98 | epoch: 1 | total time: 70.19m | eta: 109.5m +step 06533/16704 (39.11%) | loss: 2.745958 | lrm: 1.00 | dt: 641.91ms | tok/sec: 816,762 | mfu: 51.05 | epoch: 1 | total time: 70.20m | eta: 109.5m +step 06534/16704 (39.12%) | loss: 2.749786 | lrm: 1.00 | dt: 640.75ms | tok/sec: 818,235 | mfu: 51.14 | epoch: 1 | total time: 70.22m | eta: 109.5m +step 06535/16704 (39.12%) | loss: 2.754257 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,201 | mfu: 51.01 | epoch: 1 | total time: 70.23m | eta: 109.4m +step 06536/16704 (39.13%) | loss: 2.738417 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,166 | mfu: 51.01 | epoch: 1 | total time: 70.24m | eta: 109.4m +step 06537/16704 (39.13%) | loss: 2.746724 | lrm: 1.00 | dt: 642.28ms | tok/sec: 816,286 | mfu: 51.02 | epoch: 1 | total time: 70.25m | eta: 109.4m +step 06538/16704 (39.14%) | loss: 2.736118 | lrm: 1.00 | dt: 641.53ms | tok/sec: 817,249 | mfu: 51.08 | epoch: 1 | total time: 70.26m | eta: 109.4m +step 06539/16704 (39.15%) | loss: 2.720505 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,054 | mfu: 50.82 | epoch: 1 | total time: 70.27m | eta: 109.4m +step 06540/16704 (39.15%) | loss: 2.717733 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,712 | mfu: 50.86 | epoch: 1 | total time: 70.28m | eta: 109.4m +step 06541/16704 (39.16%) | loss: 2.717971 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,389 | mfu: 50.84 | epoch: 1 | total time: 70.29m | eta: 109.4m +step 06542/16704 (39.16%) | loss: 2.730739 | lrm: 1.00 | dt: 641.06ms | tok/sec: 817,842 | mfu: 51.12 | epoch: 1 | total time: 70.30m | eta: 109.4m +step 06543/16704 (39.17%) | loss: 2.724044 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,056 | mfu: 50.82 | epoch: 1 | total time: 70.31m | eta: 109.4m +step 06544/16704 (39.18%) | loss: 2.717836 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,316 | mfu: 51.02 | epoch: 1 | total time: 70.32m | eta: 109.3m +step 06545/16704 (39.18%) | loss: 2.719097 | lrm: 1.00 | dt: 648.06ms | tok/sec: 809,009 | mfu: 50.56 | epoch: 1 | total time: 70.33m | eta: 109.3m +step 06546/16704 (39.19%) | loss: 2.734864 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,814 | mfu: 50.86 | epoch: 1 | total time: 70.34m | eta: 109.3m +step 06547/16704 (39.19%) | loss: 2.725745 | lrm: 1.00 | dt: 641.60ms | tok/sec: 817,160 | mfu: 51.07 | epoch: 1 | total time: 70.36m | eta: 109.3m +step 06548/16704 (39.20%) | loss: 2.726082 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,005 | mfu: 50.88 | epoch: 1 | total time: 70.37m | eta: 109.3m +step 06549/16704 (39.21%) | loss: 2.731949 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,612 | mfu: 50.98 | epoch: 1 | total time: 70.38m | eta: 109.3m +step 06550/16704 (39.21%) | loss: 2.733006 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,937 | mfu: 50.87 | epoch: 1 | total time: 70.39m | eta: 109.3m +step 06551/16704 (39.22%) | loss: 2.739802 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,827 | mfu: 50.80 | epoch: 1 | total time: 70.40m | eta: 109.3m +step 06552/16704 (39.22%) | loss: 2.733085 | lrm: 1.00 | dt: 641.76ms | tok/sec: 816,953 | mfu: 51.06 | epoch: 1 | total time: 70.41m | eta: 109.3m +step 06553/16704 (39.23%) | loss: 2.728074 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 1 | total time: 70.42m | eta: 109.3m +step 06554/16704 (39.24%) | loss: 2.735371 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,032 | mfu: 50.88 | epoch: 1 | total time: 70.43m | eta: 109.2m +step 06555/16704 (39.24%) | loss: 2.739688 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,636 | mfu: 50.73 | epoch: 1 | total time: 70.44m | eta: 109.2m +step 06556/16704 (39.25%) | loss: 2.730821 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,788 | mfu: 50.86 | epoch: 1 | total time: 70.45m | eta: 109.2m +step 06557/16704 (39.25%) | loss: 2.726319 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,913 | mfu: 50.87 | epoch: 1 | total time: 70.46m | eta: 109.2m +step 06558/16704 (39.26%) | loss: 2.719296 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,389 | mfu: 50.71 | epoch: 1 | total time: 70.47m | eta: 109.2m +step 06559/16704 (39.27%) | loss: 2.736008 | lrm: 1.00 | dt: 646.66ms | tok/sec: 810,762 | mfu: 50.67 | epoch: 1 | total time: 70.48m | eta: 109.2m +step 06560/16704 (39.27%) | loss: 2.738208 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,748 | mfu: 50.92 | epoch: 1 | total time: 70.49m | eta: 109.2m +step 06561/16704 (39.28%) | loss: 2.726675 | lrm: 1.00 | dt: 642.43ms | tok/sec: 816,106 | mfu: 51.01 | epoch: 1 | total time: 70.51m | eta: 109.2m +step 06562/16704 (39.28%) | loss: 2.711251 | lrm: 1.00 | dt: 645.55ms | tok/sec: 812,153 | mfu: 50.76 | epoch: 1 | total time: 70.52m | eta: 109.2m +step 06563/16704 (39.29%) | loss: 2.719401 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,645 | mfu: 50.92 | epoch: 1 | total time: 70.53m | eta: 109.1m +step 06564/16704 (39.30%) | loss: 2.721478 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,853 | mfu: 50.93 | epoch: 1 | total time: 70.54m | eta: 109.1m +step 06565/16704 (39.30%) | loss: 2.717302 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,316 | mfu: 50.83 | epoch: 1 | total time: 70.55m | eta: 109.1m +step 06566/16704 (39.31%) | loss: 2.723861 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,168 | mfu: 50.89 | epoch: 1 | total time: 70.56m | eta: 109.1m +step 06567/16704 (39.31%) | loss: 2.715075 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,187 | mfu: 50.76 | epoch: 1 | total time: 70.57m | eta: 109.1m +step 06568/16704 (39.32%) | loss: 2.712993 | lrm: 1.00 | dt: 642.50ms | tok/sec: 816,010 | mfu: 51.00 | epoch: 1 | total time: 70.58m | eta: 109.1m +step 06569/16704 (39.33%) | loss: 2.706451 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 1 | total time: 70.59m | eta: 109.1m +step 06570/16704 (39.33%) | loss: 2.715709 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,221 | mfu: 50.83 | epoch: 1 | total time: 70.60m | eta: 109.1m +step 06571/16704 (39.34%) | loss: 2.713100 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,203 | mfu: 50.89 | epoch: 1 | total time: 70.61m | eta: 109.1m +step 06572/16704 (39.34%) | loss: 2.718193 | lrm: 1.00 | dt: 646.68ms | tok/sec: 810,735 | mfu: 50.67 | epoch: 1 | total time: 70.62m | eta: 109.0m +step 06573/16704 (39.35%) | loss: 2.718925 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,315 | mfu: 50.77 | epoch: 1 | total time: 70.63m | eta: 109.0m +step 06574/16704 (39.36%) | loss: 2.726147 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,423 | mfu: 50.78 | epoch: 1 | total time: 70.65m | eta: 109.0m +step 06575/16704 (39.36%) | loss: 2.729074 | lrm: 1.00 | dt: 648.37ms | tok/sec: 808,630 | mfu: 50.54 | epoch: 1 | total time: 70.66m | eta: 109.0m +step 06576/16704 (39.37%) | loss: 2.733638 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,743 | mfu: 50.74 | epoch: 1 | total time: 70.67m | eta: 109.0m +step 06577/16704 (39.37%) | loss: 2.727301 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,543 | mfu: 50.72 | epoch: 1 | total time: 70.68m | eta: 109.0m +step 06578/16704 (39.38%) | loss: 2.733408 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,779 | mfu: 50.67 | epoch: 1 | total time: 70.69m | eta: 109.0m +step 06579/16704 (39.39%) | loss: 2.721963 | lrm: 1.00 | dt: 646.93ms | tok/sec: 810,422 | mfu: 50.65 | epoch: 1 | total time: 70.70m | eta: 109.0m +step 06580/16704 (39.39%) | loss: 2.722020 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,405 | mfu: 50.90 | epoch: 1 | total time: 70.71m | eta: 109.0m +step 06581/16704 (39.40%) | loss: 2.711578 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,242 | mfu: 50.70 | epoch: 1 | total time: 70.72m | eta: 108.9m +step 06582/16704 (39.40%) | loss: 2.723293 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,888 | mfu: 50.81 | epoch: 1 | total time: 70.73m | eta: 108.9m +step 06583/16704 (39.41%) | loss: 2.719254 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,612 | mfu: 50.66 | epoch: 1 | total time: 70.74m | eta: 108.9m +step 06584/16704 (39.42%) | loss: 2.714650 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,364 | mfu: 50.84 | epoch: 1 | total time: 70.75m | eta: 108.9m +step 06585/16704 (39.42%) | loss: 2.719161 | lrm: 1.00 | dt: 646.78ms | tok/sec: 810,611 | mfu: 50.66 | epoch: 1 | total time: 70.76m | eta: 108.9m +step 06586/16704 (39.43%) | loss: 2.712742 | lrm: 1.00 | dt: 648.05ms | tok/sec: 809,027 | mfu: 50.57 | epoch: 1 | total time: 70.77m | eta: 108.9m +step 06587/16704 (39.43%) | loss: 2.709761 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,454 | mfu: 50.84 | epoch: 1 | total time: 70.79m | eta: 108.9m +step 06588/16704 (39.44%) | loss: 2.709224 | lrm: 1.00 | dt: 647.52ms | tok/sec: 809,687 | mfu: 50.61 | epoch: 1 | total time: 70.80m | eta: 108.9m +step 06589/16704 (39.45%) | loss: 2.713454 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 1 | total time: 70.81m | eta: 108.9m +step 06590/16704 (39.45%) | loss: 2.706573 | lrm: 1.00 | dt: 647.86ms | tok/sec: 809,257 | mfu: 50.58 | epoch: 1 | total time: 70.82m | eta: 108.9m +step 06591/16704 (39.46%) | loss: 2.727941 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,920 | mfu: 50.87 | epoch: 1 | total time: 70.83m | eta: 108.8m +step 06592/16704 (39.46%) | loss: 2.717264 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,252 | mfu: 50.77 | epoch: 1 | total time: 70.84m | eta: 108.8m +step 06593/16704 (39.47%) | loss: 2.721837 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,547 | mfu: 50.85 | epoch: 1 | total time: 70.85m | eta: 108.8m +step 06594/16704 (39.48%) | loss: 2.725372 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,864 | mfu: 50.74 | epoch: 1 | total time: 70.86m | eta: 108.8m +step 06595/16704 (39.48%) | loss: 2.727515 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,865 | mfu: 50.68 | epoch: 1 | total time: 70.87m | eta: 108.8m +step 06596/16704 (39.49%) | loss: 2.734188 | lrm: 1.00 | dt: 646.57ms | tok/sec: 810,878 | mfu: 50.68 | epoch: 1 | total time: 70.88m | eta: 108.8m +step 06597/16704 (39.49%) | loss: 2.730865 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,391 | mfu: 50.71 | epoch: 1 | total time: 70.89m | eta: 108.8m +step 06598/16704 (39.50%) | loss: 2.735659 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,705 | mfu: 50.73 | epoch: 1 | total time: 70.90m | eta: 108.8m +step 06599/16704 (39.51%) | loss: 2.734048 | lrm: 1.00 | dt: 647.27ms | tok/sec: 809,999 | mfu: 50.63 | epoch: 1 | total time: 70.91m | eta: 108.8m +step 06600/16704 (39.51%) | loss: 2.744689 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,187 | mfu: 50.76 | epoch: 1 | total time: 70.93m | eta: 108.7m +step 06601/16704 (39.52%) | loss: 2.745206 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,431 | mfu: 50.65 | epoch: 1 | total time: 70.94m | eta: 108.7m +step 06602/16704 (39.52%) | loss: 2.747540 | lrm: 1.00 | dt: 647.10ms | tok/sec: 810,206 | mfu: 50.64 | epoch: 1 | total time: 70.95m | eta: 108.7m +step 06603/16704 (39.53%) | loss: 2.747721 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,790 | mfu: 50.74 | epoch: 1 | total time: 70.96m | eta: 108.7m +step 06604/16704 (39.54%) | loss: 2.747822 | lrm: 1.00 | dt: 647.06ms | tok/sec: 810,265 | mfu: 50.64 | epoch: 1 | total time: 70.97m | eta: 108.7m +step 06605/16704 (39.54%) | loss: 2.749257 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,949 | mfu: 50.62 | epoch: 1 | total time: 70.98m | eta: 108.7m +step 06606/16704 (39.55%) | loss: 2.744697 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,377 | mfu: 50.77 | epoch: 1 | total time: 70.99m | eta: 108.7m +step 06607/16704 (39.55%) | loss: 2.737601 | lrm: 1.00 | dt: 645.83ms | tok/sec: 811,808 | mfu: 50.74 | epoch: 1 | total time: 71.00m | eta: 108.7m +step 06608/16704 (39.56%) | loss: 2.742026 | lrm: 1.00 | dt: 647.97ms | tok/sec: 809,128 | mfu: 50.57 | epoch: 1 | total time: 71.01m | eta: 108.7m +step 06609/16704 (39.57%) | loss: 2.729189 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,541 | mfu: 50.78 | epoch: 1 | total time: 71.02m | eta: 108.6m +step 06610/16704 (39.57%) | loss: 2.736820 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,004 | mfu: 50.88 | epoch: 1 | total time: 71.03m | eta: 108.6m +step 06611/16704 (39.58%) | loss: 2.752862 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,772 | mfu: 50.55 | epoch: 1 | total time: 71.04m | eta: 108.6m +step 06612/16704 (39.58%) | loss: 2.750381 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,765 | mfu: 50.80 | epoch: 1 | total time: 71.05m | eta: 108.6m +step 06613/16704 (39.59%) | loss: 2.749599 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,654 | mfu: 50.60 | epoch: 1 | total time: 71.07m | eta: 108.6m +step 06614/16704 (39.60%) | loss: 2.738749 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,169 | mfu: 50.95 | epoch: 1 | total time: 71.08m | eta: 108.6m +step 06615/16704 (39.60%) | loss: 2.733617 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,620 | mfu: 50.73 | epoch: 1 | total time: 71.09m | eta: 108.6m +step 06616/16704 (39.61%) | loss: 2.726657 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,596 | mfu: 50.85 | epoch: 1 | total time: 71.10m | eta: 108.6m +step 06617/16704 (39.61%) | loss: 2.727579 | lrm: 1.00 | dt: 646.67ms | tok/sec: 810,753 | mfu: 50.67 | epoch: 1 | total time: 71.11m | eta: 108.6m +step 06618/16704 (39.62%) | loss: 2.715022 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,723 | mfu: 50.73 | epoch: 1 | total time: 71.12m | eta: 108.6m +step 06619/16704 (39.63%) | loss: 2.721199 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,237 | mfu: 50.64 | epoch: 1 | total time: 71.13m | eta: 108.5m +step 06620/16704 (39.63%) | loss: 2.716526 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,998 | mfu: 50.75 | epoch: 1 | total time: 71.14m | eta: 108.5m +step 06621/16704 (39.64%) | loss: 2.712049 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,280 | mfu: 50.89 | epoch: 1 | total time: 71.15m | eta: 108.5m +step 06622/16704 (39.64%) | loss: 2.727430 | lrm: 1.00 | dt: 651.28ms | tok/sec: 805,013 | mfu: 50.31 | epoch: 1 | total time: 71.16m | eta: 108.5m +step 06623/16704 (39.65%) | loss: 2.724285 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,444 | mfu: 50.90 | epoch: 1 | total time: 71.17m | eta: 108.5m +step 06624/16704 (39.66%) | loss: 2.723138 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,644 | mfu: 50.67 | epoch: 1 | total time: 71.18m | eta: 108.5m +step 06625/16704 (39.66%) | loss: 2.727728 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,407 | mfu: 50.84 | epoch: 1 | total time: 71.19m | eta: 108.5m +step 06626/16704 (39.67%) | loss: 2.720728 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,255 | mfu: 50.95 | epoch: 1 | total time: 71.20m | eta: 108.5m +step 06627/16704 (39.67%) | loss: 2.718064 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,658 | mfu: 50.67 | epoch: 1 | total time: 71.22m | eta: 108.5m +step 06628/16704 (39.68%) | loss: 2.721300 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,498 | mfu: 50.84 | epoch: 1 | total time: 71.23m | eta: 108.4m +step 06629/16704 (39.69%) | loss: 2.723325 | lrm: 1.00 | dt: 647.91ms | tok/sec: 809,204 | mfu: 50.58 | epoch: 1 | total time: 71.24m | eta: 108.4m +step 06630/16704 (39.69%) | loss: 2.722153 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,773 | mfu: 50.67 | epoch: 1 | total time: 71.25m | eta: 108.4m +step 06631/16704 (39.70%) | loss: 2.721076 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,203 | mfu: 50.89 | epoch: 1 | total time: 71.26m | eta: 108.4m +step 06632/16704 (39.70%) | loss: 2.718091 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 1 | total time: 71.27m | eta: 108.4m +step 06633/16704 (39.71%) | loss: 2.723609 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,067 | mfu: 50.69 | epoch: 1 | total time: 71.28m | eta: 108.4m +step 06634/16704 (39.72%) | loss: 2.719933 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,986 | mfu: 50.88 | epoch: 1 | total time: 71.29m | eta: 108.4m +step 06635/16704 (39.72%) | loss: 2.713051 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,776 | mfu: 50.86 | epoch: 1 | total time: 71.30m | eta: 108.4m +step 06636/16704 (39.73%) | loss: 2.720727 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,708 | mfu: 50.80 | epoch: 1 | total time: 71.31m | eta: 108.4m +step 06637/16704 (39.73%) | loss: 2.730052 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 1 | total time: 71.32m | eta: 108.3m +step 06638/16704 (39.74%) | loss: 2.729629 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,176 | mfu: 50.82 | epoch: 1 | total time: 71.33m | eta: 108.3m +step 06639/16704 (39.74%) | loss: 2.738024 | lrm: 1.00 | dt: 642.69ms | tok/sec: 815,775 | mfu: 50.99 | epoch: 1 | total time: 71.34m | eta: 108.3m +step 06640/16704 (39.75%) | loss: 2.749648 | lrm: 1.00 | dt: 648.83ms | tok/sec: 808,052 | mfu: 50.50 | epoch: 1 | total time: 71.36m | eta: 108.3m +step 06641/16704 (39.76%) | loss: 2.732478 | lrm: 1.00 | dt: 641.75ms | tok/sec: 816,963 | mfu: 51.06 | epoch: 1 | total time: 71.37m | eta: 108.3m +step 06642/16704 (39.76%) | loss: 2.723873 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,294 | mfu: 50.71 | epoch: 1 | total time: 71.38m | eta: 108.3m +step 06643/16704 (39.77%) | loss: 2.722302 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,606 | mfu: 50.85 | epoch: 1 | total time: 71.39m | eta: 108.3m +step 06644/16704 (39.77%) | loss: 2.729649 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,868 | mfu: 51.06 | epoch: 1 | total time: 71.40m | eta: 108.3m +step 06645/16704 (39.78%) | loss: 2.737158 | lrm: 1.00 | dt: 649.92ms | tok/sec: 806,701 | mfu: 50.42 | epoch: 1 | total time: 71.41m | eta: 108.3m +step 06646/16704 (39.79%) | loss: 2.737221 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,910 | mfu: 51.00 | epoch: 1 | total time: 71.42m | eta: 108.2m +step 06647/16704 (39.79%) | loss: 2.757690 | lrm: 1.00 | dt: 647.62ms | tok/sec: 809,562 | mfu: 50.60 | epoch: 1 | total time: 71.43m | eta: 108.2m +step 06648/16704 (39.80%) | loss: 2.762929 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,549 | mfu: 50.85 | epoch: 1 | total time: 71.44m | eta: 108.2m +step 06649/16704 (39.80%) | loss: 2.771645 | lrm: 1.00 | dt: 642.49ms | tok/sec: 816,027 | mfu: 51.00 | epoch: 1 | total time: 71.45m | eta: 108.2m +step 06650/16704 (39.81%) | loss: 2.757108 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,026 | mfu: 50.88 | epoch: 1 | total time: 71.46m | eta: 108.2m +step 06651/16704 (39.82%) | loss: 2.750530 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,206 | mfu: 50.95 | epoch: 1 | total time: 71.47m | eta: 108.2m +step 06652/16704 (39.82%) | loss: 2.753731 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,726 | mfu: 51.05 | epoch: 1 | total time: 71.48m | eta: 108.2m +step 06653/16704 (39.83%) | loss: 2.758304 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,724 | mfu: 50.67 | epoch: 1 | total time: 71.50m | eta: 108.2m +step 06654/16704 (39.83%) | loss: 2.751584 | lrm: 1.00 | dt: 641.67ms | tok/sec: 817,068 | mfu: 51.07 | epoch: 1 | total time: 71.51m | eta: 108.2m +step 06655/16704 (39.84%) | loss: 2.750123 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 1 | total time: 71.52m | eta: 108.2m +step 06656/16704 (39.85%) | loss: 2.735450 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,167 | mfu: 50.82 | epoch: 1 | total time: 71.53m | eta: 108.1m +step 06657/16704 (39.85%) | loss: 2.736319 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,568 | mfu: 50.91 | epoch: 1 | total time: 71.54m | eta: 108.1m +step 06658/16704 (39.86%) | loss: 2.728018 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,597 | mfu: 50.91 | epoch: 1 | total time: 71.55m | eta: 108.1m +step 06659/16704 (39.86%) | loss: 2.725000 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,795 | mfu: 50.93 | epoch: 1 | total time: 71.56m | eta: 108.1m +step 06660/16704 (39.87%) | loss: 2.733354 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,009 | mfu: 50.88 | epoch: 1 | total time: 71.57m | eta: 108.1m +step 06661/16704 (39.88%) | loss: 2.738578 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,848 | mfu: 50.93 | epoch: 1 | total time: 71.58m | eta: 108.1m +step 06662/16704 (39.88%) | loss: 2.744000 | lrm: 1.00 | dt: 641.57ms | tok/sec: 817,199 | mfu: 51.08 | epoch: 1 | total time: 71.59m | eta: 108.1m +step 06663/16704 (39.89%) | loss: 2.732870 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,211 | mfu: 50.95 | epoch: 1 | total time: 71.60m | eta: 108.1m +step 06664/16704 (39.89%) | loss: 2.744826 | lrm: 1.00 | dt: 641.92ms | tok/sec: 816,745 | mfu: 51.05 | epoch: 1 | total time: 71.61m | eta: 108.1m +step 06665/16704 (39.90%) | loss: 2.748030 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,196 | mfu: 50.83 | epoch: 1 | total time: 71.62m | eta: 108.0m +step 06666/16704 (39.91%) | loss: 2.749163 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,522 | mfu: 50.91 | epoch: 1 | total time: 71.63m | eta: 108.0m +step 06667/16704 (39.91%) | loss: 2.751941 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,773 | mfu: 50.86 | epoch: 1 | total time: 71.65m | eta: 108.0m +step 06668/16704 (39.92%) | loss: 2.752820 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,432 | mfu: 50.78 | epoch: 1 | total time: 71.66m | eta: 108.0m +step 06669/16704 (39.92%) | loss: 2.763815 | lrm: 1.00 | dt: 641.23ms | tok/sec: 817,625 | mfu: 51.10 | epoch: 1 | total time: 71.67m | eta: 108.0m +step 06670/16704 (39.93%) | loss: 2.762892 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,039 | mfu: 50.75 | epoch: 1 | total time: 71.68m | eta: 108.0m +step 06671/16704 (39.94%) | loss: 2.774365 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,549 | mfu: 51.04 | epoch: 1 | total time: 71.69m | eta: 108.0m +step 06672/16704 (39.94%) | loss: 2.772500 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,198 | mfu: 50.95 | epoch: 1 | total time: 71.70m | eta: 108.0m +step 06673/16704 (39.95%) | loss: 2.770018 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 1 | total time: 71.71m | eta: 108.0m +step 06674/16704 (39.95%) | loss: 2.758002 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,451 | mfu: 50.97 | epoch: 1 | total time: 71.72m | eta: 107.9m +step 06675/16704 (39.96%) | loss: 2.754771 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 1 | total time: 71.73m | eta: 107.9m +step 06676/16704 (39.97%) | loss: 2.751521 | lrm: 1.00 | dt: 642.69ms | tok/sec: 815,776 | mfu: 50.99 | epoch: 1 | total time: 71.74m | eta: 107.9m +step 06677/16704 (39.97%) | loss: 2.751717 | lrm: 1.00 | dt: 642.10ms | tok/sec: 816,523 | mfu: 51.03 | epoch: 1 | total time: 71.75m | eta: 107.9m +step 06678/16704 (39.98%) | loss: 2.744070 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,139 | mfu: 50.82 | epoch: 1 | total time: 71.76m | eta: 107.9m +step 06679/16704 (39.98%) | loss: 2.723277 | lrm: 1.00 | dt: 640.67ms | tok/sec: 818,344 | mfu: 51.15 | epoch: 1 | total time: 71.77m | eta: 107.9m +step 06680/16704 (39.99%) | loss: 2.728620 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,042 | mfu: 50.75 | epoch: 1 | total time: 71.78m | eta: 107.9m +step 06681/16704 (40.00%) | loss: 2.731185 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,506 | mfu: 50.97 | epoch: 1 | total time: 71.80m | eta: 107.9m +step 06682/16704 (40.00%) | loss: 2.734198 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,748 | mfu: 50.92 | epoch: 1 | total time: 71.81m | eta: 107.9m +step 06683/16704 (40.01%) | loss: 2.734616 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,989 | mfu: 50.88 | epoch: 1 | total time: 71.82m | eta: 107.8m +step 06684/16704 (40.01%) | loss: 2.729380 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,208 | mfu: 50.89 | epoch: 1 | total time: 71.83m | eta: 107.8m +step 06685/16704 (40.02%) | loss: 2.736055 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,817 | mfu: 50.86 | epoch: 1 | total time: 71.84m | eta: 107.8m +step 06686/16704 (40.03%) | loss: 2.728810 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,057 | mfu: 50.94 | epoch: 1 | total time: 71.85m | eta: 107.8m +step 06687/16704 (40.03%) | loss: 2.733167 | lrm: 1.00 | dt: 640.79ms | tok/sec: 818,193 | mfu: 51.14 | epoch: 1 | total time: 71.86m | eta: 107.8m +step 06688/16704 (40.04%) | loss: 2.736199 | lrm: 1.00 | dt: 642.27ms | tok/sec: 816,299 | mfu: 51.02 | epoch: 1 | total time: 71.87m | eta: 107.8m +step 06689/16704 (40.04%) | loss: 2.741497 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,196 | mfu: 50.89 | epoch: 1 | total time: 71.88m | eta: 107.8m +step 06690/16704 (40.05%) | loss: 2.749239 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,501 | mfu: 50.72 | epoch: 1 | total time: 71.89m | eta: 107.8m +step 06691/16704 (40.06%) | loss: 2.761928 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,744 | mfu: 50.99 | epoch: 1 | total time: 71.90m | eta: 107.8m +step 06692/16704 (40.06%) | loss: 2.764527 | lrm: 1.00 | dt: 642.52ms | tok/sec: 815,981 | mfu: 51.00 | epoch: 1 | total time: 71.91m | eta: 107.8m +step 06693/16704 (40.07%) | loss: 2.767950 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,162 | mfu: 50.95 | epoch: 1 | total time: 71.92m | eta: 107.7m +step 06694/16704 (40.07%) | loss: 2.759152 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,638 | mfu: 50.98 | epoch: 1 | total time: 71.93m | eta: 107.7m +step 06695/16704 (40.08%) | loss: 2.751782 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,472 | mfu: 50.97 | epoch: 1 | total time: 71.95m | eta: 107.7m +step 06696/16704 (40.09%) | loss: 2.743498 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,274 | mfu: 50.83 | epoch: 1 | total time: 71.96m | eta: 107.7m +step 06697/16704 (40.09%) | loss: 2.733376 | lrm: 1.00 | dt: 641.25ms | tok/sec: 817,600 | mfu: 51.10 | epoch: 1 | total time: 71.97m | eta: 107.7m +step 06698/16704 (40.10%) | loss: 2.736365 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,557 | mfu: 50.85 | epoch: 1 | total time: 71.98m | eta: 107.7m +step 06699/16704 (40.10%) | loss: 2.718985 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,931 | mfu: 50.87 | epoch: 1 | total time: 71.99m | eta: 107.7m +step 06700/16704 (40.11%) | loss: 2.708662 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,184 | mfu: 50.76 | epoch: 1 | total time: 72.00m | eta: 107.7m +step 06701/16704 (40.12%) | loss: 2.713382 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,262 | mfu: 50.83 | epoch: 1 | total time: 72.01m | eta: 107.7m +step 06702/16704 (40.12%) | loss: 2.718458 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 1 | total time: 72.02m | eta: 107.6m +step 06703/16704 (40.13%) | loss: 2.713590 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,808 | mfu: 50.86 | epoch: 1 | total time: 72.03m | eta: 107.6m +step 06704/16704 (40.13%) | loss: 2.712529 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,504 | mfu: 50.91 | epoch: 1 | total time: 72.04m | eta: 107.6m +step 06705/16704 (40.14%) | loss: 2.710378 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,957 | mfu: 50.94 | epoch: 1 | total time: 72.05m | eta: 107.6m +step 06706/16704 (40.15%) | loss: 2.715824 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,145 | mfu: 50.89 | epoch: 1 | total time: 72.06m | eta: 107.6m +step 06707/16704 (40.15%) | loss: 2.725078 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,092 | mfu: 50.88 | epoch: 1 | total time: 72.07m | eta: 107.6m +step 06708/16704 (40.16%) | loss: 2.722648 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,640 | mfu: 50.79 | epoch: 1 | total time: 72.09m | eta: 107.6m +step 06709/16704 (40.16%) | loss: 2.722260 | lrm: 1.00 | dt: 642.31ms | tok/sec: 816,259 | mfu: 51.02 | epoch: 1 | total time: 72.10m | eta: 107.6m +step 06710/16704 (40.17%) | loss: 2.724868 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,254 | mfu: 50.95 | epoch: 1 | total time: 72.11m | eta: 107.6m +step 06711/16704 (40.18%) | loss: 2.722888 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,578 | mfu: 50.79 | epoch: 1 | total time: 72.12m | eta: 107.5m +step 06712/16704 (40.18%) | loss: 2.723538 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,341 | mfu: 50.96 | epoch: 1 | total time: 72.13m | eta: 107.5m +step 06713/16704 (40.19%) | loss: 2.711316 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,461 | mfu: 50.97 | epoch: 1 | total time: 72.14m | eta: 107.5m +step 06714/16704 (40.19%) | loss: 2.717133 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,972 | mfu: 50.94 | epoch: 1 | total time: 72.15m | eta: 107.5m +step 06715/16704 (40.20%) | loss: 2.710266 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,248 | mfu: 50.95 | epoch: 1 | total time: 72.16m | eta: 107.5m +step 06716/16704 (40.21%) | loss: 2.712302 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,328 | mfu: 50.71 | epoch: 1 | total time: 72.17m | eta: 107.5m +step 06717/16704 (40.21%) | loss: 2.717118 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,920 | mfu: 50.93 | epoch: 1 | total time: 72.18m | eta: 107.5m +step 06718/16704 (40.22%) | loss: 2.701962 | lrm: 1.00 | dt: 641.52ms | tok/sec: 817,254 | mfu: 51.08 | epoch: 1 | total time: 72.19m | eta: 107.5m +step 06719/16704 (40.22%) | loss: 2.710158 | lrm: 1.00 | dt: 647.81ms | tok/sec: 809,328 | mfu: 50.58 | epoch: 1 | total time: 72.20m | eta: 107.5m +step 06720/16704 (40.23%) | loss: 2.708506 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,936 | mfu: 50.87 | epoch: 1 | total time: 72.21m | eta: 107.4m +step 06721/16704 (40.24%) | loss: 2.704666 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,204 | mfu: 50.70 | epoch: 1 | total time: 72.22m | eta: 107.4m +step 06722/16704 (40.24%) | loss: 2.699135 | lrm: 1.00 | dt: 642.05ms | tok/sec: 816,582 | mfu: 51.04 | epoch: 1 | total time: 72.24m | eta: 107.4m +step 06723/16704 (40.25%) | loss: 2.697195 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,608 | mfu: 50.73 | epoch: 1 | total time: 72.25m | eta: 107.4m +step 06724/16704 (40.25%) | loss: 2.695193 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,764 | mfu: 50.80 | epoch: 1 | total time: 72.26m | eta: 107.4m +step 06725/16704 (40.26%) | loss: 2.683348 | lrm: 1.00 | dt: 642.58ms | tok/sec: 815,910 | mfu: 51.00 | epoch: 1 | total time: 72.27m | eta: 107.4m +step 06726/16704 (40.27%) | loss: 2.683203 | lrm: 1.00 | dt: 648.25ms | tok/sec: 808,776 | mfu: 50.55 | epoch: 1 | total time: 72.28m | eta: 107.4m +step 06727/16704 (40.27%) | loss: 2.680015 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,152 | mfu: 51.01 | epoch: 1 | total time: 72.29m | eta: 107.4m +step 06728/16704 (40.28%) | loss: 2.683341 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,318 | mfu: 50.83 | epoch: 1 | total time: 72.30m | eta: 107.4m +step 06729/16704 (40.28%) | loss: 2.686088 | lrm: 1.00 | dt: 642.52ms | tok/sec: 815,988 | mfu: 51.00 | epoch: 1 | total time: 72.31m | eta: 107.4m +step 06730/16704 (40.29%) | loss: 2.692388 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,702 | mfu: 50.92 | epoch: 1 | total time: 72.32m | eta: 107.3m +step 06731/16704 (40.30%) | loss: 2.693399 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,867 | mfu: 50.87 | epoch: 1 | total time: 72.33m | eta: 107.3m +step 06732/16704 (40.30%) | loss: 2.703005 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,352 | mfu: 50.90 | epoch: 1 | total time: 72.34m | eta: 107.3m +step 06733/16704 (40.31%) | loss: 2.698444 | lrm: 1.00 | dt: 642.74ms | tok/sec: 815,705 | mfu: 50.98 | epoch: 1 | total time: 72.35m | eta: 107.3m +step 06734/16704 (40.31%) | loss: 2.701240 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,252 | mfu: 50.95 | epoch: 1 | total time: 72.36m | eta: 107.3m +step 06735/16704 (40.32%) | loss: 2.708587 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,120 | mfu: 50.88 | epoch: 1 | total time: 72.37m | eta: 107.3m +step 06736/16704 (40.33%) | loss: 2.712648 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,269 | mfu: 50.89 | epoch: 1 | total time: 72.39m | eta: 107.3m +step 06737/16704 (40.33%) | loss: 2.714771 | lrm: 1.00 | dt: 640.99ms | tok/sec: 817,939 | mfu: 51.12 | epoch: 1 | total time: 72.40m | eta: 107.3m +step 06738/16704 (40.34%) | loss: 2.724785 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,032 | mfu: 50.94 | epoch: 1 | total time: 72.41m | eta: 107.3m +step 06739/16704 (40.34%) | loss: 2.737011 | lrm: 1.00 | dt: 641.71ms | tok/sec: 817,015 | mfu: 51.06 | epoch: 1 | total time: 72.42m | eta: 107.2m +step 06740/16704 (40.35%) | loss: 2.737153 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,441 | mfu: 50.97 | epoch: 1 | total time: 72.43m | eta: 107.2m +step 06741/16704 (40.36%) | loss: 2.735191 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,409 | mfu: 50.90 | epoch: 1 | total time: 72.44m | eta: 107.2m +step 06742/16704 (40.36%) | loss: 2.738131 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 1 | total time: 72.45m | eta: 107.2m +step 06743/16704 (40.37%) | loss: 2.734838 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,151 | mfu: 50.82 | epoch: 1 | total time: 72.46m | eta: 107.2m +step 06744/16704 (40.37%) | loss: 2.733242 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,540 | mfu: 50.91 | epoch: 1 | total time: 72.47m | eta: 107.2m +step 06745/16704 (40.38%) | loss: 2.725747 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,157 | mfu: 51.01 | epoch: 1 | total time: 72.48m | eta: 107.2m +step 06746/16704 (40.39%) | loss: 2.713820 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,981 | mfu: 50.81 | epoch: 1 | total time: 72.49m | eta: 107.2m +step 06747/16704 (40.39%) | loss: 2.714017 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,697 | mfu: 50.98 | epoch: 1 | total time: 72.50m | eta: 107.2m +step 06748/16704 (40.40%) | loss: 2.716664 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,689 | mfu: 50.98 | epoch: 1 | total time: 72.51m | eta: 107.1m +step 06749/16704 (40.40%) | loss: 2.724355 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,732 | mfu: 50.86 | epoch: 1 | total time: 72.52m | eta: 107.1m +Step 06750 | Validation bpb: 0.829086 +step 06750/16704 (40.41%) | loss: 2.729222 | lrm: 1.00 | dt: 646.91ms | tok/sec: 810,449 | mfu: 50.65 | epoch: 1 | total time: 72.54m | eta: 107.1m +step 06751/16704 (40.42%) | loss: 2.724434 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,210 | mfu: 50.76 | epoch: 1 | total time: 72.55m | eta: 107.1m +step 06752/16704 (40.42%) | loss: 2.723576 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,331 | mfu: 50.71 | epoch: 1 | total time: 72.56m | eta: 107.1m +step 06753/16704 (40.43%) | loss: 2.717646 | lrm: 1.00 | dt: 640.72ms | tok/sec: 818,282 | mfu: 51.14 | epoch: 1 | total time: 72.57m | eta: 107.1m +step 06754/16704 (40.43%) | loss: 2.721954 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,207 | mfu: 50.83 | epoch: 1 | total time: 72.58m | eta: 107.1m +step 06755/16704 (40.44%) | loss: 2.727863 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,928 | mfu: 50.87 | epoch: 1 | total time: 72.59m | eta: 107.1m +step 06756/16704 (40.45%) | loss: 2.714226 | lrm: 1.00 | dt: 640.80ms | tok/sec: 818,176 | mfu: 51.14 | epoch: 1 | total time: 72.60m | eta: 107.1m +step 06757/16704 (40.45%) | loss: 2.716706 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,711 | mfu: 50.73 | epoch: 1 | total time: 72.61m | eta: 107.0m +step 06758/16704 (40.46%) | loss: 2.709638 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,185 | mfu: 50.95 | epoch: 1 | total time: 72.62m | eta: 107.0m +step 06759/16704 (40.46%) | loss: 2.703558 | lrm: 1.00 | dt: 642.00ms | tok/sec: 816,642 | mfu: 51.04 | epoch: 1 | total time: 72.63m | eta: 107.0m +step 06760/16704 (40.47%) | loss: 2.704413 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,827 | mfu: 50.87 | epoch: 1 | total time: 72.64m | eta: 107.0m +step 06761/16704 (40.48%) | loss: 2.688121 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,005 | mfu: 50.81 | epoch: 1 | total time: 72.65m | eta: 107.0m +step 06762/16704 (40.48%) | loss: 2.701498 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,230 | mfu: 50.70 | epoch: 1 | total time: 72.66m | eta: 107.0m +step 06763/16704 (40.49%) | loss: 2.703370 | lrm: 1.00 | dt: 642.44ms | tok/sec: 816,089 | mfu: 51.01 | epoch: 1 | total time: 72.68m | eta: 107.0m +step 06764/16704 (40.49%) | loss: 2.711833 | lrm: 1.00 | dt: 642.17ms | tok/sec: 816,435 | mfu: 51.03 | epoch: 1 | total time: 72.69m | eta: 107.0m +step 06765/16704 (40.50%) | loss: 2.708908 | lrm: 1.00 | dt: 643.73ms | tok/sec: 814,447 | mfu: 50.90 | epoch: 1 | total time: 72.70m | eta: 107.0m +step 06766/16704 (40.51%) | loss: 2.720153 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,946 | mfu: 50.87 | epoch: 1 | total time: 72.71m | eta: 107.0m +step 06767/16704 (40.51%) | loss: 2.713112 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,376 | mfu: 50.84 | epoch: 1 | total time: 72.72m | eta: 106.9m +step 06768/16704 (40.52%) | loss: 2.703676 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,249 | mfu: 50.83 | epoch: 1 | total time: 72.73m | eta: 106.9m +step 06769/16704 (40.52%) | loss: 2.702562 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,148 | mfu: 50.89 | epoch: 1 | total time: 72.74m | eta: 106.9m +step 06770/16704 (40.53%) | loss: 2.707548 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,781 | mfu: 50.92 | epoch: 1 | total time: 72.75m | eta: 106.9m +step 06771/16704 (40.54%) | loss: 2.712407 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,245 | mfu: 50.95 | epoch: 1 | total time: 72.76m | eta: 106.9m +step 06772/16704 (40.54%) | loss: 2.716666 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,069 | mfu: 50.82 | epoch: 1 | total time: 72.77m | eta: 106.9m +step 06773/16704 (40.55%) | loss: 2.708876 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,606 | mfu: 50.91 | epoch: 1 | total time: 72.78m | eta: 106.9m +step 06774/16704 (40.55%) | loss: 2.716606 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,912 | mfu: 50.93 | epoch: 1 | total time: 72.79m | eta: 106.9m +step 06775/16704 (40.56%) | loss: 2.718231 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,891 | mfu: 50.81 | epoch: 1 | total time: 72.80m | eta: 106.9m +step 06776/16704 (40.57%) | loss: 2.709529 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,406 | mfu: 50.78 | epoch: 1 | total time: 72.81m | eta: 106.8m +step 06777/16704 (40.57%) | loss: 2.722875 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,168 | mfu: 50.70 | epoch: 1 | total time: 72.83m | eta: 106.8m +step 06778/16704 (40.58%) | loss: 2.723534 | lrm: 1.00 | dt: 643.95ms | tok/sec: 814,170 | mfu: 50.89 | epoch: 1 | total time: 72.84m | eta: 106.8m +step 06779/16704 (40.58%) | loss: 2.714760 | lrm: 1.00 | dt: 646.24ms | tok/sec: 811,285 | mfu: 50.71 | epoch: 1 | total time: 72.85m | eta: 106.8m +step 06780/16704 (40.59%) | loss: 2.712242 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,563 | mfu: 50.91 | epoch: 1 | total time: 72.86m | eta: 106.8m +step 06781/16704 (40.60%) | loss: 2.714592 | lrm: 1.00 | dt: 646.05ms | tok/sec: 811,525 | mfu: 50.72 | epoch: 1 | total time: 72.87m | eta: 106.8m +step 06782/16704 (40.60%) | loss: 2.699837 | lrm: 1.00 | dt: 644.10ms | tok/sec: 813,986 | mfu: 50.88 | epoch: 1 | total time: 72.88m | eta: 106.8m +step 06783/16704 (40.61%) | loss: 2.701770 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,939 | mfu: 50.75 | epoch: 1 | total time: 72.89m | eta: 106.8m +step 06784/16704 (40.61%) | loss: 2.697459 | lrm: 1.00 | dt: 641.86ms | tok/sec: 816,825 | mfu: 51.05 | epoch: 1 | total time: 72.90m | eta: 106.8m +step 06785/16704 (40.62%) | loss: 2.678461 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,546 | mfu: 50.97 | epoch: 1 | total time: 72.91m | eta: 106.7m +step 06786/16704 (40.62%) | loss: 2.690755 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,497 | mfu: 50.84 | epoch: 1 | total time: 72.92m | eta: 106.7m +step 06787/16704 (40.63%) | loss: 2.691747 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,985 | mfu: 50.81 | epoch: 1 | total time: 72.93m | eta: 106.7m +step 06788/16704 (40.64%) | loss: 2.694176 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,155 | mfu: 50.95 | epoch: 1 | total time: 72.94m | eta: 106.7m +step 06789/16704 (40.64%) | loss: 2.680687 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,579 | mfu: 50.97 | epoch: 1 | total time: 72.95m | eta: 106.7m +step 06790/16704 (40.65%) | loss: 2.687474 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,591 | mfu: 50.91 | epoch: 1 | total time: 72.97m | eta: 106.7m +step 06791/16704 (40.65%) | loss: 2.694635 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,841 | mfu: 50.99 | epoch: 1 | total time: 72.98m | eta: 106.7m +step 06792/16704 (40.66%) | loss: 2.700914 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,544 | mfu: 50.97 | epoch: 1 | total time: 72.99m | eta: 106.7m +step 06793/16704 (40.67%) | loss: 2.687010 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,656 | mfu: 50.92 | epoch: 1 | total time: 73.00m | eta: 106.7m +step 06794/16704 (40.67%) | loss: 2.693835 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,548 | mfu: 50.91 | epoch: 1 | total time: 73.01m | eta: 106.6m +step 06795/16704 (40.68%) | loss: 2.691465 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,913 | mfu: 50.87 | epoch: 1 | total time: 73.02m | eta: 106.6m +step 06796/16704 (40.68%) | loss: 2.678835 | lrm: 1.00 | dt: 643.19ms | tok/sec: 815,135 | mfu: 50.95 | epoch: 1 | total time: 73.03m | eta: 106.6m +step 06797/16704 (40.69%) | loss: 2.694222 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,944 | mfu: 50.94 | epoch: 1 | total time: 73.04m | eta: 106.6m +step 06798/16704 (40.70%) | loss: 2.698502 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,165 | mfu: 50.82 | epoch: 1 | total time: 73.05m | eta: 106.6m +step 06799/16704 (40.70%) | loss: 2.694653 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,056 | mfu: 50.88 | epoch: 1 | total time: 73.06m | eta: 106.6m +step 06800/16704 (40.71%) | loss: 2.695515 | lrm: 1.00 | dt: 643.35ms | tok/sec: 814,931 | mfu: 50.93 | epoch: 1 | total time: 73.07m | eta: 106.6m +step 06801/16704 (40.71%) | loss: 2.683511 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,207 | mfu: 51.01 | epoch: 1 | total time: 73.08m | eta: 106.6m +step 06802/16704 (40.72%) | loss: 2.686371 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,606 | mfu: 50.85 | epoch: 1 | total time: 73.09m | eta: 106.6m +step 06803/16704 (40.73%) | loss: 2.681594 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,192 | mfu: 50.83 | epoch: 1 | total time: 73.10m | eta: 106.6m +step 06804/16704 (40.73%) | loss: 2.693413 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,860 | mfu: 50.87 | epoch: 1 | total time: 73.12m | eta: 106.5m +step 06805/16704 (40.74%) | loss: 2.694714 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,264 | mfu: 50.89 | epoch: 1 | total time: 73.13m | eta: 106.5m +step 06806/16704 (40.74%) | loss: 2.694420 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,529 | mfu: 50.78 | epoch: 1 | total time: 73.14m | eta: 106.5m +step 06807/16704 (40.75%) | loss: 2.687968 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,944 | mfu: 50.87 | epoch: 1 | total time: 73.15m | eta: 106.5m +step 06808/16704 (40.76%) | loss: 2.682452 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 1 | total time: 73.16m | eta: 106.5m +step 06809/16704 (40.76%) | loss: 2.703323 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,606 | mfu: 50.79 | epoch: 1 | total time: 73.17m | eta: 106.5m +step 06810/16704 (40.77%) | loss: 2.700658 | lrm: 1.00 | dt: 647.57ms | tok/sec: 809,618 | mfu: 50.60 | epoch: 1 | total time: 73.18m | eta: 106.5m +step 06811/16704 (40.77%) | loss: 2.717209 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,632 | mfu: 50.92 | epoch: 1 | total time: 73.19m | eta: 106.5m +step 06812/16704 (40.78%) | loss: 2.716070 | lrm: 1.00 | dt: 646.58ms | tok/sec: 810,867 | mfu: 50.68 | epoch: 1 | total time: 73.20m | eta: 106.5m +step 06813/16704 (40.79%) | loss: 2.715155 | lrm: 1.00 | dt: 642.32ms | tok/sec: 816,241 | mfu: 51.02 | epoch: 1 | total time: 73.21m | eta: 106.4m +step 06814/16704 (40.79%) | loss: 2.721854 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,731 | mfu: 50.80 | epoch: 1 | total time: 73.22m | eta: 106.4m +step 06815/16704 (40.80%) | loss: 2.728185 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,893 | mfu: 50.93 | epoch: 1 | total time: 73.23m | eta: 106.4m +step 06816/16704 (40.80%) | loss: 2.724415 | lrm: 1.00 | dt: 645.19ms | tok/sec: 812,612 | mfu: 50.79 | epoch: 1 | total time: 73.24m | eta: 106.4m +step 06817/16704 (40.81%) | loss: 2.726624 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,991 | mfu: 50.81 | epoch: 1 | total time: 73.25m | eta: 106.4m +step 06818/16704 (40.82%) | loss: 2.731972 | lrm: 1.00 | dt: 642.76ms | tok/sec: 815,682 | mfu: 50.98 | epoch: 1 | total time: 73.27m | eta: 106.4m +step 06819/16704 (40.82%) | loss: 2.729044 | lrm: 1.00 | dt: 649.20ms | tok/sec: 807,595 | mfu: 50.48 | epoch: 1 | total time: 73.28m | eta: 106.4m +step 06820/16704 (40.83%) | loss: 2.733878 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,527 | mfu: 50.97 | epoch: 1 | total time: 73.29m | eta: 106.4m +step 06821/16704 (40.83%) | loss: 2.732145 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,524 | mfu: 50.66 | epoch: 1 | total time: 73.30m | eta: 106.4m +step 06822/16704 (40.84%) | loss: 2.728879 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,168 | mfu: 50.89 | epoch: 1 | total time: 73.31m | eta: 106.3m +step 06823/16704 (40.85%) | loss: 2.735151 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,222 | mfu: 50.95 | epoch: 1 | total time: 73.32m | eta: 106.3m +step 06824/16704 (40.85%) | loss: 2.735394 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,171 | mfu: 50.64 | epoch: 1 | total time: 73.33m | eta: 106.3m +step 06825/16704 (40.86%) | loss: 2.748135 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,948 | mfu: 50.87 | epoch: 1 | total time: 73.34m | eta: 106.3m +step 06826/16704 (40.86%) | loss: 2.756461 | lrm: 1.00 | dt: 646.53ms | tok/sec: 810,929 | mfu: 50.68 | epoch: 1 | total time: 73.35m | eta: 106.3m +step 06827/16704 (40.87%) | loss: 2.753887 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,300 | mfu: 50.83 | epoch: 1 | total time: 73.36m | eta: 106.3m +step 06828/16704 (40.88%) | loss: 2.743661 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,583 | mfu: 50.98 | epoch: 1 | total time: 73.37m | eta: 106.3m +step 06829/16704 (40.88%) | loss: 2.750820 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,370 | mfu: 50.77 | epoch: 1 | total time: 73.38m | eta: 106.3m +step 06830/16704 (40.89%) | loss: 2.749096 | lrm: 1.00 | dt: 649.78ms | tok/sec: 806,874 | mfu: 50.43 | epoch: 1 | total time: 73.39m | eta: 106.3m +step 06831/16704 (40.89%) | loss: 2.755066 | lrm: 1.00 | dt: 644.64ms | tok/sec: 813,302 | mfu: 50.83 | epoch: 1 | total time: 73.41m | eta: 106.3m +step 06832/16704 (40.90%) | loss: 2.745848 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,779 | mfu: 50.86 | epoch: 1 | total time: 73.42m | eta: 106.2m +step 06833/16704 (40.91%) | loss: 2.749327 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,821 | mfu: 50.87 | epoch: 1 | total time: 73.43m | eta: 106.2m +step 06834/16704 (40.91%) | loss: 2.757032 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,051 | mfu: 50.88 | epoch: 1 | total time: 73.44m | eta: 106.2m +step 06835/16704 (40.92%) | loss: 2.758236 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,487 | mfu: 50.72 | epoch: 1 | total time: 73.45m | eta: 106.2m +step 06836/16704 (40.92%) | loss: 2.750287 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,498 | mfu: 50.78 | epoch: 1 | total time: 73.46m | eta: 106.2m +step 06837/16704 (40.93%) | loss: 2.758237 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,937 | mfu: 50.68 | epoch: 1 | total time: 73.47m | eta: 106.2m +step 06838/16704 (40.94%) | loss: 2.767632 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,445 | mfu: 50.78 | epoch: 1 | total time: 73.48m | eta: 106.2m +step 06839/16704 (40.94%) | loss: 2.760922 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,325 | mfu: 50.83 | epoch: 1 | total time: 73.49m | eta: 106.2m +step 06840/16704 (40.95%) | loss: 2.758990 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,178 | mfu: 50.76 | epoch: 1 | total time: 73.50m | eta: 106.2m +step 06841/16704 (40.95%) | loss: 2.765009 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,544 | mfu: 50.97 | epoch: 1 | total time: 73.51m | eta: 106.1m +step 06842/16704 (40.96%) | loss: 2.754687 | lrm: 1.00 | dt: 647.35ms | tok/sec: 809,895 | mfu: 50.62 | epoch: 1 | total time: 73.52m | eta: 106.1m +step 06843/16704 (40.97%) | loss: 2.744025 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,193 | mfu: 51.01 | epoch: 1 | total time: 73.53m | eta: 106.1m +step 06844/16704 (40.97%) | loss: 2.738477 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,376 | mfu: 50.77 | epoch: 1 | total time: 73.55m | eta: 106.1m +step 06845/16704 (40.98%) | loss: 2.730989 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,682 | mfu: 50.79 | epoch: 1 | total time: 73.56m | eta: 106.1m +step 06846/16704 (40.98%) | loss: 2.728489 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,732 | mfu: 50.92 | epoch: 1 | total time: 73.57m | eta: 106.1m +step 06847/16704 (40.99%) | loss: 2.729834 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,303 | mfu: 50.71 | epoch: 1 | total time: 73.58m | eta: 106.1m +step 06848/16704 (41.00%) | loss: 2.726728 | lrm: 1.00 | dt: 647.50ms | tok/sec: 809,713 | mfu: 50.61 | epoch: 1 | total time: 73.59m | eta: 106.1m +step 06849/16704 (41.00%) | loss: 2.719611 | lrm: 1.00 | dt: 642.97ms | tok/sec: 815,417 | mfu: 50.96 | epoch: 1 | total time: 73.60m | eta: 106.1m +step 06850/16704 (41.01%) | loss: 2.713161 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,088 | mfu: 50.82 | epoch: 1 | total time: 73.61m | eta: 106.0m +step 06851/16704 (41.01%) | loss: 2.712368 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,370 | mfu: 50.90 | epoch: 1 | total time: 73.62m | eta: 106.0m +step 06852/16704 (41.02%) | loss: 2.703200 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,412 | mfu: 50.71 | epoch: 1 | total time: 73.63m | eta: 106.0m +step 06853/16704 (41.03%) | loss: 2.700903 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,445 | mfu: 50.97 | epoch: 1 | total time: 73.64m | eta: 106.0m +step 06854/16704 (41.03%) | loss: 2.693404 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,426 | mfu: 50.84 | epoch: 1 | total time: 73.65m | eta: 106.0m +step 06855/16704 (41.04%) | loss: 2.706363 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,711 | mfu: 50.92 | epoch: 1 | total time: 73.66m | eta: 106.0m +step 06856/16704 (41.04%) | loss: 2.703544 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,173 | mfu: 50.76 | epoch: 1 | total time: 73.67m | eta: 106.0m +step 06857/16704 (41.05%) | loss: 2.714379 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,659 | mfu: 50.85 | epoch: 1 | total time: 73.68m | eta: 106.0m +step 06858/16704 (41.06%) | loss: 2.719794 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,571 | mfu: 50.60 | epoch: 1 | total time: 73.70m | eta: 106.0m +step 06859/16704 (41.06%) | loss: 2.722757 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,318 | mfu: 51.02 | epoch: 1 | total time: 73.71m | eta: 105.9m +step 06860/16704 (41.07%) | loss: 2.713320 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,112 | mfu: 50.76 | epoch: 1 | total time: 73.72m | eta: 105.9m +step 06861/16704 (41.07%) | loss: 2.701721 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,587 | mfu: 50.91 | epoch: 1 | total time: 73.73m | eta: 105.9m +step 06862/16704 (41.08%) | loss: 2.721692 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,739 | mfu: 50.92 | epoch: 1 | total time: 73.74m | eta: 105.9m +step 06863/16704 (41.09%) | loss: 2.718483 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,132 | mfu: 51.01 | epoch: 1 | total time: 73.75m | eta: 105.9m +step 06864/16704 (41.09%) | loss: 2.724098 | lrm: 1.00 | dt: 646.03ms | tok/sec: 811,551 | mfu: 50.72 | epoch: 1 | total time: 73.76m | eta: 105.9m +step 06865/16704 (41.10%) | loss: 2.735932 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,919 | mfu: 50.93 | epoch: 1 | total time: 73.77m | eta: 105.9m +step 06866/16704 (41.10%) | loss: 2.736468 | lrm: 1.00 | dt: 647.64ms | tok/sec: 809,539 | mfu: 50.60 | epoch: 1 | total time: 73.78m | eta: 105.9m +step 06867/16704 (41.11%) | loss: 2.741690 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,279 | mfu: 50.96 | epoch: 1 | total time: 73.79m | eta: 105.9m +step 06868/16704 (41.12%) | loss: 2.731128 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,832 | mfu: 50.80 | epoch: 1 | total time: 73.80m | eta: 105.9m +step 06869/16704 (41.12%) | loss: 2.734373 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,116 | mfu: 50.95 | epoch: 1 | total time: 73.81m | eta: 105.8m +step 06870/16704 (41.13%) | loss: 2.742385 | lrm: 1.00 | dt: 642.68ms | tok/sec: 815,783 | mfu: 50.99 | epoch: 1 | total time: 73.82m | eta: 105.8m +step 06871/16704 (41.13%) | loss: 2.735028 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,745 | mfu: 50.74 | epoch: 1 | total time: 73.84m | eta: 105.8m +step 06872/16704 (41.14%) | loss: 2.752780 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,855 | mfu: 50.93 | epoch: 1 | total time: 73.85m | eta: 105.8m +step 06873/16704 (41.15%) | loss: 2.730459 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,174 | mfu: 50.95 | epoch: 1 | total time: 73.86m | eta: 105.8m +step 06874/16704 (41.15%) | loss: 2.729919 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,140 | mfu: 50.70 | epoch: 1 | total time: 73.87m | eta: 105.8m +step 06875/16704 (41.16%) | loss: 2.736620 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,480 | mfu: 50.78 | epoch: 1 | total time: 73.88m | eta: 105.8m +step 06876/16704 (41.16%) | loss: 2.718295 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,673 | mfu: 50.92 | epoch: 1 | total time: 73.89m | eta: 105.8m +step 06877/16704 (41.17%) | loss: 2.715568 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,369 | mfu: 50.77 | epoch: 1 | total time: 73.90m | eta: 105.8m +step 06878/16704 (41.18%) | loss: 2.722267 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,316 | mfu: 50.96 | epoch: 1 | total time: 73.91m | eta: 105.7m +step 06879/16704 (41.18%) | loss: 2.722298 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,643 | mfu: 50.92 | epoch: 1 | total time: 73.92m | eta: 105.7m +step 06880/16704 (41.19%) | loss: 2.726370 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 1 | total time: 73.93m | eta: 105.7m +step 06881/16704 (41.19%) | loss: 2.736221 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,952 | mfu: 50.81 | epoch: 1 | total time: 73.94m | eta: 105.7m +step 06882/16704 (41.20%) | loss: 2.737470 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,451 | mfu: 50.84 | epoch: 1 | total time: 73.95m | eta: 105.7m +step 06883/16704 (41.21%) | loss: 2.743505 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,378 | mfu: 50.71 | epoch: 1 | total time: 73.96m | eta: 105.7m +step 06884/16704 (41.21%) | loss: 2.749719 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,974 | mfu: 50.94 | epoch: 1 | total time: 73.97m | eta: 105.7m +step 06885/16704 (41.22%) | loss: 2.753276 | lrm: 1.00 | dt: 645.91ms | tok/sec: 811,709 | mfu: 50.73 | epoch: 1 | total time: 73.99m | eta: 105.7m +step 06886/16704 (41.22%) | loss: 2.754398 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,970 | mfu: 51.00 | epoch: 1 | total time: 74.00m | eta: 105.7m +step 06887/16704 (41.23%) | loss: 2.750924 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,677 | mfu: 50.79 | epoch: 1 | total time: 74.01m | eta: 105.6m +step 06888/16704 (41.24%) | loss: 2.737688 | lrm: 1.00 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 1 | total time: 74.02m | eta: 105.6m +step 06889/16704 (41.24%) | loss: 2.741080 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,499 | mfu: 51.03 | epoch: 1 | total time: 74.03m | eta: 105.6m +step 06890/16704 (41.25%) | loss: 2.739141 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,702 | mfu: 50.67 | epoch: 1 | total time: 74.04m | eta: 105.6m +step 06891/16704 (41.25%) | loss: 2.749032 | lrm: 1.00 | dt: 642.20ms | tok/sec: 816,396 | mfu: 51.03 | epoch: 1 | total time: 74.05m | eta: 105.6m +step 06892/16704 (41.26%) | loss: 2.746231 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,106 | mfu: 50.70 | epoch: 1 | total time: 74.06m | eta: 105.6m +step 06893/16704 (41.27%) | loss: 2.737326 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,695 | mfu: 50.98 | epoch: 1 | total time: 74.07m | eta: 105.6m +step 06894/16704 (41.27%) | loss: 2.736780 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,520 | mfu: 50.85 | epoch: 1 | total time: 74.08m | eta: 105.6m +step 06895/16704 (41.28%) | loss: 2.729587 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,880 | mfu: 50.93 | epoch: 1 | total time: 74.09m | eta: 105.6m +step 06896/16704 (41.28%) | loss: 2.731358 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,025 | mfu: 50.75 | epoch: 1 | total time: 74.10m | eta: 105.5m +step 06897/16704 (41.29%) | loss: 2.730495 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,263 | mfu: 50.83 | epoch: 1 | total time: 74.11m | eta: 105.5m +step 06898/16704 (41.30%) | loss: 2.729982 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,855 | mfu: 50.99 | epoch: 1 | total time: 74.13m | eta: 105.5m +step 06899/16704 (41.30%) | loss: 2.733387 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,350 | mfu: 50.90 | epoch: 1 | total time: 74.14m | eta: 105.5m +step 06900/16704 (41.31%) | loss: 2.722832 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 1 | total time: 74.15m | eta: 105.5m +step 06901/16704 (41.31%) | loss: 2.723957 | lrm: 1.00 | dt: 647.25ms | tok/sec: 810,028 | mfu: 50.63 | epoch: 1 | total time: 74.16m | eta: 105.5m +step 06902/16704 (41.32%) | loss: 2.722267 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,339 | mfu: 50.90 | epoch: 1 | total time: 74.17m | eta: 105.5m +step 06903/16704 (41.33%) | loss: 2.727212 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 1 | total time: 74.18m | eta: 105.5m +step 06904/16704 (41.33%) | loss: 2.732643 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,883 | mfu: 50.81 | epoch: 1 | total time: 74.19m | eta: 105.5m +step 06905/16704 (41.34%) | loss: 2.727156 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 1 | total time: 74.20m | eta: 105.5m +step 06906/16704 (41.34%) | loss: 2.735679 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,416 | mfu: 50.65 | epoch: 1 | total time: 74.21m | eta: 105.4m +step 06907/16704 (41.35%) | loss: 2.735976 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,296 | mfu: 50.96 | epoch: 1 | total time: 74.22m | eta: 105.4m +step 06908/16704 (41.36%) | loss: 2.732113 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,136 | mfu: 50.88 | epoch: 1 | total time: 74.23m | eta: 105.4m +step 06909/16704 (41.36%) | loss: 2.736256 | lrm: 1.00 | dt: 642.85ms | tok/sec: 815,563 | mfu: 50.97 | epoch: 1 | total time: 74.24m | eta: 105.4m +step 06910/16704 (41.37%) | loss: 2.739465 | lrm: 1.00 | dt: 641.54ms | tok/sec: 817,227 | mfu: 51.08 | epoch: 1 | total time: 74.25m | eta: 105.4m +step 06911/16704 (41.37%) | loss: 2.751205 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,818 | mfu: 50.80 | epoch: 1 | total time: 74.26m | eta: 105.4m +step 06912/16704 (41.38%) | loss: 2.744659 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 1 | total time: 74.28m | eta: 105.4m +step 06913/16704 (41.39%) | loss: 2.739296 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,782 | mfu: 50.86 | epoch: 1 | total time: 74.29m | eta: 105.4m +step 06914/16704 (41.39%) | loss: 2.736553 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,241 | mfu: 50.77 | epoch: 1 | total time: 74.30m | eta: 105.4m +step 06915/16704 (41.40%) | loss: 2.736789 | lrm: 1.00 | dt: 646.79ms | tok/sec: 810,597 | mfu: 50.66 | epoch: 1 | total time: 74.31m | eta: 105.3m +step 06916/16704 (41.40%) | loss: 2.733779 | lrm: 1.00 | dt: 641.31ms | tok/sec: 817,523 | mfu: 51.10 | epoch: 1 | total time: 74.32m | eta: 105.3m +step 06917/16704 (41.41%) | loss: 2.731009 | lrm: 1.00 | dt: 642.13ms | tok/sec: 816,487 | mfu: 51.03 | epoch: 1 | total time: 74.33m | eta: 105.3m +step 06918/16704 (41.42%) | loss: 2.731166 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,083 | mfu: 50.94 | epoch: 1 | total time: 74.34m | eta: 105.3m +step 06919/16704 (41.42%) | loss: 2.719520 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,564 | mfu: 50.85 | epoch: 1 | total time: 74.35m | eta: 105.3m +step 06920/16704 (41.43%) | loss: 2.715236 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,265 | mfu: 50.89 | epoch: 1 | total time: 74.36m | eta: 105.3m +step 06921/16704 (41.43%) | loss: 2.709566 | lrm: 1.00 | dt: 646.85ms | tok/sec: 810,523 | mfu: 50.66 | epoch: 1 | total time: 74.37m | eta: 105.3m +step 06922/16704 (41.44%) | loss: 2.702034 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,015 | mfu: 50.94 | epoch: 1 | total time: 74.38m | eta: 105.3m +step 06923/16704 (41.45%) | loss: 2.693604 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,695 | mfu: 50.79 | epoch: 1 | total time: 74.39m | eta: 105.3m +step 06924/16704 (41.45%) | loss: 2.701808 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 1 | total time: 74.40m | eta: 105.2m +step 06925/16704 (41.46%) | loss: 2.710963 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,860 | mfu: 51.05 | epoch: 1 | total time: 74.42m | eta: 105.2m +step 06926/16704 (41.46%) | loss: 2.722688 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,283 | mfu: 50.96 | epoch: 1 | total time: 74.43m | eta: 105.2m +step 06927/16704 (41.47%) | loss: 2.733625 | lrm: 1.00 | dt: 643.27ms | tok/sec: 815,032 | mfu: 50.94 | epoch: 1 | total time: 74.44m | eta: 105.2m +step 06928/16704 (41.48%) | loss: 2.730379 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,907 | mfu: 50.68 | epoch: 1 | total time: 74.45m | eta: 105.2m +step 06929/16704 (41.48%) | loss: 2.733580 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,164 | mfu: 50.95 | epoch: 1 | total time: 74.46m | eta: 105.2m +step 06930/16704 (41.49%) | loss: 2.731875 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,606 | mfu: 50.85 | epoch: 1 | total time: 74.47m | eta: 105.2m +step 06931/16704 (41.49%) | loss: 2.729953 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,175 | mfu: 50.95 | epoch: 1 | total time: 74.48m | eta: 105.2m +step 06932/16704 (41.50%) | loss: 2.727211 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,389 | mfu: 50.90 | epoch: 1 | total time: 74.49m | eta: 105.2m +step 06933/16704 (41.51%) | loss: 2.714319 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 1 | total time: 74.50m | eta: 105.1m +step 06934/16704 (41.51%) | loss: 2.713529 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,439 | mfu: 50.84 | epoch: 1 | total time: 74.51m | eta: 105.1m +step 06935/16704 (41.52%) | loss: 2.705520 | lrm: 1.00 | dt: 642.29ms | tok/sec: 816,277 | mfu: 51.02 | epoch: 1 | total time: 74.52m | eta: 105.1m +step 06936/16704 (41.52%) | loss: 2.717244 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,966 | mfu: 50.81 | epoch: 1 | total time: 74.53m | eta: 105.1m +step 06937/16704 (41.53%) | loss: 2.713467 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,026 | mfu: 50.88 | epoch: 1 | total time: 74.54m | eta: 105.1m +step 06938/16704 (41.53%) | loss: 2.728694 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,142 | mfu: 50.82 | epoch: 1 | total time: 74.55m | eta: 105.1m +step 06939/16704 (41.54%) | loss: 2.726211 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,393 | mfu: 50.71 | epoch: 1 | total time: 74.57m | eta: 105.1m +step 06940/16704 (41.55%) | loss: 2.726513 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,574 | mfu: 50.85 | epoch: 1 | total time: 74.58m | eta: 105.1m +step 06941/16704 (41.55%) | loss: 2.712528 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,142 | mfu: 50.89 | epoch: 1 | total time: 74.59m | eta: 105.1m +step 06942/16704 (41.56%) | loss: 2.698358 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,638 | mfu: 50.85 | epoch: 1 | total time: 74.60m | eta: 105.1m +step 06943/16704 (41.56%) | loss: 2.694462 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,188 | mfu: 50.76 | epoch: 1 | total time: 74.61m | eta: 105.0m +step 06944/16704 (41.57%) | loss: 2.701087 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,401 | mfu: 50.96 | epoch: 1 | total time: 74.62m | eta: 105.0m +step 06945/16704 (41.58%) | loss: 2.699121 | lrm: 1.00 | dt: 645.23ms | tok/sec: 812,556 | mfu: 50.79 | epoch: 1 | total time: 74.63m | eta: 105.0m +step 06946/16704 (41.58%) | loss: 2.698542 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,309 | mfu: 50.83 | epoch: 1 | total time: 74.64m | eta: 105.0m +step 06947/16704 (41.59%) | loss: 2.694735 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,407 | mfu: 50.65 | epoch: 1 | total time: 74.65m | eta: 105.0m +step 06948/16704 (41.59%) | loss: 2.704792 | lrm: 1.00 | dt: 641.79ms | tok/sec: 816,914 | mfu: 51.06 | epoch: 1 | total time: 74.66m | eta: 105.0m +step 06949/16704 (41.60%) | loss: 2.699683 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,482 | mfu: 50.72 | epoch: 1 | total time: 74.67m | eta: 105.0m +step 06950/16704 (41.61%) | loss: 2.705851 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,239 | mfu: 50.77 | epoch: 1 | total time: 74.68m | eta: 105.0m +step 06951/16704 (41.61%) | loss: 2.722626 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,999 | mfu: 50.75 | epoch: 1 | total time: 74.69m | eta: 105.0m +step 06952/16704 (41.62%) | loss: 2.712312 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,470 | mfu: 50.72 | epoch: 1 | total time: 74.71m | eta: 104.9m +step 06953/16704 (41.62%) | loss: 2.717224 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,636 | mfu: 50.92 | epoch: 1 | total time: 74.72m | eta: 104.9m +step 06954/16704 (41.63%) | loss: 2.731207 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,817 | mfu: 50.99 | epoch: 1 | total time: 74.73m | eta: 104.9m +step 06955/16704 (41.64%) | loss: 2.734417 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,128 | mfu: 50.88 | epoch: 1 | total time: 74.74m | eta: 104.9m +step 06956/16704 (41.64%) | loss: 2.731701 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,640 | mfu: 50.92 | epoch: 1 | total time: 74.75m | eta: 104.9m +step 06957/16704 (41.65%) | loss: 2.726987 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,917 | mfu: 51.00 | epoch: 1 | total time: 74.76m | eta: 104.9m +step 06958/16704 (41.65%) | loss: 2.722571 | lrm: 1.00 | dt: 645.32ms | tok/sec: 812,449 | mfu: 50.78 | epoch: 1 | total time: 74.77m | eta: 104.9m +step 06959/16704 (41.66%) | loss: 2.723718 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,907 | mfu: 50.93 | epoch: 1 | total time: 74.78m | eta: 104.9m +step 06960/16704 (41.67%) | loss: 2.737332 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,613 | mfu: 50.73 | epoch: 1 | total time: 74.79m | eta: 104.9m +step 06961/16704 (41.67%) | loss: 2.727842 | lrm: 1.00 | dt: 642.42ms | tok/sec: 816,110 | mfu: 51.01 | epoch: 1 | total time: 74.80m | eta: 104.8m +step 06962/16704 (41.68%) | loss: 2.722324 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,356 | mfu: 50.90 | epoch: 1 | total time: 74.81m | eta: 104.8m +step 06963/16704 (41.68%) | loss: 2.723157 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,526 | mfu: 50.85 | epoch: 1 | total time: 74.82m | eta: 104.8m +step 06964/16704 (41.69%) | loss: 2.723798 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,087 | mfu: 50.88 | epoch: 1 | total time: 74.83m | eta: 104.8m +step 06965/16704 (41.70%) | loss: 2.710881 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 1 | total time: 74.84m | eta: 104.8m +step 06966/16704 (41.70%) | loss: 2.733053 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,551 | mfu: 50.85 | epoch: 1 | total time: 74.86m | eta: 104.8m +step 06967/16704 (41.71%) | loss: 2.734627 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,011 | mfu: 50.88 | epoch: 1 | total time: 74.87m | eta: 104.8m +step 06968/16704 (41.71%) | loss: 2.735359 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,481 | mfu: 50.84 | epoch: 1 | total time: 74.88m | eta: 104.8m +step 06969/16704 (41.72%) | loss: 2.725002 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,112 | mfu: 50.76 | epoch: 1 | total time: 74.89m | eta: 104.8m +step 06970/16704 (41.73%) | loss: 2.723685 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,724 | mfu: 50.92 | epoch: 1 | total time: 74.90m | eta: 104.8m +step 06971/16704 (41.73%) | loss: 2.714830 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,399 | mfu: 50.84 | epoch: 1 | total time: 74.91m | eta: 104.7m +step 06972/16704 (41.74%) | loss: 2.708454 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,096 | mfu: 50.82 | epoch: 1 | total time: 74.92m | eta: 104.7m +step 06973/16704 (41.74%) | loss: 2.718714 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,817 | mfu: 50.80 | epoch: 1 | total time: 74.93m | eta: 104.7m +step 06974/16704 (41.75%) | loss: 2.718950 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,621 | mfu: 50.79 | epoch: 1 | total time: 74.94m | eta: 104.7m +step 06975/16704 (41.76%) | loss: 2.698034 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,896 | mfu: 50.93 | epoch: 1 | total time: 74.95m | eta: 104.7m +step 06976/16704 (41.76%) | loss: 2.704849 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,122 | mfu: 50.82 | epoch: 1 | total time: 74.96m | eta: 104.7m +step 06977/16704 (41.77%) | loss: 2.716223 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,753 | mfu: 50.74 | epoch: 1 | total time: 74.97m | eta: 104.7m +step 06978/16704 (41.77%) | loss: 2.714947 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,461 | mfu: 50.78 | epoch: 1 | total time: 74.98m | eta: 104.7m +step 06979/16704 (41.78%) | loss: 2.720564 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,604 | mfu: 50.85 | epoch: 1 | total time: 75.00m | eta: 104.7m +step 06980/16704 (41.79%) | loss: 2.725818 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,639 | mfu: 50.85 | epoch: 1 | total time: 75.01m | eta: 104.6m +step 06981/16704 (41.79%) | loss: 2.726239 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 1 | total time: 75.02m | eta: 104.6m +step 06982/16704 (41.80%) | loss: 2.711026 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 1 | total time: 75.03m | eta: 104.6m +step 06983/16704 (41.80%) | loss: 2.715942 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,019 | mfu: 50.69 | epoch: 1 | total time: 75.04m | eta: 104.6m +step 06984/16704 (41.81%) | loss: 2.712973 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,289 | mfu: 50.83 | epoch: 1 | total time: 75.05m | eta: 104.6m +step 06985/16704 (41.82%) | loss: 2.707956 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,486 | mfu: 50.72 | epoch: 1 | total time: 75.06m | eta: 104.6m +step 06986/16704 (41.82%) | loss: 2.723744 | lrm: 1.00 | dt: 647.67ms | tok/sec: 809,502 | mfu: 50.60 | epoch: 1 | total time: 75.07m | eta: 104.6m +step 06987/16704 (41.83%) | loss: 2.719308 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,145 | mfu: 50.95 | epoch: 1 | total time: 75.08m | eta: 104.6m +step 06988/16704 (41.83%) | loss: 2.727077 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,928 | mfu: 51.00 | epoch: 1 | total time: 75.09m | eta: 104.6m +step 06989/16704 (41.84%) | loss: 2.720759 | lrm: 1.00 | dt: 645.83ms | tok/sec: 811,808 | mfu: 50.74 | epoch: 1 | total time: 75.10m | eta: 104.5m +step 06990/16704 (41.85%) | loss: 2.712289 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,609 | mfu: 50.98 | epoch: 1 | total time: 75.11m | eta: 104.5m +step 06991/16704 (41.85%) | loss: 2.700617 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,476 | mfu: 50.84 | epoch: 1 | total time: 75.12m | eta: 104.5m +step 06992/16704 (41.86%) | loss: 2.691737 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,386 | mfu: 50.84 | epoch: 1 | total time: 75.13m | eta: 104.5m +step 06993/16704 (41.86%) | loss: 2.689640 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,478 | mfu: 50.91 | epoch: 1 | total time: 75.15m | eta: 104.5m +step 06994/16704 (41.87%) | loss: 2.691824 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,108 | mfu: 50.95 | epoch: 1 | total time: 75.16m | eta: 104.5m +step 06995/16704 (41.88%) | loss: 2.709872 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,753 | mfu: 50.92 | epoch: 1 | total time: 75.17m | eta: 104.5m +step 06996/16704 (41.88%) | loss: 2.715280 | lrm: 1.00 | dt: 641.61ms | tok/sec: 817,148 | mfu: 51.07 | epoch: 1 | total time: 75.18m | eta: 104.5m +step 06997/16704 (41.89%) | loss: 2.713310 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,379 | mfu: 50.96 | epoch: 1 | total time: 75.19m | eta: 104.5m +step 06998/16704 (41.89%) | loss: 2.722618 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,265 | mfu: 51.02 | epoch: 1 | total time: 75.20m | eta: 104.4m +step 06999/16704 (41.90%) | loss: 2.735415 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,030 | mfu: 50.82 | epoch: 1 | total time: 75.21m | eta: 104.4m +Step 07000 | Validation bpb: 0.827505 +step 07000/16704 (41.91%) | loss: 2.734109 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,979 | mfu: 50.81 | epoch: 1 | total time: 75.22m | eta: 104.4m +step 07001/16704 (41.91%) | loss: 2.754594 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,565 | mfu: 50.66 | epoch: 1 | total time: 75.23m | eta: 104.4m +step 07002/16704 (41.92%) | loss: 2.748083 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,536 | mfu: 50.66 | epoch: 1 | total time: 75.24m | eta: 104.4m +step 07003/16704 (41.92%) | loss: 2.742305 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,639 | mfu: 50.85 | epoch: 1 | total time: 75.25m | eta: 104.4m +step 07004/16704 (41.93%) | loss: 2.734615 | lrm: 1.00 | dt: 642.51ms | tok/sec: 815,994 | mfu: 51.00 | epoch: 1 | total time: 75.26m | eta: 104.4m +step 07005/16704 (41.94%) | loss: 2.722028 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,580 | mfu: 50.91 | epoch: 1 | total time: 75.27m | eta: 104.4m +step 07006/16704 (41.94%) | loss: 2.726674 | lrm: 1.00 | dt: 642.42ms | tok/sec: 816,110 | mfu: 51.01 | epoch: 1 | total time: 75.29m | eta: 104.4m +step 07007/16704 (41.95%) | loss: 2.733981 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,176 | mfu: 50.95 | epoch: 1 | total time: 75.30m | eta: 104.4m +step 07008/16704 (41.95%) | loss: 2.732630 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,231 | mfu: 50.77 | epoch: 1 | total time: 75.31m | eta: 104.3m +step 07009/16704 (41.96%) | loss: 2.740724 | lrm: 1.00 | dt: 645.46ms | tok/sec: 812,273 | mfu: 50.77 | epoch: 1 | total time: 75.32m | eta: 104.3m +step 07010/16704 (41.97%) | loss: 2.736930 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,683 | mfu: 50.92 | epoch: 1 | total time: 75.33m | eta: 104.3m +step 07011/16704 (41.97%) | loss: 2.746303 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,568 | mfu: 50.79 | epoch: 1 | total time: 75.34m | eta: 104.3m +step 07012/16704 (41.98%) | loss: 2.752596 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,441 | mfu: 50.84 | epoch: 1 | total time: 75.35m | eta: 104.3m +step 07013/16704 (41.98%) | loss: 2.755898 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,273 | mfu: 50.71 | epoch: 1 | total time: 75.36m | eta: 104.3m +step 07014/16704 (41.99%) | loss: 2.750066 | lrm: 1.00 | dt: 643.19ms | tok/sec: 815,131 | mfu: 50.95 | epoch: 1 | total time: 75.37m | eta: 104.3m +step 07015/16704 (42.00%) | loss: 2.745007 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,925 | mfu: 51.00 | epoch: 1 | total time: 75.38m | eta: 104.3m +step 07016/16704 (42.00%) | loss: 2.735516 | lrm: 1.00 | dt: 643.86ms | tok/sec: 814,283 | mfu: 50.89 | epoch: 1 | total time: 75.39m | eta: 104.3m +step 07017/16704 (42.01%) | loss: 2.723115 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,246 | mfu: 50.95 | epoch: 1 | total time: 75.40m | eta: 104.2m +step 07018/16704 (42.01%) | loss: 2.700483 | lrm: 1.00 | dt: 645.21ms | tok/sec: 812,589 | mfu: 50.79 | epoch: 1 | total time: 75.41m | eta: 104.2m +step 07019/16704 (42.02%) | loss: 2.705789 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,239 | mfu: 50.77 | epoch: 1 | total time: 75.42m | eta: 104.2m +step 07020/16704 (42.03%) | loss: 2.705704 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,473 | mfu: 50.91 | epoch: 1 | total time: 75.44m | eta: 104.2m +step 07021/16704 (42.03%) | loss: 2.718240 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,849 | mfu: 50.87 | epoch: 1 | total time: 75.45m | eta: 104.2m +step 07022/16704 (42.04%) | loss: 2.729289 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,310 | mfu: 50.90 | epoch: 1 | total time: 75.46m | eta: 104.2m +step 07023/16704 (42.04%) | loss: 2.721497 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,578 | mfu: 50.85 | epoch: 1 | total time: 75.47m | eta: 104.2m +step 07024/16704 (42.05%) | loss: 2.717965 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,917 | mfu: 50.62 | epoch: 1 | total time: 75.48m | eta: 104.2m +step 07025/16704 (42.06%) | loss: 2.707372 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,777 | mfu: 50.80 | epoch: 1 | total time: 75.49m | eta: 104.2m +step 07026/16704 (42.06%) | loss: 2.705993 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,240 | mfu: 50.89 | epoch: 1 | total time: 75.50m | eta: 104.1m +step 07027/16704 (42.07%) | loss: 2.695625 | lrm: 1.00 | dt: 641.34ms | tok/sec: 817,489 | mfu: 51.09 | epoch: 1 | total time: 75.51m | eta: 104.1m +step 07028/16704 (42.07%) | loss: 2.708015 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,691 | mfu: 50.79 | epoch: 1 | total time: 75.52m | eta: 104.1m +step 07029/16704 (42.08%) | loss: 2.706459 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,348 | mfu: 50.96 | epoch: 1 | total time: 75.53m | eta: 104.1m +step 07030/16704 (42.09%) | loss: 2.713981 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,456 | mfu: 50.84 | epoch: 1 | total time: 75.54m | eta: 104.1m +step 07031/16704 (42.09%) | loss: 2.705339 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,882 | mfu: 50.93 | epoch: 1 | total time: 75.55m | eta: 104.1m +step 07032/16704 (42.10%) | loss: 2.712061 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,359 | mfu: 50.84 | epoch: 1 | total time: 75.56m | eta: 104.1m +step 07033/16704 (42.10%) | loss: 2.724622 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,395 | mfu: 50.90 | epoch: 1 | total time: 75.57m | eta: 104.1m +step 07034/16704 (42.11%) | loss: 2.728750 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,655 | mfu: 50.85 | epoch: 1 | total time: 75.59m | eta: 104.1m +step 07035/16704 (42.12%) | loss: 2.719755 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,406 | mfu: 50.96 | epoch: 1 | total time: 75.60m | eta: 104.0m +step 07036/16704 (42.12%) | loss: 2.725538 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,661 | mfu: 50.92 | epoch: 1 | total time: 75.61m | eta: 104.0m +step 07037/16704 (42.13%) | loss: 2.729116 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,913 | mfu: 50.81 | epoch: 1 | total time: 75.62m | eta: 104.0m +step 07038/16704 (42.13%) | loss: 2.731875 | lrm: 1.00 | dt: 642.50ms | tok/sec: 816,016 | mfu: 51.00 | epoch: 1 | total time: 75.63m | eta: 104.0m +step 07039/16704 (42.14%) | loss: 2.739639 | lrm: 1.00 | dt: 647.13ms | tok/sec: 810,171 | mfu: 50.64 | epoch: 1 | total time: 75.64m | eta: 104.0m +step 07040/16704 (42.15%) | loss: 2.731983 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,785 | mfu: 50.93 | epoch: 1 | total time: 75.65m | eta: 104.0m +step 07041/16704 (42.15%) | loss: 2.741413 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,243 | mfu: 50.70 | epoch: 1 | total time: 75.66m | eta: 104.0m +step 07042/16704 (42.16%) | loss: 2.742686 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,050 | mfu: 50.88 | epoch: 1 | total time: 75.67m | eta: 104.0m +step 07043/16704 (42.16%) | loss: 2.742328 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,461 | mfu: 50.90 | epoch: 1 | total time: 75.68m | eta: 104.0m +step 07044/16704 (42.17%) | loss: 2.734573 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,164 | mfu: 50.89 | epoch: 1 | total time: 75.69m | eta: 104.0m +step 07045/16704 (42.18%) | loss: 2.727071 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,743 | mfu: 50.86 | epoch: 1 | total time: 75.70m | eta: 103.9m +step 07046/16704 (42.18%) | loss: 2.727498 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,277 | mfu: 50.89 | epoch: 1 | total time: 75.71m | eta: 103.9m +step 07047/16704 (42.19%) | loss: 2.728001 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,286 | mfu: 50.83 | epoch: 1 | total time: 75.73m | eta: 103.9m +step 07048/16704 (42.19%) | loss: 2.724764 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,647 | mfu: 50.98 | epoch: 1 | total time: 75.74m | eta: 103.9m +step 07049/16704 (42.20%) | loss: 2.734496 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,841 | mfu: 50.80 | epoch: 1 | total time: 75.75m | eta: 103.9m +step 07050/16704 (42.21%) | loss: 2.733240 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,667 | mfu: 50.79 | epoch: 1 | total time: 75.76m | eta: 103.9m +step 07051/16704 (42.21%) | loss: 2.730446 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,270 | mfu: 50.96 | epoch: 1 | total time: 75.77m | eta: 103.9m +step 07052/16704 (42.22%) | loss: 2.744257 | lrm: 1.00 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 1 | total time: 75.78m | eta: 103.9m +step 07053/16704 (42.22%) | loss: 2.744609 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 1 | total time: 75.79m | eta: 103.9m +step 07054/16704 (42.23%) | loss: 2.744325 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,085 | mfu: 50.82 | epoch: 1 | total time: 75.80m | eta: 103.8m +step 07055/16704 (42.24%) | loss: 2.742376 | lrm: 1.00 | dt: 642.76ms | tok/sec: 815,682 | mfu: 50.98 | epoch: 1 | total time: 75.81m | eta: 103.8m +step 07056/16704 (42.24%) | loss: 2.730747 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,639 | mfu: 50.92 | epoch: 1 | total time: 75.82m | eta: 103.8m +step 07057/16704 (42.25%) | loss: 2.725312 | lrm: 1.00 | dt: 641.94ms | tok/sec: 816,725 | mfu: 51.05 | epoch: 1 | total time: 75.83m | eta: 103.8m +step 07058/16704 (42.25%) | loss: 2.719124 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,664 | mfu: 50.92 | epoch: 1 | total time: 75.84m | eta: 103.8m +step 07059/16704 (42.26%) | loss: 2.724727 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 1 | total time: 75.85m | eta: 103.8m +step 07060/16704 (42.27%) | loss: 2.723362 | lrm: 1.00 | dt: 645.94ms | tok/sec: 811,669 | mfu: 50.73 | epoch: 1 | total time: 75.86m | eta: 103.8m +step 07061/16704 (42.27%) | loss: 2.724938 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,947 | mfu: 51.00 | epoch: 1 | total time: 75.88m | eta: 103.8m +step 07062/16704 (42.28%) | loss: 2.731134 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,992 | mfu: 50.81 | epoch: 1 | total time: 75.89m | eta: 103.8m +step 07063/16704 (42.28%) | loss: 2.716153 | lrm: 1.00 | dt: 641.64ms | tok/sec: 817,112 | mfu: 51.07 | epoch: 1 | total time: 75.90m | eta: 103.7m +step 07064/16704 (42.29%) | loss: 2.718993 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,739 | mfu: 50.92 | epoch: 1 | total time: 75.91m | eta: 103.7m +step 07065/16704 (42.30%) | loss: 2.726255 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,379 | mfu: 50.77 | epoch: 1 | total time: 75.92m | eta: 103.7m +step 07066/16704 (42.30%) | loss: 2.731446 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,197 | mfu: 50.95 | epoch: 1 | total time: 75.93m | eta: 103.7m +step 07067/16704 (42.31%) | loss: 2.731665 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,317 | mfu: 51.02 | epoch: 1 | total time: 75.94m | eta: 103.7m +step 07068/16704 (42.31%) | loss: 2.729742 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,195 | mfu: 50.83 | epoch: 1 | total time: 75.95m | eta: 103.7m +step 07069/16704 (42.32%) | loss: 2.727723 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,908 | mfu: 50.87 | epoch: 1 | total time: 75.96m | eta: 103.7m +step 07070/16704 (42.33%) | loss: 2.720692 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,490 | mfu: 51.03 | epoch: 1 | total time: 75.97m | eta: 103.7m +step 07071/16704 (42.33%) | loss: 2.724447 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 1 | total time: 75.98m | eta: 103.7m +step 07072/16704 (42.34%) | loss: 2.733346 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,122 | mfu: 50.95 | epoch: 1 | total time: 75.99m | eta: 103.6m +step 07073/16704 (42.34%) | loss: 2.741928 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,112 | mfu: 50.95 | epoch: 1 | total time: 76.00m | eta: 103.6m +step 07074/16704 (42.35%) | loss: 2.735591 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,421 | mfu: 50.84 | epoch: 1 | total time: 76.01m | eta: 103.6m +step 07075/16704 (42.36%) | loss: 2.736831 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,934 | mfu: 50.75 | epoch: 1 | total time: 76.03m | eta: 103.6m +step 07076/16704 (42.36%) | loss: 2.744815 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,410 | mfu: 50.84 | epoch: 1 | total time: 76.04m | eta: 103.6m +step 07077/16704 (42.37%) | loss: 2.737838 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,861 | mfu: 50.93 | epoch: 1 | total time: 76.05m | eta: 103.6m +step 07078/16704 (42.37%) | loss: 2.732845 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,694 | mfu: 50.79 | epoch: 1 | total time: 76.06m | eta: 103.6m +step 07079/16704 (42.38%) | loss: 2.731284 | lrm: 1.00 | dt: 642.45ms | tok/sec: 816,071 | mfu: 51.01 | epoch: 1 | total time: 76.07m | eta: 103.6m +step 07080/16704 (42.39%) | loss: 2.724617 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,953 | mfu: 50.87 | epoch: 1 | total time: 76.08m | eta: 103.6m +step 07081/16704 (42.39%) | loss: 2.718322 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,781 | mfu: 50.74 | epoch: 1 | total time: 76.09m | eta: 103.6m +step 07082/16704 (42.40%) | loss: 2.713696 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,347 | mfu: 50.90 | epoch: 1 | total time: 76.10m | eta: 103.5m +step 07083/16704 (42.40%) | loss: 2.711428 | lrm: 1.00 | dt: 642.63ms | tok/sec: 815,849 | mfu: 50.99 | epoch: 1 | total time: 76.11m | eta: 103.5m +step 07084/16704 (42.41%) | loss: 2.719214 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,101 | mfu: 50.76 | epoch: 1 | total time: 76.12m | eta: 103.5m +step 07085/16704 (42.41%) | loss: 2.725088 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,600 | mfu: 50.85 | epoch: 1 | total time: 76.13m | eta: 103.5m +step 07086/16704 (42.42%) | loss: 2.740209 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 1 | total time: 76.14m | eta: 103.5m +step 07087/16704 (42.43%) | loss: 2.739235 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,633 | mfu: 50.92 | epoch: 1 | total time: 76.15m | eta: 103.5m +step 07088/16704 (42.43%) | loss: 2.730984 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,596 | mfu: 50.98 | epoch: 1 | total time: 76.17m | eta: 103.5m +step 07089/16704 (42.44%) | loss: 2.751233 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,975 | mfu: 50.94 | epoch: 1 | total time: 76.18m | eta: 103.5m +step 07090/16704 (42.44%) | loss: 2.742381 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 1 | total time: 76.19m | eta: 103.5m +step 07091/16704 (42.45%) | loss: 2.737634 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,556 | mfu: 50.91 | epoch: 1 | total time: 76.20m | eta: 103.4m +step 07092/16704 (42.46%) | loss: 2.744466 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,437 | mfu: 50.97 | epoch: 1 | total time: 76.21m | eta: 103.4m +step 07093/16704 (42.46%) | loss: 2.732728 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,127 | mfu: 50.76 | epoch: 1 | total time: 76.22m | eta: 103.4m +step 07094/16704 (42.47%) | loss: 2.728330 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,359 | mfu: 50.77 | epoch: 1 | total time: 76.23m | eta: 103.4m +step 07095/16704 (42.47%) | loss: 2.729993 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,275 | mfu: 50.71 | epoch: 1 | total time: 76.24m | eta: 103.4m +step 07096/16704 (42.48%) | loss: 2.726365 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 1 | total time: 76.25m | eta: 103.4m +step 07097/16704 (42.49%) | loss: 2.730959 | lrm: 1.00 | dt: 641.64ms | tok/sec: 817,102 | mfu: 51.07 | epoch: 1 | total time: 76.26m | eta: 103.4m +step 07098/16704 (42.49%) | loss: 2.732609 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,195 | mfu: 50.89 | epoch: 1 | total time: 76.27m | eta: 103.4m +step 07099/16704 (42.50%) | loss: 2.742157 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,888 | mfu: 50.81 | epoch: 1 | total time: 76.28m | eta: 103.4m +step 07100/16704 (42.50%) | loss: 2.729394 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,136 | mfu: 51.01 | epoch: 1 | total time: 76.29m | eta: 103.3m +step 07101/16704 (42.51%) | loss: 2.729488 | lrm: 1.00 | dt: 646.54ms | tok/sec: 810,909 | mfu: 50.68 | epoch: 1 | total time: 76.30m | eta: 103.3m +step 07102/16704 (42.52%) | loss: 2.736160 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,174 | mfu: 50.95 | epoch: 1 | total time: 76.32m | eta: 103.3m +step 07103/16704 (42.52%) | loss: 2.736060 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,175 | mfu: 50.82 | epoch: 1 | total time: 76.33m | eta: 103.3m +step 07104/16704 (42.53%) | loss: 2.736613 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,283 | mfu: 50.83 | epoch: 1 | total time: 76.34m | eta: 103.3m +step 07105/16704 (42.53%) | loss: 2.742220 | lrm: 1.00 | dt: 642.06ms | tok/sec: 816,577 | mfu: 51.04 | epoch: 1 | total time: 76.35m | eta: 103.3m +step 07106/16704 (42.54%) | loss: 2.740381 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,242 | mfu: 50.95 | epoch: 1 | total time: 76.36m | eta: 103.3m +step 07107/16704 (42.55%) | loss: 2.739229 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,009 | mfu: 50.94 | epoch: 1 | total time: 76.37m | eta: 103.3m +step 07108/16704 (42.55%) | loss: 2.722175 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,639 | mfu: 50.79 | epoch: 1 | total time: 76.38m | eta: 103.3m +step 07109/16704 (42.56%) | loss: 2.726449 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,008 | mfu: 50.88 | epoch: 1 | total time: 76.39m | eta: 103.2m +step 07110/16704 (42.56%) | loss: 2.731924 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,007 | mfu: 50.94 | epoch: 1 | total time: 76.40m | eta: 103.2m +step 07111/16704 (42.57%) | loss: 2.718923 | lrm: 1.00 | dt: 641.10ms | tok/sec: 817,797 | mfu: 51.11 | epoch: 1 | total time: 76.41m | eta: 103.2m +step 07112/16704 (42.58%) | loss: 2.722303 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,093 | mfu: 50.94 | epoch: 1 | total time: 76.42m | eta: 103.2m +step 07113/16704 (42.58%) | loss: 2.719909 | lrm: 1.00 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 1 | total time: 76.43m | eta: 103.2m +step 07114/16704 (42.59%) | loss: 2.713088 | lrm: 1.00 | dt: 640.95ms | tok/sec: 817,983 | mfu: 51.13 | epoch: 1 | total time: 76.44m | eta: 103.2m +step 07115/16704 (42.59%) | loss: 2.717029 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,055 | mfu: 50.88 | epoch: 1 | total time: 76.46m | eta: 103.2m +step 07116/16704 (42.60%) | loss: 2.721806 | lrm: 1.00 | dt: 642.85ms | tok/sec: 815,574 | mfu: 50.97 | epoch: 1 | total time: 76.47m | eta: 103.2m +step 07117/16704 (42.61%) | loss: 2.713218 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,276 | mfu: 50.77 | epoch: 1 | total time: 76.48m | eta: 103.2m +step 07118/16704 (42.61%) | loss: 2.713273 | lrm: 1.00 | dt: 644.56ms | tok/sec: 813,410 | mfu: 50.84 | epoch: 1 | total time: 76.49m | eta: 103.2m +step 07119/16704 (42.62%) | loss: 2.706056 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,391 | mfu: 50.96 | epoch: 1 | total time: 76.50m | eta: 103.1m +step 07120/16704 (42.62%) | loss: 2.710797 | lrm: 1.00 | dt: 644.39ms | tok/sec: 813,624 | mfu: 50.85 | epoch: 1 | total time: 76.51m | eta: 103.1m +step 07121/16704 (42.63%) | loss: 2.720367 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,809 | mfu: 50.86 | epoch: 1 | total time: 76.52m | eta: 103.1m +step 07122/16704 (42.64%) | loss: 2.718824 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,331 | mfu: 50.90 | epoch: 1 | total time: 76.53m | eta: 103.1m +step 07123/16704 (42.64%) | loss: 2.716481 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,079 | mfu: 50.88 | epoch: 1 | total time: 76.54m | eta: 103.1m +step 07124/16704 (42.65%) | loss: 2.715090 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,227 | mfu: 50.89 | epoch: 1 | total time: 76.55m | eta: 103.1m +step 07125/16704 (42.65%) | loss: 2.717377 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,791 | mfu: 50.86 | epoch: 1 | total time: 76.56m | eta: 103.1m +step 07126/16704 (42.66%) | loss: 2.703072 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,775 | mfu: 50.74 | epoch: 1 | total time: 76.57m | eta: 103.1m +step 07127/16704 (42.67%) | loss: 2.708946 | lrm: 1.00 | dt: 640.90ms | tok/sec: 818,049 | mfu: 51.13 | epoch: 1 | total time: 76.58m | eta: 103.1m +step 07128/16704 (42.67%) | loss: 2.700833 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,457 | mfu: 50.78 | epoch: 1 | total time: 76.59m | eta: 103.0m +step 07129/16704 (42.68%) | loss: 2.704920 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,954 | mfu: 50.75 | epoch: 1 | total time: 76.61m | eta: 103.0m +step 07130/16704 (42.68%) | loss: 2.708637 | lrm: 1.00 | dt: 641.55ms | tok/sec: 817,215 | mfu: 51.08 | epoch: 1 | total time: 76.62m | eta: 103.0m +step 07131/16704 (42.69%) | loss: 2.731316 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,339 | mfu: 50.83 | epoch: 1 | total time: 76.63m | eta: 103.0m +step 07132/16704 (42.70%) | loss: 2.738117 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,543 | mfu: 50.85 | epoch: 1 | total time: 76.64m | eta: 103.0m +step 07133/16704 (42.70%) | loss: 2.743376 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 1 | total time: 76.65m | eta: 103.0m +step 07134/16704 (42.71%) | loss: 2.750940 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,718 | mfu: 50.86 | epoch: 1 | total time: 76.66m | eta: 103.0m +step 07135/16704 (42.71%) | loss: 2.733200 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,642 | mfu: 50.79 | epoch: 1 | total time: 76.67m | eta: 103.0m +step 07136/16704 (42.72%) | loss: 2.733712 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,321 | mfu: 50.96 | epoch: 1 | total time: 76.68m | eta: 103.0m +step 07137/16704 (42.73%) | loss: 2.735914 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,842 | mfu: 50.80 | epoch: 1 | total time: 76.69m | eta: 102.9m +step 07138/16704 (42.73%) | loss: 2.728382 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,643 | mfu: 50.67 | epoch: 1 | total time: 76.70m | eta: 102.9m +step 07139/16704 (42.74%) | loss: 2.746386 | lrm: 1.00 | dt: 640.77ms | tok/sec: 818,219 | mfu: 51.14 | epoch: 1 | total time: 76.71m | eta: 102.9m +step 07140/16704 (42.74%) | loss: 2.742632 | lrm: 1.00 | dt: 648.76ms | tok/sec: 808,136 | mfu: 50.51 | epoch: 1 | total time: 76.72m | eta: 102.9m +step 07141/16704 (42.75%) | loss: 2.745515 | lrm: 1.00 | dt: 642.18ms | tok/sec: 816,416 | mfu: 51.03 | epoch: 1 | total time: 76.73m | eta: 102.9m +step 07142/16704 (42.76%) | loss: 2.734403 | lrm: 1.00 | dt: 644.68ms | tok/sec: 813,250 | mfu: 50.83 | epoch: 1 | total time: 76.74m | eta: 102.9m +step 07143/16704 (42.76%) | loss: 2.732099 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,010 | mfu: 50.69 | epoch: 1 | total time: 76.76m | eta: 102.9m +step 07144/16704 (42.77%) | loss: 2.726693 | lrm: 1.00 | dt: 640.83ms | tok/sec: 818,136 | mfu: 51.13 | epoch: 1 | total time: 76.77m | eta: 102.9m +step 07145/16704 (42.77%) | loss: 2.731098 | lrm: 1.00 | dt: 647.62ms | tok/sec: 809,566 | mfu: 50.60 | epoch: 1 | total time: 76.78m | eta: 102.9m +step 07146/16704 (42.78%) | loss: 2.713547 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,185 | mfu: 51.01 | epoch: 1 | total time: 76.79m | eta: 102.9m +step 07147/16704 (42.79%) | loss: 2.721886 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,161 | mfu: 50.82 | epoch: 1 | total time: 76.80m | eta: 102.8m +step 07148/16704 (42.79%) | loss: 2.729359 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,106 | mfu: 50.76 | epoch: 1 | total time: 76.81m | eta: 102.8m +step 07149/16704 (42.80%) | loss: 2.735653 | lrm: 1.00 | dt: 641.58ms | tok/sec: 817,186 | mfu: 51.08 | epoch: 1 | total time: 76.82m | eta: 102.8m +step 07150/16704 (42.80%) | loss: 2.723912 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,928 | mfu: 50.75 | epoch: 1 | total time: 76.83m | eta: 102.8m +step 07151/16704 (42.81%) | loss: 2.721371 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,513 | mfu: 50.97 | epoch: 1 | total time: 76.84m | eta: 102.8m +step 07152/16704 (42.82%) | loss: 2.728307 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,663 | mfu: 50.92 | epoch: 1 | total time: 76.85m | eta: 102.8m +step 07153/16704 (42.82%) | loss: 2.731563 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,374 | mfu: 50.71 | epoch: 1 | total time: 76.86m | eta: 102.8m +step 07154/16704 (42.83%) | loss: 2.736494 | lrm: 1.00 | dt: 641.89ms | tok/sec: 816,782 | mfu: 51.05 | epoch: 1 | total time: 76.87m | eta: 102.8m +step 07155/16704 (42.83%) | loss: 2.731834 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,308 | mfu: 50.90 | epoch: 1 | total time: 76.88m | eta: 102.8m +step 07156/16704 (42.84%) | loss: 2.719390 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,344 | mfu: 50.96 | epoch: 1 | total time: 76.90m | eta: 102.7m +step 07157/16704 (42.85%) | loss: 2.733891 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,863 | mfu: 50.81 | epoch: 1 | total time: 76.91m | eta: 102.7m +step 07158/16704 (42.85%) | loss: 2.749666 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,383 | mfu: 50.96 | epoch: 1 | total time: 76.92m | eta: 102.7m +step 07159/16704 (42.86%) | loss: 2.749030 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,391 | mfu: 50.96 | epoch: 1 | total time: 76.93m | eta: 102.7m +step 07160/16704 (42.86%) | loss: 2.743060 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,281 | mfu: 50.77 | epoch: 1 | total time: 76.94m | eta: 102.7m +step 07161/16704 (42.87%) | loss: 2.741007 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 1 | total time: 76.95m | eta: 102.7m +step 07162/16704 (42.88%) | loss: 2.733821 | lrm: 1.00 | dt: 649.58ms | tok/sec: 807,123 | mfu: 50.45 | epoch: 1 | total time: 76.96m | eta: 102.7m +step 07163/16704 (42.88%) | loss: 2.731888 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,147 | mfu: 50.89 | epoch: 1 | total time: 76.97m | eta: 102.7m +step 07164/16704 (42.89%) | loss: 2.733457 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,797 | mfu: 50.99 | epoch: 1 | total time: 76.98m | eta: 102.7m +step 07165/16704 (42.89%) | loss: 2.731516 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,102 | mfu: 50.70 | epoch: 1 | total time: 76.99m | eta: 102.6m +step 07166/16704 (42.90%) | loss: 2.733717 | lrm: 1.00 | dt: 642.72ms | tok/sec: 815,730 | mfu: 50.98 | epoch: 1 | total time: 77.00m | eta: 102.6m +step 07167/16704 (42.91%) | loss: 2.725678 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,005 | mfu: 50.81 | epoch: 1 | total time: 77.01m | eta: 102.6m +step 07168/16704 (42.91%) | loss: 2.724613 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,740 | mfu: 50.92 | epoch: 1 | total time: 77.02m | eta: 102.6m +step 07169/16704 (42.92%) | loss: 2.738063 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,653 | mfu: 50.79 | epoch: 1 | total time: 77.03m | eta: 102.6m +step 07170/16704 (42.92%) | loss: 2.732788 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,090 | mfu: 50.82 | epoch: 1 | total time: 77.05m | eta: 102.6m +step 07171/16704 (42.93%) | loss: 2.730867 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,359 | mfu: 50.90 | epoch: 1 | total time: 77.06m | eta: 102.6m +step 07172/16704 (42.94%) | loss: 2.727271 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,755 | mfu: 50.80 | epoch: 1 | total time: 77.07m | eta: 102.6m +step 07173/16704 (42.94%) | loss: 2.729900 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,142 | mfu: 50.89 | epoch: 1 | total time: 77.08m | eta: 102.6m +step 07174/16704 (42.95%) | loss: 2.731934 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 1 | total time: 77.09m | eta: 102.5m +step 07175/16704 (42.95%) | loss: 2.723687 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,356 | mfu: 50.77 | epoch: 1 | total time: 77.10m | eta: 102.5m +step 07176/16704 (42.96%) | loss: 2.732150 | lrm: 1.00 | dt: 642.69ms | tok/sec: 815,774 | mfu: 50.99 | epoch: 1 | total time: 77.11m | eta: 102.5m +step 07177/16704 (42.97%) | loss: 2.724516 | lrm: 1.00 | dt: 647.27ms | tok/sec: 810,000 | mfu: 50.63 | epoch: 1 | total time: 77.12m | eta: 102.5m +step 07178/16704 (42.97%) | loss: 2.715361 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,480 | mfu: 50.91 | epoch: 1 | total time: 77.13m | eta: 102.5m +step 07179/16704 (42.98%) | loss: 2.714315 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,945 | mfu: 50.87 | epoch: 1 | total time: 77.14m | eta: 102.5m +step 07180/16704 (42.98%) | loss: 2.713681 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,058 | mfu: 50.94 | epoch: 1 | total time: 77.15m | eta: 102.5m +step 07181/16704 (42.99%) | loss: 2.728475 | lrm: 1.00 | dt: 641.39ms | tok/sec: 817,427 | mfu: 51.09 | epoch: 1 | total time: 77.16m | eta: 102.5m +step 07182/16704 (43.00%) | loss: 2.728419 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,322 | mfu: 50.77 | epoch: 1 | total time: 77.17m | eta: 102.5m +step 07183/16704 (43.00%) | loss: 2.726986 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,513 | mfu: 50.85 | epoch: 1 | total time: 77.19m | eta: 102.5m +step 07184/16704 (43.01%) | loss: 2.732438 | lrm: 1.00 | dt: 642.88ms | tok/sec: 815,529 | mfu: 50.97 | epoch: 1 | total time: 77.20m | eta: 102.4m +step 07185/16704 (43.01%) | loss: 2.725997 | lrm: 1.00 | dt: 643.19ms | tok/sec: 815,136 | mfu: 50.95 | epoch: 1 | total time: 77.21m | eta: 102.4m +step 07186/16704 (43.02%) | loss: 2.713981 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,551 | mfu: 50.97 | epoch: 1 | total time: 77.22m | eta: 102.4m +step 07187/16704 (43.03%) | loss: 2.714640 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,691 | mfu: 50.73 | epoch: 1 | total time: 77.23m | eta: 102.4m +step 07188/16704 (43.03%) | loss: 2.727470 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,457 | mfu: 50.84 | epoch: 1 | total time: 77.24m | eta: 102.4m +step 07189/16704 (43.04%) | loss: 2.728312 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,464 | mfu: 50.72 | epoch: 1 | total time: 77.25m | eta: 102.4m +step 07190/16704 (43.04%) | loss: 2.727134 | lrm: 1.00 | dt: 642.27ms | tok/sec: 816,302 | mfu: 51.02 | epoch: 1 | total time: 77.26m | eta: 102.4m +step 07191/16704 (43.05%) | loss: 2.728733 | lrm: 1.00 | dt: 645.43ms | tok/sec: 812,307 | mfu: 50.77 | epoch: 1 | total time: 77.27m | eta: 102.4m +step 07192/16704 (43.06%) | loss: 2.736167 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,473 | mfu: 50.91 | epoch: 1 | total time: 77.28m | eta: 102.4m +step 07193/16704 (43.06%) | loss: 2.725573 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,862 | mfu: 51.06 | epoch: 1 | total time: 77.29m | eta: 102.3m +step 07194/16704 (43.07%) | loss: 2.720714 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,486 | mfu: 50.84 | epoch: 1 | total time: 77.30m | eta: 102.3m +step 07195/16704 (43.07%) | loss: 2.701389 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,513 | mfu: 50.85 | epoch: 1 | total time: 77.31m | eta: 102.3m +step 07196/16704 (43.08%) | loss: 2.694163 | lrm: 1.00 | dt: 641.56ms | tok/sec: 817,213 | mfu: 51.08 | epoch: 1 | total time: 77.32m | eta: 102.3m +step 07197/16704 (43.09%) | loss: 2.694816 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,619 | mfu: 50.73 | epoch: 1 | total time: 77.34m | eta: 102.3m +step 07198/16704 (43.09%) | loss: 2.698558 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,216 | mfu: 50.89 | epoch: 1 | total time: 77.35m | eta: 102.3m +step 07199/16704 (43.10%) | loss: 2.709953 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,375 | mfu: 50.96 | epoch: 1 | total time: 77.36m | eta: 102.3m +step 07200/16704 (43.10%) | loss: 2.705547 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,359 | mfu: 50.77 | epoch: 1 | total time: 77.37m | eta: 102.3m +step 07201/16704 (43.11%) | loss: 2.719652 | lrm: 1.00 | dt: 641.63ms | tok/sec: 817,121 | mfu: 51.07 | epoch: 1 | total time: 77.38m | eta: 102.3m +step 07202/16704 (43.12%) | loss: 2.730496 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,245 | mfu: 50.83 | epoch: 1 | total time: 77.39m | eta: 102.2m +step 07203/16704 (43.12%) | loss: 2.737697 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,521 | mfu: 50.97 | epoch: 1 | total time: 77.40m | eta: 102.2m +step 07204/16704 (43.13%) | loss: 2.734810 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,715 | mfu: 50.92 | epoch: 1 | total time: 77.41m | eta: 102.2m +step 07205/16704 (43.13%) | loss: 2.750795 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,752 | mfu: 50.74 | epoch: 1 | total time: 77.42m | eta: 102.2m +step 07206/16704 (43.14%) | loss: 2.756872 | lrm: 1.00 | dt: 642.61ms | tok/sec: 815,871 | mfu: 50.99 | epoch: 1 | total time: 77.43m | eta: 102.2m +step 07207/16704 (43.15%) | loss: 2.759784 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,367 | mfu: 50.90 | epoch: 1 | total time: 77.44m | eta: 102.2m +step 07208/16704 (43.15%) | loss: 2.757705 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,160 | mfu: 50.82 | epoch: 1 | total time: 77.45m | eta: 102.2m +step 07209/16704 (43.16%) | loss: 2.753848 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,022 | mfu: 50.94 | epoch: 1 | total time: 77.46m | eta: 102.2m +step 07210/16704 (43.16%) | loss: 2.753960 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 1 | total time: 77.47m | eta: 102.2m +step 07211/16704 (43.17%) | loss: 2.746338 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,901 | mfu: 50.81 | epoch: 1 | total time: 77.49m | eta: 102.1m +step 07212/16704 (43.18%) | loss: 2.739476 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 1 | total time: 77.50m | eta: 102.1m +step 07213/16704 (43.18%) | loss: 2.728679 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,585 | mfu: 50.85 | epoch: 1 | total time: 77.51m | eta: 102.1m +step 07214/16704 (43.19%) | loss: 2.726156 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,215 | mfu: 50.76 | epoch: 1 | total time: 77.52m | eta: 102.1m +step 07215/16704 (43.19%) | loss: 2.724053 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,921 | mfu: 50.87 | epoch: 1 | total time: 77.53m | eta: 102.1m +step 07216/16704 (43.20%) | loss: 2.716443 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,618 | mfu: 50.91 | epoch: 1 | total time: 77.54m | eta: 102.1m +step 07217/16704 (43.21%) | loss: 2.717770 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,963 | mfu: 50.69 | epoch: 1 | total time: 77.55m | eta: 102.1m +step 07218/16704 (43.21%) | loss: 2.706559 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,324 | mfu: 50.83 | epoch: 1 | total time: 77.56m | eta: 102.1m +step 07219/16704 (43.22%) | loss: 2.708947 | lrm: 1.00 | dt: 642.43ms | tok/sec: 816,098 | mfu: 51.01 | epoch: 1 | total time: 77.57m | eta: 102.1m +step 07220/16704 (43.22%) | loss: 2.700795 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,967 | mfu: 50.87 | epoch: 1 | total time: 77.58m | eta: 102.1m +step 07221/16704 (43.23%) | loss: 2.714321 | lrm: 1.00 | dt: 642.61ms | tok/sec: 815,869 | mfu: 50.99 | epoch: 1 | total time: 77.59m | eta: 102.0m +step 07222/16704 (43.24%) | loss: 2.720245 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,061 | mfu: 50.76 | epoch: 1 | total time: 77.60m | eta: 102.0m +step 07223/16704 (43.24%) | loss: 2.725715 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,606 | mfu: 50.91 | epoch: 1 | total time: 77.61m | eta: 102.0m +step 07224/16704 (43.25%) | loss: 2.726628 | lrm: 1.00 | dt: 650.38ms | tok/sec: 806,127 | mfu: 50.38 | epoch: 1 | total time: 77.63m | eta: 102.0m +step 07225/16704 (43.25%) | loss: 2.720081 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,540 | mfu: 51.03 | epoch: 1 | total time: 77.64m | eta: 102.0m +step 07226/16704 (43.26%) | loss: 2.718037 | lrm: 1.00 | dt: 642.81ms | tok/sec: 815,613 | mfu: 50.98 | epoch: 1 | total time: 77.65m | eta: 102.0m +step 07227/16704 (43.27%) | loss: 2.712112 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,801 | mfu: 50.93 | epoch: 1 | total time: 77.66m | eta: 102.0m +step 07228/16704 (43.27%) | loss: 2.720385 | lrm: 1.00 | dt: 641.87ms | tok/sec: 816,816 | mfu: 51.05 | epoch: 1 | total time: 77.67m | eta: 102.0m +step 07229/16704 (43.28%) | loss: 2.719815 | lrm: 1.00 | dt: 648.59ms | tok/sec: 808,347 | mfu: 50.52 | epoch: 1 | total time: 77.68m | eta: 102.0m +step 07230/16704 (43.28%) | loss: 2.736573 | lrm: 1.00 | dt: 641.95ms | tok/sec: 816,706 | mfu: 51.05 | epoch: 1 | total time: 77.69m | eta: 101.9m +step 07231/16704 (43.29%) | loss: 2.736973 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,277 | mfu: 50.71 | epoch: 1 | total time: 77.70m | eta: 101.9m +step 07232/16704 (43.30%) | loss: 2.744613 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,339 | mfu: 50.83 | epoch: 1 | total time: 77.71m | eta: 101.9m +step 07233/16704 (43.30%) | loss: 2.738290 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,038 | mfu: 50.88 | epoch: 1 | total time: 77.72m | eta: 101.9m +step 07234/16704 (43.31%) | loss: 2.748928 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,571 | mfu: 50.85 | epoch: 1 | total time: 77.73m | eta: 101.9m +step 07235/16704 (43.31%) | loss: 2.756390 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,190 | mfu: 50.95 | epoch: 1 | total time: 77.74m | eta: 101.9m +step 07236/16704 (43.32%) | loss: 2.759511 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,771 | mfu: 50.80 | epoch: 1 | total time: 77.75m | eta: 101.9m +step 07237/16704 (43.32%) | loss: 2.757313 | lrm: 1.00 | dt: 642.30ms | tok/sec: 816,266 | mfu: 51.02 | epoch: 1 | total time: 77.76m | eta: 101.9m +step 07238/16704 (43.33%) | loss: 2.750187 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,110 | mfu: 50.95 | epoch: 1 | total time: 77.78m | eta: 101.9m +step 07239/16704 (43.34%) | loss: 2.741377 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,814 | mfu: 50.80 | epoch: 1 | total time: 77.79m | eta: 101.8m +step 07240/16704 (43.34%) | loss: 2.734420 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,849 | mfu: 50.80 | epoch: 1 | total time: 77.80m | eta: 101.8m +step 07241/16704 (43.35%) | loss: 2.724187 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,838 | mfu: 50.74 | epoch: 1 | total time: 77.81m | eta: 101.8m +step 07242/16704 (43.35%) | loss: 2.714973 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,762 | mfu: 50.92 | epoch: 1 | total time: 77.82m | eta: 101.8m +step 07243/16704 (43.36%) | loss: 2.708587 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,349 | mfu: 50.90 | epoch: 1 | total time: 77.83m | eta: 101.8m +step 07244/16704 (43.37%) | loss: 2.693660 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 1 | total time: 77.84m | eta: 101.8m +step 07245/16704 (43.37%) | loss: 2.682869 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,165 | mfu: 51.01 | epoch: 1 | total time: 77.85m | eta: 101.8m +step 07246/16704 (43.38%) | loss: 2.664752 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,530 | mfu: 50.85 | epoch: 1 | total time: 77.86m | eta: 101.8m +step 07247/16704 (43.38%) | loss: 2.661515 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,671 | mfu: 50.86 | epoch: 1 | total time: 77.87m | eta: 101.8m +step 07248/16704 (43.39%) | loss: 2.675387 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,884 | mfu: 50.93 | epoch: 1 | total time: 77.88m | eta: 101.7m +step 07249/16704 (43.40%) | loss: 2.673665 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,438 | mfu: 50.84 | epoch: 1 | total time: 77.89m | eta: 101.7m +Step 07250 | Validation bpb: 0.826183 +step 07250/16704 (43.40%) | loss: 2.675266 | lrm: 1.00 | dt: 647.42ms | tok/sec: 809,806 | mfu: 50.61 | epoch: 1 | total time: 77.90m | eta: 101.7m +step 07251/16704 (43.41%) | loss: 2.690196 | lrm: 1.00 | dt: 649.08ms | tok/sec: 807,742 | mfu: 50.49 | epoch: 1 | total time: 77.92m | eta: 101.7m +step 07252/16704 (43.41%) | loss: 2.695714 | lrm: 1.00 | dt: 646.15ms | tok/sec: 811,405 | mfu: 50.71 | epoch: 1 | total time: 77.93m | eta: 101.7m +step 07253/16704 (43.42%) | loss: 2.688495 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,141 | mfu: 51.01 | epoch: 1 | total time: 77.94m | eta: 101.7m +step 07254/16704 (43.43%) | loss: 2.697733 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,941 | mfu: 50.75 | epoch: 1 | total time: 77.95m | eta: 101.7m +step 07255/16704 (43.43%) | loss: 2.711023 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,508 | mfu: 50.97 | epoch: 1 | total time: 77.96m | eta: 101.7m +step 07256/16704 (43.44%) | loss: 2.701196 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,518 | mfu: 50.97 | epoch: 1 | total time: 77.97m | eta: 101.7m +step 07257/16704 (43.44%) | loss: 2.706760 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,940 | mfu: 50.75 | epoch: 1 | total time: 77.98m | eta: 101.7m +step 07258/16704 (43.45%) | loss: 2.715623 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,272 | mfu: 50.83 | epoch: 1 | total time: 77.99m | eta: 101.6m +step 07259/16704 (43.46%) | loss: 2.719315 | lrm: 1.00 | dt: 643.26ms | tok/sec: 815,046 | mfu: 50.94 | epoch: 1 | total time: 78.00m | eta: 101.6m +step 07260/16704 (43.46%) | loss: 2.717646 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,534 | mfu: 50.78 | epoch: 1 | total time: 78.01m | eta: 101.6m +step 07261/16704 (43.47%) | loss: 2.725966 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 1 | total time: 78.02m | eta: 101.6m +step 07262/16704 (43.47%) | loss: 2.739819 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,391 | mfu: 50.96 | epoch: 1 | total time: 78.03m | eta: 101.6m +step 07263/16704 (43.48%) | loss: 2.749702 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,494 | mfu: 50.84 | epoch: 1 | total time: 78.04m | eta: 101.6m +step 07264/16704 (43.49%) | loss: 2.741072 | lrm: 1.00 | dt: 642.90ms | tok/sec: 815,501 | mfu: 50.97 | epoch: 1 | total time: 78.05m | eta: 101.6m +step 07265/16704 (43.49%) | loss: 2.753487 | lrm: 1.00 | dt: 645.96ms | tok/sec: 811,642 | mfu: 50.73 | epoch: 1 | total time: 78.07m | eta: 101.6m +step 07266/16704 (43.50%) | loss: 2.754708 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,825 | mfu: 50.80 | epoch: 1 | total time: 78.08m | eta: 101.6m +step 07267/16704 (43.50%) | loss: 2.748649 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,926 | mfu: 50.93 | epoch: 1 | total time: 78.09m | eta: 101.5m +step 07268/16704 (43.51%) | loss: 2.755265 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,552 | mfu: 50.85 | epoch: 1 | total time: 78.10m | eta: 101.5m +step 07269/16704 (43.52%) | loss: 2.745920 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,606 | mfu: 50.91 | epoch: 1 | total time: 78.11m | eta: 101.5m +step 07270/16704 (43.52%) | loss: 2.743111 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,554 | mfu: 50.85 | epoch: 1 | total time: 78.12m | eta: 101.5m +step 07271/16704 (43.53%) | loss: 2.759174 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,329 | mfu: 50.90 | epoch: 1 | total time: 78.13m | eta: 101.5m +step 07272/16704 (43.53%) | loss: 2.767323 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,771 | mfu: 50.92 | epoch: 1 | total time: 78.14m | eta: 101.5m +step 07273/16704 (43.54%) | loss: 2.759834 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,204 | mfu: 50.89 | epoch: 1 | total time: 78.15m | eta: 101.5m +step 07274/16704 (43.55%) | loss: 2.749283 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,551 | mfu: 50.85 | epoch: 1 | total time: 78.16m | eta: 101.5m +step 07275/16704 (43.55%) | loss: 2.737646 | lrm: 1.00 | dt: 641.97ms | tok/sec: 816,687 | mfu: 51.04 | epoch: 1 | total time: 78.17m | eta: 101.5m +step 07276/16704 (43.56%) | loss: 2.734878 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,648 | mfu: 50.85 | epoch: 1 | total time: 78.18m | eta: 101.4m +step 07277/16704 (43.56%) | loss: 2.737054 | lrm: 1.00 | dt: 646.40ms | tok/sec: 811,086 | mfu: 50.69 | epoch: 1 | total time: 78.19m | eta: 101.4m +step 07278/16704 (43.57%) | loss: 2.725022 | lrm: 1.00 | dt: 642.03ms | tok/sec: 816,615 | mfu: 51.04 | epoch: 1 | total time: 78.21m | eta: 101.4m +step 07279/16704 (43.58%) | loss: 2.726978 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,195 | mfu: 50.70 | epoch: 1 | total time: 78.22m | eta: 101.4m +step 07280/16704 (43.58%) | loss: 2.721778 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,810 | mfu: 50.86 | epoch: 1 | total time: 78.23m | eta: 101.4m +step 07281/16704 (43.59%) | loss: 2.722755 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,490 | mfu: 51.03 | epoch: 1 | total time: 78.24m | eta: 101.4m +step 07282/16704 (43.59%) | loss: 2.720852 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,281 | mfu: 50.77 | epoch: 1 | total time: 78.25m | eta: 101.4m +step 07283/16704 (43.60%) | loss: 2.727450 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,558 | mfu: 50.91 | epoch: 1 | total time: 78.26m | eta: 101.4m +step 07284/16704 (43.61%) | loss: 2.737274 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 1 | total time: 78.27m | eta: 101.4m +step 07285/16704 (43.61%) | loss: 2.737053 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,831 | mfu: 50.68 | epoch: 1 | total time: 78.28m | eta: 101.4m +step 07286/16704 (43.62%) | loss: 2.732027 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 1 | total time: 78.29m | eta: 101.3m +step 07287/16704 (43.62%) | loss: 2.734911 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,373 | mfu: 50.71 | epoch: 1 | total time: 78.30m | eta: 101.3m +step 07288/16704 (43.63%) | loss: 2.735568 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,627 | mfu: 50.85 | epoch: 1 | total time: 78.31m | eta: 101.3m +step 07289/16704 (43.64%) | loss: 2.741263 | lrm: 1.00 | dt: 645.10ms | tok/sec: 812,728 | mfu: 50.80 | epoch: 1 | total time: 78.32m | eta: 101.3m +step 07290/16704 (43.64%) | loss: 2.740881 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,892 | mfu: 50.93 | epoch: 1 | total time: 78.33m | eta: 101.3m +step 07291/16704 (43.65%) | loss: 2.730302 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,703 | mfu: 50.80 | epoch: 1 | total time: 78.34m | eta: 101.3m +step 07292/16704 (43.65%) | loss: 2.723175 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,770 | mfu: 50.86 | epoch: 1 | total time: 78.36m | eta: 101.3m +step 07293/16704 (43.66%) | loss: 2.733065 | lrm: 1.00 | dt: 645.56ms | tok/sec: 812,149 | mfu: 50.76 | epoch: 1 | total time: 78.37m | eta: 101.3m +step 07294/16704 (43.67%) | loss: 2.749242 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 1 | total time: 78.38m | eta: 101.3m +step 07295/16704 (43.67%) | loss: 2.740964 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,603 | mfu: 50.98 | epoch: 1 | total time: 78.39m | eta: 101.2m +step 07296/16704 (43.68%) | loss: 2.740246 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,321 | mfu: 50.77 | epoch: 1 | total time: 78.40m | eta: 101.2m +step 07297/16704 (43.68%) | loss: 2.742188 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,060 | mfu: 50.69 | epoch: 1 | total time: 78.41m | eta: 101.2m +step 07298/16704 (43.69%) | loss: 2.739717 | lrm: 1.00 | dt: 647.74ms | tok/sec: 809,408 | mfu: 50.59 | epoch: 1 | total time: 78.42m | eta: 101.2m +step 07299/16704 (43.70%) | loss: 2.732298 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,859 | mfu: 50.99 | epoch: 1 | total time: 78.43m | eta: 101.2m +step 07300/16704 (43.70%) | loss: 2.728067 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,217 | mfu: 50.76 | epoch: 1 | total time: 78.44m | eta: 101.2m +step 07301/16704 (43.71%) | loss: 2.733998 | lrm: 1.00 | dt: 647.99ms | tok/sec: 809,103 | mfu: 50.57 | epoch: 1 | total time: 78.45m | eta: 101.2m +step 07302/16704 (43.71%) | loss: 2.729888 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,497 | mfu: 50.84 | epoch: 1 | total time: 78.46m | eta: 101.2m +step 07303/16704 (43.72%) | loss: 2.733623 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,076 | mfu: 50.76 | epoch: 1 | total time: 78.47m | eta: 101.2m +step 07304/16704 (43.73%) | loss: 2.749672 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,312 | mfu: 50.83 | epoch: 1 | total time: 78.48m | eta: 101.1m +step 07305/16704 (43.73%) | loss: 2.743169 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,068 | mfu: 50.76 | epoch: 1 | total time: 78.50m | eta: 101.1m +step 07306/16704 (43.74%) | loss: 2.735535 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,859 | mfu: 50.87 | epoch: 1 | total time: 78.51m | eta: 101.1m +step 07307/16704 (43.74%) | loss: 2.729507 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,081 | mfu: 50.88 | epoch: 1 | total time: 78.52m | eta: 101.1m +step 07308/16704 (43.75%) | loss: 2.726299 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,819 | mfu: 50.93 | epoch: 1 | total time: 78.53m | eta: 101.1m +step 07309/16704 (43.76%) | loss: 2.720253 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,030 | mfu: 50.75 | epoch: 1 | total time: 78.54m | eta: 101.1m +step 07310/16704 (43.76%) | loss: 2.714487 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,751 | mfu: 50.99 | epoch: 1 | total time: 78.55m | eta: 101.1m +step 07311/16704 (43.77%) | loss: 2.717581 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,871 | mfu: 50.93 | epoch: 1 | total time: 78.56m | eta: 101.1m +step 07312/16704 (43.77%) | loss: 2.725025 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 1 | total time: 78.57m | eta: 101.1m +step 07313/16704 (43.78%) | loss: 2.734824 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,311 | mfu: 50.96 | epoch: 1 | total time: 78.58m | eta: 101.0m +step 07314/16704 (43.79%) | loss: 2.727172 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,703 | mfu: 50.80 | epoch: 1 | total time: 78.59m | eta: 101.0m +step 07315/16704 (43.79%) | loss: 2.731183 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 1 | total time: 78.60m | eta: 101.0m +step 07316/16704 (43.80%) | loss: 2.726781 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,154 | mfu: 50.95 | epoch: 1 | total time: 78.61m | eta: 101.0m +step 07317/16704 (43.80%) | loss: 2.723727 | lrm: 1.00 | dt: 646.96ms | tok/sec: 810,391 | mfu: 50.65 | epoch: 1 | total time: 78.62m | eta: 101.0m +step 07318/16704 (43.81%) | loss: 2.720079 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,565 | mfu: 50.91 | epoch: 1 | total time: 78.64m | eta: 101.0m +step 07319/16704 (43.82%) | loss: 2.708682 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,198 | mfu: 50.76 | epoch: 1 | total time: 78.65m | eta: 101.0m +step 07320/16704 (43.82%) | loss: 2.709980 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,600 | mfu: 50.73 | epoch: 1 | total time: 78.66m | eta: 101.0m +step 07321/16704 (43.83%) | loss: 2.713424 | lrm: 1.00 | dt: 641.43ms | tok/sec: 817,375 | mfu: 51.09 | epoch: 1 | total time: 78.67m | eta: 101.0m +step 07322/16704 (43.83%) | loss: 2.719470 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,758 | mfu: 50.74 | epoch: 1 | total time: 78.68m | eta: 101.0m +step 07323/16704 (43.84%) | loss: 2.721574 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,348 | mfu: 50.77 | epoch: 1 | total time: 78.69m | eta: 100.9m +step 07324/16704 (43.85%) | loss: 2.724346 | lrm: 1.00 | dt: 640.96ms | tok/sec: 817,976 | mfu: 51.12 | epoch: 1 | total time: 78.70m | eta: 100.9m +step 07325/16704 (43.85%) | loss: 2.726609 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,741 | mfu: 50.98 | epoch: 1 | total time: 78.71m | eta: 100.9m +step 07326/16704 (43.86%) | loss: 2.731779 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,952 | mfu: 50.81 | epoch: 1 | total time: 78.72m | eta: 100.9m +step 07327/16704 (43.86%) | loss: 2.717325 | lrm: 1.00 | dt: 642.55ms | tok/sec: 815,943 | mfu: 51.00 | epoch: 1 | total time: 78.73m | eta: 100.9m +step 07328/16704 (43.87%) | loss: 2.719404 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,578 | mfu: 50.85 | epoch: 1 | total time: 78.74m | eta: 100.9m +step 07329/16704 (43.88%) | loss: 2.726233 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,310 | mfu: 50.96 | epoch: 1 | total time: 78.75m | eta: 100.9m +step 07330/16704 (43.88%) | loss: 2.708098 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,812 | mfu: 50.86 | epoch: 1 | total time: 78.76m | eta: 100.9m +step 07331/16704 (43.89%) | loss: 2.714820 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,243 | mfu: 50.95 | epoch: 1 | total time: 78.77m | eta: 100.9m +step 07332/16704 (43.89%) | loss: 2.711202 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,864 | mfu: 50.87 | epoch: 1 | total time: 78.79m | eta: 100.8m +step 07333/16704 (43.90%) | loss: 2.715085 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,425 | mfu: 50.97 | epoch: 1 | total time: 78.80m | eta: 100.8m +step 07334/16704 (43.91%) | loss: 2.708460 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 1 | total time: 78.81m | eta: 100.8m +step 07335/16704 (43.91%) | loss: 2.722162 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,625 | mfu: 50.79 | epoch: 1 | total time: 78.82m | eta: 100.8m +step 07336/16704 (43.92%) | loss: 2.719241 | lrm: 1.00 | dt: 640.74ms | tok/sec: 818,249 | mfu: 51.14 | epoch: 1 | total time: 78.83m | eta: 100.8m +step 07337/16704 (43.92%) | loss: 2.722122 | lrm: 1.00 | dt: 641.84ms | tok/sec: 816,849 | mfu: 51.05 | epoch: 1 | total time: 78.84m | eta: 100.8m +step 07338/16704 (43.93%) | loss: 2.704690 | lrm: 1.00 | dt: 642.13ms | tok/sec: 816,481 | mfu: 51.03 | epoch: 1 | total time: 78.85m | eta: 100.8m +step 07339/16704 (43.94%) | loss: 2.701937 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,826 | mfu: 50.99 | epoch: 1 | total time: 78.86m | eta: 100.8m +step 07340/16704 (43.94%) | loss: 2.682767 | lrm: 1.00 | dt: 642.17ms | tok/sec: 816,428 | mfu: 51.03 | epoch: 1 | total time: 78.87m | eta: 100.8m +step 07341/16704 (43.95%) | loss: 2.692865 | lrm: 1.00 | dt: 646.45ms | tok/sec: 811,025 | mfu: 50.69 | epoch: 1 | total time: 78.88m | eta: 100.7m +step 07342/16704 (43.95%) | loss: 2.704880 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,749 | mfu: 50.86 | epoch: 1 | total time: 78.89m | eta: 100.7m +step 07343/16704 (43.96%) | loss: 2.687419 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,358 | mfu: 50.84 | epoch: 1 | total time: 78.90m | eta: 100.7m +step 07344/16704 (43.97%) | loss: 2.695505 | lrm: 1.00 | dt: 641.68ms | tok/sec: 817,052 | mfu: 51.07 | epoch: 1 | total time: 78.91m | eta: 100.7m +step 07345/16704 (43.97%) | loss: 2.697255 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,315 | mfu: 50.71 | epoch: 1 | total time: 78.92m | eta: 100.7m +step 07346/16704 (43.98%) | loss: 2.702226 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,570 | mfu: 50.91 | epoch: 1 | total time: 78.94m | eta: 100.7m +step 07347/16704 (43.98%) | loss: 2.697157 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,867 | mfu: 51.06 | epoch: 1 | total time: 78.95m | eta: 100.7m +step 07348/16704 (43.99%) | loss: 2.703472 | lrm: 1.00 | dt: 642.27ms | tok/sec: 816,306 | mfu: 51.02 | epoch: 1 | total time: 78.96m | eta: 100.7m +step 07349/16704 (44.00%) | loss: 2.709319 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,203 | mfu: 51.01 | epoch: 1 | total time: 78.97m | eta: 100.7m +step 07350/16704 (44.00%) | loss: 2.713828 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,168 | mfu: 50.95 | epoch: 1 | total time: 78.98m | eta: 100.6m +step 07351/16704 (44.01%) | loss: 2.710781 | lrm: 1.00 | dt: 642.03ms | tok/sec: 816,605 | mfu: 51.04 | epoch: 1 | total time: 78.99m | eta: 100.6m +step 07352/16704 (44.01%) | loss: 2.717193 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,427 | mfu: 50.84 | epoch: 1 | total time: 79.00m | eta: 100.6m +step 07353/16704 (44.02%) | loss: 2.716206 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,001 | mfu: 50.81 | epoch: 1 | total time: 79.01m | eta: 100.6m +step 07354/16704 (44.03%) | loss: 2.718801 | lrm: 1.00 | dt: 641.81ms | tok/sec: 816,883 | mfu: 51.06 | epoch: 1 | total time: 79.02m | eta: 100.6m +step 07355/16704 (44.03%) | loss: 2.721998 | lrm: 1.00 | dt: 642.94ms | tok/sec: 815,456 | mfu: 50.97 | epoch: 1 | total time: 79.03m | eta: 100.6m +step 07356/16704 (44.04%) | loss: 2.724526 | lrm: 1.00 | dt: 641.14ms | tok/sec: 817,738 | mfu: 51.11 | epoch: 1 | total time: 79.04m | eta: 100.6m +step 07357/16704 (44.04%) | loss: 2.737100 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,125 | mfu: 51.01 | epoch: 1 | total time: 79.05m | eta: 100.6m +step 07358/16704 (44.05%) | loss: 2.743589 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,381 | mfu: 50.71 | epoch: 1 | total time: 79.06m | eta: 100.6m +step 07359/16704 (44.06%) | loss: 2.745135 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,125 | mfu: 50.95 | epoch: 1 | total time: 79.07m | eta: 100.6m +step 07360/16704 (44.06%) | loss: 2.743289 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,950 | mfu: 50.94 | epoch: 1 | total time: 79.09m | eta: 100.5m +step 07361/16704 (44.07%) | loss: 2.738471 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,323 | mfu: 50.96 | epoch: 1 | total time: 79.10m | eta: 100.5m +step 07362/16704 (44.07%) | loss: 2.725966 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 1 | total time: 79.11m | eta: 100.5m +step 07363/16704 (44.08%) | loss: 2.730952 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,333 | mfu: 50.71 | epoch: 1 | total time: 79.12m | eta: 100.5m +step 07364/16704 (44.09%) | loss: 2.735730 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,373 | mfu: 50.77 | epoch: 1 | total time: 79.13m | eta: 100.5m +step 07365/16704 (44.09%) | loss: 2.731009 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,884 | mfu: 50.99 | epoch: 1 | total time: 79.14m | eta: 100.5m +step 07366/16704 (44.10%) | loss: 2.736695 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,597 | mfu: 50.79 | epoch: 1 | total time: 79.15m | eta: 100.5m +step 07367/16704 (44.10%) | loss: 2.733282 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,859 | mfu: 50.99 | epoch: 1 | total time: 79.16m | eta: 100.5m +step 07368/16704 (44.11%) | loss: 2.734467 | lrm: 1.00 | dt: 646.55ms | tok/sec: 810,902 | mfu: 50.68 | epoch: 1 | total time: 79.17m | eta: 100.5m +step 07369/16704 (44.12%) | loss: 2.740373 | lrm: 1.00 | dt: 641.23ms | tok/sec: 817,622 | mfu: 51.10 | epoch: 1 | total time: 79.18m | eta: 100.4m +step 07370/16704 (44.12%) | loss: 2.758622 | lrm: 1.00 | dt: 642.74ms | tok/sec: 815,712 | mfu: 50.98 | epoch: 1 | total time: 79.19m | eta: 100.4m +step 07371/16704 (44.13%) | loss: 2.737434 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,583 | mfu: 50.98 | epoch: 1 | total time: 79.20m | eta: 100.4m +step 07372/16704 (44.13%) | loss: 2.731945 | lrm: 1.00 | dt: 643.85ms | tok/sec: 814,295 | mfu: 50.89 | epoch: 1 | total time: 79.21m | eta: 100.4m +step 07373/16704 (44.14%) | loss: 2.732267 | lrm: 1.00 | dt: 646.30ms | tok/sec: 811,213 | mfu: 50.70 | epoch: 1 | total time: 79.22m | eta: 100.4m +step 07374/16704 (44.15%) | loss: 2.723733 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,120 | mfu: 51.01 | epoch: 1 | total time: 79.24m | eta: 100.4m +step 07375/16704 (44.15%) | loss: 2.726536 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,497 | mfu: 50.97 | epoch: 1 | total time: 79.25m | eta: 100.4m +step 07376/16704 (44.16%) | loss: 2.730131 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,443 | mfu: 50.97 | epoch: 1 | total time: 79.26m | eta: 100.4m +step 07377/16704 (44.16%) | loss: 2.728171 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,319 | mfu: 50.90 | epoch: 1 | total time: 79.27m | eta: 100.4m +step 07378/16704 (44.17%) | loss: 2.726308 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,853 | mfu: 50.93 | epoch: 1 | total time: 79.28m | eta: 100.3m +step 07379/16704 (44.18%) | loss: 2.718199 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,919 | mfu: 50.75 | epoch: 1 | total time: 79.29m | eta: 100.3m +step 07380/16704 (44.18%) | loss: 2.726733 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,551 | mfu: 50.97 | epoch: 1 | total time: 79.30m | eta: 100.3m +step 07381/16704 (44.19%) | loss: 2.728788 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,821 | mfu: 50.80 | epoch: 1 | total time: 79.31m | eta: 100.3m +step 07382/16704 (44.19%) | loss: 2.723690 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,974 | mfu: 50.81 | epoch: 1 | total time: 79.32m | eta: 100.3m +step 07383/16704 (44.20%) | loss: 2.727432 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,465 | mfu: 50.91 | epoch: 1 | total time: 79.33m | eta: 100.3m +step 07384/16704 (44.20%) | loss: 2.715956 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,830 | mfu: 50.74 | epoch: 1 | total time: 79.34m | eta: 100.3m +step 07385/16704 (44.21%) | loss: 2.720347 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,266 | mfu: 50.83 | epoch: 1 | total time: 79.35m | eta: 100.3m +step 07386/16704 (44.22%) | loss: 2.709986 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,220 | mfu: 50.89 | epoch: 1 | total time: 79.36m | eta: 100.3m +step 07387/16704 (44.22%) | loss: 2.715735 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 1 | total time: 79.38m | eta: 100.2m +step 07388/16704 (44.23%) | loss: 2.716387 | lrm: 1.00 | dt: 643.01ms | tok/sec: 815,367 | mfu: 50.96 | epoch: 1 | total time: 79.39m | eta: 100.2m +step 07389/16704 (44.23%) | loss: 2.714684 | lrm: 1.00 | dt: 642.53ms | tok/sec: 815,975 | mfu: 51.00 | epoch: 1 | total time: 79.40m | eta: 100.2m +step 07390/16704 (44.24%) | loss: 2.723469 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,143 | mfu: 50.89 | epoch: 1 | total time: 79.41m | eta: 100.2m +step 07391/16704 (44.25%) | loss: 2.729478 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,197 | mfu: 50.89 | epoch: 1 | total time: 79.42m | eta: 100.2m +step 07392/16704 (44.25%) | loss: 2.740018 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,273 | mfu: 50.89 | epoch: 1 | total time: 79.43m | eta: 100.2m +step 07393/16704 (44.26%) | loss: 2.739047 | lrm: 1.00 | dt: 646.31ms | tok/sec: 811,199 | mfu: 50.70 | epoch: 1 | total time: 79.44m | eta: 100.2m +step 07394/16704 (44.26%) | loss: 2.743621 | lrm: 1.00 | dt: 641.53ms | tok/sec: 817,248 | mfu: 51.08 | epoch: 1 | total time: 79.45m | eta: 100.2m +step 07395/16704 (44.27%) | loss: 2.745967 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,896 | mfu: 50.99 | epoch: 1 | total time: 79.46m | eta: 100.2m +step 07396/16704 (44.28%) | loss: 2.747883 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,730 | mfu: 50.92 | epoch: 1 | total time: 79.47m | eta: 100.2m +step 07397/16704 (44.28%) | loss: 2.750992 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,722 | mfu: 50.92 | epoch: 1 | total time: 79.48m | eta: 100.1m +step 07398/16704 (44.29%) | loss: 2.748233 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,761 | mfu: 50.80 | epoch: 1 | total time: 79.49m | eta: 100.1m +step 07399/16704 (44.29%) | loss: 2.738187 | lrm: 1.00 | dt: 641.46ms | tok/sec: 817,339 | mfu: 51.08 | epoch: 1 | total time: 79.50m | eta: 100.1m +step 07400/16704 (44.30%) | loss: 2.731826 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,092 | mfu: 50.82 | epoch: 1 | total time: 79.51m | eta: 100.1m +step 07401/16704 (44.31%) | loss: 2.741855 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,753 | mfu: 50.86 | epoch: 1 | total time: 79.53m | eta: 100.1m +step 07402/16704 (44.31%) | loss: 2.752166 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,119 | mfu: 50.95 | epoch: 1 | total time: 79.54m | eta: 100.1m +step 07403/16704 (44.32%) | loss: 2.737481 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,822 | mfu: 50.93 | epoch: 1 | total time: 79.55m | eta: 100.1m +step 07404/16704 (44.32%) | loss: 2.726341 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,610 | mfu: 50.98 | epoch: 1 | total time: 79.56m | eta: 100.1m +step 07405/16704 (44.33%) | loss: 2.738318 | lrm: 1.00 | dt: 645.97ms | tok/sec: 811,633 | mfu: 50.73 | epoch: 1 | total time: 79.57m | eta: 100.1m +step 07406/16704 (44.34%) | loss: 2.752070 | lrm: 1.00 | dt: 640.54ms | tok/sec: 818,508 | mfu: 51.16 | epoch: 1 | total time: 79.58m | eta: 100.0m +step 07407/16704 (44.34%) | loss: 2.741629 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,732 | mfu: 50.92 | epoch: 1 | total time: 79.59m | eta: 100.0m +step 07408/16704 (44.35%) | loss: 2.740510 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,795 | mfu: 50.86 | epoch: 1 | total time: 79.60m | eta: 100.0m +step 07409/16704 (44.35%) | loss: 2.733911 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,282 | mfu: 50.89 | epoch: 1 | total time: 79.61m | eta: 100.0m +step 07410/16704 (44.36%) | loss: 2.743117 | lrm: 1.00 | dt: 643.09ms | tok/sec: 815,258 | mfu: 50.95 | epoch: 1 | total time: 79.62m | eta: 100.0m +step 07411/16704 (44.37%) | loss: 2.743078 | lrm: 1.00 | dt: 642.41ms | tok/sec: 816,128 | mfu: 51.01 | epoch: 1 | total time: 79.63m | eta: 100.0m +step 07412/16704 (44.37%) | loss: 2.732001 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,622 | mfu: 50.79 | epoch: 1 | total time: 79.64m | eta: 100.0m +step 07413/16704 (44.38%) | loss: 2.732600 | lrm: 1.00 | dt: 642.76ms | tok/sec: 815,683 | mfu: 50.98 | epoch: 1 | total time: 79.65m | eta: 100.0m +step 07414/16704 (44.38%) | loss: 2.723166 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,513 | mfu: 50.72 | epoch: 1 | total time: 79.66m | eta: 100.0m +step 07415/16704 (44.39%) | loss: 2.717359 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,136 | mfu: 50.88 | epoch: 1 | total time: 79.68m | eta: 99.9m +step 07416/16704 (44.40%) | loss: 2.714817 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,100 | mfu: 50.88 | epoch: 1 | total time: 79.69m | eta: 99.9m +step 07417/16704 (44.40%) | loss: 2.720286 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,134 | mfu: 50.70 | epoch: 1 | total time: 79.70m | eta: 99.9m +step 07418/16704 (44.41%) | loss: 2.718611 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,314 | mfu: 50.90 | epoch: 1 | total time: 79.71m | eta: 99.9m +step 07419/16704 (44.41%) | loss: 2.725537 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,626 | mfu: 50.85 | epoch: 1 | total time: 79.72m | eta: 99.9m +step 07420/16704 (44.42%) | loss: 2.739003 | lrm: 1.00 | dt: 641.13ms | tok/sec: 817,756 | mfu: 51.11 | epoch: 1 | total time: 79.73m | eta: 99.9m +step 07421/16704 (44.43%) | loss: 2.750169 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,645 | mfu: 50.85 | epoch: 1 | total time: 79.74m | eta: 99.9m +step 07422/16704 (44.43%) | loss: 2.745431 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,478 | mfu: 50.84 | epoch: 1 | total time: 79.75m | eta: 99.9m +step 07423/16704 (44.44%) | loss: 2.738915 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,342 | mfu: 50.84 | epoch: 1 | total time: 79.76m | eta: 99.9m +step 07424/16704 (44.44%) | loss: 2.731977 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,148 | mfu: 50.89 | epoch: 1 | total time: 79.77m | eta: 99.8m +step 07425/16704 (44.45%) | loss: 2.723591 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,638 | mfu: 50.98 | epoch: 1 | total time: 79.78m | eta: 99.8m +step 07426/16704 (44.46%) | loss: 2.721828 | lrm: 1.00 | dt: 643.66ms | tok/sec: 814,548 | mfu: 50.91 | epoch: 1 | total time: 79.79m | eta: 99.8m +step 07427/16704 (44.46%) | loss: 2.712036 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,851 | mfu: 50.80 | epoch: 1 | total time: 79.80m | eta: 99.8m +step 07428/16704 (44.47%) | loss: 2.721219 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,222 | mfu: 50.83 | epoch: 1 | total time: 79.82m | eta: 99.8m +step 07429/16704 (44.47%) | loss: 2.723927 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,156 | mfu: 50.95 | epoch: 1 | total time: 79.83m | eta: 99.8m +step 07430/16704 (44.48%) | loss: 2.716495 | lrm: 1.00 | dt: 642.95ms | tok/sec: 815,443 | mfu: 50.97 | epoch: 1 | total time: 79.84m | eta: 99.8m +step 07431/16704 (44.49%) | loss: 2.709441 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,952 | mfu: 50.94 | epoch: 1 | total time: 79.85m | eta: 99.8m +step 07432/16704 (44.49%) | loss: 2.712036 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,208 | mfu: 50.76 | epoch: 1 | total time: 79.86m | eta: 99.8m +step 07433/16704 (44.50%) | loss: 2.715012 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,055 | mfu: 50.69 | epoch: 1 | total time: 79.87m | eta: 99.8m +step 07434/16704 (44.50%) | loss: 2.728029 | lrm: 1.00 | dt: 641.57ms | tok/sec: 817,197 | mfu: 51.08 | epoch: 1 | total time: 79.88m | eta: 99.7m +step 07435/16704 (44.51%) | loss: 2.729078 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,692 | mfu: 50.79 | epoch: 1 | total time: 79.89m | eta: 99.7m +step 07436/16704 (44.52%) | loss: 2.724348 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,675 | mfu: 50.92 | epoch: 1 | total time: 79.90m | eta: 99.7m +step 07437/16704 (44.52%) | loss: 2.730328 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,277 | mfu: 50.77 | epoch: 1 | total time: 79.91m | eta: 99.7m +step 07438/16704 (44.53%) | loss: 2.733367 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,919 | mfu: 50.93 | epoch: 1 | total time: 79.92m | eta: 99.7m +step 07439/16704 (44.53%) | loss: 2.728018 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,136 | mfu: 51.01 | epoch: 1 | total time: 79.93m | eta: 99.7m +step 07440/16704 (44.54%) | loss: 2.723334 | lrm: 1.00 | dt: 644.58ms | tok/sec: 813,375 | mfu: 50.84 | epoch: 1 | total time: 79.94m | eta: 99.7m +step 07441/16704 (44.55%) | loss: 2.730761 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,420 | mfu: 50.84 | epoch: 1 | total time: 79.95m | eta: 99.7m +step 07442/16704 (44.55%) | loss: 2.719945 | lrm: 1.00 | dt: 643.13ms | tok/sec: 815,218 | mfu: 50.95 | epoch: 1 | total time: 79.97m | eta: 99.7m +step 07443/16704 (44.56%) | loss: 2.711365 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,741 | mfu: 50.86 | epoch: 1 | total time: 79.98m | eta: 99.6m +step 07444/16704 (44.56%) | loss: 2.716686 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,360 | mfu: 50.90 | epoch: 1 | total time: 79.99m | eta: 99.6m +step 07445/16704 (44.57%) | loss: 2.710154 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,162 | mfu: 50.89 | epoch: 1 | total time: 80.00m | eta: 99.6m +step 07446/16704 (44.58%) | loss: 2.714776 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,293 | mfu: 50.83 | epoch: 1 | total time: 80.01m | eta: 99.6m +step 07447/16704 (44.58%) | loss: 2.730429 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,467 | mfu: 50.78 | epoch: 1 | total time: 80.02m | eta: 99.6m +step 07448/16704 (44.59%) | loss: 2.735849 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,270 | mfu: 50.89 | epoch: 1 | total time: 80.03m | eta: 99.6m +step 07449/16704 (44.59%) | loss: 2.738062 | lrm: 1.00 | dt: 641.98ms | tok/sec: 816,677 | mfu: 51.04 | epoch: 1 | total time: 80.04m | eta: 99.6m +step 07450/16704 (44.60%) | loss: 2.745856 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,738 | mfu: 50.86 | epoch: 1 | total time: 80.05m | eta: 99.6m +step 07451/16704 (44.61%) | loss: 2.753308 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,928 | mfu: 50.81 | epoch: 1 | total time: 80.06m | eta: 99.6m +step 07452/16704 (44.61%) | loss: 2.745497 | lrm: 1.00 | dt: 643.82ms | tok/sec: 814,334 | mfu: 50.90 | epoch: 1 | total time: 80.07m | eta: 99.5m +step 07453/16704 (44.62%) | loss: 2.731655 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,924 | mfu: 50.87 | epoch: 1 | total time: 80.08m | eta: 99.5m +step 07454/16704 (44.62%) | loss: 2.733630 | lrm: 1.00 | dt: 642.56ms | tok/sec: 815,939 | mfu: 51.00 | epoch: 1 | total time: 80.09m | eta: 99.5m +step 07455/16704 (44.63%) | loss: 2.735637 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 1 | total time: 80.10m | eta: 99.5m +step 07456/16704 (44.64%) | loss: 2.736605 | lrm: 1.00 | dt: 643.21ms | tok/sec: 815,114 | mfu: 50.95 | epoch: 1 | total time: 80.12m | eta: 99.5m +step 07457/16704 (44.64%) | loss: 2.736778 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,348 | mfu: 50.96 | epoch: 1 | total time: 80.13m | eta: 99.5m +step 07458/16704 (44.65%) | loss: 2.740194 | lrm: 1.00 | dt: 645.44ms | tok/sec: 812,299 | mfu: 50.77 | epoch: 1 | total time: 80.14m | eta: 99.5m +step 07459/16704 (44.65%) | loss: 2.735349 | lrm: 1.00 | dt: 643.37ms | tok/sec: 814,907 | mfu: 50.93 | epoch: 1 | total time: 80.15m | eta: 99.5m +step 07460/16704 (44.66%) | loss: 2.738562 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,837 | mfu: 50.80 | epoch: 1 | total time: 80.16m | eta: 99.5m +step 07461/16704 (44.67%) | loss: 2.733381 | lrm: 1.00 | dt: 642.79ms | tok/sec: 815,642 | mfu: 50.98 | epoch: 1 | total time: 80.17m | eta: 99.5m +step 07462/16704 (44.67%) | loss: 2.729898 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,636 | mfu: 50.79 | epoch: 1 | total time: 80.18m | eta: 99.4m +step 07463/16704 (44.68%) | loss: 2.725796 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,643 | mfu: 50.85 | epoch: 1 | total time: 80.19m | eta: 99.4m +step 07464/16704 (44.68%) | loss: 2.739484 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,574 | mfu: 50.85 | epoch: 1 | total time: 80.20m | eta: 99.4m +step 07465/16704 (44.69%) | loss: 2.749236 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,144 | mfu: 50.89 | epoch: 1 | total time: 80.21m | eta: 99.4m +step 07466/16704 (44.70%) | loss: 2.744040 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,371 | mfu: 50.96 | epoch: 1 | total time: 80.22m | eta: 99.4m +step 07467/16704 (44.70%) | loss: 2.733867 | lrm: 1.00 | dt: 641.46ms | tok/sec: 817,331 | mfu: 51.08 | epoch: 1 | total time: 80.23m | eta: 99.4m +step 07468/16704 (44.71%) | loss: 2.735335 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,616 | mfu: 50.91 | epoch: 1 | total time: 80.24m | eta: 99.4m +step 07469/16704 (44.71%) | loss: 2.728211 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,373 | mfu: 50.90 | epoch: 1 | total time: 80.26m | eta: 99.4m +step 07470/16704 (44.72%) | loss: 2.729653 | lrm: 1.00 | dt: 642.57ms | tok/sec: 815,928 | mfu: 51.00 | epoch: 1 | total time: 80.27m | eta: 99.4m +step 07471/16704 (44.73%) | loss: 2.726232 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,314 | mfu: 50.90 | epoch: 1 | total time: 80.28m | eta: 99.3m +step 07472/16704 (44.73%) | loss: 2.725847 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,157 | mfu: 50.70 | epoch: 1 | total time: 80.29m | eta: 99.3m +step 07473/16704 (44.74%) | loss: 2.716476 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,977 | mfu: 50.75 | epoch: 1 | total time: 80.30m | eta: 99.3m +step 07474/16704 (44.74%) | loss: 2.722128 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,265 | mfu: 50.89 | epoch: 1 | total time: 80.31m | eta: 99.3m +step 07475/16704 (44.75%) | loss: 2.719031 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,493 | mfu: 50.91 | epoch: 1 | total time: 80.32m | eta: 99.3m +step 07476/16704 (44.76%) | loss: 2.720544 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,864 | mfu: 50.87 | epoch: 1 | total time: 80.33m | eta: 99.3m +step 07477/16704 (44.76%) | loss: 2.721207 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,174 | mfu: 50.82 | epoch: 1 | total time: 80.34m | eta: 99.3m +step 07478/16704 (44.77%) | loss: 2.716450 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,645 | mfu: 50.79 | epoch: 1 | total time: 80.35m | eta: 99.3m +step 07479/16704 (44.77%) | loss: 2.711917 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,021 | mfu: 50.94 | epoch: 1 | total time: 80.36m | eta: 99.3m +step 07480/16704 (44.78%) | loss: 2.710864 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,279 | mfu: 50.77 | epoch: 1 | total time: 80.37m | eta: 99.2m +step 07481/16704 (44.79%) | loss: 2.714197 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,206 | mfu: 50.95 | epoch: 1 | total time: 80.38m | eta: 99.2m +step 07482/16704 (44.79%) | loss: 2.708878 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,973 | mfu: 50.87 | epoch: 1 | total time: 80.39m | eta: 99.2m +step 07483/16704 (44.80%) | loss: 2.716287 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,808 | mfu: 50.86 | epoch: 1 | total time: 80.41m | eta: 99.2m +step 07484/16704 (44.80%) | loss: 2.718644 | lrm: 1.00 | dt: 643.14ms | tok/sec: 815,198 | mfu: 50.95 | epoch: 1 | total time: 80.42m | eta: 99.2m +step 07485/16704 (44.81%) | loss: 2.714826 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,644 | mfu: 50.85 | epoch: 1 | total time: 80.43m | eta: 99.2m +step 07486/16704 (44.82%) | loss: 2.714289 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,530 | mfu: 50.85 | epoch: 1 | total time: 80.44m | eta: 99.2m +step 07487/16704 (44.82%) | loss: 2.721357 | lrm: 1.00 | dt: 642.89ms | tok/sec: 815,515 | mfu: 50.97 | epoch: 1 | total time: 80.45m | eta: 99.2m +step 07488/16704 (44.83%) | loss: 2.716380 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,440 | mfu: 50.90 | epoch: 1 | total time: 80.46m | eta: 99.2m +step 07489/16704 (44.83%) | loss: 2.713736 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,463 | mfu: 50.84 | epoch: 1 | total time: 80.47m | eta: 99.1m +step 07490/16704 (44.84%) | loss: 2.700162 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,188 | mfu: 50.70 | epoch: 1 | total time: 80.48m | eta: 99.1m +step 07491/16704 (44.85%) | loss: 2.697824 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,123 | mfu: 50.88 | epoch: 1 | total time: 80.49m | eta: 99.1m +step 07492/16704 (44.85%) | loss: 2.697235 | lrm: 1.00 | dt: 643.33ms | tok/sec: 814,964 | mfu: 50.94 | epoch: 1 | total time: 80.50m | eta: 99.1m +step 07493/16704 (44.86%) | loss: 2.696817 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,361 | mfu: 50.84 | epoch: 1 | total time: 80.51m | eta: 99.1m +step 07494/16704 (44.86%) | loss: 2.707160 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,026 | mfu: 50.82 | epoch: 1 | total time: 80.52m | eta: 99.1m +step 07495/16704 (44.87%) | loss: 2.704161 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,469 | mfu: 50.78 | epoch: 1 | total time: 80.53m | eta: 99.1m +step 07496/16704 (44.88%) | loss: 2.706774 | lrm: 1.00 | dt: 645.09ms | tok/sec: 812,737 | mfu: 50.80 | epoch: 1 | total time: 80.55m | eta: 99.1m +step 07497/16704 (44.88%) | loss: 2.691313 | lrm: 1.00 | dt: 642.19ms | tok/sec: 816,405 | mfu: 51.03 | epoch: 1 | total time: 80.56m | eta: 99.1m +step 07498/16704 (44.89%) | loss: 2.692790 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,862 | mfu: 50.87 | epoch: 1 | total time: 80.57m | eta: 99.1m +step 07499/16704 (44.89%) | loss: 2.679011 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,841 | mfu: 50.74 | epoch: 1 | total time: 80.58m | eta: 99.0m +Step 07500 | Validation bpb: 0.824761 +step 07500/16704 (44.90%) | loss: 2.681175 | lrm: 1.00 | dt: 641.47ms | tok/sec: 817,322 | mfu: 51.08 | epoch: 1 | total time: 80.59m | eta: 99.0m +step 07501/16704 (44.91%) | loss: 2.691116 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,015 | mfu: 50.88 | epoch: 1 | total time: 80.60m | eta: 99.0m +step 07502/16704 (44.91%) | loss: 2.683425 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,463 | mfu: 50.97 | epoch: 1 | total time: 80.61m | eta: 99.0m +step 07503/16704 (44.92%) | loss: 2.690743 | lrm: 1.00 | dt: 641.56ms | tok/sec: 817,210 | mfu: 51.08 | epoch: 1 | total time: 80.62m | eta: 99.0m +step 07504/16704 (44.92%) | loss: 2.683964 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,769 | mfu: 50.92 | epoch: 1 | total time: 80.63m | eta: 99.0m +step 07505/16704 (44.93%) | loss: 2.677057 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,791 | mfu: 50.80 | epoch: 1 | total time: 80.64m | eta: 99.0m +step 07506/16704 (44.94%) | loss: 2.689922 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,793 | mfu: 50.99 | epoch: 1 | total time: 80.65m | eta: 99.0m +step 07507/16704 (44.94%) | loss: 2.692630 | lrm: 1.00 | dt: 647.09ms | tok/sec: 810,220 | mfu: 50.64 | epoch: 1 | total time: 80.66m | eta: 99.0m +step 07508/16704 (44.95%) | loss: 2.686574 | lrm: 1.00 | dt: 642.52ms | tok/sec: 815,986 | mfu: 51.00 | epoch: 1 | total time: 80.67m | eta: 98.9m +step 07509/16704 (44.95%) | loss: 2.705927 | lrm: 1.00 | dt: 642.17ms | tok/sec: 816,437 | mfu: 51.03 | epoch: 1 | total time: 80.68m | eta: 98.9m +step 07510/16704 (44.96%) | loss: 2.725126 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,245 | mfu: 50.77 | epoch: 1 | total time: 80.70m | eta: 98.9m +step 07511/16704 (44.97%) | loss: 2.714644 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,028 | mfu: 50.94 | epoch: 1 | total time: 80.71m | eta: 98.9m +step 07512/16704 (44.97%) | loss: 2.722071 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,830 | mfu: 50.93 | epoch: 1 | total time: 80.72m | eta: 98.9m +step 07513/16704 (44.98%) | loss: 2.722088 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,958 | mfu: 50.87 | epoch: 1 | total time: 80.73m | eta: 98.9m +step 07514/16704 (44.98%) | loss: 2.743087 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,768 | mfu: 50.92 | epoch: 1 | total time: 80.74m | eta: 98.9m +step 07515/16704 (44.99%) | loss: 2.737083 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,984 | mfu: 50.94 | epoch: 1 | total time: 80.75m | eta: 98.9m +step 07516/16704 (45.00%) | loss: 2.746832 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,496 | mfu: 50.91 | epoch: 1 | total time: 80.76m | eta: 98.9m +step 07517/16704 (45.00%) | loss: 2.741203 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,682 | mfu: 50.92 | epoch: 1 | total time: 80.77m | eta: 98.8m +step 07518/16704 (45.01%) | loss: 2.732176 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,290 | mfu: 50.83 | epoch: 1 | total time: 80.78m | eta: 98.8m +step 07519/16704 (45.01%) | loss: 2.713539 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 1 | total time: 80.79m | eta: 98.8m +step 07520/16704 (45.02%) | loss: 2.719391 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 1 | total time: 80.80m | eta: 98.8m +step 07521/16704 (45.03%) | loss: 2.724675 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,876 | mfu: 50.87 | epoch: 1 | total time: 80.81m | eta: 98.8m +step 07522/16704 (45.03%) | loss: 2.714996 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,784 | mfu: 50.93 | epoch: 1 | total time: 80.82m | eta: 98.8m +step 07523/16704 (45.04%) | loss: 2.720803 | lrm: 1.00 | dt: 643.04ms | tok/sec: 815,326 | mfu: 50.96 | epoch: 1 | total time: 80.83m | eta: 98.8m +step 07524/16704 (45.04%) | loss: 2.727262 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,243 | mfu: 50.83 | epoch: 1 | total time: 80.85m | eta: 98.8m +step 07525/16704 (45.05%) | loss: 2.741604 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,310 | mfu: 50.90 | epoch: 1 | total time: 80.86m | eta: 98.8m +step 07526/16704 (45.06%) | loss: 2.738131 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 1 | total time: 80.87m | eta: 98.7m +step 07527/16704 (45.06%) | loss: 2.732733 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,628 | mfu: 50.79 | epoch: 1 | total time: 80.88m | eta: 98.7m +step 07528/16704 (45.07%) | loss: 2.725877 | lrm: 1.00 | dt: 645.72ms | tok/sec: 811,948 | mfu: 50.75 | epoch: 1 | total time: 80.89m | eta: 98.7m +step 07529/16704 (45.07%) | loss: 2.727727 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,365 | mfu: 50.90 | epoch: 1 | total time: 80.90m | eta: 98.7m +step 07530/16704 (45.08%) | loss: 2.718305 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,514 | mfu: 50.85 | epoch: 1 | total time: 80.91m | eta: 98.7m +step 07531/16704 (45.09%) | loss: 2.716798 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,507 | mfu: 50.85 | epoch: 1 | total time: 80.92m | eta: 98.7m +step 07532/16704 (45.09%) | loss: 2.718910 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,479 | mfu: 50.78 | epoch: 1 | total time: 80.93m | eta: 98.7m +step 07533/16704 (45.10%) | loss: 2.714655 | lrm: 1.00 | dt: 644.20ms | tok/sec: 813,861 | mfu: 50.87 | epoch: 1 | total time: 80.94m | eta: 98.7m +step 07534/16704 (45.10%) | loss: 2.708112 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,960 | mfu: 50.87 | epoch: 1 | total time: 80.95m | eta: 98.7m +step 07535/16704 (45.11%) | loss: 2.706125 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 1 | total time: 80.96m | eta: 98.7m +step 07536/16704 (45.11%) | loss: 2.697664 | lrm: 1.00 | dt: 642.62ms | tok/sec: 815,863 | mfu: 50.99 | epoch: 1 | total time: 80.97m | eta: 98.6m +step 07537/16704 (45.12%) | loss: 2.691447 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,415 | mfu: 50.71 | epoch: 1 | total time: 80.99m | eta: 98.6m +step 07538/16704 (45.13%) | loss: 2.684174 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,791 | mfu: 50.93 | epoch: 1 | total time: 81.00m | eta: 98.6m +step 07539/16704 (45.13%) | loss: 2.671710 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,430 | mfu: 50.97 | epoch: 1 | total time: 81.01m | eta: 98.6m +step 07540/16704 (45.14%) | loss: 2.676320 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,089 | mfu: 50.76 | epoch: 1 | total time: 81.02m | eta: 98.6m +step 07541/16704 (45.14%) | loss: 2.674926 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,815 | mfu: 50.93 | epoch: 1 | total time: 81.03m | eta: 98.6m +step 07542/16704 (45.15%) | loss: 2.674156 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,008 | mfu: 50.94 | epoch: 1 | total time: 81.04m | eta: 98.6m +step 07543/16704 (45.16%) | loss: 2.668381 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,745 | mfu: 50.74 | epoch: 1 | total time: 81.05m | eta: 98.6m +step 07544/16704 (45.16%) | loss: 2.675420 | lrm: 1.00 | dt: 645.57ms | tok/sec: 812,135 | mfu: 50.76 | epoch: 1 | total time: 81.06m | eta: 98.6m +step 07545/16704 (45.17%) | loss: 2.705537 | lrm: 1.00 | dt: 645.42ms | tok/sec: 812,320 | mfu: 50.77 | epoch: 1 | total time: 81.07m | eta: 98.5m +step 07546/16704 (45.17%) | loss: 2.715746 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,865 | mfu: 50.87 | epoch: 1 | total time: 81.08m | eta: 98.5m +step 07547/16704 (45.18%) | loss: 2.715036 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,774 | mfu: 50.74 | epoch: 1 | total time: 81.09m | eta: 98.5m +step 07548/16704 (45.19%) | loss: 2.715514 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,620 | mfu: 50.79 | epoch: 1 | total time: 81.10m | eta: 98.5m +step 07549/16704 (45.19%) | loss: 2.711432 | lrm: 1.00 | dt: 641.75ms | tok/sec: 816,960 | mfu: 51.06 | epoch: 1 | total time: 81.11m | eta: 98.5m +step 07550/16704 (45.20%) | loss: 2.706683 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 1 | total time: 81.12m | eta: 98.5m +step 07551/16704 (45.20%) | loss: 2.711835 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,276 | mfu: 50.83 | epoch: 1 | total time: 81.14m | eta: 98.5m +step 07552/16704 (45.21%) | loss: 2.719415 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,350 | mfu: 50.84 | epoch: 1 | total time: 81.15m | eta: 98.5m +step 07553/16704 (45.22%) | loss: 2.725023 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,534 | mfu: 50.66 | epoch: 1 | total time: 81.16m | eta: 98.5m +step 07554/16704 (45.22%) | loss: 2.716065 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,889 | mfu: 50.93 | epoch: 1 | total time: 81.17m | eta: 98.4m +step 07555/16704 (45.23%) | loss: 2.713686 | lrm: 1.00 | dt: 643.84ms | tok/sec: 814,315 | mfu: 50.90 | epoch: 1 | total time: 81.18m | eta: 98.4m +step 07556/16704 (45.23%) | loss: 2.704081 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,535 | mfu: 50.78 | epoch: 1 | total time: 81.19m | eta: 98.4m +step 07557/16704 (45.24%) | loss: 2.703290 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,595 | mfu: 50.79 | epoch: 1 | total time: 81.20m | eta: 98.4m +step 07558/16704 (45.25%) | loss: 2.703829 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,051 | mfu: 50.82 | epoch: 1 | total time: 81.21m | eta: 98.4m +step 07559/16704 (45.25%) | loss: 2.700400 | lrm: 1.00 | dt: 642.27ms | tok/sec: 816,301 | mfu: 51.02 | epoch: 1 | total time: 81.22m | eta: 98.4m +step 07560/16704 (45.26%) | loss: 2.705200 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,225 | mfu: 50.89 | epoch: 1 | total time: 81.23m | eta: 98.4m +step 07561/16704 (45.26%) | loss: 2.681619 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,050 | mfu: 50.69 | epoch: 1 | total time: 81.24m | eta: 98.4m +step 07562/16704 (45.27%) | loss: 2.691589 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,924 | mfu: 50.93 | epoch: 1 | total time: 81.25m | eta: 98.4m +step 07563/16704 (45.28%) | loss: 2.704104 | lrm: 1.00 | dt: 647.61ms | tok/sec: 809,574 | mfu: 50.60 | epoch: 1 | total time: 81.26m | eta: 98.4m +step 07564/16704 (45.28%) | loss: 2.709754 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 1 | total time: 81.28m | eta: 98.3m +step 07565/16704 (45.29%) | loss: 2.728316 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,182 | mfu: 50.76 | epoch: 1 | total time: 81.29m | eta: 98.3m +step 07566/16704 (45.29%) | loss: 2.731875 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,581 | mfu: 50.85 | epoch: 1 | total time: 81.30m | eta: 98.3m +step 07567/16704 (45.30%) | loss: 2.745414 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,570 | mfu: 50.85 | epoch: 1 | total time: 81.31m | eta: 98.3m +step 07568/16704 (45.31%) | loss: 2.750258 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,348 | mfu: 50.90 | epoch: 1 | total time: 81.32m | eta: 98.3m +step 07569/16704 (45.31%) | loss: 2.738513 | lrm: 1.00 | dt: 646.50ms | tok/sec: 810,965 | mfu: 50.69 | epoch: 1 | total time: 81.33m | eta: 98.3m +step 07570/16704 (45.32%) | loss: 2.726645 | lrm: 1.00 | dt: 642.15ms | tok/sec: 816,461 | mfu: 51.03 | epoch: 1 | total time: 81.34m | eta: 98.3m +step 07571/16704 (45.32%) | loss: 2.723573 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,356 | mfu: 50.90 | epoch: 1 | total time: 81.35m | eta: 98.3m +step 07572/16704 (45.33%) | loss: 2.710303 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,915 | mfu: 50.81 | epoch: 1 | total time: 81.36m | eta: 98.3m +step 07573/16704 (45.34%) | loss: 2.705219 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,489 | mfu: 50.72 | epoch: 1 | total time: 81.37m | eta: 98.2m +step 07574/16704 (45.34%) | loss: 2.690839 | lrm: 1.00 | dt: 643.89ms | tok/sec: 814,252 | mfu: 50.89 | epoch: 1 | total time: 81.38m | eta: 98.2m +step 07575/16704 (45.35%) | loss: 2.677880 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,850 | mfu: 50.87 | epoch: 1 | total time: 81.39m | eta: 98.2m +step 07576/16704 (45.35%) | loss: 2.687040 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,678 | mfu: 50.73 | epoch: 1 | total time: 81.40m | eta: 98.2m +step 07577/16704 (45.36%) | loss: 2.684326 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,375 | mfu: 50.90 | epoch: 1 | total time: 81.41m | eta: 98.2m +step 07578/16704 (45.37%) | loss: 2.691678 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,534 | mfu: 50.78 | epoch: 1 | total time: 81.43m | eta: 98.2m +step 07579/16704 (45.37%) | loss: 2.697439 | lrm: 1.00 | dt: 643.91ms | tok/sec: 814,220 | mfu: 50.89 | epoch: 1 | total time: 81.44m | eta: 98.2m +step 07580/16704 (45.38%) | loss: 2.697920 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,296 | mfu: 50.96 | epoch: 1 | total time: 81.45m | eta: 98.2m +step 07581/16704 (45.38%) | loss: 2.700605 | lrm: 1.00 | dt: 645.40ms | tok/sec: 812,342 | mfu: 50.77 | epoch: 1 | total time: 81.46m | eta: 98.2m +step 07582/16704 (45.39%) | loss: 2.707594 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,211 | mfu: 50.89 | epoch: 1 | total time: 81.47m | eta: 98.1m +step 07583/16704 (45.40%) | loss: 2.701934 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,455 | mfu: 50.84 | epoch: 1 | total time: 81.48m | eta: 98.1m +step 07584/16704 (45.40%) | loss: 2.712017 | lrm: 1.00 | dt: 644.12ms | tok/sec: 813,963 | mfu: 50.87 | epoch: 1 | total time: 81.49m | eta: 98.1m +step 07585/16704 (45.41%) | loss: 2.710763 | lrm: 1.00 | dt: 644.46ms | tok/sec: 813,534 | mfu: 50.85 | epoch: 1 | total time: 81.50m | eta: 98.1m +step 07586/16704 (45.41%) | loss: 2.710252 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,794 | mfu: 50.93 | epoch: 1 | total time: 81.51m | eta: 98.1m +step 07587/16704 (45.42%) | loss: 2.697111 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,973 | mfu: 50.81 | epoch: 1 | total time: 81.52m | eta: 98.1m +step 07588/16704 (45.43%) | loss: 2.692002 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,114 | mfu: 50.82 | epoch: 1 | total time: 81.53m | eta: 98.1m +step 07589/16704 (45.43%) | loss: 2.710676 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,819 | mfu: 50.86 | epoch: 1 | total time: 81.54m | eta: 98.1m +step 07590/16704 (45.44%) | loss: 2.687662 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,091 | mfu: 50.76 | epoch: 1 | total time: 81.55m | eta: 98.1m +step 07591/16704 (45.44%) | loss: 2.693830 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,558 | mfu: 50.91 | epoch: 1 | total time: 81.57m | eta: 98.0m +step 07592/16704 (45.45%) | loss: 2.697928 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,129 | mfu: 50.82 | epoch: 1 | total time: 81.58m | eta: 98.0m +step 07593/16704 (45.46%) | loss: 2.699559 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,472 | mfu: 50.78 | epoch: 1 | total time: 81.59m | eta: 98.0m +step 07594/16704 (45.46%) | loss: 2.689674 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,292 | mfu: 50.96 | epoch: 1 | total time: 81.60m | eta: 98.0m +step 07595/16704 (45.47%) | loss: 2.690184 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,714 | mfu: 50.86 | epoch: 1 | total time: 81.61m | eta: 98.0m +step 07596/16704 (45.47%) | loss: 2.693154 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,927 | mfu: 50.87 | epoch: 1 | total time: 81.62m | eta: 98.0m +step 07597/16704 (45.48%) | loss: 2.688381 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,110 | mfu: 50.82 | epoch: 1 | total time: 81.63m | eta: 98.0m +step 07598/16704 (45.49%) | loss: 2.688679 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,901 | mfu: 50.93 | epoch: 1 | total time: 81.64m | eta: 98.0m +step 07599/16704 (45.49%) | loss: 2.683718 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,043 | mfu: 50.82 | epoch: 1 | total time: 81.65m | eta: 98.0m +step 07600/16704 (45.50%) | loss: 2.689363 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 1 | total time: 81.66m | eta: 98.0m +step 07601/16704 (45.50%) | loss: 2.701387 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,685 | mfu: 50.79 | epoch: 1 | total time: 81.67m | eta: 97.9m +step 07602/16704 (45.51%) | loss: 2.694232 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,243 | mfu: 50.83 | epoch: 1 | total time: 81.68m | eta: 97.9m +step 07603/16704 (45.52%) | loss: 2.691433 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 1 | total time: 81.69m | eta: 97.9m +step 07604/16704 (45.52%) | loss: 2.685738 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,846 | mfu: 50.80 | epoch: 1 | total time: 81.70m | eta: 97.9m +step 07605/16704 (45.53%) | loss: 2.680158 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,657 | mfu: 50.79 | epoch: 1 | total time: 81.72m | eta: 97.9m +step 07606/16704 (45.53%) | loss: 2.678227 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,838 | mfu: 50.93 | epoch: 1 | total time: 81.73m | eta: 97.9m +step 07607/16704 (45.54%) | loss: 2.686056 | lrm: 1.00 | dt: 643.62ms | tok/sec: 814,590 | mfu: 50.91 | epoch: 1 | total time: 81.74m | eta: 97.9m +step 07608/16704 (45.55%) | loss: 2.693710 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,011 | mfu: 50.81 | epoch: 1 | total time: 81.75m | eta: 97.9m +step 07609/16704 (45.55%) | loss: 2.697677 | lrm: 1.00 | dt: 643.46ms | tok/sec: 814,792 | mfu: 50.93 | epoch: 1 | total time: 81.76m | eta: 97.9m +step 07610/16704 (45.56%) | loss: 2.699770 | lrm: 1.00 | dt: 646.75ms | tok/sec: 810,654 | mfu: 50.67 | epoch: 1 | total time: 81.77m | eta: 97.8m +step 07611/16704 (45.56%) | loss: 2.701565 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,835 | mfu: 50.74 | epoch: 1 | total time: 81.78m | eta: 97.8m +step 07612/16704 (45.57%) | loss: 2.706937 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,944 | mfu: 50.81 | epoch: 1 | total time: 81.79m | eta: 97.8m +step 07613/16704 (45.58%) | loss: 2.710086 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,451 | mfu: 50.84 | epoch: 1 | total time: 81.80m | eta: 97.8m +step 07614/16704 (45.58%) | loss: 2.703863 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,006 | mfu: 50.75 | epoch: 1 | total time: 81.81m | eta: 97.8m +step 07615/16704 (45.59%) | loss: 2.718166 | lrm: 1.00 | dt: 641.62ms | tok/sec: 817,131 | mfu: 51.07 | epoch: 1 | total time: 81.82m | eta: 97.8m +step 07616/16704 (45.59%) | loss: 2.707564 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,497 | mfu: 50.72 | epoch: 1 | total time: 81.83m | eta: 97.8m +step 07617/16704 (45.60%) | loss: 2.713281 | lrm: 1.00 | dt: 645.90ms | tok/sec: 811,712 | mfu: 50.73 | epoch: 1 | total time: 81.84m | eta: 97.8m +step 07618/16704 (45.61%) | loss: 2.713530 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,798 | mfu: 50.86 | epoch: 1 | total time: 81.86m | eta: 97.8m +step 07619/16704 (45.61%) | loss: 2.730174 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,516 | mfu: 50.85 | epoch: 1 | total time: 81.87m | eta: 97.7m +step 07620/16704 (45.62%) | loss: 2.725725 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,847 | mfu: 50.74 | epoch: 1 | total time: 81.88m | eta: 97.7m +step 07621/16704 (45.62%) | loss: 2.703675 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,627 | mfu: 50.92 | epoch: 1 | total time: 81.89m | eta: 97.7m +step 07622/16704 (45.63%) | loss: 2.701147 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,984 | mfu: 50.94 | epoch: 1 | total time: 81.90m | eta: 97.7m +step 07623/16704 (45.64%) | loss: 2.695449 | lrm: 1.00 | dt: 649.21ms | tok/sec: 807,579 | mfu: 50.47 | epoch: 1 | total time: 81.91m | eta: 97.7m +step 07624/16704 (45.64%) | loss: 2.694849 | lrm: 1.00 | dt: 641.75ms | tok/sec: 816,969 | mfu: 51.06 | epoch: 1 | total time: 81.92m | eta: 97.7m +step 07625/16704 (45.65%) | loss: 2.686031 | lrm: 1.00 | dt: 648.24ms | tok/sec: 808,789 | mfu: 50.55 | epoch: 1 | total time: 81.93m | eta: 97.7m +step 07626/16704 (45.65%) | loss: 2.681984 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,160 | mfu: 50.82 | epoch: 1 | total time: 81.94m | eta: 97.7m +step 07627/16704 (45.66%) | loss: 2.696397 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,665 | mfu: 50.86 | epoch: 1 | total time: 81.95m | eta: 97.7m +step 07628/16704 (45.67%) | loss: 2.687392 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,761 | mfu: 50.74 | epoch: 1 | total time: 81.96m | eta: 97.6m +step 07629/16704 (45.67%) | loss: 2.676425 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,127 | mfu: 50.95 | epoch: 1 | total time: 81.97m | eta: 97.6m +step 07630/16704 (45.68%) | loss: 2.682118 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,736 | mfu: 50.73 | epoch: 1 | total time: 81.98m | eta: 97.6m +step 07631/16704 (45.68%) | loss: 2.691232 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,375 | mfu: 50.77 | epoch: 1 | total time: 82.00m | eta: 97.6m +step 07632/16704 (45.69%) | loss: 2.686495 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,190 | mfu: 50.83 | epoch: 1 | total time: 82.01m | eta: 97.6m +step 07633/16704 (45.70%) | loss: 2.695474 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,440 | mfu: 50.84 | epoch: 1 | total time: 82.02m | eta: 97.6m +step 07634/16704 (45.70%) | loss: 2.689996 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,151 | mfu: 50.70 | epoch: 1 | total time: 82.03m | eta: 97.6m +step 07635/16704 (45.71%) | loss: 2.685241 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,075 | mfu: 50.88 | epoch: 1 | total time: 82.04m | eta: 97.6m +step 07636/16704 (45.71%) | loss: 2.694061 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,682 | mfu: 50.79 | epoch: 1 | total time: 82.05m | eta: 97.6m +step 07637/16704 (45.72%) | loss: 2.696773 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,791 | mfu: 50.74 | epoch: 1 | total time: 82.06m | eta: 97.6m +step 07638/16704 (45.73%) | loss: 2.696536 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,406 | mfu: 50.90 | epoch: 1 | total time: 82.07m | eta: 97.5m +step 07639/16704 (45.73%) | loss: 2.703857 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,856 | mfu: 50.74 | epoch: 1 | total time: 82.08m | eta: 97.5m +step 07640/16704 (45.74%) | loss: 2.697259 | lrm: 1.00 | dt: 645.31ms | tok/sec: 812,459 | mfu: 50.78 | epoch: 1 | total time: 82.09m | eta: 97.5m +step 07641/16704 (45.74%) | loss: 2.697333 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,108 | mfu: 50.88 | epoch: 1 | total time: 82.10m | eta: 97.5m +step 07642/16704 (45.75%) | loss: 2.695891 | lrm: 1.00 | dt: 644.76ms | tok/sec: 813,147 | mfu: 50.82 | epoch: 1 | total time: 82.11m | eta: 97.5m +step 07643/16704 (45.76%) | loss: 2.697260 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,529 | mfu: 50.91 | epoch: 1 | total time: 82.12m | eta: 97.5m +step 07644/16704 (45.76%) | loss: 2.706564 | lrm: 1.00 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 1 | total time: 82.13m | eta: 97.5m +step 07645/16704 (45.77%) | loss: 2.705123 | lrm: 1.00 | dt: 643.30ms | tok/sec: 815,002 | mfu: 50.94 | epoch: 1 | total time: 82.15m | eta: 97.5m +step 07646/16704 (45.77%) | loss: 2.711840 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,232 | mfu: 50.83 | epoch: 1 | total time: 82.16m | eta: 97.5m +step 07647/16704 (45.78%) | loss: 2.720447 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,023 | mfu: 50.88 | epoch: 1 | total time: 82.17m | eta: 97.4m +step 07648/16704 (45.79%) | loss: 2.726118 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,273 | mfu: 50.83 | epoch: 1 | total time: 82.18m | eta: 97.4m +step 07649/16704 (45.79%) | loss: 2.730713 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,004 | mfu: 50.81 | epoch: 1 | total time: 82.19m | eta: 97.4m +step 07650/16704 (45.80%) | loss: 2.728698 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,600 | mfu: 50.91 | epoch: 1 | total time: 82.20m | eta: 97.4m +step 07651/16704 (45.80%) | loss: 2.727638 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,464 | mfu: 50.72 | epoch: 1 | total time: 82.21m | eta: 97.4m +step 07652/16704 (45.81%) | loss: 2.731991 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,601 | mfu: 50.85 | epoch: 1 | total time: 82.22m | eta: 97.4m +step 07653/16704 (45.82%) | loss: 2.729344 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,794 | mfu: 50.99 | epoch: 1 | total time: 82.23m | eta: 97.4m +step 07654/16704 (45.82%) | loss: 2.712861 | lrm: 1.00 | dt: 645.47ms | tok/sec: 812,256 | mfu: 50.77 | epoch: 1 | total time: 82.24m | eta: 97.4m +step 07655/16704 (45.83%) | loss: 2.710668 | lrm: 1.00 | dt: 643.34ms | tok/sec: 814,943 | mfu: 50.94 | epoch: 1 | total time: 82.25m | eta: 97.4m +step 07656/16704 (45.83%) | loss: 2.711290 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,468 | mfu: 50.66 | epoch: 1 | total time: 82.26m | eta: 97.3m +step 07657/16704 (45.84%) | loss: 2.704980 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,600 | mfu: 50.85 | epoch: 1 | total time: 82.27m | eta: 97.3m +step 07658/16704 (45.85%) | loss: 2.697305 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,805 | mfu: 50.80 | epoch: 1 | total time: 82.29m | eta: 97.3m +step 07659/16704 (45.85%) | loss: 2.702458 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,222 | mfu: 50.77 | epoch: 1 | total time: 82.30m | eta: 97.3m +step 07660/16704 (45.86%) | loss: 2.694234 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,347 | mfu: 50.90 | epoch: 1 | total time: 82.31m | eta: 97.3m +step 07661/16704 (45.86%) | loss: 2.704074 | lrm: 1.00 | dt: 646.83ms | tok/sec: 810,556 | mfu: 50.66 | epoch: 1 | total time: 82.32m | eta: 97.3m +step 07662/16704 (45.87%) | loss: 2.706376 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,442 | mfu: 50.90 | epoch: 1 | total time: 82.33m | eta: 97.3m +step 07663/16704 (45.88%) | loss: 2.707277 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,361 | mfu: 50.90 | epoch: 1 | total time: 82.34m | eta: 97.3m +step 07664/16704 (45.88%) | loss: 2.710873 | lrm: 1.00 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 1 | total time: 82.35m | eta: 97.3m +step 07665/16704 (45.89%) | loss: 2.706673 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 1 | total time: 82.36m | eta: 97.3m +step 07666/16704 (45.89%) | loss: 2.723546 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,474 | mfu: 50.84 | epoch: 1 | total time: 82.37m | eta: 97.2m +step 07667/16704 (45.90%) | loss: 2.709126 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,092 | mfu: 50.88 | epoch: 1 | total time: 82.38m | eta: 97.2m +step 07668/16704 (45.91%) | loss: 2.706755 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,881 | mfu: 50.81 | epoch: 1 | total time: 82.39m | eta: 97.2m +step 07669/16704 (45.91%) | loss: 2.712145 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,577 | mfu: 50.91 | epoch: 1 | total time: 82.40m | eta: 97.2m +step 07670/16704 (45.92%) | loss: 2.714696 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,662 | mfu: 50.79 | epoch: 1 | total time: 82.41m | eta: 97.2m +step 07671/16704 (45.92%) | loss: 2.701130 | lrm: 1.00 | dt: 645.29ms | tok/sec: 812,479 | mfu: 50.78 | epoch: 1 | total time: 82.42m | eta: 97.2m +step 07672/16704 (45.93%) | loss: 2.703268 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,498 | mfu: 50.78 | epoch: 1 | total time: 82.44m | eta: 97.2m +step 07673/16704 (45.94%) | loss: 2.710069 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,179 | mfu: 50.95 | epoch: 1 | total time: 82.45m | eta: 97.2m +step 07674/16704 (45.94%) | loss: 2.710939 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,260 | mfu: 50.83 | epoch: 1 | total time: 82.46m | eta: 97.2m +step 07675/16704 (45.95%) | loss: 2.709140 | lrm: 1.00 | dt: 646.46ms | tok/sec: 811,009 | mfu: 50.69 | epoch: 1 | total time: 82.47m | eta: 97.1m +step 07676/16704 (45.95%) | loss: 2.700237 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,261 | mfu: 50.89 | epoch: 1 | total time: 82.48m | eta: 97.1m +step 07677/16704 (45.96%) | loss: 2.708517 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,100 | mfu: 50.88 | epoch: 1 | total time: 82.49m | eta: 97.1m +step 07678/16704 (45.97%) | loss: 2.707077 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,418 | mfu: 50.71 | epoch: 1 | total time: 82.50m | eta: 97.1m +step 07679/16704 (45.97%) | loss: 2.702868 | lrm: 1.00 | dt: 642.01ms | tok/sec: 816,638 | mfu: 51.04 | epoch: 1 | total time: 82.51m | eta: 97.1m +step 07680/16704 (45.98%) | loss: 2.699406 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,917 | mfu: 50.87 | epoch: 1 | total time: 82.52m | eta: 97.1m +step 07681/16704 (45.98%) | loss: 2.710276 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,181 | mfu: 50.76 | epoch: 1 | total time: 82.53m | eta: 97.1m +step 07682/16704 (45.99%) | loss: 2.711880 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,166 | mfu: 50.76 | epoch: 1 | total time: 82.54m | eta: 97.1m +step 07683/16704 (45.99%) | loss: 2.707408 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,402 | mfu: 50.90 | epoch: 1 | total time: 82.55m | eta: 97.1m +step 07684/16704 (46.00%) | loss: 2.711365 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,927 | mfu: 50.87 | epoch: 1 | total time: 82.56m | eta: 97.0m +step 07685/16704 (46.01%) | loss: 2.712015 | lrm: 1.00 | dt: 642.68ms | tok/sec: 815,790 | mfu: 50.99 | epoch: 1 | total time: 82.58m | eta: 97.0m +step 07686/16704 (46.01%) | loss: 2.715846 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,171 | mfu: 50.82 | epoch: 1 | total time: 82.59m | eta: 97.0m +step 07687/16704 (46.02%) | loss: 2.708504 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,164 | mfu: 50.89 | epoch: 1 | total time: 82.60m | eta: 97.0m +step 07688/16704 (46.02%) | loss: 2.702434 | lrm: 1.00 | dt: 644.34ms | tok/sec: 813,676 | mfu: 50.86 | epoch: 1 | total time: 82.61m | eta: 97.0m +step 07689/16704 (46.03%) | loss: 2.692414 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,502 | mfu: 50.85 | epoch: 1 | total time: 82.62m | eta: 97.0m +step 07690/16704 (46.04%) | loss: 2.684228 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,742 | mfu: 50.86 | epoch: 1 | total time: 82.63m | eta: 97.0m +step 07691/16704 (46.04%) | loss: 2.686441 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,081 | mfu: 50.94 | epoch: 1 | total time: 82.64m | eta: 97.0m +step 07692/16704 (46.05%) | loss: 2.694889 | lrm: 1.00 | dt: 647.16ms | tok/sec: 810,132 | mfu: 50.63 | epoch: 1 | total time: 82.65m | eta: 97.0m +step 07693/16704 (46.05%) | loss: 2.689816 | lrm: 1.00 | dt: 641.23ms | tok/sec: 817,626 | mfu: 51.10 | epoch: 1 | total time: 82.66m | eta: 96.9m +step 07694/16704 (46.06%) | loss: 2.691790 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,798 | mfu: 50.74 | epoch: 1 | total time: 82.67m | eta: 96.9m +step 07695/16704 (46.07%) | loss: 2.689305 | lrm: 1.00 | dt: 642.78ms | tok/sec: 815,653 | mfu: 50.98 | epoch: 1 | total time: 82.68m | eta: 96.9m +step 07696/16704 (46.07%) | loss: 2.691463 | lrm: 1.00 | dt: 645.85ms | tok/sec: 811,783 | mfu: 50.74 | epoch: 1 | total time: 82.69m | eta: 96.9m +step 07697/16704 (46.08%) | loss: 2.693163 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,837 | mfu: 50.87 | epoch: 1 | total time: 82.70m | eta: 96.9m +step 07698/16704 (46.08%) | loss: 2.685835 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,502 | mfu: 50.85 | epoch: 1 | total time: 82.71m | eta: 96.9m +step 07699/16704 (46.09%) | loss: 2.698361 | lrm: 1.00 | dt: 643.92ms | tok/sec: 814,217 | mfu: 50.89 | epoch: 1 | total time: 82.73m | eta: 96.9m +step 07700/16704 (46.10%) | loss: 2.699793 | lrm: 1.00 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 1 | total time: 82.74m | eta: 96.9m +step 07701/16704 (46.10%) | loss: 2.706504 | lrm: 1.00 | dt: 646.62ms | tok/sec: 810,815 | mfu: 50.68 | epoch: 1 | total time: 82.75m | eta: 96.9m +step 07702/16704 (46.11%) | loss: 2.703880 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,394 | mfu: 50.90 | epoch: 1 | total time: 82.76m | eta: 96.9m +step 07703/16704 (46.11%) | loss: 2.694099 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,071 | mfu: 50.82 | epoch: 1 | total time: 82.77m | eta: 96.8m +step 07704/16704 (46.12%) | loss: 2.710779 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 1 | total time: 82.78m | eta: 96.8m +step 07705/16704 (46.13%) | loss: 2.705206 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,022 | mfu: 50.88 | epoch: 1 | total time: 82.79m | eta: 96.8m +step 07706/16704 (46.13%) | loss: 2.693545 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,540 | mfu: 50.78 | epoch: 1 | total time: 82.80m | eta: 96.8m +step 07707/16704 (46.14%) | loss: 2.691599 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,267 | mfu: 50.83 | epoch: 1 | total time: 82.81m | eta: 96.8m +step 07708/16704 (46.14%) | loss: 2.686347 | lrm: 1.00 | dt: 643.06ms | tok/sec: 815,297 | mfu: 50.96 | epoch: 1 | total time: 82.82m | eta: 96.8m +step 07709/16704 (46.15%) | loss: 2.688158 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,883 | mfu: 50.81 | epoch: 1 | total time: 82.83m | eta: 96.8m +step 07710/16704 (46.16%) | loss: 2.703150 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,099 | mfu: 50.88 | epoch: 1 | total time: 82.84m | eta: 96.8m +step 07711/16704 (46.16%) | loss: 2.720306 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,468 | mfu: 50.97 | epoch: 1 | total time: 82.85m | eta: 96.8m +step 07712/16704 (46.17%) | loss: 2.728986 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,764 | mfu: 50.92 | epoch: 1 | total time: 82.87m | eta: 96.7m +step 07713/16704 (46.17%) | loss: 2.718690 | lrm: 1.00 | dt: 643.63ms | tok/sec: 814,575 | mfu: 50.91 | epoch: 1 | total time: 82.88m | eta: 96.7m +step 07714/16704 (46.18%) | loss: 2.714262 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,495 | mfu: 50.91 | epoch: 1 | total time: 82.89m | eta: 96.7m +step 07715/16704 (46.19%) | loss: 2.711319 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,575 | mfu: 50.72 | epoch: 1 | total time: 82.90m | eta: 96.7m +step 07716/16704 (46.19%) | loss: 2.711197 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,010 | mfu: 50.94 | epoch: 1 | total time: 82.91m | eta: 96.7m +step 07717/16704 (46.20%) | loss: 2.710582 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,468 | mfu: 50.78 | epoch: 1 | total time: 82.92m | eta: 96.7m +step 07718/16704 (46.20%) | loss: 2.722260 | lrm: 1.00 | dt: 643.53ms | tok/sec: 814,710 | mfu: 50.92 | epoch: 1 | total time: 82.93m | eta: 96.7m +step 07719/16704 (46.21%) | loss: 2.704784 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,907 | mfu: 50.75 | epoch: 1 | total time: 82.94m | eta: 96.7m +step 07720/16704 (46.22%) | loss: 2.709831 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,529 | mfu: 50.91 | epoch: 1 | total time: 82.95m | eta: 96.7m +step 07721/16704 (46.22%) | loss: 2.714882 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,816 | mfu: 50.93 | epoch: 1 | total time: 82.96m | eta: 96.6m +step 07722/16704 (46.23%) | loss: 2.711852 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,145 | mfu: 50.89 | epoch: 1 | total time: 82.97m | eta: 96.6m +step 07723/16704 (46.23%) | loss: 2.719453 | lrm: 1.00 | dt: 646.90ms | tok/sec: 810,460 | mfu: 50.65 | epoch: 1 | total time: 82.98m | eta: 96.6m +step 07724/16704 (46.24%) | loss: 2.711130 | lrm: 1.00 | dt: 642.98ms | tok/sec: 815,408 | mfu: 50.96 | epoch: 1 | total time: 82.99m | eta: 96.6m +step 07725/16704 (46.25%) | loss: 2.713561 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,906 | mfu: 50.81 | epoch: 1 | total time: 83.00m | eta: 96.6m +step 07726/16704 (46.25%) | loss: 2.706443 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,505 | mfu: 50.78 | epoch: 1 | total time: 83.02m | eta: 96.6m +step 07727/16704 (46.26%) | loss: 2.691398 | lrm: 1.00 | dt: 641.29ms | tok/sec: 817,552 | mfu: 51.10 | epoch: 1 | total time: 83.03m | eta: 96.6m +step 07728/16704 (46.26%) | loss: 2.679608 | lrm: 1.00 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 1 | total time: 83.04m | eta: 96.6m +step 07729/16704 (46.27%) | loss: 2.690233 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,030 | mfu: 50.88 | epoch: 1 | total time: 83.05m | eta: 96.6m +step 07730/16704 (46.28%) | loss: 2.682288 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,597 | mfu: 50.79 | epoch: 1 | total time: 83.06m | eta: 96.6m +step 07731/16704 (46.28%) | loss: 2.683508 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,143 | mfu: 50.70 | epoch: 1 | total time: 83.07m | eta: 96.5m +step 07732/16704 (46.29%) | loss: 2.671896 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,916 | mfu: 50.81 | epoch: 1 | total time: 83.08m | eta: 96.5m +step 07733/16704 (46.29%) | loss: 2.675514 | lrm: 1.00 | dt: 643.74ms | tok/sec: 814,437 | mfu: 50.90 | epoch: 1 | total time: 83.09m | eta: 96.5m +step 07734/16704 (46.30%) | loss: 2.674893 | lrm: 1.00 | dt: 642.35ms | tok/sec: 816,197 | mfu: 51.01 | epoch: 1 | total time: 83.10m | eta: 96.5m +step 07735/16704 (46.31%) | loss: 2.684741 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,092 | mfu: 50.76 | epoch: 1 | total time: 83.11m | eta: 96.5m +step 07736/16704 (46.31%) | loss: 2.678091 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 1 | total time: 83.12m | eta: 96.5m +step 07737/16704 (46.32%) | loss: 2.674597 | lrm: 1.00 | dt: 645.33ms | tok/sec: 812,435 | mfu: 50.78 | epoch: 1 | total time: 83.13m | eta: 96.5m +step 07738/16704 (46.32%) | loss: 2.668116 | lrm: 1.00 | dt: 643.07ms | tok/sec: 815,289 | mfu: 50.96 | epoch: 1 | total time: 83.14m | eta: 96.5m +step 07739/16704 (46.33%) | loss: 2.681729 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,728 | mfu: 50.86 | epoch: 1 | total time: 83.16m | eta: 96.5m +step 07740/16704 (46.34%) | loss: 2.672258 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,616 | mfu: 50.73 | epoch: 1 | total time: 83.17m | eta: 96.4m +step 07741/16704 (46.34%) | loss: 2.667517 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 1 | total time: 83.18m | eta: 96.4m +step 07742/16704 (46.35%) | loss: 2.681860 | lrm: 1.00 | dt: 644.88ms | tok/sec: 813,005 | mfu: 50.81 | epoch: 1 | total time: 83.19m | eta: 96.4m +step 07743/16704 (46.35%) | loss: 2.684431 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,208 | mfu: 50.76 | epoch: 1 | total time: 83.20m | eta: 96.4m +step 07744/16704 (46.36%) | loss: 2.679099 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,895 | mfu: 50.93 | epoch: 1 | total time: 83.21m | eta: 96.4m +step 07745/16704 (46.37%) | loss: 2.680728 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 1 | total time: 83.22m | eta: 96.4m +step 07746/16704 (46.37%) | loss: 2.683566 | lrm: 1.00 | dt: 644.38ms | tok/sec: 813,632 | mfu: 50.85 | epoch: 1 | total time: 83.23m | eta: 96.4m +step 07747/16704 (46.38%) | loss: 2.682498 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,020 | mfu: 50.81 | epoch: 1 | total time: 83.24m | eta: 96.4m +step 07748/16704 (46.38%) | loss: 2.679384 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,628 | mfu: 50.79 | epoch: 1 | total time: 83.25m | eta: 96.4m +step 07749/16704 (46.39%) | loss: 2.685008 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,276 | mfu: 50.89 | epoch: 1 | total time: 83.26m | eta: 96.3m +Step 07750 | Validation bpb: 0.823356 +step 07750/16704 (46.40%) | loss: 2.680440 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,485 | mfu: 50.84 | epoch: 1 | total time: 83.27m | eta: 96.3m +step 07751/16704 (46.40%) | loss: 2.677304 | lrm: 1.00 | dt: 647.95ms | tok/sec: 809,142 | mfu: 50.57 | epoch: 1 | total time: 83.28m | eta: 96.3m +step 07752/16704 (46.41%) | loss: 2.684460 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,111 | mfu: 50.76 | epoch: 1 | total time: 83.29m | eta: 96.3m +step 07753/16704 (46.41%) | loss: 2.683157 | lrm: 1.00 | dt: 642.46ms | tok/sec: 816,058 | mfu: 51.00 | epoch: 1 | total time: 83.31m | eta: 96.3m +step 07754/16704 (46.42%) | loss: 2.675701 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,917 | mfu: 50.87 | epoch: 1 | total time: 83.32m | eta: 96.3m +step 07755/16704 (46.43%) | loss: 2.700591 | lrm: 1.00 | dt: 645.69ms | tok/sec: 811,982 | mfu: 50.75 | epoch: 1 | total time: 83.33m | eta: 96.3m +step 07756/16704 (46.43%) | loss: 2.699834 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,561 | mfu: 50.91 | epoch: 1 | total time: 83.34m | eta: 96.3m +step 07757/16704 (46.44%) | loss: 2.690350 | lrm: 1.00 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 1 | total time: 83.35m | eta: 96.3m +step 07758/16704 (46.44%) | loss: 2.692971 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,779 | mfu: 50.92 | epoch: 1 | total time: 83.36m | eta: 96.2m +step 07759/16704 (46.45%) | loss: 2.683100 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,210 | mfu: 50.83 | epoch: 1 | total time: 83.37m | eta: 96.2m +step 07760/16704 (46.46%) | loss: 2.683983 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,041 | mfu: 50.75 | epoch: 1 | total time: 83.38m | eta: 96.2m +step 07761/16704 (46.46%) | loss: 2.692502 | lrm: 1.00 | dt: 643.10ms | tok/sec: 815,255 | mfu: 50.95 | epoch: 1 | total time: 83.39m | eta: 96.2m +step 07762/16704 (46.47%) | loss: 2.698007 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,841 | mfu: 50.74 | epoch: 1 | total time: 83.40m | eta: 96.2m +step 07763/16704 (46.47%) | loss: 2.697087 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,917 | mfu: 50.81 | epoch: 1 | total time: 83.41m | eta: 96.2m +step 07764/16704 (46.48%) | loss: 2.695323 | lrm: 1.00 | dt: 642.61ms | tok/sec: 815,868 | mfu: 50.99 | epoch: 1 | total time: 83.42m | eta: 96.2m +step 07765/16704 (46.49%) | loss: 2.695076 | lrm: 1.00 | dt: 646.56ms | tok/sec: 810,887 | mfu: 50.68 | epoch: 1 | total time: 83.43m | eta: 96.2m +step 07766/16704 (46.49%) | loss: 2.695831 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 1 | total time: 83.45m | eta: 96.2m +step 07767/16704 (46.50%) | loss: 2.706354 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,069 | mfu: 50.94 | epoch: 1 | total time: 83.46m | eta: 96.2m +step 07768/16704 (46.50%) | loss: 2.716323 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,388 | mfu: 50.84 | epoch: 1 | total time: 83.47m | eta: 96.1m +step 07769/16704 (46.51%) | loss: 2.738494 | lrm: 1.00 | dt: 645.95ms | tok/sec: 811,653 | mfu: 50.73 | epoch: 1 | total time: 83.48m | eta: 96.1m +step 07770/16704 (46.52%) | loss: 2.735667 | lrm: 1.00 | dt: 643.00ms | tok/sec: 815,383 | mfu: 50.96 | epoch: 1 | total time: 83.49m | eta: 96.1m +step 07771/16704 (46.52%) | loss: 2.736247 | lrm: 1.00 | dt: 645.77ms | tok/sec: 811,882 | mfu: 50.74 | epoch: 1 | total time: 83.50m | eta: 96.1m +step 07772/16704 (46.53%) | loss: 2.714661 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,273 | mfu: 50.83 | epoch: 1 | total time: 83.51m | eta: 96.1m +step 07773/16704 (46.53%) | loss: 2.709873 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,527 | mfu: 50.91 | epoch: 1 | total time: 83.52m | eta: 96.1m +step 07774/16704 (46.54%) | loss: 2.709539 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,303 | mfu: 50.71 | epoch: 1 | total time: 83.53m | eta: 96.1m +step 07775/16704 (46.55%) | loss: 2.721093 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,570 | mfu: 50.79 | epoch: 1 | total time: 83.54m | eta: 96.1m +step 07776/16704 (46.55%) | loss: 2.728712 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,977 | mfu: 50.81 | epoch: 1 | total time: 83.55m | eta: 96.1m +step 07777/16704 (46.56%) | loss: 2.737074 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,054 | mfu: 50.82 | epoch: 1 | total time: 83.56m | eta: 96.0m +step 07778/16704 (46.56%) | loss: 2.734770 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,664 | mfu: 50.79 | epoch: 1 | total time: 83.57m | eta: 96.0m +step 07779/16704 (46.57%) | loss: 2.732658 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,821 | mfu: 50.74 | epoch: 1 | total time: 83.58m | eta: 96.0m +step 07780/16704 (46.58%) | loss: 2.719377 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 1 | total time: 83.60m | eta: 96.0m +step 07781/16704 (46.58%) | loss: 2.717578 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,598 | mfu: 50.85 | epoch: 1 | total time: 83.61m | eta: 96.0m +step 07782/16704 (46.59%) | loss: 2.717685 | lrm: 1.00 | dt: 645.13ms | tok/sec: 812,688 | mfu: 50.79 | epoch: 1 | total time: 83.62m | eta: 96.0m +step 07783/16704 (46.59%) | loss: 2.722330 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,230 | mfu: 50.83 | epoch: 1 | total time: 83.63m | eta: 96.0m +step 07784/16704 (46.60%) | loss: 2.720705 | lrm: 1.00 | dt: 643.03ms | tok/sec: 815,345 | mfu: 50.96 | epoch: 1 | total time: 83.64m | eta: 96.0m +step 07785/16704 (46.61%) | loss: 2.723460 | lrm: 1.00 | dt: 644.51ms | tok/sec: 813,466 | mfu: 50.84 | epoch: 1 | total time: 83.65m | eta: 96.0m +step 07786/16704 (46.61%) | loss: 2.729671 | lrm: 1.00 | dt: 645.24ms | tok/sec: 812,552 | mfu: 50.79 | epoch: 1 | total time: 83.66m | eta: 95.9m +step 07787/16704 (46.62%) | loss: 2.732351 | lrm: 1.00 | dt: 645.45ms | tok/sec: 812,277 | mfu: 50.77 | epoch: 1 | total time: 83.67m | eta: 95.9m +step 07788/16704 (46.62%) | loss: 2.732223 | lrm: 1.00 | dt: 646.61ms | tok/sec: 810,825 | mfu: 50.68 | epoch: 1 | total time: 83.68m | eta: 95.9m +step 07789/16704 (46.63%) | loss: 2.722357 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,598 | mfu: 50.85 | epoch: 1 | total time: 83.69m | eta: 95.9m +step 07790/16704 (46.64%) | loss: 2.722189 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,915 | mfu: 50.81 | epoch: 1 | total time: 83.70m | eta: 95.9m +step 07791/16704 (46.64%) | loss: 2.728692 | lrm: 1.00 | dt: 647.64ms | tok/sec: 809,535 | mfu: 50.60 | epoch: 1 | total time: 83.71m | eta: 95.9m +step 07792/16704 (46.65%) | loss: 2.724333 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,905 | mfu: 50.81 | epoch: 1 | total time: 83.72m | eta: 95.9m +step 07793/16704 (46.65%) | loss: 2.723213 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,682 | mfu: 50.73 | epoch: 1 | total time: 83.74m | eta: 95.9m +step 07794/16704 (46.66%) | loss: 2.727516 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,819 | mfu: 50.86 | epoch: 1 | total time: 83.75m | eta: 95.9m +step 07795/16704 (46.67%) | loss: 2.723694 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,426 | mfu: 50.72 | epoch: 1 | total time: 83.76m | eta: 95.8m +step 07796/16704 (46.67%) | loss: 2.720342 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,373 | mfu: 50.77 | epoch: 1 | total time: 83.77m | eta: 95.8m +step 07797/16704 (46.68%) | loss: 2.700501 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 1 | total time: 83.78m | eta: 95.8m +step 07798/16704 (46.68%) | loss: 2.691589 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,321 | mfu: 50.83 | epoch: 1 | total time: 83.79m | eta: 95.8m +step 07799/16704 (46.69%) | loss: 2.695974 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,015 | mfu: 50.75 | epoch: 1 | total time: 83.80m | eta: 95.8m +step 07800/16704 (46.70%) | loss: 2.681047 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,151 | mfu: 50.70 | epoch: 1 | total time: 83.81m | eta: 95.8m +step 07801/16704 (46.70%) | loss: 2.677053 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,191 | mfu: 50.83 | epoch: 1 | total time: 83.82m | eta: 95.8m +step 07802/16704 (46.71%) | loss: 2.685465 | lrm: 1.00 | dt: 648.24ms | tok/sec: 808,790 | mfu: 50.55 | epoch: 1 | total time: 83.83m | eta: 95.8m +step 07803/16704 (46.71%) | loss: 2.685010 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,393 | mfu: 50.96 | epoch: 1 | total time: 83.84m | eta: 95.8m +step 07804/16704 (46.72%) | loss: 2.698524 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,288 | mfu: 50.83 | epoch: 1 | total time: 83.85m | eta: 95.8m +step 07805/16704 (46.73%) | loss: 2.703321 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,289 | mfu: 50.64 | epoch: 1 | total time: 83.86m | eta: 95.7m +step 07806/16704 (46.73%) | loss: 2.710449 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,121 | mfu: 50.76 | epoch: 1 | total time: 83.88m | eta: 95.7m +step 07807/16704 (46.74%) | loss: 2.716243 | lrm: 1.00 | dt: 643.90ms | tok/sec: 814,239 | mfu: 50.89 | epoch: 1 | total time: 83.89m | eta: 95.7m +step 07808/16704 (46.74%) | loss: 2.721434 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,515 | mfu: 50.72 | epoch: 1 | total time: 83.90m | eta: 95.7m +step 07809/16704 (46.75%) | loss: 2.724819 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,391 | mfu: 50.78 | epoch: 1 | total time: 83.91m | eta: 95.7m +step 07810/16704 (46.76%) | loss: 2.717528 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,087 | mfu: 50.82 | epoch: 1 | total time: 83.92m | eta: 95.7m +step 07811/16704 (46.76%) | loss: 2.726338 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,699 | mfu: 50.67 | epoch: 1 | total time: 83.93m | eta: 95.7m +step 07812/16704 (46.77%) | loss: 2.711607 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,858 | mfu: 50.74 | epoch: 1 | total time: 83.94m | eta: 95.7m +step 07813/16704 (46.77%) | loss: 2.710573 | lrm: 1.00 | dt: 649.52ms | tok/sec: 807,198 | mfu: 50.45 | epoch: 1 | total time: 83.95m | eta: 95.7m +step 07814/16704 (46.78%) | loss: 2.700950 | lrm: 1.00 | dt: 642.69ms | tok/sec: 815,768 | mfu: 50.99 | epoch: 1 | total time: 83.96m | eta: 95.6m +step 07815/16704 (46.79%) | loss: 2.694467 | lrm: 1.00 | dt: 645.64ms | tok/sec: 812,045 | mfu: 50.75 | epoch: 1 | total time: 83.97m | eta: 95.6m +step 07816/16704 (46.79%) | loss: 2.692825 | lrm: 1.00 | dt: 646.52ms | tok/sec: 810,938 | mfu: 50.68 | epoch: 1 | total time: 83.98m | eta: 95.6m +step 07817/16704 (46.80%) | loss: 2.712352 | lrm: 1.00 | dt: 642.31ms | tok/sec: 816,256 | mfu: 51.02 | epoch: 1 | total time: 83.99m | eta: 95.6m +step 07818/16704 (46.80%) | loss: 2.720715 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,518 | mfu: 50.72 | epoch: 1 | total time: 84.00m | eta: 95.6m +step 07819/16704 (46.81%) | loss: 2.716020 | lrm: 1.00 | dt: 647.19ms | tok/sec: 810,097 | mfu: 50.63 | epoch: 1 | total time: 84.02m | eta: 95.6m +step 07820/16704 (46.82%) | loss: 2.715054 | lrm: 1.00 | dt: 645.38ms | tok/sec: 812,372 | mfu: 50.77 | epoch: 1 | total time: 84.03m | eta: 95.6m +step 07821/16704 (46.82%) | loss: 2.721040 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,185 | mfu: 50.76 | epoch: 1 | total time: 84.04m | eta: 95.6m +step 07822/16704 (46.83%) | loss: 2.729959 | lrm: 1.00 | dt: 646.82ms | tok/sec: 810,561 | mfu: 50.66 | epoch: 1 | total time: 84.05m | eta: 95.6m +step 07823/16704 (46.83%) | loss: 2.728354 | lrm: 1.00 | dt: 647.26ms | tok/sec: 810,005 | mfu: 50.63 | epoch: 1 | total time: 84.06m | eta: 95.5m +step 07824/16704 (46.84%) | loss: 2.725994 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,078 | mfu: 50.76 | epoch: 1 | total time: 84.07m | eta: 95.5m +step 07825/16704 (46.85%) | loss: 2.723771 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,848 | mfu: 50.80 | epoch: 1 | total time: 84.08m | eta: 95.5m +step 07826/16704 (46.85%) | loss: 2.720738 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 1 | total time: 84.09m | eta: 95.5m +step 07827/16704 (46.86%) | loss: 2.718696 | lrm: 1.00 | dt: 644.88ms | tok/sec: 812,996 | mfu: 50.81 | epoch: 1 | total time: 84.10m | eta: 95.5m +step 07828/16704 (46.86%) | loss: 2.705285 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,784 | mfu: 50.86 | epoch: 1 | total time: 84.11m | eta: 95.5m +step 07829/16704 (46.87%) | loss: 2.693349 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,731 | mfu: 50.86 | epoch: 1 | total time: 84.12m | eta: 95.5m +step 07830/16704 (46.88%) | loss: 2.694455 | lrm: 1.00 | dt: 646.13ms | tok/sec: 811,433 | mfu: 50.72 | epoch: 1 | total time: 84.13m | eta: 95.5m +step 07831/16704 (46.88%) | loss: 2.699697 | lrm: 1.00 | dt: 645.12ms | tok/sec: 812,694 | mfu: 50.79 | epoch: 1 | total time: 84.14m | eta: 95.5m +step 07832/16704 (46.89%) | loss: 2.697457 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,508 | mfu: 50.85 | epoch: 1 | total time: 84.15m | eta: 95.5m +step 07833/16704 (46.89%) | loss: 2.695978 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,284 | mfu: 50.83 | epoch: 1 | total time: 84.17m | eta: 95.4m +step 07834/16704 (46.90%) | loss: 2.705659 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,048 | mfu: 50.82 | epoch: 1 | total time: 84.18m | eta: 95.4m +step 07835/16704 (46.90%) | loss: 2.699021 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,852 | mfu: 50.80 | epoch: 1 | total time: 84.19m | eta: 95.4m +step 07836/16704 (46.91%) | loss: 2.711041 | lrm: 1.00 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 1 | total time: 84.20m | eta: 95.4m +step 07837/16704 (46.92%) | loss: 2.701191 | lrm: 1.00 | dt: 646.92ms | tok/sec: 810,433 | mfu: 50.65 | epoch: 1 | total time: 84.21m | eta: 95.4m +step 07838/16704 (46.92%) | loss: 2.702650 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,619 | mfu: 50.79 | epoch: 1 | total time: 84.22m | eta: 95.4m +step 07839/16704 (46.93%) | loss: 2.704051 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,723 | mfu: 50.86 | epoch: 1 | total time: 84.23m | eta: 95.4m +step 07840/16704 (46.93%) | loss: 2.688625 | lrm: 1.00 | dt: 645.81ms | tok/sec: 811,824 | mfu: 50.74 | epoch: 1 | total time: 84.24m | eta: 95.4m +step 07841/16704 (46.94%) | loss: 2.704719 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,010 | mfu: 50.75 | epoch: 1 | total time: 84.25m | eta: 95.4m +step 07842/16704 (46.95%) | loss: 2.706203 | lrm: 1.00 | dt: 650.36ms | tok/sec: 806,146 | mfu: 50.39 | epoch: 1 | total time: 84.26m | eta: 95.3m +step 07843/16704 (46.95%) | loss: 2.705925 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,996 | mfu: 50.88 | epoch: 1 | total time: 84.27m | eta: 95.3m +step 07844/16704 (46.96%) | loss: 2.699677 | lrm: 1.00 | dt: 644.17ms | tok/sec: 813,895 | mfu: 50.87 | epoch: 1 | total time: 84.28m | eta: 95.3m +step 07845/16704 (46.96%) | loss: 2.703129 | lrm: 1.00 | dt: 645.34ms | tok/sec: 812,416 | mfu: 50.78 | epoch: 1 | total time: 84.29m | eta: 95.3m +step 07846/16704 (46.97%) | loss: 2.706183 | lrm: 1.00 | dt: 646.81ms | tok/sec: 810,579 | mfu: 50.66 | epoch: 1 | total time: 84.31m | eta: 95.3m +step 07847/16704 (46.98%) | loss: 2.695846 | lrm: 1.00 | dt: 645.05ms | tok/sec: 812,787 | mfu: 50.80 | epoch: 1 | total time: 84.32m | eta: 95.3m +step 07848/16704 (46.98%) | loss: 2.694123 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,413 | mfu: 50.84 | epoch: 1 | total time: 84.33m | eta: 95.3m +step 07849/16704 (46.99%) | loss: 2.703513 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,409 | mfu: 50.71 | epoch: 1 | total time: 84.34m | eta: 95.3m +step 07850/16704 (46.99%) | loss: 2.700938 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 1 | total time: 84.35m | eta: 95.3m +step 07851/16704 (47.00%) | loss: 2.709858 | lrm: 1.00 | dt: 645.89ms | tok/sec: 811,725 | mfu: 50.73 | epoch: 1 | total time: 84.36m | eta: 95.2m +step 07852/16704 (47.01%) | loss: 2.721844 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,921 | mfu: 50.75 | epoch: 1 | total time: 84.37m | eta: 95.2m +step 07853/16704 (47.01%) | loss: 2.713689 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,649 | mfu: 50.92 | epoch: 1 | total time: 84.38m | eta: 95.2m +step 07854/16704 (47.02%) | loss: 2.685116 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,701 | mfu: 50.86 | epoch: 1 | total time: 84.39m | eta: 95.2m +step 07855/16704 (47.02%) | loss: 2.678843 | lrm: 1.00 | dt: 646.27ms | tok/sec: 811,253 | mfu: 50.70 | epoch: 1 | total time: 84.40m | eta: 95.2m +step 07856/16704 (47.03%) | loss: 2.687222 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,008 | mfu: 50.81 | epoch: 1 | total time: 84.41m | eta: 95.2m +step 07857/16704 (47.04%) | loss: 2.693299 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,416 | mfu: 50.84 | epoch: 1 | total time: 84.42m | eta: 95.2m +step 07858/16704 (47.04%) | loss: 2.685600 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.85 | epoch: 1 | total time: 84.43m | eta: 95.2m +step 07859/16704 (47.05%) | loss: 2.686396 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,260 | mfu: 50.83 | epoch: 1 | total time: 84.45m | eta: 95.2m +step 07860/16704 (47.05%) | loss: 2.692930 | lrm: 1.00 | dt: 642.67ms | tok/sec: 815,794 | mfu: 50.99 | epoch: 1 | total time: 84.46m | eta: 95.2m +step 07861/16704 (47.06%) | loss: 2.696089 | lrm: 1.00 | dt: 645.30ms | tok/sec: 812,474 | mfu: 50.78 | epoch: 1 | total time: 84.47m | eta: 95.1m +step 07862/16704 (47.07%) | loss: 2.689418 | lrm: 1.00 | dt: 645.65ms | tok/sec: 812,025 | mfu: 50.75 | epoch: 1 | total time: 84.48m | eta: 95.1m +step 07863/16704 (47.07%) | loss: 2.688806 | lrm: 1.00 | dt: 643.59ms | tok/sec: 814,628 | mfu: 50.92 | epoch: 1 | total time: 84.49m | eta: 95.1m +step 07864/16704 (47.08%) | loss: 2.687983 | lrm: 1.00 | dt: 645.39ms | tok/sec: 812,356 | mfu: 50.77 | epoch: 1 | total time: 84.50m | eta: 95.1m +step 07865/16704 (47.08%) | loss: 2.699711 | lrm: 1.00 | dt: 646.51ms | tok/sec: 810,945 | mfu: 50.69 | epoch: 1 | total time: 84.51m | eta: 95.1m +step 07866/16704 (47.09%) | loss: 2.706712 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 1 | total time: 84.52m | eta: 95.1m +step 07867/16704 (47.10%) | loss: 2.713435 | lrm: 1.00 | dt: 643.12ms | tok/sec: 815,224 | mfu: 50.95 | epoch: 1 | total time: 84.53m | eta: 95.1m +step 07868/16704 (47.10%) | loss: 2.708140 | lrm: 1.00 | dt: 647.49ms | tok/sec: 809,723 | mfu: 50.61 | epoch: 1 | total time: 84.54m | eta: 95.1m +step 07869/16704 (47.11%) | loss: 2.710614 | lrm: 1.00 | dt: 644.54ms | tok/sec: 813,425 | mfu: 50.84 | epoch: 1 | total time: 84.55m | eta: 95.1m +step 07870/16704 (47.11%) | loss: 2.707823 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,502 | mfu: 50.72 | epoch: 1 | total time: 84.56m | eta: 95.0m +step 07871/16704 (47.12%) | loss: 2.717323 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,654 | mfu: 50.79 | epoch: 1 | total time: 84.57m | eta: 95.0m +step 07872/16704 (47.13%) | loss: 2.709801 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 1 | total time: 84.59m | eta: 95.0m +step 07873/16704 (47.13%) | loss: 2.719156 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,452 | mfu: 50.72 | epoch: 1 | total time: 84.60m | eta: 95.0m +step 07874/16704 (47.14%) | loss: 2.712253 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,668 | mfu: 50.92 | epoch: 1 | total time: 84.61m | eta: 95.0m +step 07875/16704 (47.14%) | loss: 2.710143 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,645 | mfu: 50.79 | epoch: 1 | total time: 84.62m | eta: 95.0m +step 07876/16704 (47.15%) | loss: 2.712295 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,695 | mfu: 50.98 | epoch: 1 | total time: 84.63m | eta: 95.0m +step 07877/16704 (47.16%) | loss: 2.712425 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,105 | mfu: 50.76 | epoch: 1 | total time: 84.64m | eta: 95.0m +step 07878/16704 (47.16%) | loss: 2.716038 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,608 | mfu: 50.85 | epoch: 1 | total time: 84.65m | eta: 95.0m +step 07879/16704 (47.17%) | loss: 2.705592 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,761 | mfu: 50.86 | epoch: 1 | total time: 84.66m | eta: 94.9m +step 07880/16704 (47.17%) | loss: 2.708606 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,182 | mfu: 50.83 | epoch: 1 | total time: 84.67m | eta: 94.9m +step 07881/16704 (47.18%) | loss: 2.717879 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,062 | mfu: 50.88 | epoch: 1 | total time: 84.68m | eta: 94.9m +step 07882/16704 (47.19%) | loss: 2.717188 | lrm: 1.00 | dt: 645.63ms | tok/sec: 812,050 | mfu: 50.75 | epoch: 1 | total time: 84.69m | eta: 94.9m +step 07883/16704 (47.19%) | loss: 2.714934 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,317 | mfu: 50.83 | epoch: 1 | total time: 84.70m | eta: 94.9m +step 07884/16704 (47.20%) | loss: 2.704231 | lrm: 1.00 | dt: 644.99ms | tok/sec: 812,864 | mfu: 50.81 | epoch: 1 | total time: 84.71m | eta: 94.9m +step 07885/16704 (47.20%) | loss: 2.706284 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,810 | mfu: 50.86 | epoch: 1 | total time: 84.72m | eta: 94.9m +step 07886/16704 (47.21%) | loss: 2.708423 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,279 | mfu: 50.83 | epoch: 1 | total time: 84.74m | eta: 94.9m +step 07887/16704 (47.22%) | loss: 2.707084 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,084 | mfu: 50.88 | epoch: 1 | total time: 84.75m | eta: 94.9m +step 07888/16704 (47.22%) | loss: 2.718379 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,537 | mfu: 50.72 | epoch: 1 | total time: 84.76m | eta: 94.8m +step 07889/16704 (47.23%) | loss: 2.717534 | lrm: 1.00 | dt: 646.84ms | tok/sec: 810,531 | mfu: 50.66 | epoch: 1 | total time: 84.77m | eta: 94.8m +step 07890/16704 (47.23%) | loss: 2.716006 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,834 | mfu: 50.87 | epoch: 1 | total time: 84.78m | eta: 94.8m +step 07891/16704 (47.24%) | loss: 2.721355 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,058 | mfu: 50.82 | epoch: 1 | total time: 84.79m | eta: 94.8m +step 07892/16704 (47.25%) | loss: 2.725323 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 1 | total time: 84.80m | eta: 94.8m +step 07893/16704 (47.25%) | loss: 2.729776 | lrm: 1.00 | dt: 648.96ms | tok/sec: 807,885 | mfu: 50.49 | epoch: 1 | total time: 84.81m | eta: 94.8m +step 07894/16704 (47.26%) | loss: 2.744001 | lrm: 1.00 | dt: 643.81ms | tok/sec: 814,349 | mfu: 50.90 | epoch: 1 | total time: 84.82m | eta: 94.8m +step 07895/16704 (47.26%) | loss: 2.734668 | lrm: 1.00 | dt: 645.02ms | tok/sec: 812,819 | mfu: 50.80 | epoch: 1 | total time: 84.83m | eta: 94.8m +step 07896/16704 (47.27%) | loss: 2.733190 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,814 | mfu: 50.74 | epoch: 1 | total time: 84.84m | eta: 94.8m +step 07897/16704 (47.28%) | loss: 2.736719 | lrm: 1.00 | dt: 644.26ms | tok/sec: 813,782 | mfu: 50.86 | epoch: 1 | total time: 84.85m | eta: 94.8m +step 07898/16704 (47.28%) | loss: 2.727953 | lrm: 1.00 | dt: 645.49ms | tok/sec: 812,226 | mfu: 50.77 | epoch: 1 | total time: 84.86m | eta: 94.7m +step 07899/16704 (47.29%) | loss: 2.723281 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,168 | mfu: 50.76 | epoch: 1 | total time: 84.88m | eta: 94.7m +step 07900/16704 (47.29%) | loss: 2.724727 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,282 | mfu: 50.83 | epoch: 1 | total time: 84.89m | eta: 94.7m +step 07901/16704 (47.30%) | loss: 2.714196 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,716 | mfu: 50.92 | epoch: 1 | total time: 84.90m | eta: 94.7m +step 07902/16704 (47.31%) | loss: 2.722341 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,566 | mfu: 50.91 | epoch: 1 | total time: 84.91m | eta: 94.7m +step 07903/16704 (47.31%) | loss: 2.725758 | lrm: 1.00 | dt: 646.39ms | tok/sec: 811,102 | mfu: 50.70 | epoch: 1 | total time: 84.92m | eta: 94.7m +step 07904/16704 (47.32%) | loss: 2.721657 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,545 | mfu: 50.85 | epoch: 1 | total time: 84.93m | eta: 94.7m +step 07905/16704 (47.32%) | loss: 2.728608 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,643 | mfu: 50.92 | epoch: 1 | total time: 84.94m | eta: 94.7m +step 07906/16704 (47.33%) | loss: 2.729915 | lrm: 1.00 | dt: 643.39ms | tok/sec: 814,887 | mfu: 50.93 | epoch: 1 | total time: 84.95m | eta: 94.7m +step 07907/16704 (47.34%) | loss: 2.734914 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,359 | mfu: 50.84 | epoch: 1 | total time: 84.96m | eta: 94.6m +step 07908/16704 (47.34%) | loss: 2.741585 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,139 | mfu: 50.88 | epoch: 1 | total time: 84.97m | eta: 94.6m +step 07909/16704 (47.35%) | loss: 2.744134 | lrm: 1.00 | dt: 642.16ms | tok/sec: 816,439 | mfu: 51.03 | epoch: 1 | total time: 84.98m | eta: 94.6m +step 07910/16704 (47.35%) | loss: 2.739536 | lrm: 1.00 | dt: 645.08ms | tok/sec: 812,747 | mfu: 50.80 | epoch: 1 | total time: 84.99m | eta: 94.6m +step 07911/16704 (47.36%) | loss: 2.728261 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,891 | mfu: 50.74 | epoch: 1 | total time: 85.00m | eta: 94.6m +step 07912/16704 (47.37%) | loss: 2.731114 | lrm: 1.00 | dt: 646.42ms | tok/sec: 811,064 | mfu: 50.69 | epoch: 1 | total time: 85.01m | eta: 94.6m +step 07913/16704 (47.37%) | loss: 2.728009 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,895 | mfu: 50.81 | epoch: 1 | total time: 85.03m | eta: 94.6m +step 07914/16704 (47.38%) | loss: 2.736310 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,409 | mfu: 50.90 | epoch: 1 | total time: 85.04m | eta: 94.6m +step 07915/16704 (47.38%) | loss: 2.721667 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,153 | mfu: 50.89 | epoch: 1 | total time: 85.05m | eta: 94.6m +step 07916/16704 (47.39%) | loss: 2.723473 | lrm: 1.00 | dt: 646.69ms | tok/sec: 810,727 | mfu: 50.67 | epoch: 1 | total time: 85.06m | eta: 94.5m +step 07917/16704 (47.40%) | loss: 2.721141 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,828 | mfu: 50.87 | epoch: 1 | total time: 85.07m | eta: 94.5m +step 07918/16704 (47.40%) | loss: 2.700356 | lrm: 1.00 | dt: 643.22ms | tok/sec: 815,099 | mfu: 50.94 | epoch: 1 | total time: 85.08m | eta: 94.5m +step 07919/16704 (47.41%) | loss: 2.704440 | lrm: 1.00 | dt: 645.18ms | tok/sec: 812,626 | mfu: 50.79 | epoch: 1 | total time: 85.09m | eta: 94.5m +step 07920/16704 (47.41%) | loss: 2.696741 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,569 | mfu: 50.85 | epoch: 1 | total time: 85.10m | eta: 94.5m +step 07921/16704 (47.42%) | loss: 2.700689 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,388 | mfu: 50.90 | epoch: 1 | total time: 85.11m | eta: 94.5m +step 07922/16704 (47.43%) | loss: 2.704818 | lrm: 1.00 | dt: 643.69ms | tok/sec: 814,497 | mfu: 50.91 | epoch: 1 | total time: 85.12m | eta: 94.5m +step 07923/16704 (47.43%) | loss: 2.720064 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,623 | mfu: 50.67 | epoch: 1 | total time: 85.13m | eta: 94.5m +step 07924/16704 (47.44%) | loss: 2.714935 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,224 | mfu: 50.77 | epoch: 1 | total time: 85.14m | eta: 94.5m +step 07925/16704 (47.44%) | loss: 2.700145 | lrm: 1.00 | dt: 646.21ms | tok/sec: 811,330 | mfu: 50.71 | epoch: 1 | total time: 85.15m | eta: 94.5m +step 07926/16704 (47.45%) | loss: 2.683925 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,162 | mfu: 50.95 | epoch: 1 | total time: 85.17m | eta: 94.4m +step 07927/16704 (47.46%) | loss: 2.662462 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,120 | mfu: 50.88 | epoch: 1 | total time: 85.18m | eta: 94.4m +step 07928/16704 (47.46%) | loss: 2.646191 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,321 | mfu: 50.90 | epoch: 1 | total time: 85.19m | eta: 94.4m +step 07929/16704 (47.47%) | loss: 2.653498 | lrm: 1.00 | dt: 645.41ms | tok/sec: 812,335 | mfu: 50.77 | epoch: 1 | total time: 85.20m | eta: 94.4m +step 07930/16704 (47.47%) | loss: 2.655453 | lrm: 1.00 | dt: 644.74ms | tok/sec: 813,180 | mfu: 50.82 | epoch: 1 | total time: 85.21m | eta: 94.4m +step 07931/16704 (47.48%) | loss: 2.674297 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,364 | mfu: 50.90 | epoch: 1 | total time: 85.22m | eta: 94.4m +step 07932/16704 (47.49%) | loss: 2.678549 | lrm: 1.00 | dt: 645.28ms | tok/sec: 812,500 | mfu: 50.78 | epoch: 1 | total time: 85.23m | eta: 94.4m +step 07933/16704 (47.49%) | loss: 2.679907 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,311 | mfu: 50.96 | epoch: 1 | total time: 85.24m | eta: 94.4m +step 07934/16704 (47.50%) | loss: 2.681389 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,992 | mfu: 50.75 | epoch: 1 | total time: 85.25m | eta: 94.4m +step 07935/16704 (47.50%) | loss: 2.675927 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,142 | mfu: 50.70 | epoch: 1 | total time: 85.26m | eta: 94.3m +step 07936/16704 (47.51%) | loss: 2.680790 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,692 | mfu: 50.92 | epoch: 1 | total time: 85.27m | eta: 94.3m +step 07937/16704 (47.52%) | loss: 2.681145 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,226 | mfu: 50.83 | epoch: 1 | total time: 85.28m | eta: 94.3m +step 07938/16704 (47.52%) | loss: 2.686547 | lrm: 1.00 | dt: 645.83ms | tok/sec: 811,800 | mfu: 50.74 | epoch: 1 | total time: 85.29m | eta: 94.3m +step 07939/16704 (47.53%) | loss: 2.690644 | lrm: 1.00 | dt: 644.72ms | tok/sec: 813,202 | mfu: 50.83 | epoch: 1 | total time: 85.31m | eta: 94.3m +step 07940/16704 (47.53%) | loss: 2.685339 | lrm: 1.00 | dt: 642.04ms | tok/sec: 816,602 | mfu: 51.04 | epoch: 1 | total time: 85.32m | eta: 94.3m +step 07941/16704 (47.54%) | loss: 2.687037 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,979 | mfu: 50.81 | epoch: 1 | total time: 85.33m | eta: 94.3m +step 07942/16704 (47.55%) | loss: 2.690239 | lrm: 1.00 | dt: 648.89ms | tok/sec: 807,979 | mfu: 50.50 | epoch: 1 | total time: 85.34m | eta: 94.3m +step 07943/16704 (47.55%) | loss: 2.677761 | lrm: 1.00 | dt: 640.80ms | tok/sec: 818,181 | mfu: 51.14 | epoch: 1 | total time: 85.35m | eta: 94.3m +step 07944/16704 (47.56%) | loss: 2.671726 | lrm: 1.00 | dt: 645.27ms | tok/sec: 812,504 | mfu: 50.78 | epoch: 2 | total time: 85.36m | eta: 94.2m +step 07945/16704 (47.56%) | loss: 2.673520 | lrm: 1.00 | dt: 644.59ms | tok/sec: 813,364 | mfu: 50.84 | epoch: 2 | total time: 85.37m | eta: 94.2m +step 07946/16704 (47.57%) | loss: 2.676902 | lrm: 1.00 | dt: 644.95ms | tok/sec: 812,907 | mfu: 50.81 | epoch: 2 | total time: 85.38m | eta: 94.2m +step 07947/16704 (47.58%) | loss: 2.678749 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,832 | mfu: 50.80 | epoch: 2 | total time: 85.39m | eta: 94.2m +step 07948/16704 (47.58%) | loss: 2.689467 | lrm: 1.00 | dt: 643.30ms | tok/sec: 814,998 | mfu: 50.94 | epoch: 2 | total time: 85.40m | eta: 94.2m +step 07949/16704 (47.59%) | loss: 2.700292 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 85.41m | eta: 94.2m +step 07950/16704 (47.59%) | loss: 2.700726 | lrm: 1.00 | dt: 647.31ms | tok/sec: 809,944 | mfu: 50.62 | epoch: 2 | total time: 85.42m | eta: 94.2m +step 07951/16704 (47.60%) | loss: 2.698539 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,189 | mfu: 51.01 | epoch: 2 | total time: 85.43m | eta: 94.2m +step 07952/16704 (47.61%) | loss: 2.699846 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 2 | total time: 85.44m | eta: 94.2m +step 07953/16704 (47.61%) | loss: 2.708010 | lrm: 1.00 | dt: 646.25ms | tok/sec: 811,277 | mfu: 50.71 | epoch: 2 | total time: 85.46m | eta: 94.1m +step 07954/16704 (47.62%) | loss: 2.700837 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 2 | total time: 85.47m | eta: 94.1m +step 07955/16704 (47.62%) | loss: 2.683766 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,344 | mfu: 50.71 | epoch: 2 | total time: 85.48m | eta: 94.1m +step 07956/16704 (47.63%) | loss: 2.675582 | lrm: 1.00 | dt: 645.48ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 2 | total time: 85.49m | eta: 94.1m +step 07957/16704 (47.64%) | loss: 2.678158 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,146 | mfu: 51.01 | epoch: 2 | total time: 85.50m | eta: 94.1m +step 07958/16704 (47.64%) | loss: 2.684204 | lrm: 1.00 | dt: 646.41ms | tok/sec: 811,077 | mfu: 50.69 | epoch: 2 | total time: 85.51m | eta: 94.1m +step 07959/16704 (47.65%) | loss: 2.689748 | lrm: 1.00 | dt: 646.44ms | tok/sec: 811,038 | mfu: 50.69 | epoch: 2 | total time: 85.52m | eta: 94.1m +step 07960/16704 (47.65%) | loss: 2.692328 | lrm: 1.00 | dt: 642.40ms | tok/sec: 816,142 | mfu: 51.01 | epoch: 2 | total time: 85.53m | eta: 94.1m +step 07961/16704 (47.66%) | loss: 2.691720 | lrm: 1.00 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 2 | total time: 85.54m | eta: 94.1m +step 07962/16704 (47.67%) | loss: 2.686750 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,333 | mfu: 50.83 | epoch: 2 | total time: 85.55m | eta: 94.1m +step 07963/16704 (47.67%) | loss: 2.683145 | lrm: 1.00 | dt: 643.93ms | tok/sec: 814,200 | mfu: 50.89 | epoch: 2 | total time: 85.56m | eta: 94.0m +step 07964/16704 (47.68%) | loss: 2.683151 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,503 | mfu: 50.66 | epoch: 2 | total time: 85.57m | eta: 94.0m +step 07965/16704 (47.68%) | loss: 2.678375 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,114 | mfu: 50.88 | epoch: 2 | total time: 85.58m | eta: 94.0m +step 07966/16704 (47.69%) | loss: 2.685279 | lrm: 1.00 | dt: 645.93ms | tok/sec: 811,681 | mfu: 50.73 | epoch: 2 | total time: 85.60m | eta: 94.0m +step 07967/16704 (47.70%) | loss: 2.683006 | lrm: 1.00 | dt: 674.09ms | tok/sec: 777,772 | mfu: 48.61 | epoch: 2 | total time: 85.61m | eta: 94.0m +step 07968/16704 (47.70%) | loss: 2.683454 | lrm: 1.00 | dt: 636.12ms | tok/sec: 824,191 | mfu: 51.51 | epoch: 2 | total time: 85.62m | eta: 94.0m +step 07969/16704 (47.71%) | loss: 2.674220 | lrm: 1.00 | dt: 650.35ms | tok/sec: 806,160 | mfu: 50.39 | epoch: 2 | total time: 85.63m | eta: 94.0m +step 07970/16704 (47.71%) | loss: 2.680900 | lrm: 1.00 | dt: 643.79ms | tok/sec: 814,372 | mfu: 50.90 | epoch: 2 | total time: 85.64m | eta: 94.0m +step 07971/16704 (47.72%) | loss: 2.693533 | lrm: 1.00 | dt: 641.38ms | tok/sec: 817,440 | mfu: 51.09 | epoch: 2 | total time: 85.65m | eta: 94.0m +step 07972/16704 (47.73%) | loss: 2.706077 | lrm: 1.00 | dt: 648.17ms | tok/sec: 808,873 | mfu: 50.56 | epoch: 2 | total time: 85.66m | eta: 93.9m +step 07973/16704 (47.73%) | loss: 2.707369 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,769 | mfu: 50.92 | epoch: 2 | total time: 85.67m | eta: 93.9m +step 07974/16704 (47.74%) | loss: 2.695776 | lrm: 1.00 | dt: 645.66ms | tok/sec: 812,017 | mfu: 50.75 | epoch: 2 | total time: 85.68m | eta: 93.9m +step 07975/16704 (47.74%) | loss: 2.693531 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,920 | mfu: 50.75 | epoch: 2 | total time: 85.69m | eta: 93.9m +step 07976/16704 (47.75%) | loss: 2.708481 | lrm: 1.00 | dt: 640.98ms | tok/sec: 817,944 | mfu: 51.12 | epoch: 2 | total time: 85.70m | eta: 93.9m +step 07977/16704 (47.76%) | loss: 2.709623 | lrm: 1.00 | dt: 645.35ms | tok/sec: 812,405 | mfu: 50.78 | epoch: 2 | total time: 85.71m | eta: 93.9m +step 07978/16704 (47.76%) | loss: 2.710924 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,657 | mfu: 50.85 | epoch: 2 | total time: 85.72m | eta: 93.9m +step 07979/16704 (47.77%) | loss: 2.711575 | lrm: 1.00 | dt: 646.83ms | tok/sec: 810,547 | mfu: 50.66 | epoch: 2 | total time: 85.74m | eta: 93.9m +step 07980/16704 (47.77%) | loss: 2.709001 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,714 | mfu: 50.92 | epoch: 2 | total time: 85.75m | eta: 93.9m +step 07981/16704 (47.78%) | loss: 2.706060 | lrm: 1.00 | dt: 643.08ms | tok/sec: 815,282 | mfu: 50.96 | epoch: 2 | total time: 85.76m | eta: 93.8m +step 07982/16704 (47.78%) | loss: 2.700033 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,732 | mfu: 50.86 | epoch: 2 | total time: 85.77m | eta: 93.8m +step 07983/16704 (47.79%) | loss: 2.700135 | lrm: 1.00 | dt: 642.12ms | tok/sec: 816,497 | mfu: 51.03 | epoch: 2 | total time: 85.78m | eta: 93.8m +step 07984/16704 (47.80%) | loss: 2.711525 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,660 | mfu: 50.92 | epoch: 2 | total time: 85.79m | eta: 93.8m +step 07985/16704 (47.80%) | loss: 2.711995 | lrm: 1.00 | dt: 644.91ms | tok/sec: 812,968 | mfu: 50.81 | epoch: 2 | total time: 85.80m | eta: 93.8m +step 07986/16704 (47.81%) | loss: 2.702361 | lrm: 1.00 | dt: 644.27ms | tok/sec: 813,773 | mfu: 50.86 | epoch: 2 | total time: 85.81m | eta: 93.8m +step 07987/16704 (47.81%) | loss: 2.697874 | lrm: 1.00 | dt: 646.34ms | tok/sec: 811,167 | mfu: 50.70 | epoch: 2 | total time: 85.82m | eta: 93.8m +step 07988/16704 (47.82%) | loss: 2.706695 | lrm: 1.00 | dt: 643.32ms | tok/sec: 814,974 | mfu: 50.94 | epoch: 2 | total time: 85.83m | eta: 93.8m +step 07989/16704 (47.83%) | loss: 2.701590 | lrm: 1.00 | dt: 643.61ms | tok/sec: 814,609 | mfu: 50.91 | epoch: 2 | total time: 85.84m | eta: 93.8m +step 07990/16704 (47.83%) | loss: 2.699858 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,995 | mfu: 50.75 | epoch: 2 | total time: 85.85m | eta: 93.8m +step 07991/16704 (47.84%) | loss: 2.683454 | lrm: 1.00 | dt: 642.68ms | tok/sec: 815,782 | mfu: 50.99 | epoch: 2 | total time: 85.86m | eta: 93.7m +step 07992/16704 (47.84%) | loss: 2.687075 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,186 | mfu: 50.89 | epoch: 2 | total time: 85.87m | eta: 93.7m +step 07993/16704 (47.85%) | loss: 2.676530 | lrm: 1.00 | dt: 641.73ms | tok/sec: 816,995 | mfu: 51.06 | epoch: 2 | total time: 85.89m | eta: 93.7m +step 07994/16704 (47.86%) | loss: 2.678412 | lrm: 1.00 | dt: 649.00ms | tok/sec: 807,837 | mfu: 50.49 | epoch: 2 | total time: 85.90m | eta: 93.7m +step 07995/16704 (47.86%) | loss: 2.666484 | lrm: 1.00 | dt: 642.59ms | tok/sec: 815,897 | mfu: 50.99 | epoch: 2 | total time: 85.91m | eta: 93.7m +step 07996/16704 (47.87%) | loss: 2.673826 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,061 | mfu: 50.82 | epoch: 2 | total time: 85.92m | eta: 93.7m +step 07997/16704 (47.87%) | loss: 2.672143 | lrm: 1.00 | dt: 641.65ms | tok/sec: 817,093 | mfu: 51.07 | epoch: 2 | total time: 85.93m | eta: 93.7m +step 07998/16704 (47.88%) | loss: 2.669526 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,324 | mfu: 50.90 | epoch: 2 | total time: 85.94m | eta: 93.7m +step 07999/16704 (47.89%) | loss: 2.662930 | lrm: 1.00 | dt: 646.07ms | tok/sec: 811,497 | mfu: 50.72 | epoch: 2 | total time: 85.95m | eta: 93.7m +[GC rank7] gen2: 186.6ms collected 90976 objects +[GC rank6] gen2: 191.7ms collected 90984 objects +[GC rank0] gen2: 193.5ms collected 91112 objects +[GC rank2] gen2: 197.1ms collected 91056 objects +[GC rank5] gen2: 242.9ms collected 91008 objects +[GC rank3] gen2: 250.5ms collected 91048 objects +[GC rank1] gen2: 317.4ms collected 91088 objects +[GC rank4] gen2: 372.6ms collected 91024 objects +Step 08000 | Validation bpb: 0.822244 +step 08000/16704 (47.89%) | loss: 2.649165 | lrm: 1.00 | dt: 643.29ms | tok/sec: 815,011 | mfu: 50.94 | epoch: 2 | total time: 85.96m | eta: 93.6m +step 08001/16704 (47.90%) | loss: 2.651423 | lrm: 1.00 | dt: 647.18ms | tok/sec: 810,105 | mfu: 50.63 | epoch: 2 | total time: 85.97m | eta: 93.6m +step 08002/16704 (47.90%) | loss: 2.648210 | lrm: 1.00 | dt: 646.32ms | tok/sec: 811,183 | mfu: 50.70 | epoch: 2 | total time: 85.98m | eta: 93.6m +step 08003/16704 (47.91%) | loss: 2.659334 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,388 | mfu: 50.90 | epoch: 2 | total time: 85.99m | eta: 93.6m +step 08004/16704 (47.92%) | loss: 2.656725 | lrm: 1.00 | dt: 641.95ms | tok/sec: 816,716 | mfu: 51.05 | epoch: 2 | total time: 86.00m | eta: 93.6m +step 08005/16704 (47.92%) | loss: 2.660026 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,009 | mfu: 50.88 | epoch: 2 | total time: 86.01m | eta: 93.6m +step 08006/16704 (47.93%) | loss: 2.674487 | lrm: 1.00 | dt: 642.09ms | tok/sec: 816,538 | mfu: 51.03 | epoch: 2 | total time: 86.03m | eta: 93.6m +step 08007/16704 (47.93%) | loss: 2.669757 | lrm: 1.00 | dt: 643.72ms | tok/sec: 814,468 | mfu: 50.91 | epoch: 2 | total time: 86.04m | eta: 93.6m +step 08008/16704 (47.94%) | loss: 2.666344 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,193 | mfu: 50.83 | epoch: 2 | total time: 86.05m | eta: 93.6m +step 08009/16704 (47.95%) | loss: 2.670895 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,387 | mfu: 50.78 | epoch: 2 | total time: 86.06m | eta: 93.5m +step 08010/16704 (47.95%) | loss: 2.686300 | lrm: 1.00 | dt: 643.17ms | tok/sec: 815,164 | mfu: 50.95 | epoch: 2 | total time: 86.07m | eta: 93.5m +step 08011/16704 (47.96%) | loss: 2.683487 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,314 | mfu: 50.83 | epoch: 2 | total time: 86.08m | eta: 93.5m +step 08012/16704 (47.96%) | loss: 2.682375 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,817 | mfu: 50.86 | epoch: 2 | total time: 86.09m | eta: 93.5m +step 08013/16704 (47.97%) | loss: 2.688325 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,556 | mfu: 50.85 | epoch: 2 | total time: 86.10m | eta: 93.5m +step 08014/16704 (47.98%) | loss: 2.693239 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,998 | mfu: 50.88 | epoch: 2 | total time: 86.11m | eta: 93.5m +step 08015/16704 (47.98%) | loss: 2.702308 | lrm: 1.00 | dt: 645.01ms | tok/sec: 812,835 | mfu: 50.80 | epoch: 2 | total time: 86.12m | eta: 93.5m +step 08016/16704 (47.99%) | loss: 2.691654 | lrm: 1.00 | dt: 645.04ms | tok/sec: 812,797 | mfu: 50.80 | epoch: 2 | total time: 86.13m | eta: 93.5m +step 08017/16704 (47.99%) | loss: 2.689376 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,577 | mfu: 50.85 | epoch: 2 | total time: 86.14m | eta: 93.5m +step 08018/16704 (48.00%) | loss: 2.689839 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,533 | mfu: 50.91 | epoch: 2 | total time: 86.15m | eta: 93.4m +step 08019/16704 (48.01%) | loss: 2.698647 | lrm: 1.00 | dt: 644.24ms | tok/sec: 813,810 | mfu: 50.86 | epoch: 2 | total time: 86.16m | eta: 93.4m +step 08020/16704 (48.01%) | loss: 2.696429 | lrm: 1.00 | dt: 644.83ms | tok/sec: 813,062 | mfu: 50.82 | epoch: 2 | total time: 86.18m | eta: 93.4m +step 08021/16704 (48.02%) | loss: 2.692416 | lrm: 1.00 | dt: 642.38ms | tok/sec: 816,160 | mfu: 51.01 | epoch: 2 | total time: 86.19m | eta: 93.4m +step 08022/16704 (48.02%) | loss: 2.693985 | lrm: 1.00 | dt: 644.85ms | tok/sec: 813,034 | mfu: 50.82 | epoch: 2 | total time: 86.20m | eta: 93.4m +step 08023/16704 (48.03%) | loss: 2.697805 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,769 | mfu: 50.92 | epoch: 2 | total time: 86.21m | eta: 93.4m +step 08024/16704 (48.04%) | loss: 2.693324 | lrm: 1.00 | dt: 641.83ms | tok/sec: 816,863 | mfu: 51.06 | epoch: 2 | total time: 86.22m | eta: 93.4m +step 08025/16704 (48.04%) | loss: 2.697086 | lrm: 1.00 | dt: 643.56ms | tok/sec: 814,663 | mfu: 50.92 | epoch: 2 | total time: 86.23m | eta: 93.4m +step 08026/16704 (48.05%) | loss: 2.699140 | lrm: 1.00 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 2 | total time: 86.24m | eta: 93.4m +step 08027/16704 (48.05%) | loss: 2.702320 | lrm: 1.00 | dt: 644.25ms | tok/sec: 813,797 | mfu: 50.86 | epoch: 2 | total time: 86.25m | eta: 93.4m +step 08028/16704 (48.06%) | loss: 2.702658 | lrm: 1.00 | dt: 645.78ms | tok/sec: 811,872 | mfu: 50.74 | epoch: 2 | total time: 86.26m | eta: 93.3m +step 08029/16704 (48.07%) | loss: 2.697394 | lrm: 1.00 | dt: 644.92ms | tok/sec: 812,946 | mfu: 50.81 | epoch: 2 | total time: 86.27m | eta: 93.3m +step 08030/16704 (48.07%) | loss: 2.702574 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,984 | mfu: 50.94 | epoch: 2 | total time: 86.28m | eta: 93.3m +step 08031/16704 (48.08%) | loss: 2.692559 | lrm: 1.00 | dt: 644.06ms | tok/sec: 814,036 | mfu: 50.88 | epoch: 2 | total time: 86.29m | eta: 93.3m +step 08032/16704 (48.08%) | loss: 2.700562 | lrm: 1.00 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 2 | total time: 86.30m | eta: 93.3m +step 08033/16704 (48.09%) | loss: 2.698284 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,497 | mfu: 50.91 | epoch: 2 | total time: 86.32m | eta: 93.3m +step 08034/16704 (48.10%) | loss: 2.694548 | lrm: 1.00 | dt: 643.05ms | tok/sec: 815,315 | mfu: 50.96 | epoch: 2 | total time: 86.33m | eta: 93.3m +step 08035/16704 (48.10%) | loss: 2.696966 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,632 | mfu: 50.79 | epoch: 2 | total time: 86.34m | eta: 93.3m +step 08036/16704 (48.11%) | loss: 2.702287 | lrm: 1.00 | dt: 643.54ms | tok/sec: 814,695 | mfu: 50.92 | epoch: 2 | total time: 86.35m | eta: 93.3m +step 08037/16704 (48.11%) | loss: 2.701267 | lrm: 1.00 | dt: 647.54ms | tok/sec: 809,655 | mfu: 50.60 | epoch: 2 | total time: 86.36m | eta: 93.2m +step 08038/16704 (48.12%) | loss: 2.699714 | lrm: 1.00 | dt: 644.42ms | tok/sec: 813,579 | mfu: 50.85 | epoch: 2 | total time: 86.37m | eta: 93.2m +step 08039/16704 (48.13%) | loss: 2.715136 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,593 | mfu: 50.79 | epoch: 2 | total time: 86.38m | eta: 93.2m +step 08040/16704 (48.13%) | loss: 2.702830 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,554 | mfu: 50.91 | epoch: 2 | total time: 86.39m | eta: 93.2m +step 08041/16704 (48.14%) | loss: 2.695793 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,934 | mfu: 50.87 | epoch: 2 | total time: 86.40m | eta: 93.2m +step 08042/16704 (48.14%) | loss: 2.697566 | lrm: 1.00 | dt: 645.16ms | tok/sec: 812,646 | mfu: 50.79 | epoch: 2 | total time: 86.41m | eta: 93.2m +step 08043/16704 (48.15%) | loss: 2.703261 | lrm: 1.00 | dt: 644.43ms | tok/sec: 813,574 | mfu: 50.85 | epoch: 2 | total time: 86.42m | eta: 93.2m +step 08044/16704 (48.16%) | loss: 2.699099 | lrm: 1.00 | dt: 646.10ms | tok/sec: 811,470 | mfu: 50.72 | epoch: 2 | total time: 86.43m | eta: 93.2m +step 08045/16704 (48.16%) | loss: 2.701689 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,599 | mfu: 50.79 | epoch: 2 | total time: 86.44m | eta: 93.2m +step 08046/16704 (48.17%) | loss: 2.686789 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,860 | mfu: 50.93 | epoch: 2 | total time: 86.45m | eta: 93.1m +step 08047/16704 (48.17%) | loss: 2.676366 | lrm: 1.00 | dt: 645.86ms | tok/sec: 811,767 | mfu: 50.74 | epoch: 2 | total time: 86.47m | eta: 93.1m +step 08048/16704 (48.18%) | loss: 2.681581 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,663 | mfu: 50.79 | epoch: 2 | total time: 86.48m | eta: 93.1m +step 08049/16704 (48.19%) | loss: 2.677932 | lrm: 1.00 | dt: 643.76ms | tok/sec: 814,410 | mfu: 50.90 | epoch: 2 | total time: 86.49m | eta: 93.1m +step 08050/16704 (48.19%) | loss: 2.677139 | lrm: 1.00 | dt: 644.88ms | tok/sec: 812,997 | mfu: 50.81 | epoch: 2 | total time: 86.50m | eta: 93.1m +step 08051/16704 (48.20%) | loss: 2.689602 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,583 | mfu: 50.98 | epoch: 2 | total time: 86.51m | eta: 93.1m +step 08052/16704 (48.20%) | loss: 2.686219 | lrm: 1.00 | dt: 642.91ms | tok/sec: 815,495 | mfu: 50.97 | epoch: 2 | total time: 86.52m | eta: 93.1m +step 08053/16704 (48.21%) | loss: 2.688338 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,195 | mfu: 50.83 | epoch: 2 | total time: 86.53m | eta: 93.1m +step 08054/16704 (48.22%) | loss: 2.683175 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,031 | mfu: 50.82 | epoch: 2 | total time: 86.54m | eta: 93.1m +step 08055/16704 (48.22%) | loss: 2.696512 | lrm: 1.00 | dt: 644.61ms | tok/sec: 813,346 | mfu: 50.84 | epoch: 2 | total time: 86.55m | eta: 93.0m +step 08056/16704 (48.23%) | loss: 2.693852 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,568 | mfu: 50.79 | epoch: 2 | total time: 86.56m | eta: 93.0m +step 08057/16704 (48.23%) | loss: 2.696720 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 2 | total time: 86.57m | eta: 93.0m +step 08058/16704 (48.24%) | loss: 2.690301 | lrm: 1.00 | dt: 642.42ms | tok/sec: 816,117 | mfu: 51.01 | epoch: 2 | total time: 86.58m | eta: 93.0m +step 08059/16704 (48.25%) | loss: 2.685074 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,292 | mfu: 50.64 | epoch: 2 | total time: 86.59m | eta: 93.0m +step 08060/16704 (48.25%) | loss: 2.678815 | lrm: 1.00 | dt: 643.16ms | tok/sec: 815,175 | mfu: 50.95 | epoch: 2 | total time: 86.61m | eta: 93.0m +step 08061/16704 (48.26%) | loss: 2.704046 | lrm: 1.00 | dt: 643.49ms | tok/sec: 814,757 | mfu: 50.92 | epoch: 2 | total time: 86.62m | eta: 93.0m +step 08062/16704 (48.26%) | loss: 2.717801 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 2 | total time: 86.63m | eta: 93.0m +step 08063/16704 (48.27%) | loss: 2.714951 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,027 | mfu: 50.88 | epoch: 2 | total time: 86.64m | eta: 93.0m +step 08064/16704 (48.28%) | loss: 2.712243 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,535 | mfu: 50.72 | epoch: 2 | total time: 86.65m | eta: 93.0m +step 08065/16704 (48.28%) | loss: 2.709520 | lrm: 1.00 | dt: 644.13ms | tok/sec: 813,942 | mfu: 50.87 | epoch: 2 | total time: 86.66m | eta: 92.9m +step 08066/16704 (48.29%) | loss: 2.698103 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,325 | mfu: 50.83 | epoch: 2 | total time: 86.67m | eta: 92.9m +step 08067/16704 (48.29%) | loss: 2.691002 | lrm: 1.00 | dt: 646.60ms | tok/sec: 810,839 | mfu: 50.68 | epoch: 2 | total time: 86.68m | eta: 92.9m +step 08068/16704 (48.30%) | loss: 2.683419 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,320 | mfu: 50.83 | epoch: 2 | total time: 86.69m | eta: 92.9m +step 08069/16704 (48.31%) | loss: 2.688883 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,874 | mfu: 50.93 | epoch: 2 | total time: 86.70m | eta: 92.9m +step 08070/16704 (48.31%) | loss: 2.676481 | lrm: 1.00 | dt: 646.94ms | tok/sec: 810,412 | mfu: 50.65 | epoch: 2 | total time: 86.71m | eta: 92.9m +step 08071/16704 (48.32%) | loss: 2.679025 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,743 | mfu: 50.74 | epoch: 2 | total time: 86.72m | eta: 92.9m +step 08072/16704 (48.32%) | loss: 2.677591 | lrm: 1.00 | dt: 643.65ms | tok/sec: 814,559 | mfu: 50.91 | epoch: 2 | total time: 86.73m | eta: 92.9m +step 08073/16704 (48.33%) | loss: 2.681358 | lrm: 1.00 | dt: 646.97ms | tok/sec: 810,371 | mfu: 50.65 | epoch: 2 | total time: 86.74m | eta: 92.9m +step 08074/16704 (48.34%) | loss: 2.679833 | lrm: 1.00 | dt: 645.99ms | tok/sec: 811,601 | mfu: 50.73 | epoch: 2 | total time: 86.76m | eta: 92.8m +step 08075/16704 (48.34%) | loss: 2.683777 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,196 | mfu: 50.76 | epoch: 2 | total time: 86.77m | eta: 92.8m +step 08076/16704 (48.35%) | loss: 2.680525 | lrm: 1.00 | dt: 644.86ms | tok/sec: 813,026 | mfu: 50.82 | epoch: 2 | total time: 86.78m | eta: 92.8m +step 08077/16704 (48.35%) | loss: 2.693964 | lrm: 1.00 | dt: 644.90ms | tok/sec: 812,976 | mfu: 50.81 | epoch: 2 | total time: 86.79m | eta: 92.8m +step 08078/16704 (48.36%) | loss: 2.696559 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,459 | mfu: 50.72 | epoch: 2 | total time: 86.80m | eta: 92.8m +step 08079/16704 (48.37%) | loss: 2.699316 | lrm: 1.00 | dt: 646.08ms | tok/sec: 811,489 | mfu: 50.72 | epoch: 2 | total time: 86.81m | eta: 92.8m +step 08080/16704 (48.37%) | loss: 2.697153 | lrm: 1.00 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 2 | total time: 86.82m | eta: 92.8m +step 08081/16704 (48.38%) | loss: 2.688671 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,438 | mfu: 50.72 | epoch: 2 | total time: 86.83m | eta: 92.8m +step 08082/16704 (48.38%) | loss: 2.682344 | lrm: 1.00 | dt: 642.96ms | tok/sec: 815,425 | mfu: 50.97 | epoch: 2 | total time: 86.84m | eta: 92.8m +step 08083/16704 (48.39%) | loss: 2.681067 | lrm: 1.00 | dt: 645.25ms | tok/sec: 812,529 | mfu: 50.78 | epoch: 2 | total time: 86.85m | eta: 92.7m +step 08084/16704 (48.40%) | loss: 2.685511 | lrm: 1.00 | dt: 644.62ms | tok/sec: 813,333 | mfu: 50.83 | epoch: 2 | total time: 86.86m | eta: 92.7m +step 08085/16704 (48.40%) | loss: 2.683408 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,103 | mfu: 50.76 | epoch: 2 | total time: 86.87m | eta: 92.7m +step 08086/16704 (48.41%) | loss: 2.684634 | lrm: 1.00 | dt: 645.98ms | tok/sec: 811,610 | mfu: 50.73 | epoch: 2 | total time: 86.88m | eta: 92.7m +step 08087/16704 (48.41%) | loss: 2.695770 | lrm: 1.00 | dt: 644.30ms | tok/sec: 813,733 | mfu: 50.86 | epoch: 2 | total time: 86.90m | eta: 92.7m +step 08088/16704 (48.42%) | loss: 2.700264 | lrm: 1.00 | dt: 646.14ms | tok/sec: 811,412 | mfu: 50.71 | epoch: 2 | total time: 86.91m | eta: 92.7m +step 08089/16704 (48.43%) | loss: 2.695586 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,024 | mfu: 50.88 | epoch: 2 | total time: 86.92m | eta: 92.7m +step 08090/16704 (48.43%) | loss: 2.703873 | lrm: 1.00 | dt: 642.10ms | tok/sec: 816,516 | mfu: 51.03 | epoch: 2 | total time: 86.93m | eta: 92.7m +step 08091/16704 (48.44%) | loss: 2.699482 | lrm: 1.00 | dt: 644.53ms | tok/sec: 813,439 | mfu: 50.84 | epoch: 2 | total time: 86.94m | eta: 92.7m +step 08092/16704 (48.44%) | loss: 2.704316 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,283 | mfu: 50.64 | epoch: 2 | total time: 86.95m | eta: 92.7m +step 08093/16704 (48.45%) | loss: 2.704163 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,474 | mfu: 50.84 | epoch: 2 | total time: 86.96m | eta: 92.6m +step 08094/16704 (48.46%) | loss: 2.696180 | lrm: 1.00 | dt: 646.22ms | tok/sec: 811,308 | mfu: 50.71 | epoch: 2 | total time: 86.97m | eta: 92.6m +step 08095/16704 (48.46%) | loss: 2.707515 | lrm: 1.00 | dt: 644.00ms | tok/sec: 814,116 | mfu: 50.88 | epoch: 2 | total time: 86.98m | eta: 92.6m +step 08096/16704 (48.47%) | loss: 2.707903 | lrm: 1.00 | dt: 644.33ms | tok/sec: 813,690 | mfu: 50.86 | epoch: 2 | total time: 86.99m | eta: 92.6m +step 08097/16704 (48.47%) | loss: 2.719217 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,693 | mfu: 50.73 | epoch: 2 | total time: 87.00m | eta: 92.6m +step 08098/16704 (48.48%) | loss: 2.714424 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,279 | mfu: 50.58 | epoch: 2 | total time: 87.01m | eta: 92.6m +step 08099/16704 (48.49%) | loss: 2.716839 | lrm: 1.00 | dt: 644.70ms | tok/sec: 813,229 | mfu: 50.83 | epoch: 2 | total time: 87.02m | eta: 92.6m +step 08100/16704 (48.49%) | loss: 2.715923 | lrm: 1.00 | dt: 645.36ms | tok/sec: 812,401 | mfu: 50.78 | epoch: 2 | total time: 87.04m | eta: 92.6m +step 08101/16704 (48.50%) | loss: 2.730669 | lrm: 1.00 | dt: 646.36ms | tok/sec: 811,137 | mfu: 50.70 | epoch: 2 | total time: 87.05m | eta: 92.6m +step 08102/16704 (48.50%) | loss: 2.729383 | lrm: 1.00 | dt: 643.97ms | tok/sec: 814,143 | mfu: 50.89 | epoch: 2 | total time: 87.06m | eta: 92.5m +step 08103/16704 (48.51%) | loss: 2.721514 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,773 | mfu: 50.67 | epoch: 2 | total time: 87.07m | eta: 92.5m +step 08104/16704 (48.52%) | loss: 2.711599 | lrm: 1.00 | dt: 644.87ms | tok/sec: 813,013 | mfu: 50.81 | epoch: 2 | total time: 87.08m | eta: 92.5m +step 08105/16704 (48.52%) | loss: 2.703127 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,998 | mfu: 50.75 | epoch: 2 | total time: 87.09m | eta: 92.5m +step 08106/16704 (48.53%) | loss: 2.707047 | lrm: 1.00 | dt: 643.58ms | tok/sec: 814,648 | mfu: 50.92 | epoch: 2 | total time: 87.10m | eta: 92.5m +step 08107/16704 (48.53%) | loss: 2.693748 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,240 | mfu: 50.64 | epoch: 2 | total time: 87.11m | eta: 92.5m +step 08108/16704 (48.54%) | loss: 2.683448 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,226 | mfu: 50.70 | epoch: 2 | total time: 87.12m | eta: 92.5m +step 08109/16704 (48.55%) | loss: 2.683880 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,548 | mfu: 50.97 | epoch: 2 | total time: 87.13m | eta: 92.5m +step 08110/16704 (48.55%) | loss: 2.685119 | lrm: 1.00 | dt: 645.87ms | tok/sec: 811,759 | mfu: 50.74 | epoch: 2 | total time: 87.14m | eta: 92.5m +step 08111/16704 (48.56%) | loss: 2.685555 | lrm: 1.00 | dt: 646.65ms | tok/sec: 810,776 | mfu: 50.67 | epoch: 2 | total time: 87.15m | eta: 92.4m +step 08112/16704 (48.56%) | loss: 2.702045 | lrm: 1.00 | dt: 643.30ms | tok/sec: 814,994 | mfu: 50.94 | epoch: 2 | total time: 87.16m | eta: 92.4m +step 08113/16704 (48.57%) | loss: 2.711547 | lrm: 1.00 | dt: 646.28ms | tok/sec: 811,241 | mfu: 50.70 | epoch: 2 | total time: 87.17m | eta: 92.4m +step 08114/16704 (48.58%) | loss: 2.709314 | lrm: 1.00 | dt: 645.15ms | tok/sec: 812,656 | mfu: 50.79 | epoch: 2 | total time: 87.19m | eta: 92.4m +step 08115/16704 (48.58%) | loss: 2.713922 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,209 | mfu: 50.76 | epoch: 2 | total time: 87.20m | eta: 92.4m +step 08116/16704 (48.59%) | loss: 2.719841 | lrm: 1.00 | dt: 643.87ms | tok/sec: 814,275 | mfu: 50.89 | epoch: 2 | total time: 87.21m | eta: 92.4m +step 08117/16704 (48.59%) | loss: 2.715172 | lrm: 1.00 | dt: 644.89ms | tok/sec: 812,989 | mfu: 50.81 | epoch: 2 | total time: 87.22m | eta: 92.4m +step 08118/16704 (48.60%) | loss: 2.709932 | lrm: 1.00 | dt: 644.23ms | tok/sec: 813,819 | mfu: 50.86 | epoch: 2 | total time: 87.23m | eta: 92.4m +step 08119/16704 (48.61%) | loss: 2.718653 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,736 | mfu: 50.73 | epoch: 2 | total time: 87.24m | eta: 92.4m +step 08120/16704 (48.61%) | loss: 2.713207 | lrm: 1.00 | dt: 644.71ms | tok/sec: 813,215 | mfu: 50.83 | epoch: 2 | total time: 87.25m | eta: 92.3m +step 08121/16704 (48.62%) | loss: 2.710246 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,313 | mfu: 50.83 | epoch: 2 | total time: 87.26m | eta: 92.3m +step 08122/16704 (48.62%) | loss: 2.717266 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,503 | mfu: 50.66 | epoch: 2 | total time: 87.27m | eta: 92.3m +step 08123/16704 (48.63%) | loss: 2.727409 | lrm: 1.00 | dt: 644.82ms | tok/sec: 813,073 | mfu: 50.82 | epoch: 2 | total time: 87.28m | eta: 92.3m +step 08124/16704 (48.64%) | loss: 2.726735 | lrm: 1.00 | dt: 646.88ms | tok/sec: 810,488 | mfu: 50.66 | epoch: 2 | total time: 87.29m | eta: 92.3m +step 08125/16704 (48.64%) | loss: 2.723438 | lrm: 1.00 | dt: 643.48ms | tok/sec: 814,772 | mfu: 50.92 | epoch: 2 | total time: 87.30m | eta: 92.3m +step 08126/16704 (48.65%) | loss: 2.734814 | lrm: 1.00 | dt: 645.52ms | tok/sec: 812,189 | mfu: 50.76 | epoch: 2 | total time: 87.31m | eta: 92.3m +step 08127/16704 (48.65%) | loss: 2.738758 | lrm: 1.00 | dt: 644.77ms | tok/sec: 813,136 | mfu: 50.82 | epoch: 2 | total time: 87.33m | eta: 92.3m +step 08128/16704 (48.66%) | loss: 2.740472 | lrm: 1.00 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 2 | total time: 87.34m | eta: 92.3m +step 08129/16704 (48.66%) | loss: 2.714922 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 2 | total time: 87.35m | eta: 92.3m +step 08130/16704 (48.67%) | loss: 2.717574 | lrm: 1.00 | dt: 645.62ms | tok/sec: 812,074 | mfu: 50.76 | epoch: 2 | total time: 87.36m | eta: 92.2m +step 08131/16704 (48.68%) | loss: 2.711575 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,886 | mfu: 50.81 | epoch: 2 | total time: 87.37m | eta: 92.2m +step 08132/16704 (48.68%) | loss: 2.707452 | lrm: 1.00 | dt: 644.40ms | tok/sec: 813,608 | mfu: 50.85 | epoch: 2 | total time: 87.38m | eta: 92.2m +step 08133/16704 (48.69%) | loss: 2.701020 | lrm: 1.00 | dt: 642.84ms | tok/sec: 815,582 | mfu: 50.98 | epoch: 2 | total time: 87.39m | eta: 92.2m +step 08134/16704 (48.69%) | loss: 2.695490 | lrm: 1.00 | dt: 648.86ms | tok/sec: 808,012 | mfu: 50.50 | epoch: 2 | total time: 87.40m | eta: 92.2m +step 08135/16704 (48.70%) | loss: 2.697154 | lrm: 1.00 | dt: 643.45ms | tok/sec: 814,812 | mfu: 50.93 | epoch: 2 | total time: 87.41m | eta: 92.2m +step 08136/16704 (48.71%) | loss: 2.699227 | lrm: 1.00 | dt: 645.22ms | tok/sec: 812,575 | mfu: 50.79 | epoch: 2 | total time: 87.42m | eta: 92.2m +step 08137/16704 (48.71%) | loss: 2.700317 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,472 | mfu: 50.91 | epoch: 2 | total time: 87.43m | eta: 92.2m +step 08138/16704 (48.72%) | loss: 2.702475 | lrm: 1.00 | dt: 646.02ms | tok/sec: 811,559 | mfu: 50.72 | epoch: 2 | total time: 87.44m | eta: 92.2m +step 08139/16704 (48.72%) | loss: 2.711696 | lrm: 1.00 | dt: 642.73ms | tok/sec: 815,725 | mfu: 50.98 | epoch: 2 | total time: 87.45m | eta: 92.1m +step 08140/16704 (48.73%) | loss: 2.717760 | lrm: 1.00 | dt: 644.41ms | tok/sec: 813,591 | mfu: 50.85 | epoch: 2 | total time: 87.47m | eta: 92.1m +step 08141/16704 (48.74%) | loss: 2.713936 | lrm: 1.00 | dt: 646.04ms | tok/sec: 811,545 | mfu: 50.72 | epoch: 2 | total time: 87.48m | eta: 92.1m +step 08142/16704 (48.74%) | loss: 2.716881 | lrm: 1.00 | dt: 643.11ms | tok/sec: 815,237 | mfu: 50.95 | epoch: 2 | total time: 87.49m | eta: 92.1m +step 08143/16704 (48.75%) | loss: 2.704093 | lrm: 1.00 | dt: 645.84ms | tok/sec: 811,796 | mfu: 50.74 | epoch: 2 | total time: 87.50m | eta: 92.1m +step 08144/16704 (48.75%) | loss: 2.706885 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,902 | mfu: 50.75 | epoch: 2 | total time: 87.51m | eta: 92.1m +step 08145/16704 (48.76%) | loss: 2.708944 | lrm: 1.00 | dt: 645.60ms | tok/sec: 812,093 | mfu: 50.76 | epoch: 2 | total time: 87.52m | eta: 92.1m +step 08146/16704 (48.77%) | loss: 2.704915 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,989 | mfu: 50.94 | epoch: 2 | total time: 87.53m | eta: 92.1m +step 08147/16704 (48.77%) | loss: 2.691388 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,993 | mfu: 50.88 | epoch: 2 | total time: 87.54m | eta: 92.1m +step 08148/16704 (48.78%) | loss: 2.703278 | lrm: 1.00 | dt: 648.18ms | tok/sec: 808,862 | mfu: 50.56 | epoch: 2 | total time: 87.55m | eta: 92.0m +step 08149/16704 (48.78%) | loss: 2.689836 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,363 | mfu: 50.90 | epoch: 2 | total time: 87.56m | eta: 92.0m +step 08150/16704 (48.79%) | loss: 2.676581 | lrm: 1.00 | dt: 646.23ms | tok/sec: 811,297 | mfu: 50.71 | epoch: 2 | total time: 87.57m | eta: 92.0m +step 08151/16704 (48.80%) | loss: 2.681972 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,503 | mfu: 50.85 | epoch: 2 | total time: 87.58m | eta: 92.0m +step 08152/16704 (48.80%) | loss: 2.675393 | lrm: 1.00 | dt: 644.97ms | tok/sec: 812,881 | mfu: 50.81 | epoch: 2 | total time: 87.59m | eta: 92.0m +step 08153/16704 (48.81%) | loss: 2.691198 | lrm: 1.00 | dt: 646.49ms | tok/sec: 810,973 | mfu: 50.69 | epoch: 2 | total time: 87.60m | eta: 92.0m +step 08154/16704 (48.81%) | loss: 2.692328 | lrm: 1.00 | dt: 647.00ms | tok/sec: 810,342 | mfu: 50.65 | epoch: 2 | total time: 87.62m | eta: 92.0m +step 08155/16704 (48.82%) | loss: 2.694526 | lrm: 1.00 | dt: 646.02ms | tok/sec: 811,567 | mfu: 50.72 | epoch: 2 | total time: 87.63m | eta: 92.0m +step 08156/16704 (48.83%) | loss: 2.697978 | lrm: 1.00 | dt: 645.67ms | tok/sec: 812,007 | mfu: 50.75 | epoch: 2 | total time: 87.64m | eta: 92.0m +step 08157/16704 (48.83%) | loss: 2.692609 | lrm: 1.00 | dt: 647.27ms | tok/sec: 810,000 | mfu: 50.63 | epoch: 2 | total time: 87.65m | eta: 92.0m +step 08158/16704 (48.84%) | loss: 2.713433 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,609 | mfu: 50.98 | epoch: 2 | total time: 87.66m | eta: 91.9m +step 08159/16704 (48.84%) | loss: 2.707015 | lrm: 1.00 | dt: 646.12ms | tok/sec: 811,439 | mfu: 50.72 | epoch: 2 | total time: 87.67m | eta: 91.9m +step 08160/16704 (48.85%) | loss: 2.711068 | lrm: 1.00 | dt: 644.45ms | tok/sec: 813,545 | mfu: 50.85 | epoch: 2 | total time: 87.68m | eta: 91.9m +step 08161/16704 (48.86%) | loss: 2.724937 | lrm: 1.00 | dt: 646.20ms | tok/sec: 811,344 | mfu: 50.71 | epoch: 2 | total time: 87.69m | eta: 91.9m +step 08162/16704 (48.86%) | loss: 2.734711 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,049 | mfu: 50.69 | epoch: 2 | total time: 87.70m | eta: 91.9m +step 08163/16704 (48.87%) | loss: 2.728094 | lrm: 1.00 | dt: 643.47ms | tok/sec: 814,781 | mfu: 50.92 | epoch: 2 | total time: 87.71m | eta: 91.9m +step 08164/16704 (48.87%) | loss: 2.722368 | lrm: 1.00 | dt: 646.43ms | tok/sec: 811,050 | mfu: 50.69 | epoch: 2 | total time: 87.72m | eta: 91.9m +step 08165/16704 (48.88%) | loss: 2.727334 | lrm: 1.00 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 2 | total time: 87.73m | eta: 91.9m +step 08166/16704 (48.89%) | loss: 2.723597 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,846 | mfu: 50.80 | epoch: 2 | total time: 87.74m | eta: 91.9m +step 08167/16704 (48.89%) | loss: 2.718749 | lrm: 1.00 | dt: 644.81ms | tok/sec: 813,090 | mfu: 50.82 | epoch: 2 | total time: 87.76m | eta: 91.8m +step 08168/16704 (48.90%) | loss: 2.718376 | lrm: 1.00 | dt: 643.99ms | tok/sec: 814,123 | mfu: 50.88 | epoch: 2 | total time: 87.77m | eta: 91.8m +step 08169/16704 (48.90%) | loss: 2.722402 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,355 | mfu: 50.84 | epoch: 2 | total time: 87.78m | eta: 91.8m +step 08170/16704 (48.91%) | loss: 2.710803 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,813 | mfu: 50.80 | epoch: 2 | total time: 87.79m | eta: 91.8m +step 08171/16704 (48.92%) | loss: 2.703649 | lrm: 1.00 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 2 | total time: 87.80m | eta: 91.8m +step 08172/16704 (48.92%) | loss: 2.711739 | lrm: 1.00 | dt: 648.09ms | tok/sec: 808,974 | mfu: 50.56 | epoch: 2 | total time: 87.81m | eta: 91.8m +step 08173/16704 (48.93%) | loss: 2.708341 | lrm: 1.00 | dt: 645.88ms | tok/sec: 811,743 | mfu: 50.74 | epoch: 2 | total time: 87.82m | eta: 91.8m +step 08174/16704 (48.93%) | loss: 2.714994 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,529 | mfu: 50.91 | epoch: 2 | total time: 87.83m | eta: 91.8m +step 08175/16704 (48.94%) | loss: 2.724505 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,449 | mfu: 50.72 | epoch: 2 | total time: 87.84m | eta: 91.8m +step 08176/16704 (48.95%) | loss: 2.726998 | lrm: 1.00 | dt: 643.28ms | tok/sec: 815,026 | mfu: 50.94 | epoch: 2 | total time: 87.85m | eta: 91.7m +step 08177/16704 (48.95%) | loss: 2.728560 | lrm: 1.00 | dt: 645.73ms | tok/sec: 811,928 | mfu: 50.75 | epoch: 2 | total time: 87.86m | eta: 91.7m +step 08178/16704 (48.96%) | loss: 2.706719 | lrm: 1.00 | dt: 644.73ms | tok/sec: 813,189 | mfu: 50.83 | epoch: 2 | total time: 87.87m | eta: 91.7m +step 08179/16704 (48.96%) | loss: 2.717886 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,102 | mfu: 50.76 | epoch: 2 | total time: 87.88m | eta: 91.7m +step 08180/16704 (48.97%) | loss: 2.717390 | lrm: 1.00 | dt: 646.74ms | tok/sec: 810,661 | mfu: 50.67 | epoch: 2 | total time: 87.90m | eta: 91.7m +step 08181/16704 (48.98%) | loss: 2.733381 | lrm: 1.00 | dt: 644.48ms | tok/sec: 813,507 | mfu: 50.85 | epoch: 2 | total time: 87.91m | eta: 91.7m +step 08182/16704 (48.98%) | loss: 2.732653 | lrm: 1.00 | dt: 648.13ms | tok/sec: 808,924 | mfu: 50.56 | epoch: 2 | total time: 87.92m | eta: 91.7m +step 08183/16704 (48.99%) | loss: 2.731034 | lrm: 1.00 | dt: 646.83ms | tok/sec: 810,546 | mfu: 50.66 | epoch: 2 | total time: 87.93m | eta: 91.7m +step 08184/16704 (48.99%) | loss: 2.721398 | lrm: 1.00 | dt: 645.53ms | tok/sec: 812,180 | mfu: 50.76 | epoch: 2 | total time: 87.94m | eta: 91.7m +step 08185/16704 (49.00%) | loss: 2.725268 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,512 | mfu: 50.72 | epoch: 2 | total time: 87.95m | eta: 91.7m +step 08186/16704 (49.01%) | loss: 2.709284 | lrm: 1.00 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 2 | total time: 87.96m | eta: 91.6m +step 08187/16704 (49.01%) | loss: 2.715274 | lrm: 1.00 | dt: 648.38ms | tok/sec: 808,618 | mfu: 50.54 | epoch: 2 | total time: 87.97m | eta: 91.6m +step 08188/16704 (49.02%) | loss: 2.722046 | lrm: 1.00 | dt: 645.37ms | tok/sec: 812,387 | mfu: 50.78 | epoch: 2 | total time: 87.98m | eta: 91.6m +step 08189/16704 (49.02%) | loss: 2.713678 | lrm: 1.00 | dt: 646.09ms | tok/sec: 811,480 | mfu: 50.72 | epoch: 2 | total time: 87.99m | eta: 91.6m +step 08190/16704 (49.03%) | loss: 2.690560 | lrm: 1.00 | dt: 645.20ms | tok/sec: 812,594 | mfu: 50.79 | epoch: 2 | total time: 88.00m | eta: 91.6m +step 08191/16704 (49.04%) | loss: 2.689503 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,901 | mfu: 50.81 | epoch: 2 | total time: 88.01m | eta: 91.6m +step 08192/16704 (49.04%) | loss: 2.705474 | lrm: 1.00 | dt: 644.55ms | tok/sec: 813,422 | mfu: 50.84 | epoch: 2 | total time: 88.02m | eta: 91.6m +step 08193/16704 (49.05%) | loss: 2.704732 | lrm: 1.00 | dt: 645.68ms | tok/sec: 811,988 | mfu: 50.75 | epoch: 2 | total time: 88.04m | eta: 91.6m +step 08194/16704 (49.05%) | loss: 2.692935 | lrm: 1.00 | dt: 645.92ms | tok/sec: 811,696 | mfu: 50.73 | epoch: 2 | total time: 88.05m | eta: 91.6m +step 08195/16704 (49.06%) | loss: 2.694656 | lrm: 1.00 | dt: 646.73ms | tok/sec: 810,676 | mfu: 50.67 | epoch: 2 | total time: 88.06m | eta: 91.5m +step 08196/16704 (49.07%) | loss: 2.706047 | lrm: 1.00 | dt: 647.08ms | tok/sec: 810,242 | mfu: 50.64 | epoch: 2 | total time: 88.07m | eta: 91.5m +step 08197/16704 (49.07%) | loss: 2.698749 | lrm: 1.00 | dt: 646.06ms | tok/sec: 811,512 | mfu: 50.72 | epoch: 2 | total time: 88.08m | eta: 91.5m +step 08198/16704 (49.08%) | loss: 2.684354 | lrm: 1.00 | dt: 645.58ms | tok/sec: 812,113 | mfu: 50.76 | epoch: 2 | total time: 88.09m | eta: 91.5m +step 08199/16704 (49.08%) | loss: 2.681522 | lrm: 1.00 | dt: 647.48ms | tok/sec: 809,740 | mfu: 50.61 | epoch: 2 | total time: 88.10m | eta: 91.5m +step 08200/16704 (49.09%) | loss: 2.690932 | lrm: 1.00 | dt: 646.11ms | tok/sec: 811,457 | mfu: 50.72 | epoch: 2 | total time: 88.11m | eta: 91.5m +step 08201/16704 (49.10%) | loss: 2.700048 | lrm: 1.00 | dt: 646.86ms | tok/sec: 810,510 | mfu: 50.66 | epoch: 2 | total time: 88.12m | eta: 91.5m +step 08202/16704 (49.10%) | loss: 2.690030 | lrm: 1.00 | dt: 647.25ms | tok/sec: 810,021 | mfu: 50.63 | epoch: 2 | total time: 88.13m | eta: 91.5m +step 08203/16704 (49.11%) | loss: 2.684814 | lrm: 1.00 | dt: 649.62ms | tok/sec: 807,064 | mfu: 50.44 | epoch: 2 | total time: 88.14m | eta: 91.5m +step 08204/16704 (49.11%) | loss: 2.680218 | lrm: 1.00 | dt: 645.79ms | tok/sec: 811,857 | mfu: 50.74 | epoch: 2 | total time: 88.15m | eta: 91.4m +step 08205/16704 (49.12%) | loss: 2.688412 | lrm: 1.00 | dt: 647.93ms | tok/sec: 809,177 | mfu: 50.57 | epoch: 2 | total time: 88.16m | eta: 91.4m +step 08206/16704 (49.13%) | loss: 2.679639 | lrm: 1.00 | dt: 648.13ms | tok/sec: 808,918 | mfu: 50.56 | epoch: 2 | total time: 88.18m | eta: 91.4m +step 08207/16704 (49.13%) | loss: 2.671581 | lrm: 1.00 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 2 | total time: 88.19m | eta: 91.4m +step 08208/16704 (49.14%) | loss: 2.683780 | lrm: 1.00 | dt: 648.06ms | tok/sec: 809,013 | mfu: 50.56 | epoch: 2 | total time: 88.20m | eta: 91.4m +step 08209/16704 (49.14%) | loss: 2.676498 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,750 | mfu: 50.86 | epoch: 2 | total time: 88.21m | eta: 91.4m +step 08210/16704 (49.15%) | loss: 2.679657 | lrm: 1.00 | dt: 647.55ms | tok/sec: 809,654 | mfu: 50.60 | epoch: 2 | total time: 88.22m | eta: 91.4m +step 08211/16704 (49.16%) | loss: 2.680115 | lrm: 1.00 | dt: 648.79ms | tok/sec: 808,098 | mfu: 50.51 | epoch: 2 | total time: 88.23m | eta: 91.4m +step 08212/16704 (49.16%) | loss: 2.686200 | lrm: 1.00 | dt: 645.80ms | tok/sec: 811,842 | mfu: 50.74 | epoch: 2 | total time: 88.24m | eta: 91.4m +step 08213/16704 (49.17%) | loss: 2.685104 | lrm: 1.00 | dt: 646.64ms | tok/sec: 810,790 | mfu: 50.68 | epoch: 2 | total time: 88.25m | eta: 91.3m +step 08214/16704 (49.17%) | loss: 2.682995 | lrm: 1.00 | dt: 649.63ms | tok/sec: 807,055 | mfu: 50.44 | epoch: 2 | total time: 88.26m | eta: 91.3m +step 08215/16704 (49.18%) | loss: 2.663696 | lrm: 1.00 | dt: 646.95ms | tok/sec: 810,394 | mfu: 50.65 | epoch: 2 | total time: 88.27m | eta: 91.3m +step 08216/16704 (49.19%) | loss: 2.663307 | lrm: 1.00 | dt: 647.33ms | tok/sec: 809,927 | mfu: 50.62 | epoch: 2 | total time: 88.28m | eta: 91.3m +step 08217/16704 (49.19%) | loss: 2.660844 | lrm: 1.00 | dt: 648.33ms | tok/sec: 808,680 | mfu: 50.54 | epoch: 2 | total time: 88.29m | eta: 91.3m +step 08218/16704 (49.20%) | loss: 2.665721 | lrm: 1.00 | dt: 646.87ms | tok/sec: 810,493 | mfu: 50.66 | epoch: 2 | total time: 88.30m | eta: 91.3m +step 08219/16704 (49.20%) | loss: 2.665175 | lrm: 1.00 | dt: 649.12ms | tok/sec: 807,684 | mfu: 50.48 | epoch: 2 | total time: 88.32m | eta: 91.3m +step 08220/16704 (49.21%) | loss: 2.668457 | lrm: 1.00 | dt: 645.74ms | tok/sec: 811,916 | mfu: 50.75 | epoch: 2 | total time: 88.33m | eta: 91.3m +step 08221/16704 (49.22%) | loss: 2.672897 | lrm: 1.00 | dt: 648.35ms | tok/sec: 808,650 | mfu: 50.54 | epoch: 2 | total time: 88.34m | eta: 91.3m +step 08222/16704 (49.22%) | loss: 2.685819 | lrm: 1.00 | dt: 648.82ms | tok/sec: 808,069 | mfu: 50.51 | epoch: 2 | total time: 88.35m | eta: 91.3m +step 08223/16704 (49.23%) | loss: 2.694532 | lrm: 1.00 | dt: 645.76ms | tok/sec: 811,887 | mfu: 50.74 | epoch: 2 | total time: 88.36m | eta: 91.2m +step 08224/16704 (49.23%) | loss: 2.706311 | lrm: 1.00 | dt: 647.32ms | tok/sec: 809,930 | mfu: 50.62 | epoch: 2 | total time: 88.37m | eta: 91.2m +step 08225/16704 (49.24%) | loss: 2.706999 | lrm: 1.00 | dt: 646.63ms | tok/sec: 810,800 | mfu: 50.68 | epoch: 2 | total time: 88.38m | eta: 91.2m +step 08226/16704 (49.25%) | loss: 2.693556 | lrm: 1.00 | dt: 648.30ms | tok/sec: 808,706 | mfu: 50.55 | epoch: 2 | total time: 88.39m | eta: 91.2m +step 08227/16704 (49.25%) | loss: 2.675639 | lrm: 1.00 | dt: 647.75ms | tok/sec: 809,398 | mfu: 50.59 | epoch: 2 | total time: 88.40m | eta: 91.2m +step 08228/16704 (49.26%) | loss: 2.666323 | lrm: 1.00 | dt: 645.82ms | tok/sec: 811,814 | mfu: 50.74 | epoch: 2 | total time: 88.41m | eta: 91.2m +step 08229/16704 (49.26%) | loss: 2.676090 | lrm: 1.00 | dt: 646.16ms | tok/sec: 811,392 | mfu: 50.71 | epoch: 2 | total time: 88.42m | eta: 91.2m +step 08230/16704 (49.27%) | loss: 2.677693 | lrm: 1.00 | dt: 649.08ms | tok/sec: 807,743 | mfu: 50.49 | epoch: 2 | total time: 88.43m | eta: 91.2m +step 08231/16704 (49.28%) | loss: 2.670106 | lrm: 1.00 | dt: 645.51ms | tok/sec: 812,202 | mfu: 50.76 | epoch: 2 | total time: 88.45m | eta: 91.2m +step 08232/16704 (49.28%) | loss: 2.676129 | lrm: 1.00 | dt: 650.18ms | tok/sec: 806,375 | mfu: 50.40 | epoch: 2 | total time: 88.46m | eta: 91.1m +step 08233/16704 (49.29%) | loss: 2.679097 | lrm: 1.00 | dt: 648.21ms | tok/sec: 808,829 | mfu: 50.55 | epoch: 2 | total time: 88.47m | eta: 91.1m +step 08234/16704 (49.29%) | loss: 2.673920 | lrm: 1.00 | dt: 646.29ms | tok/sec: 811,229 | mfu: 50.70 | epoch: 2 | total time: 88.48m | eta: 91.1m +step 08235/16704 (49.30%) | loss: 2.679351 | lrm: 1.00 | dt: 644.94ms | tok/sec: 812,928 | mfu: 50.81 | epoch: 2 | total time: 88.49m | eta: 91.1m +step 08236/16704 (49.31%) | loss: 2.683964 | lrm: 1.00 | dt: 647.85ms | tok/sec: 809,276 | mfu: 50.58 | epoch: 2 | total time: 88.50m | eta: 91.1m +step 08237/16704 (49.31%) | loss: 2.674374 | lrm: 1.00 | dt: 647.95ms | tok/sec: 809,154 | mfu: 50.57 | epoch: 2 | total time: 88.51m | eta: 91.1m +step 08238/16704 (49.32%) | loss: 2.672994 | lrm: 1.00 | dt: 646.71ms | tok/sec: 810,698 | mfu: 50.67 | epoch: 2 | total time: 88.52m | eta: 91.1m +step 08239/16704 (49.32%) | loss: 2.684055 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 2 | total time: 88.53m | eta: 91.1m +step 08240/16704 (49.33%) | loss: 2.688558 | lrm: 1.00 | dt: 645.50ms | tok/sec: 812,217 | mfu: 50.76 | epoch: 2 | total time: 88.54m | eta: 91.1m +step 08241/16704 (49.34%) | loss: 2.691669 | lrm: 1.00 | dt: 646.35ms | tok/sec: 811,154 | mfu: 50.70 | epoch: 2 | total time: 88.55m | eta: 91.0m +step 08242/16704 (49.34%) | loss: 2.702654 | lrm: 1.00 | dt: 644.75ms | tok/sec: 813,165 | mfu: 50.82 | epoch: 2 | total time: 88.56m | eta: 91.0m +step 08243/16704 (49.35%) | loss: 2.694986 | lrm: 1.00 | dt: 645.71ms | tok/sec: 811,961 | mfu: 50.75 | epoch: 2 | total time: 88.57m | eta: 91.0m +step 08244/16704 (49.35%) | loss: 2.693024 | lrm: 1.00 | dt: 648.21ms | tok/sec: 808,825 | mfu: 50.55 | epoch: 2 | total time: 88.59m | eta: 91.0m +step 08245/16704 (49.36%) | loss: 2.707470 | lrm: 1.00 | dt: 646.18ms | tok/sec: 811,359 | mfu: 50.71 | epoch: 2 | total time: 88.60m | eta: 91.0m +step 08246/16704 (49.37%) | loss: 2.707187 | lrm: 1.00 | dt: 643.78ms | tok/sec: 814,387 | mfu: 50.90 | epoch: 2 | total time: 88.61m | eta: 91.0m +step 08247/16704 (49.37%) | loss: 2.693634 | lrm: 1.00 | dt: 648.31ms | tok/sec: 808,705 | mfu: 50.55 | epoch: 2 | total time: 88.62m | eta: 91.0m +step 08248/16704 (49.38%) | loss: 2.671880 | lrm: 1.00 | dt: 645.06ms | tok/sec: 812,771 | mfu: 50.80 | epoch: 2 | total time: 88.63m | eta: 91.0m +step 08249/16704 (49.38%) | loss: 2.672160 | lrm: 1.00 | dt: 647.24ms | tok/sec: 810,033 | mfu: 50.63 | epoch: 2 | total time: 88.64m | eta: 91.0m +Step 08250 | Validation bpb: 0.820837 +step 08250/16704 (49.39%) | loss: 2.679210 | lrm: 1.00 | dt: 645.03ms | tok/sec: 812,809 | mfu: 50.80 | epoch: 2 | total time: 88.65m | eta: 91.0m +step 08251/16704 (49.40%) | loss: 2.686787 | lrm: 1.00 | dt: 641.71ms | tok/sec: 817,013 | mfu: 51.06 | epoch: 2 | total time: 88.66m | eta: 90.9m +step 08252/16704 (49.40%) | loss: 2.678220 | lrm: 1.00 | dt: 648.41ms | tok/sec: 808,575 | mfu: 50.54 | epoch: 2 | total time: 88.67m | eta: 90.9m +step 08253/16704 (49.41%) | loss: 2.678704 | lrm: 1.00 | dt: 641.01ms | tok/sec: 817,907 | mfu: 51.12 | epoch: 2 | total time: 88.68m | eta: 90.9m +step 08254/16704 (49.41%) | loss: 2.680935 | lrm: 1.00 | dt: 644.52ms | tok/sec: 813,460 | mfu: 50.84 | epoch: 2 | total time: 88.69m | eta: 90.9m +step 08255/16704 (49.42%) | loss: 2.692281 | lrm: 1.00 | dt: 646.89ms | tok/sec: 810,479 | mfu: 50.66 | epoch: 2 | total time: 88.70m | eta: 90.9m +step 08256/16704 (49.43%) | loss: 2.690869 | lrm: 1.00 | dt: 643.18ms | tok/sec: 815,147 | mfu: 50.95 | epoch: 2 | total time: 88.71m | eta: 90.9m +step 08257/16704 (49.43%) | loss: 2.685332 | lrm: 1.00 | dt: 641.66ms | tok/sec: 817,074 | mfu: 51.07 | epoch: 2 | total time: 88.72m | eta: 90.9m +step 08258/16704 (49.44%) | loss: 2.705746 | lrm: 1.00 | dt: 645.54ms | tok/sec: 812,175 | mfu: 50.76 | epoch: 2 | total time: 88.74m | eta: 90.9m +step 08259/16704 (49.44%) | loss: 2.698192 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,187 | mfu: 51.01 | epoch: 2 | total time: 88.75m | eta: 90.9m +step 08260/16704 (49.45%) | loss: 2.704585 | lrm: 1.00 | dt: 643.38ms | tok/sec: 814,893 | mfu: 50.93 | epoch: 2 | total time: 88.76m | eta: 90.8m +step 08261/16704 (49.46%) | loss: 2.715095 | lrm: 1.00 | dt: 642.70ms | tok/sec: 815,761 | mfu: 50.99 | epoch: 2 | total time: 88.77m | eta: 90.8m +step 08262/16704 (49.46%) | loss: 2.732702 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,821 | mfu: 50.93 | epoch: 2 | total time: 88.78m | eta: 90.8m +step 08263/16704 (49.47%) | loss: 2.728046 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,183 | mfu: 50.95 | epoch: 2 | total time: 88.79m | eta: 90.8m +step 08264/16704 (49.47%) | loss: 2.737195 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,155 | mfu: 51.01 | epoch: 2 | total time: 88.80m | eta: 90.8m +step 08265/16704 (49.48%) | loss: 2.732115 | lrm: 1.00 | dt: 644.29ms | tok/sec: 813,751 | mfu: 50.86 | epoch: 2 | total time: 88.81m | eta: 90.8m +step 08266/16704 (49.49%) | loss: 2.721044 | lrm: 1.00 | dt: 644.01ms | tok/sec: 814,105 | mfu: 50.88 | epoch: 2 | total time: 88.82m | eta: 90.8m +step 08267/16704 (49.49%) | loss: 2.722477 | lrm: 1.00 | dt: 645.75ms | tok/sec: 811,908 | mfu: 50.75 | epoch: 2 | total time: 88.83m | eta: 90.8m +step 08268/16704 (49.50%) | loss: 2.729627 | lrm: 1.00 | dt: 642.18ms | tok/sec: 816,418 | mfu: 51.03 | epoch: 2 | total time: 88.84m | eta: 90.8m +step 08269/16704 (49.50%) | loss: 2.716504 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,494 | mfu: 50.91 | epoch: 2 | total time: 88.85m | eta: 90.7m +step 08270/16704 (49.51%) | loss: 2.701370 | lrm: 1.00 | dt: 644.08ms | tok/sec: 814,012 | mfu: 50.88 | epoch: 2 | total time: 88.86m | eta: 90.7m +step 08271/16704 (49.52%) | loss: 2.704640 | lrm: 1.00 | dt: 643.67ms | tok/sec: 814,531 | mfu: 50.91 | epoch: 2 | total time: 88.88m | eta: 90.7m +step 08272/16704 (49.52%) | loss: 2.692292 | lrm: 1.00 | dt: 643.25ms | tok/sec: 815,062 | mfu: 50.94 | epoch: 2 | total time: 88.89m | eta: 90.7m +step 08273/16704 (49.53%) | loss: 2.682951 | lrm: 1.00 | dt: 642.65ms | tok/sec: 815,827 | mfu: 50.99 | epoch: 2 | total time: 88.90m | eta: 90.7m +step 08274/16704 (49.53%) | loss: 2.680748 | lrm: 1.00 | dt: 640.85ms | tok/sec: 818,118 | mfu: 51.13 | epoch: 2 | total time: 88.91m | eta: 90.7m +step 08275/16704 (49.54%) | loss: 2.680076 | lrm: 1.00 | dt: 642.39ms | tok/sec: 816,152 | mfu: 51.01 | epoch: 2 | total time: 88.92m | eta: 90.7m +step 08276/16704 (49.55%) | loss: 2.673768 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,895 | mfu: 50.81 | epoch: 2 | total time: 88.93m | eta: 90.7m +step 08277/16704 (49.55%) | loss: 2.681168 | lrm: 1.00 | dt: 643.26ms | tok/sec: 815,051 | mfu: 50.94 | epoch: 2 | total time: 88.94m | eta: 90.7m +step 08278/16704 (49.56%) | loss: 2.678643 | lrm: 1.00 | dt: 643.71ms | tok/sec: 814,473 | mfu: 50.91 | epoch: 2 | total time: 88.95m | eta: 90.6m +step 08279/16704 (49.56%) | loss: 2.692244 | lrm: 1.00 | dt: 642.60ms | tok/sec: 815,882 | mfu: 50.99 | epoch: 2 | total time: 88.96m | eta: 90.6m +step 08280/16704 (49.57%) | loss: 2.671702 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,660 | mfu: 50.92 | epoch: 2 | total time: 88.97m | eta: 90.6m +step 08281/16704 (49.57%) | loss: 2.679266 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,121 | mfu: 50.82 | epoch: 2 | total time: 88.98m | eta: 90.6m +step 08282/16704 (49.58%) | loss: 2.677137 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,544 | mfu: 50.97 | epoch: 2 | total time: 88.99m | eta: 90.6m +step 08283/16704 (49.59%) | loss: 2.676132 | lrm: 1.00 | dt: 642.97ms | tok/sec: 815,419 | mfu: 50.96 | epoch: 2 | total time: 89.00m | eta: 90.6m +step 08284/16704 (49.59%) | loss: 2.676606 | lrm: 1.00 | dt: 644.28ms | tok/sec: 813,761 | mfu: 50.86 | epoch: 2 | total time: 89.01m | eta: 90.6m +step 08285/16704 (49.60%) | loss: 2.676714 | lrm: 1.00 | dt: 642.61ms | tok/sec: 815,872 | mfu: 50.99 | epoch: 2 | total time: 89.03m | eta: 90.6m +step 08286/16704 (49.60%) | loss: 2.682924 | lrm: 1.00 | dt: 641.93ms | tok/sec: 816,739 | mfu: 51.05 | epoch: 2 | total time: 89.04m | eta: 90.6m +step 08287/16704 (49.61%) | loss: 2.681424 | lrm: 1.00 | dt: 643.60ms | tok/sec: 814,618 | mfu: 50.91 | epoch: 2 | total time: 89.05m | eta: 90.6m +step 08288/16704 (49.62%) | loss: 2.683375 | lrm: 1.00 | dt: 642.23ms | tok/sec: 816,356 | mfu: 51.02 | epoch: 2 | total time: 89.06m | eta: 90.5m +step 08289/16704 (49.62%) | loss: 2.697027 | lrm: 1.00 | dt: 643.24ms | tok/sec: 815,067 | mfu: 50.94 | epoch: 2 | total time: 89.07m | eta: 90.5m +step 08290/16704 (49.63%) | loss: 2.691042 | lrm: 1.00 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 2 | total time: 89.08m | eta: 90.5m +step 08291/16704 (49.63%) | loss: 2.695329 | lrm: 1.00 | dt: 643.41ms | tok/sec: 814,854 | mfu: 50.93 | epoch: 2 | total time: 89.09m | eta: 90.5m +step 08292/16704 (49.64%) | loss: 2.695373 | lrm: 1.00 | dt: 644.63ms | tok/sec: 813,310 | mfu: 50.83 | epoch: 2 | total time: 89.10m | eta: 90.5m +step 08293/16704 (49.65%) | loss: 2.695601 | lrm: 1.00 | dt: 641.74ms | tok/sec: 816,984 | mfu: 51.06 | epoch: 2 | total time: 89.11m | eta: 90.5m +step 08294/16704 (49.65%) | loss: 2.692607 | lrm: 1.00 | dt: 642.21ms | tok/sec: 816,384 | mfu: 51.03 | epoch: 2 | total time: 89.12m | eta: 90.5m +step 08295/16704 (49.66%) | loss: 2.691952 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,695 | mfu: 50.98 | epoch: 2 | total time: 89.13m | eta: 90.5m +step 08296/16704 (49.66%) | loss: 2.684344 | lrm: 1.00 | dt: 644.80ms | tok/sec: 813,107 | mfu: 50.82 | epoch: 2 | total time: 89.14m | eta: 90.5m +step 08297/16704 (49.67%) | loss: 2.680861 | lrm: 1.00 | dt: 643.02ms | tok/sec: 815,356 | mfu: 50.96 | epoch: 2 | total time: 89.15m | eta: 90.4m +step 08298/16704 (49.68%) | loss: 2.680271 | lrm: 1.00 | dt: 643.70ms | tok/sec: 814,496 | mfu: 50.91 | epoch: 2 | total time: 89.16m | eta: 90.4m +step 08299/16704 (49.68%) | loss: 2.685320 | lrm: 1.00 | dt: 644.19ms | tok/sec: 813,866 | mfu: 50.87 | epoch: 2 | total time: 89.18m | eta: 90.4m +step 08300/16704 (49.69%) | loss: 2.686718 | lrm: 1.00 | dt: 643.88ms | tok/sec: 814,265 | mfu: 50.89 | epoch: 2 | total time: 89.19m | eta: 90.4m +step 08301/16704 (49.69%) | loss: 2.681123 | lrm: 1.00 | dt: 645.70ms | tok/sec: 811,963 | mfu: 50.75 | epoch: 2 | total time: 89.20m | eta: 90.4m +step 08302/16704 (49.70%) | loss: 2.683547 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,938 | mfu: 50.87 | epoch: 2 | total time: 89.21m | eta: 90.4m +step 08303/16704 (49.71%) | loss: 2.673090 | lrm: 1.00 | dt: 642.37ms | tok/sec: 816,181 | mfu: 51.01 | epoch: 2 | total time: 89.22m | eta: 90.4m +step 08304/16704 (49.71%) | loss: 2.679240 | lrm: 1.00 | dt: 643.23ms | tok/sec: 815,088 | mfu: 50.94 | epoch: 2 | total time: 89.23m | eta: 90.4m +step 08305/16704 (49.72%) | loss: 2.686227 | lrm: 1.00 | dt: 644.57ms | tok/sec: 813,390 | mfu: 50.84 | epoch: 2 | total time: 89.24m | eta: 90.4m +step 08306/16704 (49.72%) | loss: 2.699585 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,823 | mfu: 50.93 | epoch: 2 | total time: 89.25m | eta: 90.3m +step 08307/16704 (49.73%) | loss: 2.695960 | lrm: 1.00 | dt: 646.77ms | tok/sec: 810,626 | mfu: 50.67 | epoch: 2 | total time: 89.26m | eta: 90.3m +step 08308/16704 (49.74%) | loss: 2.713728 | lrm: 1.00 | dt: 641.40ms | tok/sec: 817,410 | mfu: 51.09 | epoch: 2 | total time: 89.27m | eta: 90.3m +step 08309/16704 (49.74%) | loss: 2.708266 | lrm: 1.00 | dt: 642.16ms | tok/sec: 816,446 | mfu: 51.03 | epoch: 2 | total time: 89.28m | eta: 90.3m +step 08310/16704 (49.75%) | loss: 2.706199 | lrm: 1.00 | dt: 644.44ms | tok/sec: 813,553 | mfu: 50.85 | epoch: 2 | total time: 89.29m | eta: 90.3m +step 08311/16704 (49.75%) | loss: 2.706203 | lrm: 1.00 | dt: 642.26ms | tok/sec: 816,312 | mfu: 51.02 | epoch: 2 | total time: 89.30m | eta: 90.3m +step 08312/16704 (49.76%) | loss: 2.712350 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,469 | mfu: 50.97 | epoch: 2 | total time: 89.31m | eta: 90.3m +step 08313/16704 (49.77%) | loss: 2.704895 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,122 | mfu: 50.95 | epoch: 2 | total time: 89.33m | eta: 90.3m +step 08314/16704 (49.77%) | loss: 2.694036 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,465 | mfu: 50.97 | epoch: 2 | total time: 89.34m | eta: 90.3m +step 08315/16704 (49.78%) | loss: 2.690235 | lrm: 1.00 | dt: 642.82ms | tok/sec: 815,606 | mfu: 50.98 | epoch: 2 | total time: 89.35m | eta: 90.3m +step 08316/16704 (49.78%) | loss: 2.690762 | lrm: 1.00 | dt: 642.36ms | tok/sec: 816,186 | mfu: 51.01 | epoch: 2 | total time: 89.36m | eta: 90.2m +step 08317/16704 (49.79%) | loss: 2.698839 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,239 | mfu: 50.83 | epoch: 2 | total time: 89.37m | eta: 90.2m +step 08318/16704 (49.80%) | loss: 2.693274 | lrm: 1.00 | dt: 643.64ms | tok/sec: 814,568 | mfu: 50.91 | epoch: 2 | total time: 89.38m | eta: 90.2m +step 08319/16704 (49.80%) | loss: 2.695601 | lrm: 1.00 | dt: 643.43ms | tok/sec: 814,829 | mfu: 50.93 | epoch: 2 | total time: 89.39m | eta: 90.2m +step 08320/16704 (49.81%) | loss: 2.700901 | lrm: 1.00 | dt: 643.26ms | tok/sec: 815,048 | mfu: 50.94 | epoch: 2 | total time: 89.40m | eta: 90.2m +step 08321/16704 (49.81%) | loss: 2.698649 | lrm: 1.00 | dt: 641.26ms | tok/sec: 817,589 | mfu: 51.10 | epoch: 2 | total time: 89.41m | eta: 90.2m +step 08322/16704 (49.82%) | loss: 2.698701 | lrm: 1.00 | dt: 643.98ms | tok/sec: 814,133 | mfu: 50.88 | epoch: 2 | total time: 89.42m | eta: 90.2m +step 08323/16704 (49.83%) | loss: 2.695609 | lrm: 1.00 | dt: 644.02ms | tok/sec: 814,083 | mfu: 50.88 | epoch: 2 | total time: 89.43m | eta: 90.2m +step 08324/16704 (49.83%) | loss: 2.718071 | lrm: 1.00 | dt: 644.50ms | tok/sec: 813,484 | mfu: 50.84 | epoch: 2 | total time: 89.44m | eta: 90.2m +step 08325/16704 (49.84%) | loss: 2.722749 | lrm: 1.00 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 2 | total time: 89.45m | eta: 90.1m +step 08326/16704 (49.84%) | loss: 2.713834 | lrm: 1.00 | dt: 641.13ms | tok/sec: 817,751 | mfu: 51.11 | epoch: 2 | total time: 89.46m | eta: 90.1m +step 08327/16704 (49.85%) | loss: 2.695612 | lrm: 1.00 | dt: 642.71ms | tok/sec: 815,749 | mfu: 50.99 | epoch: 2 | total time: 89.48m | eta: 90.1m +step 08328/16704 (49.86%) | loss: 2.686754 | lrm: 1.00 | dt: 644.21ms | tok/sec: 813,849 | mfu: 50.87 | epoch: 2 | total time: 89.49m | eta: 90.1m +step 08329/16704 (49.86%) | loss: 2.691117 | lrm: 1.00 | dt: 640.97ms | tok/sec: 817,958 | mfu: 51.12 | epoch: 2 | total time: 89.50m | eta: 90.1m +step 08330/16704 (49.87%) | loss: 2.690046 | lrm: 1.00 | dt: 641.75ms | tok/sec: 816,960 | mfu: 51.06 | epoch: 2 | total time: 89.51m | eta: 90.1m +step 08331/16704 (49.87%) | loss: 2.701191 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,545 | mfu: 50.97 | epoch: 2 | total time: 89.52m | eta: 90.1m +step 08332/16704 (49.88%) | loss: 2.691463 | lrm: 1.00 | dt: 642.99ms | tok/sec: 815,392 | mfu: 50.96 | epoch: 2 | total time: 89.53m | eta: 90.1m +step 08333/16704 (49.89%) | loss: 2.684811 | lrm: 1.00 | dt: 642.74ms | tok/sec: 815,708 | mfu: 50.98 | epoch: 2 | total time: 89.54m | eta: 90.1m +step 08334/16704 (49.89%) | loss: 2.707450 | lrm: 1.00 | dt: 643.31ms | tok/sec: 814,983 | mfu: 50.94 | epoch: 2 | total time: 89.55m | eta: 90.0m +step 08335/16704 (49.90%) | loss: 2.705349 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,726 | mfu: 50.92 | epoch: 2 | total time: 89.56m | eta: 90.0m +step 08336/16704 (49.90%) | loss: 2.706865 | lrm: 1.00 | dt: 644.15ms | tok/sec: 813,917 | mfu: 50.87 | epoch: 2 | total time: 89.57m | eta: 90.0m +step 08337/16704 (49.91%) | loss: 2.695296 | lrm: 1.00 | dt: 641.65ms | tok/sec: 817,092 | mfu: 51.07 | epoch: 2 | total time: 89.58m | eta: 90.0m +step 08338/16704 (49.92%) | loss: 2.689110 | lrm: 1.00 | dt: 643.40ms | tok/sec: 814,872 | mfu: 50.93 | epoch: 2 | total time: 89.59m | eta: 90.0m +step 08339/16704 (49.92%) | loss: 2.690321 | lrm: 1.00 | dt: 643.15ms | tok/sec: 815,185 | mfu: 50.95 | epoch: 2 | total time: 89.60m | eta: 90.0m +step 08340/16704 (49.93%) | loss: 2.709261 | lrm: 1.00 | dt: 643.68ms | tok/sec: 814,515 | mfu: 50.91 | epoch: 2 | total time: 89.61m | eta: 90.0m +step 08341/16704 (49.93%) | loss: 2.702046 | lrm: 1.00 | dt: 643.77ms | tok/sec: 814,397 | mfu: 50.90 | epoch: 2 | total time: 89.63m | eta: 90.0m +step 08342/16704 (49.94%) | loss: 2.698664 | lrm: 1.00 | dt: 643.94ms | tok/sec: 814,183 | mfu: 50.89 | epoch: 2 | total time: 89.64m | eta: 90.0m +step 08343/16704 (49.95%) | loss: 2.704839 | lrm: 1.00 | dt: 644.69ms | tok/sec: 813,244 | mfu: 50.83 | epoch: 2 | total time: 89.65m | eta: 89.9m +step 08344/16704 (49.95%) | loss: 2.702613 | lrm: 1.00 | dt: 640.83ms | tok/sec: 818,134 | mfu: 51.13 | epoch: 2 | total time: 89.66m | eta: 89.9m +step 08345/16704 (49.96%) | loss: 2.698695 | lrm: 1.00 | dt: 642.93ms | tok/sec: 815,471 | mfu: 50.97 | epoch: 2 | total time: 89.67m | eta: 89.9m +step 08346/16704 (49.96%) | loss: 2.702961 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,493 | mfu: 50.84 | epoch: 2 | total time: 89.68m | eta: 89.9m +step 08347/16704 (49.97%) | loss: 2.697148 | lrm: 1.00 | dt: 642.75ms | tok/sec: 815,690 | mfu: 50.98 | epoch: 2 | total time: 89.69m | eta: 89.9m +step 08348/16704 (49.98%) | loss: 2.708488 | lrm: 1.00 | dt: 644.78ms | tok/sec: 813,128 | mfu: 50.82 | epoch: 2 | total time: 89.70m | eta: 89.9m +step 08349/16704 (49.98%) | loss: 2.689639 | lrm: 1.00 | dt: 641.06ms | tok/sec: 817,841 | mfu: 51.12 | epoch: 2 | total time: 89.71m | eta: 89.9m +step 08350/16704 (49.99%) | loss: 2.689945 | lrm: 1.00 | dt: 642.86ms | tok/sec: 815,557 | mfu: 50.97 | epoch: 2 | total time: 89.72m | eta: 89.9m +step 08351/16704 (49.99%) | loss: 2.688905 | lrm: 1.00 | dt: 643.57ms | tok/sec: 814,650 | mfu: 50.92 | epoch: 2 | total time: 89.73m | eta: 89.9m +step 08352/16704 (50.00%) | loss: 2.684999 | lrm: 1.00 | dt: 640.90ms | tok/sec: 818,045 | mfu: 51.13 | epoch: 2 | total time: 89.74m | eta: 89.9m +step 08353/16704 (50.01%) | loss: 2.679088 | lrm: 1.00 | dt: 643.83ms | tok/sec: 814,333 | mfu: 50.90 | epoch: 2 | total time: 89.75m | eta: 89.8m +step 08354/16704 (50.01%) | loss: 2.671168 | lrm: 1.00 | dt: 643.51ms | tok/sec: 814,730 | mfu: 50.92 | epoch: 2 | total time: 89.76m | eta: 89.8m +step 08355/16704 (50.02%) | loss: 2.664481 | lrm: 1.00 | dt: 642.87ms | tok/sec: 815,545 | mfu: 50.97 | epoch: 2 | total time: 89.78m | eta: 89.8m +step 08356/16704 (50.02%) | loss: 2.677040 | lrm: 1.00 | dt: 643.20ms | tok/sec: 815,128 | mfu: 50.95 | epoch: 2 | total time: 89.79m | eta: 89.8m +step 08357/16704 (50.03%) | loss: 2.686158 | lrm: 1.00 | dt: 644.03ms | tok/sec: 814,068 | mfu: 50.88 | epoch: 2 | total time: 89.80m | eta: 89.8m +step 08358/16704 (50.04%) | loss: 2.679145 | lrm: 1.00 | dt: 644.04ms | tok/sec: 814,066 | mfu: 50.88 | epoch: 2 | total time: 89.81m | eta: 89.8m +step 08359/16704 (50.04%) | loss: 2.687608 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,713 | mfu: 50.92 | epoch: 2 | total time: 89.82m | eta: 89.8m +step 08360/16704 (50.05%) | loss: 2.689371 | lrm: 1.00 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 2 | total time: 89.83m | eta: 89.8m +step 08361/16704 (50.05%) | loss: 2.687880 | lrm: 1.00 | dt: 644.09ms | tok/sec: 813,992 | mfu: 50.88 | epoch: 2 | total time: 89.84m | eta: 89.8m +step 08362/16704 (50.06%) | loss: 2.684518 | lrm: 1.00 | dt: 641.78ms | tok/sec: 816,932 | mfu: 51.06 | epoch: 2 | total time: 89.85m | eta: 89.7m +step 08363/16704 (50.07%) | loss: 2.683502 | lrm: 1.00 | dt: 644.60ms | tok/sec: 813,359 | mfu: 50.84 | epoch: 2 | total time: 89.86m | eta: 89.7m +step 08364/16704 (50.07%) | loss: 2.674353 | lrm: 1.00 | dt: 643.44ms | tok/sec: 814,815 | mfu: 50.93 | epoch: 2 | total time: 89.87m | eta: 89.7m +step 08365/16704 (50.08%) | loss: 2.690890 | lrm: 1.00 | dt: 643.96ms | tok/sec: 814,159 | mfu: 50.89 | epoch: 2 | total time: 89.88m | eta: 89.7m +step 08366/16704 (50.08%) | loss: 2.697225 | lrm: 1.00 | dt: 644.66ms | tok/sec: 813,280 | mfu: 50.83 | epoch: 2 | total time: 89.89m | eta: 89.7m +step 08367/16704 (50.09%) | loss: 2.701419 | lrm: 1.00 | dt: 641.72ms | tok/sec: 817,003 | mfu: 51.06 | epoch: 2 | total time: 89.90m | eta: 89.7m +step 08368/16704 (50.10%) | loss: 2.699376 | lrm: 1.00 | dt: 645.59ms | tok/sec: 812,106 | mfu: 50.76 | epoch: 2 | total time: 89.92m | eta: 89.7m +step 08369/16704 (50.10%) | loss: 2.702041 | lrm: 1.00 | dt: 642.20ms | tok/sec: 816,399 | mfu: 51.03 | epoch: 2 | total time: 89.93m | eta: 89.7m +step 08370/16704 (50.11%) | loss: 2.690543 | lrm: 1.00 | dt: 644.07ms | tok/sec: 814,027 | mfu: 50.88 | epoch: 2 | total time: 89.94m | eta: 89.7m +step 08371/16704 (50.11%) | loss: 2.673807 | lrm: 1.00 | dt: 644.14ms | tok/sec: 813,929 | mfu: 50.87 | epoch: 2 | total time: 89.95m | eta: 89.6m +step 08372/16704 (50.12%) | loss: 2.680693 | lrm: 1.00 | dt: 642.48ms | tok/sec: 816,040 | mfu: 51.00 | epoch: 2 | total time: 89.96m | eta: 89.6m +step 08373/16704 (50.13%) | loss: 2.675240 | lrm: 1.00 | dt: 644.31ms | tok/sec: 813,716 | mfu: 50.86 | epoch: 2 | total time: 89.97m | eta: 89.6m +step 08374/16704 (50.13%) | loss: 2.670410 | lrm: 1.00 | dt: 646.17ms | tok/sec: 811,383 | mfu: 50.71 | epoch: 2 | total time: 89.98m | eta: 89.6m +step 08375/16704 (50.14%) | loss: 2.666588 | lrm: 1.00 | dt: 643.52ms | tok/sec: 814,715 | mfu: 50.92 | epoch: 2 | total time: 89.99m | eta: 89.6m +step 08376/16704 (50.14%) | loss: 2.678674 | lrm: 1.00 | dt: 642.31ms | tok/sec: 816,250 | mfu: 51.02 | epoch: 2 | total time: 90.00m | eta: 89.6m +step 08377/16704 (50.15%) | loss: 2.684639 | lrm: 1.00 | dt: 645.61ms | tok/sec: 812,080 | mfu: 50.76 | epoch: 2 | total time: 90.01m | eta: 89.6m +step 08378/16704 (50.16%) | loss: 2.684412 | lrm: 1.00 | dt: 643.80ms | tok/sec: 814,369 | mfu: 50.90 | epoch: 2 | total time: 90.02m | eta: 89.6m +step 08379/16704 (50.16%) | loss: 2.674102 | lrm: 1.00 | dt: 643.42ms | tok/sec: 814,843 | mfu: 50.93 | epoch: 2 | total time: 90.03m | eta: 89.6m +step 08380/16704 (50.17%) | loss: 2.674319 | lrm: 1.00 | dt: 644.96ms | tok/sec: 812,905 | mfu: 50.81 | epoch: 2 | total time: 90.04m | eta: 89.5m +step 08381/16704 (50.17%) | loss: 2.679977 | lrm: 1.00 | dt: 641.62ms | tok/sec: 817,128 | mfu: 51.07 | epoch: 2 | total time: 90.05m | eta: 89.5m +step 08382/16704 (50.18%) | loss: 2.682410 | lrm: 1.00 | dt: 644.65ms | tok/sec: 813,296 | mfu: 50.83 | epoch: 2 | total time: 90.07m | eta: 89.5m +step 08383/16704 (50.19%) | loss: 2.686618 | lrm: 1.00 | dt: 644.49ms | tok/sec: 813,486 | mfu: 50.84 | epoch: 2 | total time: 90.08m | eta: 89.5m +step 08384/16704 (50.19%) | loss: 2.692354 | lrm: 1.00 | dt: 643.50ms | tok/sec: 814,740 | mfu: 50.92 | epoch: 2 | total time: 90.09m | eta: 89.5m +step 08385/16704 (50.20%) | loss: 2.693573 | lrm: 1.00 | dt: 647.04ms | tok/sec: 810,282 | mfu: 50.64 | epoch: 2 | total time: 90.10m | eta: 89.5m +step 08386/16704 (50.20%) | loss: 2.702995 | lrm: 1.00 | dt: 641.00ms | tok/sec: 817,924 | mfu: 51.12 | epoch: 2 | total time: 90.11m | eta: 89.5m +step 08387/16704 (50.21%) | loss: 2.701141 | lrm: 1.00 | dt: 647.28ms | tok/sec: 809,987 | mfu: 50.63 | epoch: 2 | total time: 90.12m | eta: 89.5m +step 08388/16704 (50.22%) | loss: 2.691643 | lrm: 1.00 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 2 | total time: 90.13m | eta: 89.5m +step 08389/16704 (50.22%) | loss: 2.705722 | lrm: 1.00 | dt: 646.01ms | tok/sec: 811,572 | mfu: 50.72 | epoch: 2 | total time: 90.14m | eta: 89.5m +step 08390/16704 (50.23%) | loss: 2.701338 | lrm: 1.00 | dt: 640.99ms | tok/sec: 817,931 | mfu: 51.12 | epoch: 2 | total time: 90.15m | eta: 89.4m +step 08391/16704 (50.23%) | loss: 2.707488 | lrm: 1.00 | dt: 644.35ms | tok/sec: 813,669 | mfu: 50.86 | epoch: 2 | total time: 90.16m | eta: 89.4m +step 08392/16704 (50.24%) | loss: 2.712454 | lrm: 1.00 | dt: 642.32ms | tok/sec: 816,239 | mfu: 51.02 | epoch: 2 | total time: 90.17m | eta: 89.4m +step 08393/16704 (50.25%) | loss: 2.718019 | lrm: 1.00 | dt: 642.08ms | tok/sec: 816,548 | mfu: 51.04 | epoch: 2 | total time: 90.18m | eta: 89.4m +step 08394/16704 (50.25%) | loss: 2.716814 | lrm: 0.99 | dt: 646.80ms | tok/sec: 810,589 | mfu: 50.66 | epoch: 2 | total time: 90.19m | eta: 89.4m +step 08395/16704 (50.26%) | loss: 2.700594 | lrm: 0.99 | dt: 642.35ms | tok/sec: 816,207 | mfu: 51.01 | epoch: 2 | total time: 90.20m | eta: 89.4m +step 08396/16704 (50.26%) | loss: 2.683514 | lrm: 0.99 | dt: 644.41ms | tok/sec: 813,592 | mfu: 50.85 | epoch: 2 | total time: 90.22m | eta: 89.4m +step 08397/16704 (50.27%) | loss: 2.683841 | lrm: 0.99 | dt: 643.94ms | tok/sec: 814,192 | mfu: 50.89 | epoch: 2 | total time: 90.23m | eta: 89.4m +step 08398/16704 (50.28%) | loss: 2.704260 | lrm: 0.99 | dt: 645.56ms | tok/sec: 812,142 | mfu: 50.76 | epoch: 2 | total time: 90.24m | eta: 89.4m +step 08399/16704 (50.28%) | loss: 2.716356 | lrm: 0.99 | dt: 645.82ms | tok/sec: 811,817 | mfu: 50.74 | epoch: 2 | total time: 90.25m | eta: 89.3m +step 08400/16704 (50.29%) | loss: 2.709904 | lrm: 0.99 | dt: 642.48ms | tok/sec: 816,032 | mfu: 51.00 | epoch: 2 | total time: 90.26m | eta: 89.3m +step 08401/16704 (50.29%) | loss: 2.723273 | lrm: 0.99 | dt: 645.72ms | tok/sec: 811,942 | mfu: 50.75 | epoch: 2 | total time: 90.27m | eta: 89.3m +step 08402/16704 (50.30%) | loss: 2.720275 | lrm: 0.99 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 2 | total time: 90.28m | eta: 89.3m +step 08403/16704 (50.31%) | loss: 2.710660 | lrm: 0.99 | dt: 646.44ms | tok/sec: 811,042 | mfu: 50.69 | epoch: 2 | total time: 90.29m | eta: 89.3m +step 08404/16704 (50.31%) | loss: 2.710825 | lrm: 0.99 | dt: 646.72ms | tok/sec: 810,691 | mfu: 50.67 | epoch: 2 | total time: 90.30m | eta: 89.3m +step 08405/16704 (50.32%) | loss: 2.698844 | lrm: 0.99 | dt: 643.31ms | tok/sec: 814,982 | mfu: 50.94 | epoch: 2 | total time: 90.31m | eta: 89.3m +step 08406/16704 (50.32%) | loss: 2.694529 | lrm: 0.99 | dt: 642.72ms | tok/sec: 815,727 | mfu: 50.98 | epoch: 2 | total time: 90.32m | eta: 89.3m +step 08407/16704 (50.33%) | loss: 2.693943 | lrm: 0.99 | dt: 645.52ms | tok/sec: 812,194 | mfu: 50.76 | epoch: 2 | total time: 90.33m | eta: 89.3m +step 08408/16704 (50.34%) | loss: 2.686645 | lrm: 0.99 | dt: 645.79ms | tok/sec: 811,854 | mfu: 50.74 | epoch: 2 | total time: 90.34m | eta: 89.2m +step 08409/16704 (50.34%) | loss: 2.690124 | lrm: 0.99 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 2 | total time: 90.36m | eta: 89.2m +step 08410/16704 (50.35%) | loss: 2.695594 | lrm: 0.99 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 2 | total time: 90.37m | eta: 89.2m +step 08411/16704 (50.35%) | loss: 2.702447 | lrm: 0.99 | dt: 643.71ms | tok/sec: 814,476 | mfu: 50.91 | epoch: 2 | total time: 90.38m | eta: 89.2m +step 08412/16704 (50.36%) | loss: 2.718138 | lrm: 0.99 | dt: 643.72ms | tok/sec: 814,468 | mfu: 50.91 | epoch: 2 | total time: 90.39m | eta: 89.2m +step 08413/16704 (50.37%) | loss: 2.715260 | lrm: 0.99 | dt: 644.19ms | tok/sec: 813,875 | mfu: 50.87 | epoch: 2 | total time: 90.40m | eta: 89.2m +step 08414/16704 (50.37%) | loss: 2.715699 | lrm: 0.99 | dt: 643.61ms | tok/sec: 814,607 | mfu: 50.91 | epoch: 2 | total time: 90.41m | eta: 89.2m +step 08415/16704 (50.38%) | loss: 2.692532 | lrm: 0.99 | dt: 643.94ms | tok/sec: 814,190 | mfu: 50.89 | epoch: 2 | total time: 90.42m | eta: 89.2m +step 08416/16704 (50.38%) | loss: 2.689103 | lrm: 0.99 | dt: 644.17ms | tok/sec: 813,894 | mfu: 50.87 | epoch: 2 | total time: 90.43m | eta: 89.2m +step 08417/16704 (50.39%) | loss: 2.681571 | lrm: 0.99 | dt: 646.47ms | tok/sec: 810,996 | mfu: 50.69 | epoch: 2 | total time: 90.44m | eta: 89.2m +step 08418/16704 (50.40%) | loss: 2.684443 | lrm: 0.99 | dt: 643.25ms | tok/sec: 815,061 | mfu: 50.94 | epoch: 2 | total time: 90.45m | eta: 89.1m +step 08419/16704 (50.40%) | loss: 2.679670 | lrm: 0.99 | dt: 644.88ms | tok/sec: 813,002 | mfu: 50.81 | epoch: 2 | total time: 90.46m | eta: 89.1m +step 08420/16704 (50.41%) | loss: 2.675802 | lrm: 0.99 | dt: 645.41ms | tok/sec: 812,331 | mfu: 50.77 | epoch: 2 | total time: 90.47m | eta: 89.1m +step 08421/16704 (50.41%) | loss: 2.682518 | lrm: 0.99 | dt: 644.32ms | tok/sec: 813,702 | mfu: 50.86 | epoch: 2 | total time: 90.48m | eta: 89.1m +step 08422/16704 (50.42%) | loss: 2.681490 | lrm: 0.99 | dt: 643.26ms | tok/sec: 815,054 | mfu: 50.94 | epoch: 2 | total time: 90.49m | eta: 89.1m +step 08423/16704 (50.43%) | loss: 2.681415 | lrm: 0.99 | dt: 644.82ms | tok/sec: 813,081 | mfu: 50.82 | epoch: 2 | total time: 90.51m | eta: 89.1m +step 08424/16704 (50.43%) | loss: 2.666885 | lrm: 0.99 | dt: 642.14ms | tok/sec: 816,473 | mfu: 51.03 | epoch: 2 | total time: 90.52m | eta: 89.1m +step 08425/16704 (50.44%) | loss: 2.659658 | lrm: 0.99 | dt: 645.89ms | tok/sec: 811,728 | mfu: 50.73 | epoch: 2 | total time: 90.53m | eta: 89.1m +step 08426/16704 (50.44%) | loss: 2.669872 | lrm: 0.99 | dt: 644.51ms | tok/sec: 813,465 | mfu: 50.84 | epoch: 2 | total time: 90.54m | eta: 89.1m +step 08427/16704 (50.45%) | loss: 2.680679 | lrm: 0.99 | dt: 642.35ms | tok/sec: 816,197 | mfu: 51.01 | epoch: 2 | total time: 90.55m | eta: 89.0m +step 08428/16704 (50.45%) | loss: 2.690010 | lrm: 0.99 | dt: 646.46ms | tok/sec: 811,011 | mfu: 50.69 | epoch: 2 | total time: 90.56m | eta: 89.0m +step 08429/16704 (50.46%) | loss: 2.693313 | lrm: 0.99 | dt: 644.17ms | tok/sec: 813,902 | mfu: 50.87 | epoch: 2 | total time: 90.57m | eta: 89.0m +step 08430/16704 (50.47%) | loss: 2.689445 | lrm: 0.99 | dt: 643.78ms | tok/sec: 814,391 | mfu: 50.90 | epoch: 2 | total time: 90.58m | eta: 89.0m +step 08431/16704 (50.47%) | loss: 2.690754 | lrm: 0.99 | dt: 645.48ms | tok/sec: 812,243 | mfu: 50.77 | epoch: 2 | total time: 90.59m | eta: 89.0m +step 08432/16704 (50.48%) | loss: 2.698291 | lrm: 0.99 | dt: 641.57ms | tok/sec: 817,189 | mfu: 51.08 | epoch: 2 | total time: 90.60m | eta: 89.0m +step 08433/16704 (50.48%) | loss: 2.690868 | lrm: 0.99 | dt: 644.28ms | tok/sec: 813,756 | mfu: 50.86 | epoch: 2 | total time: 90.61m | eta: 89.0m +step 08434/16704 (50.49%) | loss: 2.673448 | lrm: 0.99 | dt: 646.16ms | tok/sec: 811,389 | mfu: 50.71 | epoch: 2 | total time: 90.62m | eta: 89.0m +step 08435/16704 (50.50%) | loss: 2.687396 | lrm: 0.99 | dt: 643.39ms | tok/sec: 814,877 | mfu: 50.93 | epoch: 2 | total time: 90.63m | eta: 89.0m +step 08436/16704 (50.50%) | loss: 2.675992 | lrm: 0.99 | dt: 645.14ms | tok/sec: 812,668 | mfu: 50.79 | epoch: 2 | total time: 90.65m | eta: 88.9m +step 08437/16704 (50.51%) | loss: 2.688393 | lrm: 0.99 | dt: 644.43ms | tok/sec: 813,565 | mfu: 50.85 | epoch: 2 | total time: 90.66m | eta: 88.9m +step 08438/16704 (50.51%) | loss: 2.695940 | lrm: 0.99 | dt: 643.78ms | tok/sec: 814,387 | mfu: 50.90 | epoch: 2 | total time: 90.67m | eta: 88.9m +step 08439/16704 (50.52%) | loss: 2.697255 | lrm: 0.99 | dt: 645.19ms | tok/sec: 812,609 | mfu: 50.79 | epoch: 2 | total time: 90.68m | eta: 88.9m +step 08440/16704 (50.53%) | loss: 2.702022 | lrm: 0.99 | dt: 646.59ms | tok/sec: 810,846 | mfu: 50.68 | epoch: 2 | total time: 90.69m | eta: 88.9m +step 08441/16704 (50.53%) | loss: 2.710594 | lrm: 0.99 | dt: 642.95ms | tok/sec: 815,438 | mfu: 50.97 | epoch: 2 | total time: 90.70m | eta: 88.9m +step 08442/16704 (50.54%) | loss: 2.713021 | lrm: 0.99 | dt: 643.28ms | tok/sec: 815,020 | mfu: 50.94 | epoch: 2 | total time: 90.71m | eta: 88.9m +step 08443/16704 (50.54%) | loss: 2.710957 | lrm: 0.99 | dt: 645.23ms | tok/sec: 812,557 | mfu: 50.79 | epoch: 2 | total time: 90.72m | eta: 88.9m +step 08444/16704 (50.55%) | loss: 2.711380 | lrm: 0.99 | dt: 641.73ms | tok/sec: 816,988 | mfu: 51.06 | epoch: 2 | total time: 90.73m | eta: 88.9m +step 08445/16704 (50.56%) | loss: 2.713808 | lrm: 0.99 | dt: 645.94ms | tok/sec: 811,662 | mfu: 50.73 | epoch: 2 | total time: 90.74m | eta: 88.8m +step 08446/16704 (50.56%) | loss: 2.718210 | lrm: 0.99 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 2 | total time: 90.75m | eta: 88.8m +step 08447/16704 (50.57%) | loss: 2.725376 | lrm: 0.99 | dt: 643.03ms | tok/sec: 815,340 | mfu: 50.96 | epoch: 2 | total time: 90.76m | eta: 88.8m +step 08448/16704 (50.57%) | loss: 2.716650 | lrm: 0.99 | dt: 644.38ms | tok/sec: 813,630 | mfu: 50.85 | epoch: 2 | total time: 90.77m | eta: 88.8m +step 08449/16704 (50.58%) | loss: 2.714803 | lrm: 0.99 | dt: 642.95ms | tok/sec: 815,439 | mfu: 50.97 | epoch: 2 | total time: 90.78m | eta: 88.8m +step 08450/16704 (50.59%) | loss: 2.707132 | lrm: 0.99 | dt: 645.86ms | tok/sec: 811,770 | mfu: 50.74 | epoch: 2 | total time: 90.80m | eta: 88.8m +step 08451/16704 (50.59%) | loss: 2.705541 | lrm: 0.99 | dt: 643.31ms | tok/sec: 814,979 | mfu: 50.94 | epoch: 2 | total time: 90.81m | eta: 88.8m +step 08452/16704 (50.60%) | loss: 2.720864 | lrm: 0.99 | dt: 642.67ms | tok/sec: 815,795 | mfu: 50.99 | epoch: 2 | total time: 90.82m | eta: 88.8m +step 08453/16704 (50.60%) | loss: 2.703400 | lrm: 0.99 | dt: 645.38ms | tok/sec: 812,367 | mfu: 50.77 | epoch: 2 | total time: 90.83m | eta: 88.8m +step 08454/16704 (50.61%) | loss: 2.695209 | lrm: 0.99 | dt: 643.82ms | tok/sec: 814,341 | mfu: 50.90 | epoch: 2 | total time: 90.84m | eta: 88.8m +step 08455/16704 (50.62%) | loss: 2.692593 | lrm: 0.99 | dt: 642.71ms | tok/sec: 815,751 | mfu: 50.99 | epoch: 2 | total time: 90.85m | eta: 88.7m +step 08456/16704 (50.62%) | loss: 2.688050 | lrm: 0.99 | dt: 641.31ms | tok/sec: 817,528 | mfu: 51.10 | epoch: 2 | total time: 90.86m | eta: 88.7m +step 08457/16704 (50.63%) | loss: 2.696673 | lrm: 0.99 | dt: 645.65ms | tok/sec: 812,033 | mfu: 50.75 | epoch: 2 | total time: 90.87m | eta: 88.7m +step 08458/16704 (50.63%) | loss: 2.694049 | lrm: 0.99 | dt: 642.66ms | tok/sec: 815,809 | mfu: 50.99 | epoch: 2 | total time: 90.88m | eta: 88.7m +step 08459/16704 (50.64%) | loss: 2.705539 | lrm: 0.99 | dt: 645.04ms | tok/sec: 812,796 | mfu: 50.80 | epoch: 2 | total time: 90.89m | eta: 88.7m +step 08460/16704 (50.65%) | loss: 2.699809 | lrm: 0.99 | dt: 643.66ms | tok/sec: 814,541 | mfu: 50.91 | epoch: 2 | total time: 90.90m | eta: 88.7m +step 08461/16704 (50.65%) | loss: 2.704171 | lrm: 0.99 | dt: 643.32ms | tok/sec: 814,970 | mfu: 50.94 | epoch: 2 | total time: 90.91m | eta: 88.7m +step 08462/16704 (50.66%) | loss: 2.701419 | lrm: 0.99 | dt: 642.08ms | tok/sec: 816,549 | mfu: 51.04 | epoch: 2 | total time: 90.92m | eta: 88.7m +step 08463/16704 (50.66%) | loss: 2.704454 | lrm: 0.99 | dt: 644.86ms | tok/sec: 813,024 | mfu: 50.82 | epoch: 2 | total time: 90.94m | eta: 88.7m +step 08464/16704 (50.67%) | loss: 2.696114 | lrm: 0.99 | dt: 643.15ms | tok/sec: 815,190 | mfu: 50.95 | epoch: 2 | total time: 90.95m | eta: 88.6m +step 08465/16704 (50.68%) | loss: 2.698067 | lrm: 0.99 | dt: 643.24ms | tok/sec: 815,077 | mfu: 50.94 | epoch: 2 | total time: 90.96m | eta: 88.6m +step 08466/16704 (50.68%) | loss: 2.715148 | lrm: 0.99 | dt: 644.60ms | tok/sec: 813,354 | mfu: 50.84 | epoch: 2 | total time: 90.97m | eta: 88.6m +step 08467/16704 (50.69%) | loss: 2.701909 | lrm: 0.99 | dt: 641.65ms | tok/sec: 817,091 | mfu: 51.07 | epoch: 2 | total time: 90.98m | eta: 88.6m +step 08468/16704 (50.69%) | loss: 2.711876 | lrm: 0.99 | dt: 643.28ms | tok/sec: 815,027 | mfu: 50.94 | epoch: 2 | total time: 90.99m | eta: 88.6m +step 08469/16704 (50.70%) | loss: 2.724436 | lrm: 0.99 | dt: 644.83ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 2 | total time: 91.00m | eta: 88.6m +step 08470/16704 (50.71%) | loss: 2.732241 | lrm: 0.99 | dt: 643.13ms | tok/sec: 815,214 | mfu: 50.95 | epoch: 2 | total time: 91.01m | eta: 88.6m +step 08471/16704 (50.71%) | loss: 2.715689 | lrm: 0.99 | dt: 644.06ms | tok/sec: 814,032 | mfu: 50.88 | epoch: 2 | total time: 91.02m | eta: 88.6m +step 08472/16704 (50.72%) | loss: 2.707229 | lrm: 0.99 | dt: 643.85ms | tok/sec: 814,305 | mfu: 50.90 | epoch: 2 | total time: 91.03m | eta: 88.6m +step 08473/16704 (50.72%) | loss: 2.701324 | lrm: 0.99 | dt: 642.92ms | tok/sec: 815,477 | mfu: 50.97 | epoch: 2 | total time: 91.04m | eta: 88.5m +step 08474/16704 (50.73%) | loss: 2.697920 | lrm: 0.99 | dt: 643.37ms | tok/sec: 814,907 | mfu: 50.93 | epoch: 2 | total time: 91.05m | eta: 88.5m +step 08475/16704 (50.74%) | loss: 2.695603 | lrm: 0.99 | dt: 642.42ms | tok/sec: 816,116 | mfu: 51.01 | epoch: 2 | total time: 91.06m | eta: 88.5m +step 08476/16704 (50.74%) | loss: 2.701604 | lrm: 0.99 | dt: 644.30ms | tok/sec: 813,732 | mfu: 50.86 | epoch: 2 | total time: 91.07m | eta: 88.5m +step 08477/16704 (50.75%) | loss: 2.697350 | lrm: 0.99 | dt: 642.60ms | tok/sec: 815,879 | mfu: 50.99 | epoch: 2 | total time: 91.09m | eta: 88.5m +step 08478/16704 (50.75%) | loss: 2.703248 | lrm: 0.98 | dt: 643.33ms | tok/sec: 814,957 | mfu: 50.94 | epoch: 2 | total time: 91.10m | eta: 88.5m +step 08479/16704 (50.76%) | loss: 2.698837 | lrm: 0.98 | dt: 643.91ms | tok/sec: 814,227 | mfu: 50.89 | epoch: 2 | total time: 91.11m | eta: 88.5m +step 08480/16704 (50.77%) | loss: 2.693999 | lrm: 0.98 | dt: 645.64ms | tok/sec: 812,040 | mfu: 50.75 | epoch: 2 | total time: 91.12m | eta: 88.5m +step 08481/16704 (50.77%) | loss: 2.685627 | lrm: 0.98 | dt: 642.83ms | tok/sec: 815,597 | mfu: 50.98 | epoch: 2 | total time: 91.13m | eta: 88.5m +step 08482/16704 (50.78%) | loss: 2.690777 | lrm: 0.98 | dt: 645.61ms | tok/sec: 812,079 | mfu: 50.76 | epoch: 2 | total time: 91.14m | eta: 88.4m +step 08483/16704 (50.78%) | loss: 2.699739 | lrm: 0.98 | dt: 642.20ms | tok/sec: 816,394 | mfu: 51.03 | epoch: 2 | total time: 91.15m | eta: 88.4m +step 08484/16704 (50.79%) | loss: 2.697812 | lrm: 0.98 | dt: 642.82ms | tok/sec: 815,605 | mfu: 50.98 | epoch: 2 | total time: 91.16m | eta: 88.4m +step 08485/16704 (50.80%) | loss: 2.691934 | lrm: 0.98 | dt: 645.75ms | tok/sec: 811,900 | mfu: 50.74 | epoch: 2 | total time: 91.17m | eta: 88.4m +step 08486/16704 (50.80%) | loss: 2.704304 | lrm: 0.98 | dt: 643.98ms | tok/sec: 814,132 | mfu: 50.88 | epoch: 2 | total time: 91.18m | eta: 88.4m +step 08487/16704 (50.81%) | loss: 2.707536 | lrm: 0.98 | dt: 644.75ms | tok/sec: 813,166 | mfu: 50.82 | epoch: 2 | total time: 91.19m | eta: 88.4m +step 08488/16704 (50.81%) | loss: 2.696138 | lrm: 0.98 | dt: 642.60ms | tok/sec: 815,891 | mfu: 50.99 | epoch: 2 | total time: 91.20m | eta: 88.4m +step 08489/16704 (50.82%) | loss: 2.684563 | lrm: 0.98 | dt: 642.71ms | tok/sec: 815,743 | mfu: 50.99 | epoch: 2 | total time: 91.21m | eta: 88.4m +step 08490/16704 (50.83%) | loss: 2.688034 | lrm: 0.98 | dt: 644.14ms | tok/sec: 813,932 | mfu: 50.87 | epoch: 2 | total time: 91.22m | eta: 88.4m +step 08491/16704 (50.83%) | loss: 2.680026 | lrm: 0.98 | dt: 643.35ms | tok/sec: 814,932 | mfu: 50.93 | epoch: 2 | total time: 91.24m | eta: 88.4m +step 08492/16704 (50.84%) | loss: 2.683232 | lrm: 0.98 | dt: 643.77ms | tok/sec: 814,407 | mfu: 50.90 | epoch: 2 | total time: 91.25m | eta: 88.3m +step 08493/16704 (50.84%) | loss: 2.700042 | lrm: 0.98 | dt: 644.53ms | tok/sec: 813,448 | mfu: 50.84 | epoch: 2 | total time: 91.26m | eta: 88.3m +step 08494/16704 (50.85%) | loss: 2.706708 | lrm: 0.98 | dt: 642.87ms | tok/sec: 815,540 | mfu: 50.97 | epoch: 2 | total time: 91.27m | eta: 88.3m +step 08495/16704 (50.86%) | loss: 2.696870 | lrm: 0.98 | dt: 644.85ms | tok/sec: 813,033 | mfu: 50.82 | epoch: 2 | total time: 91.28m | eta: 88.3m +step 08496/16704 (50.86%) | loss: 2.704395 | lrm: 0.98 | dt: 643.91ms | tok/sec: 814,221 | mfu: 50.89 | epoch: 2 | total time: 91.29m | eta: 88.3m +step 08497/16704 (50.87%) | loss: 2.687792 | lrm: 0.98 | dt: 643.46ms | tok/sec: 814,791 | mfu: 50.93 | epoch: 2 | total time: 91.30m | eta: 88.3m +step 08498/16704 (50.87%) | loss: 2.685531 | lrm: 0.98 | dt: 644.59ms | tok/sec: 813,362 | mfu: 50.84 | epoch: 2 | total time: 91.31m | eta: 88.3m +step 08499/16704 (50.88%) | loss: 2.679456 | lrm: 0.98 | dt: 642.59ms | tok/sec: 815,896 | mfu: 50.99 | epoch: 2 | total time: 91.32m | eta: 88.3m +Step 08500 | Validation bpb: 0.819368 +step 08500/16704 (50.89%) | loss: 2.664082 | lrm: 0.98 | dt: 648.46ms | tok/sec: 808,507 | mfu: 50.53 | epoch: 2 | total time: 91.33m | eta: 88.3m +step 08501/16704 (50.89%) | loss: 2.678926 | lrm: 0.98 | dt: 645.85ms | tok/sec: 811,774 | mfu: 50.74 | epoch: 2 | total time: 91.34m | eta: 88.2m +step 08502/16704 (50.90%) | loss: 2.691730 | lrm: 0.98 | dt: 642.74ms | tok/sec: 815,712 | mfu: 50.98 | epoch: 2 | total time: 91.35m | eta: 88.2m +step 08503/16704 (50.90%) | loss: 2.693252 | lrm: 0.98 | dt: 644.92ms | tok/sec: 812,955 | mfu: 50.81 | epoch: 2 | total time: 91.36m | eta: 88.2m +step 08504/16704 (50.91%) | loss: 2.699394 | lrm: 0.98 | dt: 648.33ms | tok/sec: 808,669 | mfu: 50.54 | epoch: 2 | total time: 91.38m | eta: 88.2m +step 08505/16704 (50.92%) | loss: 2.695130 | lrm: 0.98 | dt: 642.71ms | tok/sec: 815,750 | mfu: 50.99 | epoch: 2 | total time: 91.39m | eta: 88.2m +step 08506/16704 (50.92%) | loss: 2.694128 | lrm: 0.98 | dt: 645.41ms | tok/sec: 812,332 | mfu: 50.77 | epoch: 2 | total time: 91.40m | eta: 88.2m +step 08507/16704 (50.93%) | loss: 2.701289 | lrm: 0.98 | dt: 647.15ms | tok/sec: 810,146 | mfu: 50.64 | epoch: 2 | total time: 91.41m | eta: 88.2m +step 08508/16704 (50.93%) | loss: 2.706921 | lrm: 0.98 | dt: 643.18ms | tok/sec: 815,144 | mfu: 50.95 | epoch: 2 | total time: 91.42m | eta: 88.2m +step 08509/16704 (50.94%) | loss: 2.708485 | lrm: 0.98 | dt: 644.42ms | tok/sec: 813,578 | mfu: 50.85 | epoch: 2 | total time: 91.43m | eta: 88.2m +step 08510/16704 (50.95%) | loss: 2.704089 | lrm: 0.98 | dt: 644.08ms | tok/sec: 814,006 | mfu: 50.88 | epoch: 2 | total time: 91.44m | eta: 88.1m +step 08511/16704 (50.95%) | loss: 2.709928 | lrm: 0.98 | dt: 663.84ms | tok/sec: 789,780 | mfu: 49.36 | epoch: 2 | total time: 91.45m | eta: 88.1m +step 08512/16704 (50.96%) | loss: 2.709437 | lrm: 0.98 | dt: 638.57ms | tok/sec: 821,031 | mfu: 51.32 | epoch: 2 | total time: 91.46m | eta: 88.1m +step 08513/16704 (50.96%) | loss: 2.700645 | lrm: 0.98 | dt: 646.88ms | tok/sec: 810,488 | mfu: 50.66 | epoch: 2 | total time: 91.47m | eta: 88.1m +step 08514/16704 (50.97%) | loss: 2.712047 | lrm: 0.98 | dt: 642.53ms | tok/sec: 815,968 | mfu: 51.00 | epoch: 2 | total time: 91.48m | eta: 88.1m +step 08515/16704 (50.98%) | loss: 2.718416 | lrm: 0.98 | dt: 645.51ms | tok/sec: 812,211 | mfu: 50.76 | epoch: 2 | total time: 91.49m | eta: 88.1m +step 08516/16704 (50.98%) | loss: 2.708584 | lrm: 0.98 | dt: 644.56ms | tok/sec: 813,410 | mfu: 50.84 | epoch: 2 | total time: 91.50m | eta: 88.1m +step 08517/16704 (50.99%) | loss: 2.711462 | lrm: 0.98 | dt: 643.50ms | tok/sec: 814,741 | mfu: 50.92 | epoch: 2 | total time: 91.51m | eta: 88.1m +step 08518/16704 (50.99%) | loss: 2.705959 | lrm: 0.98 | dt: 642.05ms | tok/sec: 816,580 | mfu: 51.04 | epoch: 2 | total time: 91.53m | eta: 88.1m +step 08519/16704 (51.00%) | loss: 2.708059 | lrm: 0.98 | dt: 645.05ms | tok/sec: 812,789 | mfu: 50.80 | epoch: 2 | total time: 91.54m | eta: 88.1m +step 08520/16704 (51.01%) | loss: 2.703120 | lrm: 0.98 | dt: 642.12ms | tok/sec: 816,500 | mfu: 51.03 | epoch: 2 | total time: 91.55m | eta: 88.0m +step 08521/16704 (51.01%) | loss: 2.699120 | lrm: 0.98 | dt: 644.30ms | tok/sec: 813,738 | mfu: 50.86 | epoch: 2 | total time: 91.56m | eta: 88.0m +step 08522/16704 (51.02%) | loss: 2.686117 | lrm: 0.98 | dt: 642.71ms | tok/sec: 815,742 | mfu: 50.99 | epoch: 2 | total time: 91.57m | eta: 88.0m +step 08523/16704 (51.02%) | loss: 2.687279 | lrm: 0.98 | dt: 643.87ms | tok/sec: 814,270 | mfu: 50.89 | epoch: 2 | total time: 91.58m | eta: 88.0m +step 08524/16704 (51.03%) | loss: 2.684354 | lrm: 0.98 | dt: 645.56ms | tok/sec: 812,141 | mfu: 50.76 | epoch: 2 | total time: 91.59m | eta: 88.0m +step 08525/16704 (51.04%) | loss: 2.677137 | lrm: 0.98 | dt: 643.24ms | tok/sec: 815,075 | mfu: 50.94 | epoch: 2 | total time: 91.60m | eta: 88.0m +step 08526/16704 (51.04%) | loss: 2.666689 | lrm: 0.98 | dt: 645.05ms | tok/sec: 812,786 | mfu: 50.80 | epoch: 2 | total time: 91.61m | eta: 88.0m +step 08527/16704 (51.05%) | loss: 2.661939 | lrm: 0.98 | dt: 643.84ms | tok/sec: 814,318 | mfu: 50.90 | epoch: 2 | total time: 91.62m | eta: 88.0m +step 08528/16704 (51.05%) | loss: 2.661583 | lrm: 0.98 | dt: 648.18ms | tok/sec: 808,866 | mfu: 50.56 | epoch: 2 | total time: 91.63m | eta: 88.0m +step 08529/16704 (51.06%) | loss: 2.666342 | lrm: 0.98 | dt: 643.02ms | tok/sec: 815,349 | mfu: 50.96 | epoch: 2 | total time: 91.64m | eta: 87.9m +step 08530/16704 (51.07%) | loss: 2.653354 | lrm: 0.98 | dt: 646.65ms | tok/sec: 810,775 | mfu: 50.67 | epoch: 2 | total time: 91.65m | eta: 87.9m +step 08531/16704 (51.07%) | loss: 2.651221 | lrm: 0.98 | dt: 643.20ms | tok/sec: 815,127 | mfu: 50.95 | epoch: 2 | total time: 91.67m | eta: 87.9m +step 08532/16704 (51.08%) | loss: 2.653627 | lrm: 0.98 | dt: 642.17ms | tok/sec: 816,430 | mfu: 51.03 | epoch: 2 | total time: 91.68m | eta: 87.9m +step 08533/16704 (51.08%) | loss: 2.665498 | lrm: 0.98 | dt: 643.40ms | tok/sec: 814,865 | mfu: 50.93 | epoch: 2 | total time: 91.69m | eta: 87.9m +step 08534/16704 (51.09%) | loss: 2.665650 | lrm: 0.98 | dt: 645.38ms | tok/sec: 812,370 | mfu: 50.77 | epoch: 2 | total time: 91.70m | eta: 87.9m +step 08535/16704 (51.10%) | loss: 2.660385 | lrm: 0.98 | dt: 642.74ms | tok/sec: 815,707 | mfu: 50.98 | epoch: 2 | total time: 91.71m | eta: 87.9m +step 08536/16704 (51.10%) | loss: 2.658345 | lrm: 0.98 | dt: 644.24ms | tok/sec: 813,808 | mfu: 50.86 | epoch: 2 | total time: 91.72m | eta: 87.9m +step 08537/16704 (51.11%) | loss: 2.667331 | lrm: 0.98 | dt: 643.03ms | tok/sec: 815,339 | mfu: 50.96 | epoch: 2 | total time: 91.73m | eta: 87.9m +step 08538/16704 (51.11%) | loss: 2.670529 | lrm: 0.98 | dt: 644.19ms | tok/sec: 813,873 | mfu: 50.87 | epoch: 2 | total time: 91.74m | eta: 87.8m +step 08539/16704 (51.12%) | loss: 2.690504 | lrm: 0.98 | dt: 644.26ms | tok/sec: 813,780 | mfu: 50.86 | epoch: 2 | total time: 91.75m | eta: 87.8m +step 08540/16704 (51.13%) | loss: 2.694917 | lrm: 0.98 | dt: 644.63ms | tok/sec: 813,311 | mfu: 50.83 | epoch: 2 | total time: 91.76m | eta: 87.8m +step 08541/16704 (51.13%) | loss: 2.685774 | lrm: 0.98 | dt: 643.40ms | tok/sec: 814,865 | mfu: 50.93 | epoch: 2 | total time: 91.77m | eta: 87.8m +step 08542/16704 (51.14%) | loss: 2.688568 | lrm: 0.98 | dt: 645.07ms | tok/sec: 812,766 | mfu: 50.80 | epoch: 2 | total time: 91.78m | eta: 87.8m +step 08543/16704 (51.14%) | loss: 2.690791 | lrm: 0.98 | dt: 641.18ms | tok/sec: 817,697 | mfu: 51.11 | epoch: 2 | total time: 91.79m | eta: 87.8m +step 08544/16704 (51.15%) | loss: 2.683596 | lrm: 0.98 | dt: 645.21ms | tok/sec: 812,585 | mfu: 50.79 | epoch: 2 | total time: 91.80m | eta: 87.8m +step 08545/16704 (51.16%) | loss: 2.685317 | lrm: 0.98 | dt: 644.24ms | tok/sec: 813,814 | mfu: 50.86 | epoch: 2 | total time: 91.82m | eta: 87.8m +step 08546/16704 (51.16%) | loss: 2.683911 | lrm: 0.98 | dt: 643.37ms | tok/sec: 814,906 | mfu: 50.93 | epoch: 2 | total time: 91.83m | eta: 87.8m +step 08547/16704 (51.17%) | loss: 2.682779 | lrm: 0.98 | dt: 646.57ms | tok/sec: 810,878 | mfu: 50.68 | epoch: 2 | total time: 91.84m | eta: 87.7m +step 08548/16704 (51.17%) | loss: 2.677324 | lrm: 0.98 | dt: 642.00ms | tok/sec: 816,641 | mfu: 51.04 | epoch: 2 | total time: 91.85m | eta: 87.7m +step 08549/16704 (51.18%) | loss: 2.679935 | lrm: 0.98 | dt: 643.26ms | tok/sec: 815,043 | mfu: 50.94 | epoch: 2 | total time: 91.86m | eta: 87.7m +step 08550/16704 (51.19%) | loss: 2.661443 | lrm: 0.98 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 2 | total time: 91.87m | eta: 87.7m +step 08551/16704 (51.19%) | loss: 2.663173 | lrm: 0.98 | dt: 644.19ms | tok/sec: 813,875 | mfu: 50.87 | epoch: 2 | total time: 91.88m | eta: 87.7m +step 08552/16704 (51.20%) | loss: 2.653025 | lrm: 0.98 | dt: 643.61ms | tok/sec: 814,609 | mfu: 50.91 | epoch: 2 | total time: 91.89m | eta: 87.7m +step 08553/16704 (51.20%) | loss: 2.652498 | lrm: 0.98 | dt: 644.82ms | tok/sec: 813,078 | mfu: 50.82 | epoch: 2 | total time: 91.90m | eta: 87.7m +step 08554/16704 (51.21%) | loss: 2.654621 | lrm: 0.98 | dt: 643.27ms | tok/sec: 815,033 | mfu: 50.94 | epoch: 2 | total time: 91.91m | eta: 87.7m +step 08555/16704 (51.22%) | loss: 2.644621 | lrm: 0.98 | dt: 644.39ms | tok/sec: 813,623 | mfu: 50.85 | epoch: 2 | total time: 91.92m | eta: 87.7m +step 08556/16704 (51.22%) | loss: 2.646379 | lrm: 0.98 | dt: 645.35ms | tok/sec: 812,408 | mfu: 50.78 | epoch: 2 | total time: 91.93m | eta: 87.7m +step 08557/16704 (51.23%) | loss: 2.651208 | lrm: 0.98 | dt: 644.12ms | tok/sec: 813,966 | mfu: 50.87 | epoch: 2 | total time: 91.94m | eta: 87.6m +step 08558/16704 (51.23%) | loss: 2.659949 | lrm: 0.98 | dt: 645.18ms | tok/sec: 812,627 | mfu: 50.79 | epoch: 2 | total time: 91.96m | eta: 87.6m +step 08559/16704 (51.24%) | loss: 2.658767 | lrm: 0.98 | dt: 644.80ms | tok/sec: 813,099 | mfu: 50.82 | epoch: 2 | total time: 91.97m | eta: 87.6m +step 08560/16704 (51.25%) | loss: 2.648555 | lrm: 0.98 | dt: 647.40ms | tok/sec: 809,842 | mfu: 50.62 | epoch: 2 | total time: 91.98m | eta: 87.6m +step 08561/16704 (51.25%) | loss: 2.639027 | lrm: 0.97 | dt: 642.17ms | tok/sec: 816,427 | mfu: 51.03 | epoch: 2 | total time: 91.99m | eta: 87.6m +step 08562/16704 (51.26%) | loss: 2.643696 | lrm: 0.97 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 2 | total time: 92.00m | eta: 87.6m +step 08563/16704 (51.26%) | loss: 2.646022 | lrm: 0.97 | dt: 645.07ms | tok/sec: 812,763 | mfu: 50.80 | epoch: 2 | total time: 92.01m | eta: 87.6m +step 08564/16704 (51.27%) | loss: 2.642875 | lrm: 0.97 | dt: 645.58ms | tok/sec: 812,116 | mfu: 50.76 | epoch: 2 | total time: 92.02m | eta: 87.6m +step 08565/16704 (51.28%) | loss: 2.641029 | lrm: 0.97 | dt: 645.12ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 2 | total time: 92.03m | eta: 87.6m +step 08566/16704 (51.28%) | loss: 2.634908 | lrm: 0.97 | dt: 644.25ms | tok/sec: 813,797 | mfu: 50.86 | epoch: 2 | total time: 92.04m | eta: 87.5m +step 08567/16704 (51.29%) | loss: 2.634964 | lrm: 0.97 | dt: 644.42ms | tok/sec: 813,580 | mfu: 50.85 | epoch: 2 | total time: 92.05m | eta: 87.5m +step 08568/16704 (51.29%) | loss: 2.638047 | lrm: 0.97 | dt: 645.31ms | tok/sec: 812,462 | mfu: 50.78 | epoch: 2 | total time: 92.06m | eta: 87.5m +step 08569/16704 (51.30%) | loss: 2.643293 | lrm: 0.97 | dt: 645.05ms | tok/sec: 812,781 | mfu: 50.80 | epoch: 2 | total time: 92.07m | eta: 87.5m +step 08570/16704 (51.31%) | loss: 2.652695 | lrm: 0.97 | dt: 643.55ms | tok/sec: 814,684 | mfu: 50.92 | epoch: 2 | total time: 92.08m | eta: 87.5m +step 08571/16704 (51.31%) | loss: 2.655029 | lrm: 0.97 | dt: 646.62ms | tok/sec: 810,818 | mfu: 50.68 | epoch: 2 | total time: 92.09m | eta: 87.5m +step 08572/16704 (51.32%) | loss: 2.661210 | lrm: 0.97 | dt: 646.38ms | tok/sec: 811,110 | mfu: 50.70 | epoch: 2 | total time: 92.11m | eta: 87.5m +step 08573/16704 (51.32%) | loss: 2.651535 | lrm: 0.97 | dt: 644.67ms | tok/sec: 813,264 | mfu: 50.83 | epoch: 2 | total time: 92.12m | eta: 87.5m +step 08574/16704 (51.33%) | loss: 2.660378 | lrm: 0.97 | dt: 645.36ms | tok/sec: 812,398 | mfu: 50.78 | epoch: 2 | total time: 92.13m | eta: 87.5m +step 08575/16704 (51.34%) | loss: 2.652146 | lrm: 0.97 | dt: 645.52ms | tok/sec: 812,199 | mfu: 50.76 | epoch: 2 | total time: 92.14m | eta: 87.4m +step 08576/16704 (51.34%) | loss: 2.657308 | lrm: 0.97 | dt: 644.15ms | tok/sec: 813,917 | mfu: 50.87 | epoch: 2 | total time: 92.15m | eta: 87.4m +step 08577/16704 (51.35%) | loss: 2.657593 | lrm: 0.97 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 2 | total time: 92.16m | eta: 87.4m +step 08578/16704 (51.35%) | loss: 2.654334 | lrm: 0.97 | dt: 643.80ms | tok/sec: 814,367 | mfu: 50.90 | epoch: 2 | total time: 92.17m | eta: 87.4m +step 08579/16704 (51.36%) | loss: 2.644812 | lrm: 0.97 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 2 | total time: 92.18m | eta: 87.4m +step 08580/16704 (51.36%) | loss: 2.642078 | lrm: 0.97 | dt: 645.12ms | tok/sec: 812,703 | mfu: 50.80 | epoch: 2 | total time: 92.19m | eta: 87.4m +step 08581/16704 (51.37%) | loss: 2.633693 | lrm: 0.97 | dt: 643.93ms | tok/sec: 814,205 | mfu: 50.89 | epoch: 2 | total time: 92.20m | eta: 87.4m +step 08582/16704 (51.38%) | loss: 2.633155 | lrm: 0.97 | dt: 646.02ms | tok/sec: 811,568 | mfu: 50.72 | epoch: 2 | total time: 92.21m | eta: 87.4m +step 08583/16704 (51.38%) | loss: 2.640842 | lrm: 0.97 | dt: 644.15ms | tok/sec: 813,927 | mfu: 50.87 | epoch: 2 | total time: 92.22m | eta: 87.4m +step 08584/16704 (51.39%) | loss: 2.642452 | lrm: 0.97 | dt: 645.65ms | tok/sec: 812,029 | mfu: 50.75 | epoch: 2 | total time: 92.23m | eta: 87.4m +step 08585/16704 (51.39%) | loss: 2.655658 | lrm: 0.97 | dt: 647.71ms | tok/sec: 809,444 | mfu: 50.59 | epoch: 2 | total time: 92.25m | eta: 87.3m +step 08586/16704 (51.40%) | loss: 2.661231 | lrm: 0.97 | dt: 643.37ms | tok/sec: 814,904 | mfu: 50.93 | epoch: 2 | total time: 92.26m | eta: 87.3m +step 08587/16704 (51.41%) | loss: 2.663365 | lrm: 0.97 | dt: 646.91ms | tok/sec: 810,454 | mfu: 50.65 | epoch: 2 | total time: 92.27m | eta: 87.3m +step 08588/16704 (51.41%) | loss: 2.670506 | lrm: 0.97 | dt: 643.08ms | tok/sec: 815,271 | mfu: 50.96 | epoch: 2 | total time: 92.28m | eta: 87.3m +step 08589/16704 (51.42%) | loss: 2.667284 | lrm: 0.97 | dt: 647.27ms | tok/sec: 809,996 | mfu: 50.63 | epoch: 2 | total time: 92.29m | eta: 87.3m +step 08590/16704 (51.42%) | loss: 2.664062 | lrm: 0.97 | dt: 645.22ms | tok/sec: 812,572 | mfu: 50.79 | epoch: 2 | total time: 92.30m | eta: 87.3m +step 08591/16704 (51.43%) | loss: 2.658696 | lrm: 0.97 | dt: 644.99ms | tok/sec: 812,863 | mfu: 50.81 | epoch: 2 | total time: 92.31m | eta: 87.3m +step 08592/16704 (51.44%) | loss: 2.662338 | lrm: 0.97 | dt: 644.63ms | tok/sec: 813,312 | mfu: 50.83 | epoch: 2 | total time: 92.32m | eta: 87.3m +step 08593/16704 (51.44%) | loss: 2.661567 | lrm: 0.97 | dt: 644.23ms | tok/sec: 813,822 | mfu: 50.87 | epoch: 2 | total time: 92.33m | eta: 87.3m +step 08594/16704 (51.45%) | loss: 2.652418 | lrm: 0.97 | dt: 642.81ms | tok/sec: 815,615 | mfu: 50.98 | epoch: 2 | total time: 92.34m | eta: 87.2m +step 08595/16704 (51.45%) | loss: 2.657099 | lrm: 0.97 | dt: 644.68ms | tok/sec: 813,256 | mfu: 50.83 | epoch: 2 | total time: 92.35m | eta: 87.2m +step 08596/16704 (51.46%) | loss: 2.659760 | lrm: 0.97 | dt: 643.37ms | tok/sec: 814,914 | mfu: 50.93 | epoch: 2 | total time: 92.36m | eta: 87.2m +step 08597/16704 (51.47%) | loss: 2.643164 | lrm: 0.97 | dt: 644.22ms | tok/sec: 813,835 | mfu: 50.87 | epoch: 2 | total time: 92.37m | eta: 87.2m +step 08598/16704 (51.47%) | loss: 2.650084 | lrm: 0.97 | dt: 644.43ms | tok/sec: 813,562 | mfu: 50.85 | epoch: 2 | total time: 92.38m | eta: 87.2m +step 08599/16704 (51.48%) | loss: 2.660626 | lrm: 0.97 | dt: 647.75ms | tok/sec: 809,402 | mfu: 50.59 | epoch: 2 | total time: 92.40m | eta: 87.2m +step 08600/16704 (51.48%) | loss: 2.657044 | lrm: 0.97 | dt: 646.17ms | tok/sec: 811,381 | mfu: 50.71 | epoch: 2 | total time: 92.41m | eta: 87.2m +step 08601/16704 (51.49%) | loss: 2.663426 | lrm: 0.97 | dt: 644.57ms | tok/sec: 813,387 | mfu: 50.84 | epoch: 2 | total time: 92.42m | eta: 87.2m +step 08602/16704 (51.50%) | loss: 2.668508 | lrm: 0.97 | dt: 644.05ms | tok/sec: 814,046 | mfu: 50.88 | epoch: 2 | total time: 92.43m | eta: 87.2m +step 08603/16704 (51.50%) | loss: 2.673690 | lrm: 0.97 | dt: 644.44ms | tok/sec: 813,559 | mfu: 50.85 | epoch: 2 | total time: 92.44m | eta: 87.1m +step 08604/16704 (51.51%) | loss: 2.690487 | lrm: 0.97 | dt: 646.11ms | tok/sec: 811,451 | mfu: 50.72 | epoch: 2 | total time: 92.45m | eta: 87.1m +step 08605/16704 (51.51%) | loss: 2.696398 | lrm: 0.97 | dt: 643.60ms | tok/sec: 814,621 | mfu: 50.92 | epoch: 2 | total time: 92.46m | eta: 87.1m +step 08606/16704 (51.52%) | loss: 2.699342 | lrm: 0.97 | dt: 648.61ms | tok/sec: 808,322 | mfu: 50.52 | epoch: 2 | total time: 92.47m | eta: 87.1m +step 08607/16704 (51.53%) | loss: 2.700413 | lrm: 0.97 | dt: 646.42ms | tok/sec: 811,063 | mfu: 50.69 | epoch: 2 | total time: 92.48m | eta: 87.1m +step 08608/16704 (51.53%) | loss: 2.696818 | lrm: 0.97 | dt: 646.49ms | tok/sec: 810,971 | mfu: 50.69 | epoch: 2 | total time: 92.49m | eta: 87.1m +step 08609/16704 (51.54%) | loss: 2.694070 | lrm: 0.97 | dt: 643.95ms | tok/sec: 814,179 | mfu: 50.89 | epoch: 2 | total time: 92.50m | eta: 87.1m +step 08610/16704 (51.54%) | loss: 2.697179 | lrm: 0.97 | dt: 646.96ms | tok/sec: 810,387 | mfu: 50.65 | epoch: 2 | total time: 92.51m | eta: 87.1m +step 08611/16704 (51.55%) | loss: 2.686852 | lrm: 0.97 | dt: 645.20ms | tok/sec: 812,597 | mfu: 50.79 | epoch: 2 | total time: 92.52m | eta: 87.1m +step 08612/16704 (51.56%) | loss: 2.691325 | lrm: 0.97 | dt: 645.98ms | tok/sec: 811,615 | mfu: 50.73 | epoch: 2 | total time: 92.54m | eta: 87.0m +step 08613/16704 (51.56%) | loss: 2.685923 | lrm: 0.97 | dt: 645.79ms | tok/sec: 811,861 | mfu: 50.74 | epoch: 2 | total time: 92.55m | eta: 87.0m +step 08614/16704 (51.57%) | loss: 2.690819 | lrm: 0.97 | dt: 643.77ms | tok/sec: 814,401 | mfu: 50.90 | epoch: 2 | total time: 92.56m | eta: 87.0m +step 08615/16704 (51.57%) | loss: 2.687313 | lrm: 0.97 | dt: 644.93ms | tok/sec: 812,931 | mfu: 50.81 | epoch: 2 | total time: 92.57m | eta: 87.0m +step 08616/16704 (51.58%) | loss: 2.693172 | lrm: 0.97 | dt: 644.88ms | tok/sec: 813,006 | mfu: 50.81 | epoch: 2 | total time: 92.58m | eta: 87.0m +step 08617/16704 (51.59%) | loss: 2.697782 | lrm: 0.97 | dt: 643.84ms | tok/sec: 814,317 | mfu: 50.90 | epoch: 2 | total time: 92.59m | eta: 87.0m +step 08618/16704 (51.59%) | loss: 2.706838 | lrm: 0.97 | dt: 646.05ms | tok/sec: 811,528 | mfu: 50.72 | epoch: 2 | total time: 92.60m | eta: 87.0m +step 08619/16704 (51.60%) | loss: 2.706635 | lrm: 0.97 | dt: 644.27ms | tok/sec: 813,775 | mfu: 50.86 | epoch: 2 | total time: 92.61m | eta: 87.0m +step 08620/16704 (51.60%) | loss: 2.704360 | lrm: 0.97 | dt: 645.94ms | tok/sec: 811,666 | mfu: 50.73 | epoch: 2 | total time: 92.62m | eta: 87.0m +step 08621/16704 (51.61%) | loss: 2.685987 | lrm: 0.97 | dt: 644.15ms | tok/sec: 813,921 | mfu: 50.87 | epoch: 2 | total time: 92.63m | eta: 87.0m +step 08622/16704 (51.62%) | loss: 2.695209 | lrm: 0.97 | dt: 643.98ms | tok/sec: 814,131 | mfu: 50.88 | epoch: 2 | total time: 92.64m | eta: 86.9m +step 08623/16704 (51.62%) | loss: 2.695100 | lrm: 0.97 | dt: 646.94ms | tok/sec: 810,410 | mfu: 50.65 | epoch: 2 | total time: 92.65m | eta: 86.9m +step 08624/16704 (51.63%) | loss: 2.701501 | lrm: 0.97 | dt: 645.25ms | tok/sec: 812,531 | mfu: 50.78 | epoch: 2 | total time: 92.66m | eta: 86.9m +step 08625/16704 (51.63%) | loss: 2.693900 | lrm: 0.97 | dt: 646.51ms | tok/sec: 810,944 | mfu: 50.69 | epoch: 2 | total time: 92.68m | eta: 86.9m +step 08626/16704 (51.64%) | loss: 2.696621 | lrm: 0.97 | dt: 644.50ms | tok/sec: 813,479 | mfu: 50.84 | epoch: 2 | total time: 92.69m | eta: 86.9m +step 08627/16704 (51.65%) | loss: 2.704191 | lrm: 0.97 | dt: 644.99ms | tok/sec: 812,861 | mfu: 50.80 | epoch: 2 | total time: 92.70m | eta: 86.9m +step 08628/16704 (51.65%) | loss: 2.700394 | lrm: 0.97 | dt: 642.99ms | tok/sec: 815,385 | mfu: 50.96 | epoch: 2 | total time: 92.71m | eta: 86.9m +step 08629/16704 (51.66%) | loss: 2.698424 | lrm: 0.97 | dt: 646.79ms | tok/sec: 810,594 | mfu: 50.66 | epoch: 2 | total time: 92.72m | eta: 86.9m +step 08630/16704 (51.66%) | loss: 2.697623 | lrm: 0.97 | dt: 643.11ms | tok/sec: 815,243 | mfu: 50.95 | epoch: 2 | total time: 92.73m | eta: 86.9m +step 08631/16704 (51.67%) | loss: 2.702748 | lrm: 0.97 | dt: 648.25ms | tok/sec: 808,768 | mfu: 50.55 | epoch: 2 | total time: 92.74m | eta: 86.8m +step 08632/16704 (51.68%) | loss: 2.705234 | lrm: 0.97 | dt: 644.93ms | tok/sec: 812,934 | mfu: 50.81 | epoch: 2 | total time: 92.75m | eta: 86.8m +step 08633/16704 (51.68%) | loss: 2.701565 | lrm: 0.97 | dt: 643.68ms | tok/sec: 814,512 | mfu: 50.91 | epoch: 2 | total time: 92.76m | eta: 86.8m +step 08634/16704 (51.69%) | loss: 2.695601 | lrm: 0.97 | dt: 647.99ms | tok/sec: 809,103 | mfu: 50.57 | epoch: 2 | total time: 92.77m | eta: 86.8m +step 08635/16704 (51.69%) | loss: 2.685195 | lrm: 0.97 | dt: 643.93ms | tok/sec: 814,206 | mfu: 50.89 | epoch: 2 | total time: 92.78m | eta: 86.8m +step 08636/16704 (51.70%) | loss: 2.686929 | lrm: 0.97 | dt: 645.00ms | tok/sec: 812,849 | mfu: 50.80 | epoch: 2 | total time: 92.79m | eta: 86.8m +step 08637/16704 (51.71%) | loss: 2.685584 | lrm: 0.97 | dt: 644.94ms | tok/sec: 812,930 | mfu: 50.81 | epoch: 2 | total time: 92.80m | eta: 86.8m +step 08638/16704 (51.71%) | loss: 2.677157 | lrm: 0.97 | dt: 645.68ms | tok/sec: 811,991 | mfu: 50.75 | epoch: 2 | total time: 92.82m | eta: 86.8m +step 08639/16704 (51.72%) | loss: 2.687239 | lrm: 0.97 | dt: 643.89ms | tok/sec: 814,253 | mfu: 50.89 | epoch: 2 | total time: 92.83m | eta: 86.8m +step 08640/16704 (51.72%) | loss: 2.687834 | lrm: 0.97 | dt: 645.58ms | tok/sec: 812,117 | mfu: 50.76 | epoch: 2 | total time: 92.84m | eta: 86.7m +step 08641/16704 (51.73%) | loss: 2.678919 | lrm: 0.97 | dt: 644.18ms | tok/sec: 813,888 | mfu: 50.87 | epoch: 2 | total time: 92.85m | eta: 86.7m +step 08642/16704 (51.74%) | loss: 2.689003 | lrm: 0.97 | dt: 645.83ms | tok/sec: 811,810 | mfu: 50.74 | epoch: 2 | total time: 92.86m | eta: 86.7m +step 08643/16704 (51.74%) | loss: 2.684287 | lrm: 0.97 | dt: 645.62ms | tok/sec: 812,066 | mfu: 50.76 | epoch: 2 | total time: 92.87m | eta: 86.7m +step 08644/16704 (51.75%) | loss: 2.687829 | lrm: 0.97 | dt: 643.59ms | tok/sec: 814,624 | mfu: 50.92 | epoch: 2 | total time: 92.88m | eta: 86.7m +step 08645/16704 (51.75%) | loss: 2.678045 | lrm: 0.96 | dt: 648.02ms | tok/sec: 809,064 | mfu: 50.57 | epoch: 2 | total time: 92.89m | eta: 86.7m +step 08646/16704 (51.76%) | loss: 2.675851 | lrm: 0.96 | dt: 644.60ms | tok/sec: 813,350 | mfu: 50.84 | epoch: 2 | total time: 92.90m | eta: 86.7m +step 08647/16704 (51.77%) | loss: 2.691694 | lrm: 0.96 | dt: 646.85ms | tok/sec: 810,521 | mfu: 50.66 | epoch: 2 | total time: 92.91m | eta: 86.7m +step 08648/16704 (51.77%) | loss: 2.681402 | lrm: 0.96 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 2 | total time: 92.92m | eta: 86.7m +step 08649/16704 (51.78%) | loss: 2.681448 | lrm: 0.96 | dt: 645.89ms | tok/sec: 811,733 | mfu: 50.73 | epoch: 2 | total time: 92.93m | eta: 86.7m +step 08650/16704 (51.78%) | loss: 2.668958 | lrm: 0.96 | dt: 645.63ms | tok/sec: 812,054 | mfu: 50.75 | epoch: 2 | total time: 92.94m | eta: 86.6m +step 08651/16704 (51.79%) | loss: 2.670873 | lrm: 0.96 | dt: 645.07ms | tok/sec: 812,760 | mfu: 50.80 | epoch: 2 | total time: 92.95m | eta: 86.6m +step 08652/16704 (51.80%) | loss: 2.667788 | lrm: 0.96 | dt: 644.24ms | tok/sec: 813,814 | mfu: 50.86 | epoch: 2 | total time: 92.97m | eta: 86.6m +step 08653/16704 (51.80%) | loss: 2.679200 | lrm: 0.96 | dt: 643.45ms | tok/sec: 814,804 | mfu: 50.93 | epoch: 2 | total time: 92.98m | eta: 86.6m +step 08654/16704 (51.81%) | loss: 2.672106 | lrm: 0.96 | dt: 643.52ms | tok/sec: 814,717 | mfu: 50.92 | epoch: 2 | total time: 92.99m | eta: 86.6m +step 08655/16704 (51.81%) | loss: 2.672746 | lrm: 0.96 | dt: 647.68ms | tok/sec: 809,487 | mfu: 50.59 | epoch: 2 | total time: 93.00m | eta: 86.6m +step 08656/16704 (51.82%) | loss: 2.671584 | lrm: 0.96 | dt: 646.20ms | tok/sec: 811,337 | mfu: 50.71 | epoch: 2 | total time: 93.01m | eta: 86.6m +step 08657/16704 (51.83%) | loss: 2.660467 | lrm: 0.96 | dt: 644.26ms | tok/sec: 813,777 | mfu: 50.86 | epoch: 2 | total time: 93.02m | eta: 86.6m +step 08658/16704 (51.83%) | loss: 2.649162 | lrm: 0.96 | dt: 646.52ms | tok/sec: 810,934 | mfu: 50.68 | epoch: 2 | total time: 93.03m | eta: 86.6m +step 08659/16704 (51.84%) | loss: 2.661087 | lrm: 0.96 | dt: 645.16ms | tok/sec: 812,648 | mfu: 50.79 | epoch: 2 | total time: 93.04m | eta: 86.5m +step 08660/16704 (51.84%) | loss: 2.657832 | lrm: 0.96 | dt: 649.19ms | tok/sec: 807,601 | mfu: 50.48 | epoch: 2 | total time: 93.05m | eta: 86.5m +step 08661/16704 (51.85%) | loss: 2.648943 | lrm: 0.96 | dt: 644.29ms | tok/sec: 813,740 | mfu: 50.86 | epoch: 2 | total time: 93.06m | eta: 86.5m +step 08662/16704 (51.86%) | loss: 2.658394 | lrm: 0.96 | dt: 644.76ms | tok/sec: 813,150 | mfu: 50.82 | epoch: 2 | total time: 93.07m | eta: 86.5m +step 08663/16704 (51.86%) | loss: 2.669843 | lrm: 0.96 | dt: 645.09ms | tok/sec: 812,733 | mfu: 50.80 | epoch: 2 | total time: 93.08m | eta: 86.5m +step 08664/16704 (51.87%) | loss: 2.674936 | lrm: 0.96 | dt: 644.10ms | tok/sec: 813,979 | mfu: 50.87 | epoch: 2 | total time: 93.09m | eta: 86.5m +step 08665/16704 (51.87%) | loss: 2.666741 | lrm: 0.96 | dt: 648.34ms | tok/sec: 808,667 | mfu: 50.54 | epoch: 2 | total time: 93.11m | eta: 86.5m +step 08666/16704 (51.88%) | loss: 2.654993 | lrm: 0.96 | dt: 645.09ms | tok/sec: 812,731 | mfu: 50.80 | epoch: 2 | total time: 93.12m | eta: 86.5m +step 08667/16704 (51.89%) | loss: 2.648848 | lrm: 0.96 | dt: 646.25ms | tok/sec: 811,272 | mfu: 50.71 | epoch: 2 | total time: 93.13m | eta: 86.5m +step 08668/16704 (51.89%) | loss: 2.659477 | lrm: 0.96 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 2 | total time: 93.14m | eta: 86.4m +step 08669/16704 (51.90%) | loss: 2.665195 | lrm: 0.96 | dt: 644.50ms | tok/sec: 813,483 | mfu: 50.84 | epoch: 2 | total time: 93.15m | eta: 86.4m +step 08670/16704 (51.90%) | loss: 2.669924 | lrm: 0.96 | dt: 645.20ms | tok/sec: 812,602 | mfu: 50.79 | epoch: 2 | total time: 93.16m | eta: 86.4m +step 08671/16704 (51.91%) | loss: 2.674111 | lrm: 0.96 | dt: 647.29ms | tok/sec: 809,973 | mfu: 50.62 | epoch: 2 | total time: 93.17m | eta: 86.4m +step 08672/16704 (51.92%) | loss: 2.673657 | lrm: 0.96 | dt: 643.60ms | tok/sec: 814,612 | mfu: 50.91 | epoch: 2 | total time: 93.18m | eta: 86.4m +step 08673/16704 (51.92%) | loss: 2.674785 | lrm: 0.96 | dt: 647.05ms | tok/sec: 810,268 | mfu: 50.64 | epoch: 2 | total time: 93.19m | eta: 86.4m +step 08674/16704 (51.93%) | loss: 2.659975 | lrm: 0.96 | dt: 644.00ms | tok/sec: 814,106 | mfu: 50.88 | epoch: 2 | total time: 93.20m | eta: 86.4m +step 08675/16704 (51.93%) | loss: 2.670116 | lrm: 0.96 | dt: 643.63ms | tok/sec: 814,582 | mfu: 50.91 | epoch: 2 | total time: 93.21m | eta: 86.4m +step 08676/16704 (51.94%) | loss: 2.664523 | lrm: 0.96 | dt: 645.63ms | tok/sec: 812,057 | mfu: 50.75 | epoch: 2 | total time: 93.22m | eta: 86.4m +step 08677/16704 (51.95%) | loss: 2.671688 | lrm: 0.96 | dt: 644.81ms | tok/sec: 813,089 | mfu: 50.82 | epoch: 2 | total time: 93.23m | eta: 86.3m +step 08678/16704 (51.95%) | loss: 2.665921 | lrm: 0.96 | dt: 647.67ms | tok/sec: 809,493 | mfu: 50.59 | epoch: 2 | total time: 93.25m | eta: 86.3m +step 08679/16704 (51.96%) | loss: 2.669137 | lrm: 0.96 | dt: 644.13ms | tok/sec: 813,950 | mfu: 50.87 | epoch: 2 | total time: 93.26m | eta: 86.3m +step 08680/16704 (51.96%) | loss: 2.665316 | lrm: 0.96 | dt: 643.86ms | tok/sec: 814,289 | mfu: 50.89 | epoch: 2 | total time: 93.27m | eta: 86.3m +step 08681/16704 (51.97%) | loss: 2.669852 | lrm: 0.96 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 2 | total time: 93.28m | eta: 86.3m +step 08682/16704 (51.98%) | loss: 2.665985 | lrm: 0.96 | dt: 643.38ms | tok/sec: 814,893 | mfu: 50.93 | epoch: 2 | total time: 93.29m | eta: 86.3m +step 08683/16704 (51.98%) | loss: 2.662043 | lrm: 0.96 | dt: 646.08ms | tok/sec: 811,487 | mfu: 50.72 | epoch: 2 | total time: 93.30m | eta: 86.3m +step 08684/16704 (51.99%) | loss: 2.654100 | lrm: 0.96 | dt: 642.97ms | tok/sec: 815,411 | mfu: 50.96 | epoch: 2 | total time: 93.31m | eta: 86.3m +step 08685/16704 (51.99%) | loss: 2.653850 | lrm: 0.96 | dt: 644.01ms | tok/sec: 814,102 | mfu: 50.88 | epoch: 2 | total time: 93.32m | eta: 86.3m +step 08686/16704 (52.00%) | loss: 2.674786 | lrm: 0.96 | dt: 646.24ms | tok/sec: 811,295 | mfu: 50.71 | epoch: 2 | total time: 93.33m | eta: 86.3m +step 08687/16704 (52.01%) | loss: 2.665627 | lrm: 0.96 | dt: 642.49ms | tok/sec: 816,021 | mfu: 51.00 | epoch: 2 | total time: 93.34m | eta: 86.2m +step 08688/16704 (52.01%) | loss: 2.672169 | lrm: 0.96 | dt: 646.63ms | tok/sec: 810,806 | mfu: 50.68 | epoch: 2 | total time: 93.35m | eta: 86.2m +step 08689/16704 (52.02%) | loss: 2.681220 | lrm: 0.96 | dt: 644.99ms | tok/sec: 812,867 | mfu: 50.81 | epoch: 2 | total time: 93.36m | eta: 86.2m +step 08690/16704 (52.02%) | loss: 2.677310 | lrm: 0.96 | dt: 644.77ms | tok/sec: 813,136 | mfu: 50.82 | epoch: 2 | total time: 93.37m | eta: 86.2m +step 08691/16704 (52.03%) | loss: 2.679944 | lrm: 0.96 | dt: 645.54ms | tok/sec: 812,165 | mfu: 50.76 | epoch: 2 | total time: 93.39m | eta: 86.2m +step 08692/16704 (52.04%) | loss: 2.696065 | lrm: 0.96 | dt: 643.26ms | tok/sec: 815,051 | mfu: 50.94 | epoch: 2 | total time: 93.40m | eta: 86.2m +step 08693/16704 (52.04%) | loss: 2.699646 | lrm: 0.96 | dt: 679.19ms | tok/sec: 771,936 | mfu: 48.25 | epoch: 2 | total time: 93.41m | eta: 86.2m +step 08694/16704 (52.05%) | loss: 2.699342 | lrm: 0.96 | dt: 635.73ms | tok/sec: 824,703 | mfu: 51.55 | epoch: 2 | total time: 93.42m | eta: 86.2m +step 08695/16704 (52.05%) | loss: 2.696272 | lrm: 0.96 | dt: 651.93ms | tok/sec: 804,213 | mfu: 50.26 | epoch: 2 | total time: 93.43m | eta: 86.2m +step 08696/16704 (52.06%) | loss: 2.696358 | lrm: 0.96 | dt: 642.46ms | tok/sec: 816,059 | mfu: 51.00 | epoch: 2 | total time: 93.44m | eta: 86.1m +step 08697/16704 (52.07%) | loss: 2.691843 | lrm: 0.96 | dt: 643.73ms | tok/sec: 814,457 | mfu: 50.90 | epoch: 2 | total time: 93.45m | eta: 86.1m +step 08698/16704 (52.07%) | loss: 2.700337 | lrm: 0.96 | dt: 648.64ms | tok/sec: 808,293 | mfu: 50.52 | epoch: 2 | total time: 93.46m | eta: 86.1m +step 08699/16704 (52.08%) | loss: 2.702656 | lrm: 0.96 | dt: 642.47ms | tok/sec: 816,045 | mfu: 51.00 | epoch: 2 | total time: 93.47m | eta: 86.1m +step 08700/16704 (52.08%) | loss: 2.695948 | lrm: 0.96 | dt: 645.29ms | tok/sec: 812,479 | mfu: 50.78 | epoch: 2 | total time: 93.48m | eta: 86.1m +step 08701/16704 (52.09%) | loss: 2.700624 | lrm: 0.96 | dt: 645.83ms | tok/sec: 811,800 | mfu: 50.74 | epoch: 2 | total time: 93.49m | eta: 86.1m +step 08702/16704 (52.10%) | loss: 2.713717 | lrm: 0.96 | dt: 641.61ms | tok/sec: 817,139 | mfu: 51.07 | epoch: 2 | total time: 93.50m | eta: 86.1m +step 08703/16704 (52.10%) | loss: 2.712582 | lrm: 0.96 | dt: 647.53ms | tok/sec: 809,679 | mfu: 50.61 | epoch: 2 | total time: 93.51m | eta: 86.1m +step 08704/16704 (52.11%) | loss: 2.719998 | lrm: 0.96 | dt: 646.21ms | tok/sec: 811,324 | mfu: 50.71 | epoch: 2 | total time: 93.53m | eta: 86.1m +step 08705/16704 (52.11%) | loss: 2.731649 | lrm: 0.96 | dt: 642.98ms | tok/sec: 815,403 | mfu: 50.96 | epoch: 2 | total time: 93.54m | eta: 86.0m +step 08706/16704 (52.12%) | loss: 2.732393 | lrm: 0.96 | dt: 645.41ms | tok/sec: 812,329 | mfu: 50.77 | epoch: 2 | total time: 93.55m | eta: 86.0m +step 08707/16704 (52.13%) | loss: 2.720547 | lrm: 0.96 | dt: 645.29ms | tok/sec: 812,479 | mfu: 50.78 | epoch: 2 | total time: 93.56m | eta: 86.0m +step 08708/16704 (52.13%) | loss: 2.713430 | lrm: 0.96 | dt: 646.00ms | tok/sec: 811,594 | mfu: 50.73 | epoch: 2 | total time: 93.57m | eta: 86.0m +step 08709/16704 (52.14%) | loss: 2.730019 | lrm: 0.96 | dt: 645.48ms | tok/sec: 812,245 | mfu: 50.77 | epoch: 2 | total time: 93.58m | eta: 86.0m +step 08710/16704 (52.14%) | loss: 2.722934 | lrm: 0.96 | dt: 645.67ms | tok/sec: 812,002 | mfu: 50.75 | epoch: 2 | total time: 93.59m | eta: 86.0m +step 08711/16704 (52.15%) | loss: 2.714662 | lrm: 0.96 | dt: 645.19ms | tok/sec: 812,606 | mfu: 50.79 | epoch: 2 | total time: 93.60m | eta: 86.0m +step 08712/16704 (52.16%) | loss: 2.709637 | lrm: 0.96 | dt: 646.23ms | tok/sec: 811,296 | mfu: 50.71 | epoch: 2 | total time: 93.61m | eta: 86.0m +step 08713/16704 (52.16%) | loss: 2.710190 | lrm: 0.96 | dt: 642.83ms | tok/sec: 815,587 | mfu: 50.98 | epoch: 2 | total time: 93.62m | eta: 86.0m +step 08714/16704 (52.17%) | loss: 2.712792 | lrm: 0.96 | dt: 646.31ms | tok/sec: 811,201 | mfu: 50.70 | epoch: 2 | total time: 93.63m | eta: 86.0m +step 08715/16704 (52.17%) | loss: 2.718243 | lrm: 0.96 | dt: 643.96ms | tok/sec: 814,160 | mfu: 50.89 | epoch: 2 | total time: 93.64m | eta: 85.9m +step 08716/16704 (52.18%) | loss: 2.713642 | lrm: 0.96 | dt: 644.89ms | tok/sec: 812,993 | mfu: 50.81 | epoch: 2 | total time: 93.65m | eta: 85.9m +step 08717/16704 (52.19%) | loss: 2.713636 | lrm: 0.96 | dt: 647.24ms | tok/sec: 810,042 | mfu: 50.63 | epoch: 2 | total time: 93.67m | eta: 85.9m +step 08718/16704 (52.19%) | loss: 2.712064 | lrm: 0.96 | dt: 643.08ms | tok/sec: 815,273 | mfu: 50.96 | epoch: 2 | total time: 93.68m | eta: 85.9m +step 08719/16704 (52.20%) | loss: 2.702950 | lrm: 0.96 | dt: 645.65ms | tok/sec: 812,033 | mfu: 50.75 | epoch: 2 | total time: 93.69m | eta: 85.9m +step 08720/16704 (52.20%) | loss: 2.698574 | lrm: 0.96 | dt: 644.71ms | tok/sec: 813,214 | mfu: 50.83 | epoch: 2 | total time: 93.70m | eta: 85.9m +step 08721/16704 (52.21%) | loss: 2.701653 | lrm: 0.96 | dt: 644.91ms | tok/sec: 812,964 | mfu: 50.81 | epoch: 2 | total time: 93.71m | eta: 85.9m +step 08722/16704 (52.22%) | loss: 2.687440 | lrm: 0.96 | dt: 644.10ms | tok/sec: 813,990 | mfu: 50.88 | epoch: 2 | total time: 93.72m | eta: 85.9m +step 08723/16704 (52.22%) | loss: 2.681389 | lrm: 0.96 | dt: 645.14ms | tok/sec: 812,669 | mfu: 50.79 | epoch: 2 | total time: 93.73m | eta: 85.9m +step 08724/16704 (52.23%) | loss: 2.682716 | lrm: 0.96 | dt: 644.07ms | tok/sec: 814,026 | mfu: 50.88 | epoch: 2 | total time: 93.74m | eta: 85.8m +step 08725/16704 (52.23%) | loss: 2.688352 | lrm: 0.96 | dt: 644.85ms | tok/sec: 813,044 | mfu: 50.82 | epoch: 2 | total time: 93.75m | eta: 85.8m +step 08726/16704 (52.24%) | loss: 2.692909 | lrm: 0.96 | dt: 644.73ms | tok/sec: 813,193 | mfu: 50.83 | epoch: 2 | total time: 93.76m | eta: 85.8m +step 08727/16704 (52.24%) | loss: 2.713285 | lrm: 0.96 | dt: 645.44ms | tok/sec: 812,292 | mfu: 50.77 | epoch: 2 | total time: 93.77m | eta: 85.8m +step 08728/16704 (52.25%) | loss: 2.705125 | lrm: 0.95 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 2 | total time: 93.78m | eta: 85.8m +step 08729/16704 (52.26%) | loss: 2.697480 | lrm: 0.95 | dt: 643.53ms | tok/sec: 814,703 | mfu: 50.92 | epoch: 2 | total time: 93.79m | eta: 85.8m +step 08730/16704 (52.26%) | loss: 2.692354 | lrm: 0.95 | dt: 643.59ms | tok/sec: 814,631 | mfu: 50.92 | epoch: 2 | total time: 93.80m | eta: 85.8m +step 08731/16704 (52.27%) | loss: 2.704051 | lrm: 0.95 | dt: 646.95ms | tok/sec: 810,395 | mfu: 50.65 | epoch: 2 | total time: 93.82m | eta: 85.8m +step 08732/16704 (52.27%) | loss: 2.695996 | lrm: 0.95 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 2 | total time: 93.83m | eta: 85.8m +step 08733/16704 (52.28%) | loss: 2.701127 | lrm: 0.95 | dt: 646.08ms | tok/sec: 811,490 | mfu: 50.72 | epoch: 2 | total time: 93.84m | eta: 85.7m +step 08734/16704 (52.29%) | loss: 2.699650 | lrm: 0.95 | dt: 645.52ms | tok/sec: 812,192 | mfu: 50.76 | epoch: 2 | total time: 93.85m | eta: 85.7m +step 08735/16704 (52.29%) | loss: 2.702197 | lrm: 0.95 | dt: 643.76ms | tok/sec: 814,409 | mfu: 50.90 | epoch: 2 | total time: 93.86m | eta: 85.7m +step 08736/16704 (52.30%) | loss: 2.695092 | lrm: 0.95 | dt: 646.82ms | tok/sec: 810,563 | mfu: 50.66 | epoch: 2 | total time: 93.87m | eta: 85.7m +step 08737/16704 (52.30%) | loss: 2.689439 | lrm: 0.95 | dt: 643.28ms | tok/sec: 815,026 | mfu: 50.94 | epoch: 2 | total time: 93.88m | eta: 85.7m +step 08738/16704 (52.31%) | loss: 2.688898 | lrm: 0.95 | dt: 645.09ms | tok/sec: 812,736 | mfu: 50.80 | epoch: 2 | total time: 93.89m | eta: 85.7m +step 08739/16704 (52.32%) | loss: 2.692690 | lrm: 0.95 | dt: 646.15ms | tok/sec: 811,408 | mfu: 50.71 | epoch: 2 | total time: 93.90m | eta: 85.7m +step 08740/16704 (52.32%) | loss: 2.675832 | lrm: 0.95 | dt: 642.80ms | tok/sec: 815,627 | mfu: 50.98 | epoch: 2 | total time: 93.91m | eta: 85.7m +step 08741/16704 (52.33%) | loss: 2.688286 | lrm: 0.95 | dt: 645.28ms | tok/sec: 812,497 | mfu: 50.78 | epoch: 2 | total time: 93.92m | eta: 85.7m +step 08742/16704 (52.33%) | loss: 2.684086 | lrm: 0.95 | dt: 644.75ms | tok/sec: 813,158 | mfu: 50.82 | epoch: 2 | total time: 93.93m | eta: 85.7m +step 08743/16704 (52.34%) | loss: 2.681494 | lrm: 0.95 | dt: 643.94ms | tok/sec: 814,186 | mfu: 50.89 | epoch: 2 | total time: 93.94m | eta: 85.6m +step 08744/16704 (52.35%) | loss: 2.669899 | lrm: 0.95 | dt: 645.57ms | tok/sec: 812,130 | mfu: 50.76 | epoch: 2 | total time: 93.96m | eta: 85.6m +step 08745/16704 (52.35%) | loss: 2.681447 | lrm: 0.95 | dt: 643.76ms | tok/sec: 814,421 | mfu: 50.90 | epoch: 2 | total time: 93.97m | eta: 85.6m +step 08746/16704 (52.36%) | loss: 2.674184 | lrm: 0.95 | dt: 645.26ms | tok/sec: 812,520 | mfu: 50.78 | epoch: 2 | total time: 93.98m | eta: 85.6m +step 08747/16704 (52.36%) | loss: 2.678986 | lrm: 0.95 | dt: 645.06ms | tok/sec: 812,778 | mfu: 50.80 | epoch: 2 | total time: 93.99m | eta: 85.6m +step 08748/16704 (52.37%) | loss: 2.679670 | lrm: 0.95 | dt: 644.28ms | tok/sec: 813,757 | mfu: 50.86 | epoch: 2 | total time: 94.00m | eta: 85.6m +step 08749/16704 (52.38%) | loss: 2.677663 | lrm: 0.95 | dt: 646.08ms | tok/sec: 811,489 | mfu: 50.72 | epoch: 2 | total time: 94.01m | eta: 85.6m +Step 08750 | Validation bpb: 0.816891 +step 08750/16704 (52.38%) | loss: 2.685914 | lrm: 0.95 | dt: 647.02ms | tok/sec: 810,310 | mfu: 50.65 | epoch: 2 | total time: 94.02m | eta: 85.6m +step 08751/16704 (52.39%) | loss: 2.683031 | lrm: 0.95 | dt: 646.46ms | tok/sec: 811,016 | mfu: 50.69 | epoch: 2 | total time: 94.03m | eta: 85.6m +step 08752/16704 (52.39%) | loss: 2.696906 | lrm: 0.95 | dt: 644.52ms | tok/sec: 813,453 | mfu: 50.84 | epoch: 2 | total time: 94.04m | eta: 85.5m +step 08753/16704 (52.40%) | loss: 2.694967 | lrm: 0.95 | dt: 641.34ms | tok/sec: 817,489 | mfu: 51.09 | epoch: 2 | total time: 94.05m | eta: 85.5m +step 08754/16704 (52.41%) | loss: 2.694090 | lrm: 0.95 | dt: 647.86ms | tok/sec: 809,256 | mfu: 50.58 | epoch: 2 | total time: 94.06m | eta: 85.5m +step 08755/16704 (52.41%) | loss: 2.691878 | lrm: 0.95 | dt: 643.48ms | tok/sec: 814,775 | mfu: 50.92 | epoch: 2 | total time: 94.07m | eta: 85.5m +step 08756/16704 (52.42%) | loss: 2.688236 | lrm: 0.95 | dt: 642.51ms | tok/sec: 816,002 | mfu: 51.00 | epoch: 2 | total time: 94.08m | eta: 85.5m +step 08757/16704 (52.42%) | loss: 2.690561 | lrm: 0.95 | dt: 645.44ms | tok/sec: 812,293 | mfu: 50.77 | epoch: 2 | total time: 94.09m | eta: 85.5m +step 08758/16704 (52.43%) | loss: 2.695900 | lrm: 0.95 | dt: 641.84ms | tok/sec: 816,856 | mfu: 51.05 | epoch: 2 | total time: 94.11m | eta: 85.5m +step 08759/16704 (52.44%) | loss: 2.694613 | lrm: 0.95 | dt: 642.75ms | tok/sec: 815,689 | mfu: 50.98 | epoch: 2 | total time: 94.12m | eta: 85.5m +step 08760/16704 (52.44%) | loss: 2.683791 | lrm: 0.95 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 2 | total time: 94.13m | eta: 85.5m +step 08761/16704 (52.45%) | loss: 2.686022 | lrm: 0.95 | dt: 641.94ms | tok/sec: 816,720 | mfu: 51.05 | epoch: 2 | total time: 94.14m | eta: 85.4m +step 08762/16704 (52.45%) | loss: 2.686361 | lrm: 0.95 | dt: 645.52ms | tok/sec: 812,192 | mfu: 50.76 | epoch: 2 | total time: 94.15m | eta: 85.4m +step 08763/16704 (52.46%) | loss: 2.678861 | lrm: 0.95 | dt: 644.04ms | tok/sec: 814,056 | mfu: 50.88 | epoch: 2 | total time: 94.16m | eta: 85.4m +step 08764/16704 (52.47%) | loss: 2.665296 | lrm: 0.95 | dt: 642.75ms | tok/sec: 815,699 | mfu: 50.98 | epoch: 2 | total time: 94.17m | eta: 85.4m +step 08765/16704 (52.47%) | loss: 2.659595 | lrm: 0.95 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 2 | total time: 94.18m | eta: 85.4m +step 08766/16704 (52.48%) | loss: 2.660249 | lrm: 0.95 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 2 | total time: 94.19m | eta: 85.4m +step 08767/16704 (52.48%) | loss: 2.656613 | lrm: 0.95 | dt: 642.55ms | tok/sec: 815,953 | mfu: 51.00 | epoch: 2 | total time: 94.20m | eta: 85.4m +step 08768/16704 (52.49%) | loss: 2.654647 | lrm: 0.95 | dt: 644.58ms | tok/sec: 813,375 | mfu: 50.84 | epoch: 2 | total time: 94.21m | eta: 85.4m +step 08769/16704 (52.50%) | loss: 2.666490 | lrm: 0.95 | dt: 643.67ms | tok/sec: 814,524 | mfu: 50.91 | epoch: 2 | total time: 94.22m | eta: 85.4m +step 08770/16704 (52.50%) | loss: 2.672567 | lrm: 0.95 | dt: 645.35ms | tok/sec: 812,406 | mfu: 50.78 | epoch: 2 | total time: 94.23m | eta: 85.3m +step 08771/16704 (52.51%) | loss: 2.676187 | lrm: 0.95 | dt: 643.21ms | tok/sec: 815,107 | mfu: 50.95 | epoch: 2 | total time: 94.25m | eta: 85.3m +step 08772/16704 (52.51%) | loss: 2.667862 | lrm: 0.95 | dt: 643.79ms | tok/sec: 814,382 | mfu: 50.90 | epoch: 2 | total time: 94.26m | eta: 85.3m +step 08773/16704 (52.52%) | loss: 2.670194 | lrm: 0.95 | dt: 642.82ms | tok/sec: 815,601 | mfu: 50.98 | epoch: 2 | total time: 94.27m | eta: 85.3m +step 08774/16704 (52.53%) | loss: 2.664316 | lrm: 0.95 | dt: 646.65ms | tok/sec: 810,777 | mfu: 50.67 | epoch: 2 | total time: 94.28m | eta: 85.3m +step 08775/16704 (52.53%) | loss: 2.663260 | lrm: 0.95 | dt: 643.78ms | tok/sec: 814,389 | mfu: 50.90 | epoch: 2 | total time: 94.29m | eta: 85.3m +step 08776/16704 (52.54%) | loss: 2.664830 | lrm: 0.95 | dt: 644.08ms | tok/sec: 814,007 | mfu: 50.88 | epoch: 2 | total time: 94.30m | eta: 85.3m +step 08777/16704 (52.54%) | loss: 2.657875 | lrm: 0.95 | dt: 644.54ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 94.31m | eta: 85.3m +step 08778/16704 (52.55%) | loss: 2.674516 | lrm: 0.95 | dt: 645.63ms | tok/sec: 812,060 | mfu: 50.75 | epoch: 2 | total time: 94.32m | eta: 85.3m +step 08779/16704 (52.56%) | loss: 2.673974 | lrm: 0.95 | dt: 642.51ms | tok/sec: 816,004 | mfu: 51.00 | epoch: 2 | total time: 94.33m | eta: 85.3m +step 08780/16704 (52.56%) | loss: 2.662049 | lrm: 0.95 | dt: 645.96ms | tok/sec: 811,646 | mfu: 50.73 | epoch: 2 | total time: 94.34m | eta: 85.2m +step 08781/16704 (52.57%) | loss: 2.679270 | lrm: 0.95 | dt: 645.53ms | tok/sec: 812,184 | mfu: 50.76 | epoch: 2 | total time: 94.35m | eta: 85.2m +step 08782/16704 (52.57%) | loss: 2.683733 | lrm: 0.95 | dt: 643.94ms | tok/sec: 814,184 | mfu: 50.89 | epoch: 2 | total time: 94.36m | eta: 85.2m +step 08783/16704 (52.58%) | loss: 2.683051 | lrm: 0.95 | dt: 644.50ms | tok/sec: 813,481 | mfu: 50.84 | epoch: 2 | total time: 94.37m | eta: 85.2m +step 08784/16704 (52.59%) | loss: 2.668592 | lrm: 0.95 | dt: 643.78ms | tok/sec: 814,392 | mfu: 50.90 | epoch: 2 | total time: 94.38m | eta: 85.2m +step 08785/16704 (52.59%) | loss: 2.681583 | lrm: 0.95 | dt: 663.36ms | tok/sec: 790,354 | mfu: 49.40 | epoch: 2 | total time: 94.40m | eta: 85.2m +step 08786/16704 (52.60%) | loss: 2.672706 | lrm: 0.95 | dt: 637.99ms | tok/sec: 821,784 | mfu: 51.36 | epoch: 2 | total time: 94.41m | eta: 85.2m +step 08787/16704 (52.60%) | loss: 2.679701 | lrm: 0.95 | dt: 648.05ms | tok/sec: 809,018 | mfu: 50.56 | epoch: 2 | total time: 94.42m | eta: 85.2m +step 08788/16704 (52.61%) | loss: 2.672745 | lrm: 0.95 | dt: 641.38ms | tok/sec: 817,434 | mfu: 51.09 | epoch: 2 | total time: 94.43m | eta: 85.2m +step 08789/16704 (52.62%) | loss: 2.688461 | lrm: 0.95 | dt: 643.78ms | tok/sec: 814,383 | mfu: 50.90 | epoch: 2 | total time: 94.44m | eta: 85.1m +step 08790/16704 (52.62%) | loss: 2.686657 | lrm: 0.95 | dt: 649.47ms | tok/sec: 807,254 | mfu: 50.45 | epoch: 2 | total time: 94.45m | eta: 85.1m +step 08791/16704 (52.63%) | loss: 2.691405 | lrm: 0.95 | dt: 640.59ms | tok/sec: 818,448 | mfu: 51.15 | epoch: 2 | total time: 94.46m | eta: 85.1m +step 08792/16704 (52.63%) | loss: 2.691098 | lrm: 0.95 | dt: 644.46ms | tok/sec: 813,525 | mfu: 50.85 | epoch: 2 | total time: 94.47m | eta: 85.1m +step 08793/16704 (52.64%) | loss: 2.688931 | lrm: 0.95 | dt: 644.57ms | tok/sec: 813,388 | mfu: 50.84 | epoch: 2 | total time: 94.48m | eta: 85.1m +step 08794/16704 (52.65%) | loss: 2.693729 | lrm: 0.95 | dt: 642.74ms | tok/sec: 815,707 | mfu: 50.98 | epoch: 2 | total time: 94.49m | eta: 85.1m +step 08795/16704 (52.65%) | loss: 2.704341 | lrm: 0.95 | dt: 648.17ms | tok/sec: 808,870 | mfu: 50.56 | epoch: 2 | total time: 94.50m | eta: 85.1m +step 08796/16704 (52.66%) | loss: 2.687787 | lrm: 0.95 | dt: 642.67ms | tok/sec: 815,793 | mfu: 50.99 | epoch: 2 | total time: 94.51m | eta: 85.1m +step 08797/16704 (52.66%) | loss: 2.696169 | lrm: 0.95 | dt: 643.93ms | tok/sec: 814,195 | mfu: 50.89 | epoch: 2 | total time: 94.52m | eta: 85.1m +step 08798/16704 (52.67%) | loss: 2.693273 | lrm: 0.95 | dt: 646.03ms | tok/sec: 811,549 | mfu: 50.72 | epoch: 2 | total time: 94.54m | eta: 85.0m +step 08799/16704 (52.68%) | loss: 2.696010 | lrm: 0.95 | dt: 641.10ms | tok/sec: 817,797 | mfu: 51.11 | epoch: 2 | total time: 94.55m | eta: 85.0m +step 08800/16704 (52.68%) | loss: 2.693280 | lrm: 0.95 | dt: 644.07ms | tok/sec: 814,026 | mfu: 50.88 | epoch: 2 | total time: 94.56m | eta: 85.0m +step 08801/16704 (52.69%) | loss: 2.688704 | lrm: 0.95 | dt: 643.53ms | tok/sec: 814,705 | mfu: 50.92 | epoch: 2 | total time: 94.57m | eta: 85.0m +step 08802/16704 (52.69%) | loss: 2.692566 | lrm: 0.95 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 2 | total time: 94.58m | eta: 85.0m +step 08803/16704 (52.70%) | loss: 2.698438 | lrm: 0.95 | dt: 643.81ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 2 | total time: 94.59m | eta: 85.0m +step 08804/16704 (52.71%) | loss: 2.693017 | lrm: 0.95 | dt: 643.96ms | tok/sec: 814,167 | mfu: 50.89 | epoch: 2 | total time: 94.60m | eta: 85.0m +step 08805/16704 (52.71%) | loss: 2.689971 | lrm: 0.95 | dt: 645.39ms | tok/sec: 812,353 | mfu: 50.77 | epoch: 2 | total time: 94.61m | eta: 85.0m +step 08806/16704 (52.72%) | loss: 2.691024 | lrm: 0.95 | dt: 642.73ms | tok/sec: 815,721 | mfu: 50.98 | epoch: 2 | total time: 94.62m | eta: 85.0m +step 08807/16704 (52.72%) | loss: 2.667811 | lrm: 0.95 | dt: 644.99ms | tok/sec: 812,866 | mfu: 50.81 | epoch: 2 | total time: 94.63m | eta: 85.0m +step 08808/16704 (52.73%) | loss: 2.663765 | lrm: 0.95 | dt: 644.71ms | tok/sec: 813,215 | mfu: 50.83 | epoch: 2 | total time: 94.64m | eta: 84.9m +step 08809/16704 (52.74%) | loss: 2.653092 | lrm: 0.95 | dt: 642.92ms | tok/sec: 815,476 | mfu: 50.97 | epoch: 2 | total time: 94.65m | eta: 84.9m +step 08810/16704 (52.74%) | loss: 2.652615 | lrm: 0.95 | dt: 645.70ms | tok/sec: 811,964 | mfu: 50.75 | epoch: 2 | total time: 94.66m | eta: 84.9m +step 08811/16704 (52.75%) | loss: 2.665107 | lrm: 0.95 | dt: 644.08ms | tok/sec: 814,015 | mfu: 50.88 | epoch: 2 | total time: 94.67m | eta: 84.9m +step 08812/16704 (52.75%) | loss: 2.675186 | lrm: 0.94 | dt: 644.40ms | tok/sec: 813,603 | mfu: 50.85 | epoch: 2 | total time: 94.69m | eta: 84.9m +step 08813/16704 (52.76%) | loss: 2.676883 | lrm: 0.94 | dt: 646.10ms | tok/sec: 811,463 | mfu: 50.72 | epoch: 2 | total time: 94.70m | eta: 84.9m +step 08814/16704 (52.77%) | loss: 2.680748 | lrm: 0.94 | dt: 644.66ms | tok/sec: 813,274 | mfu: 50.83 | epoch: 2 | total time: 94.71m | eta: 84.9m +step 08815/16704 (52.77%) | loss: 2.689332 | lrm: 0.94 | dt: 643.94ms | tok/sec: 814,183 | mfu: 50.89 | epoch: 2 | total time: 94.72m | eta: 84.9m +step 08816/16704 (52.78%) | loss: 2.711042 | lrm: 0.94 | dt: 644.84ms | tok/sec: 813,047 | mfu: 50.82 | epoch: 2 | total time: 94.73m | eta: 84.9m +step 08817/16704 (52.78%) | loss: 2.703833 | lrm: 0.94 | dt: 645.23ms | tok/sec: 812,563 | mfu: 50.79 | epoch: 2 | total time: 94.74m | eta: 84.8m +step 08818/16704 (52.79%) | loss: 2.696469 | lrm: 0.94 | dt: 642.96ms | tok/sec: 815,434 | mfu: 50.97 | epoch: 2 | total time: 94.75m | eta: 84.8m +step 08819/16704 (52.80%) | loss: 2.698343 | lrm: 0.94 | dt: 645.55ms | tok/sec: 812,151 | mfu: 50.76 | epoch: 2 | total time: 94.76m | eta: 84.8m +step 08820/16704 (52.80%) | loss: 2.707702 | lrm: 0.94 | dt: 642.33ms | tok/sec: 816,231 | mfu: 51.02 | epoch: 2 | total time: 94.77m | eta: 84.8m +step 08821/16704 (52.81%) | loss: 2.708345 | lrm: 0.94 | dt: 646.12ms | tok/sec: 811,443 | mfu: 50.72 | epoch: 2 | total time: 94.78m | eta: 84.8m +step 08822/16704 (52.81%) | loss: 2.711262 | lrm: 0.94 | dt: 643.71ms | tok/sec: 814,482 | mfu: 50.91 | epoch: 2 | total time: 94.79m | eta: 84.8m +step 08823/16704 (52.82%) | loss: 2.691641 | lrm: 0.94 | dt: 642.67ms | tok/sec: 815,792 | mfu: 50.99 | epoch: 2 | total time: 94.80m | eta: 84.8m +step 08824/16704 (52.83%) | loss: 2.689185 | lrm: 0.94 | dt: 645.98ms | tok/sec: 811,618 | mfu: 50.73 | epoch: 2 | total time: 94.81m | eta: 84.8m +step 08825/16704 (52.83%) | loss: 2.701743 | lrm: 0.94 | dt: 642.38ms | tok/sec: 816,165 | mfu: 51.01 | epoch: 2 | total time: 94.83m | eta: 84.8m +step 08826/16704 (52.84%) | loss: 2.700597 | lrm: 0.94 | dt: 644.92ms | tok/sec: 812,950 | mfu: 50.81 | epoch: 2 | total time: 94.84m | eta: 84.7m +step 08827/16704 (52.84%) | loss: 2.698460 | lrm: 0.94 | dt: 646.05ms | tok/sec: 811,534 | mfu: 50.72 | epoch: 2 | total time: 94.85m | eta: 84.7m +step 08828/16704 (52.85%) | loss: 2.695364 | lrm: 0.94 | dt: 644.40ms | tok/sec: 813,610 | mfu: 50.85 | epoch: 2 | total time: 94.86m | eta: 84.7m +step 08829/16704 (52.86%) | loss: 2.685620 | lrm: 0.94 | dt: 646.43ms | tok/sec: 811,057 | mfu: 50.69 | epoch: 2 | total time: 94.87m | eta: 84.7m +step 08830/16704 (52.86%) | loss: 2.683010 | lrm: 0.94 | dt: 645.14ms | tok/sec: 812,673 | mfu: 50.79 | epoch: 2 | total time: 94.88m | eta: 84.7m +step 08831/16704 (52.87%) | loss: 2.676943 | lrm: 0.94 | dt: 645.05ms | tok/sec: 812,780 | mfu: 50.80 | epoch: 2 | total time: 94.89m | eta: 84.7m +step 08832/16704 (52.87%) | loss: 2.671872 | lrm: 0.94 | dt: 642.05ms | tok/sec: 816,581 | mfu: 51.04 | epoch: 2 | total time: 94.90m | eta: 84.7m +step 08833/16704 (52.88%) | loss: 2.683578 | lrm: 0.94 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 2 | total time: 94.91m | eta: 84.7m +step 08834/16704 (52.89%) | loss: 2.688340 | lrm: 0.94 | dt: 643.51ms | tok/sec: 814,732 | mfu: 50.92 | epoch: 2 | total time: 94.92m | eta: 84.7m +step 08835/16704 (52.89%) | loss: 2.679142 | lrm: 0.94 | dt: 647.45ms | tok/sec: 809,770 | mfu: 50.61 | epoch: 2 | total time: 94.93m | eta: 84.6m +step 08836/16704 (52.90%) | loss: 2.675523 | lrm: 0.94 | dt: 643.79ms | tok/sec: 814,379 | mfu: 50.90 | epoch: 2 | total time: 94.94m | eta: 84.6m +step 08837/16704 (52.90%) | loss: 2.667153 | lrm: 0.94 | dt: 644.34ms | tok/sec: 813,676 | mfu: 50.86 | epoch: 2 | total time: 94.95m | eta: 84.6m +step 08838/16704 (52.91%) | loss: 2.672826 | lrm: 0.94 | dt: 643.70ms | tok/sec: 814,492 | mfu: 50.91 | epoch: 2 | total time: 94.96m | eta: 84.6m +step 08839/16704 (52.92%) | loss: 2.687023 | lrm: 0.94 | dt: 643.96ms | tok/sec: 814,162 | mfu: 50.89 | epoch: 2 | total time: 94.98m | eta: 84.6m +step 08840/16704 (52.92%) | loss: 2.679398 | lrm: 0.94 | dt: 645.69ms | tok/sec: 811,977 | mfu: 50.75 | epoch: 2 | total time: 94.99m | eta: 84.6m +step 08841/16704 (52.93%) | loss: 2.678413 | lrm: 0.94 | dt: 645.53ms | tok/sec: 812,177 | mfu: 50.76 | epoch: 2 | total time: 95.00m | eta: 84.6m +step 08842/16704 (52.93%) | loss: 2.684666 | lrm: 0.94 | dt: 643.84ms | tok/sec: 814,312 | mfu: 50.90 | epoch: 2 | total time: 95.01m | eta: 84.6m +step 08843/16704 (52.94%) | loss: 2.685774 | lrm: 0.94 | dt: 643.29ms | tok/sec: 815,010 | mfu: 50.94 | epoch: 2 | total time: 95.02m | eta: 84.6m +step 08844/16704 (52.95%) | loss: 2.672168 | lrm: 0.94 | dt: 644.71ms | tok/sec: 813,213 | mfu: 50.83 | epoch: 2 | total time: 95.03m | eta: 84.6m +step 08845/16704 (52.95%) | loss: 2.669421 | lrm: 0.94 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 2 | total time: 95.04m | eta: 84.5m +step 08846/16704 (52.96%) | loss: 2.667829 | lrm: 0.94 | dt: 643.55ms | tok/sec: 814,675 | mfu: 50.92 | epoch: 2 | total time: 95.05m | eta: 84.5m +step 08847/16704 (52.96%) | loss: 2.680697 | lrm: 0.94 | dt: 644.95ms | tok/sec: 812,912 | mfu: 50.81 | epoch: 2 | total time: 95.06m | eta: 84.5m +step 08848/16704 (52.97%) | loss: 2.669440 | lrm: 0.94 | dt: 644.16ms | tok/sec: 813,905 | mfu: 50.87 | epoch: 2 | total time: 95.07m | eta: 84.5m +step 08849/16704 (52.98%) | loss: 2.677926 | lrm: 0.94 | dt: 646.96ms | tok/sec: 810,390 | mfu: 50.65 | epoch: 2 | total time: 95.08m | eta: 84.5m +step 08850/16704 (52.98%) | loss: 2.665228 | lrm: 0.94 | dt: 643.79ms | tok/sec: 814,371 | mfu: 50.90 | epoch: 2 | total time: 95.09m | eta: 84.5m +step 08851/16704 (52.99%) | loss: 2.671344 | lrm: 0.94 | dt: 643.51ms | tok/sec: 814,731 | mfu: 50.92 | epoch: 2 | total time: 95.10m | eta: 84.5m +step 08852/16704 (52.99%) | loss: 2.679899 | lrm: 0.94 | dt: 644.92ms | tok/sec: 812,952 | mfu: 50.81 | epoch: 2 | total time: 95.12m | eta: 84.5m +step 08853/16704 (53.00%) | loss: 2.694212 | lrm: 0.94 | dt: 643.47ms | tok/sec: 814,786 | mfu: 50.93 | epoch: 2 | total time: 95.13m | eta: 84.5m +step 08854/16704 (53.01%) | loss: 2.691781 | lrm: 0.94 | dt: 645.06ms | tok/sec: 812,772 | mfu: 50.80 | epoch: 2 | total time: 95.14m | eta: 84.4m +step 08855/16704 (53.01%) | loss: 2.691129 | lrm: 0.94 | dt: 645.44ms | tok/sec: 812,293 | mfu: 50.77 | epoch: 2 | total time: 95.15m | eta: 84.4m +step 08856/16704 (53.02%) | loss: 2.683448 | lrm: 0.94 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 2 | total time: 95.16m | eta: 84.4m +step 08857/16704 (53.02%) | loss: 2.682041 | lrm: 0.94 | dt: 646.91ms | tok/sec: 810,449 | mfu: 50.65 | epoch: 2 | total time: 95.17m | eta: 84.4m +step 08858/16704 (53.03%) | loss: 2.692384 | lrm: 0.94 | dt: 643.81ms | tok/sec: 814,351 | mfu: 50.90 | epoch: 2 | total time: 95.18m | eta: 84.4m +step 08859/16704 (53.04%) | loss: 2.702385 | lrm: 0.94 | dt: 642.10ms | tok/sec: 816,521 | mfu: 51.03 | epoch: 2 | total time: 95.19m | eta: 84.4m +step 08860/16704 (53.04%) | loss: 2.690923 | lrm: 0.94 | dt: 646.86ms | tok/sec: 810,508 | mfu: 50.66 | epoch: 2 | total time: 95.20m | eta: 84.4m +step 08861/16704 (53.05%) | loss: 2.689416 | lrm: 0.94 | dt: 643.61ms | tok/sec: 814,603 | mfu: 50.91 | epoch: 2 | total time: 95.21m | eta: 84.4m +step 08862/16704 (53.05%) | loss: 2.679854 | lrm: 0.94 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 2 | total time: 95.22m | eta: 84.4m +step 08863/16704 (53.06%) | loss: 2.678301 | lrm: 0.94 | dt: 645.65ms | tok/sec: 812,035 | mfu: 50.75 | epoch: 2 | total time: 95.23m | eta: 84.3m +step 08864/16704 (53.07%) | loss: 2.662709 | lrm: 0.94 | dt: 645.38ms | tok/sec: 812,376 | mfu: 50.77 | epoch: 2 | total time: 95.24m | eta: 84.3m +step 08865/16704 (53.07%) | loss: 2.655478 | lrm: 0.94 | dt: 646.02ms | tok/sec: 811,568 | mfu: 50.72 | epoch: 2 | total time: 95.26m | eta: 84.3m +step 08866/16704 (53.08%) | loss: 2.674010 | lrm: 0.94 | dt: 644.29ms | tok/sec: 813,740 | mfu: 50.86 | epoch: 2 | total time: 95.27m | eta: 84.3m +step 08867/16704 (53.08%) | loss: 2.665238 | lrm: 0.94 | dt: 645.65ms | tok/sec: 812,031 | mfu: 50.75 | epoch: 2 | total time: 95.28m | eta: 84.3m +step 08868/16704 (53.09%) | loss: 2.655939 | lrm: 0.94 | dt: 643.16ms | tok/sec: 815,180 | mfu: 50.95 | epoch: 2 | total time: 95.29m | eta: 84.3m +step 08869/16704 (53.10%) | loss: 2.666551 | lrm: 0.94 | dt: 643.17ms | tok/sec: 815,163 | mfu: 50.95 | epoch: 2 | total time: 95.30m | eta: 84.3m +step 08870/16704 (53.10%) | loss: 2.649721 | lrm: 0.94 | dt: 643.78ms | tok/sec: 814,395 | mfu: 50.90 | epoch: 2 | total time: 95.31m | eta: 84.3m +step 08871/16704 (53.11%) | loss: 2.655770 | lrm: 0.94 | dt: 644.93ms | tok/sec: 812,943 | mfu: 50.81 | epoch: 2 | total time: 95.32m | eta: 84.3m +step 08872/16704 (53.11%) | loss: 2.660684 | lrm: 0.94 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 2 | total time: 95.33m | eta: 84.3m +step 08873/16704 (53.12%) | loss: 2.657372 | lrm: 0.94 | dt: 643.41ms | tok/sec: 814,859 | mfu: 50.93 | epoch: 2 | total time: 95.34m | eta: 84.2m +step 08874/16704 (53.12%) | loss: 2.655618 | lrm: 0.94 | dt: 644.32ms | tok/sec: 813,713 | mfu: 50.86 | epoch: 2 | total time: 95.35m | eta: 84.2m +step 08875/16704 (53.13%) | loss: 2.642726 | lrm: 0.94 | dt: 641.81ms | tok/sec: 816,893 | mfu: 51.06 | epoch: 2 | total time: 95.36m | eta: 84.2m +step 08876/16704 (53.14%) | loss: 2.632600 | lrm: 0.94 | dt: 645.47ms | tok/sec: 812,258 | mfu: 50.77 | epoch: 2 | total time: 95.37m | eta: 84.2m +step 08877/16704 (53.14%) | loss: 2.630558 | lrm: 0.94 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 2 | total time: 95.38m | eta: 84.2m +step 08878/16704 (53.15%) | loss: 2.630583 | lrm: 0.94 | dt: 642.82ms | tok/sec: 815,605 | mfu: 50.98 | epoch: 2 | total time: 95.39m | eta: 84.2m +step 08879/16704 (53.15%) | loss: 2.638412 | lrm: 0.94 | dt: 644.00ms | tok/sec: 814,109 | mfu: 50.88 | epoch: 2 | total time: 95.41m | eta: 84.2m +step 08880/16704 (53.16%) | loss: 2.639635 | lrm: 0.94 | dt: 643.79ms | tok/sec: 814,375 | mfu: 50.90 | epoch: 2 | total time: 95.42m | eta: 84.2m +step 08881/16704 (53.17%) | loss: 2.651998 | lrm: 0.94 | dt: 645.09ms | tok/sec: 812,742 | mfu: 50.80 | epoch: 2 | total time: 95.43m | eta: 84.2m +step 08882/16704 (53.17%) | loss: 2.652115 | lrm: 0.94 | dt: 643.21ms | tok/sec: 815,107 | mfu: 50.95 | epoch: 2 | total time: 95.44m | eta: 84.1m +step 08883/16704 (53.18%) | loss: 2.657903 | lrm: 0.94 | dt: 644.88ms | tok/sec: 812,995 | mfu: 50.81 | epoch: 2 | total time: 95.45m | eta: 84.1m +step 08884/16704 (53.18%) | loss: 2.653066 | lrm: 0.94 | dt: 644.93ms | tok/sec: 812,942 | mfu: 50.81 | epoch: 2 | total time: 95.46m | eta: 84.1m +step 08885/16704 (53.19%) | loss: 2.652352 | lrm: 0.94 | dt: 640.79ms | tok/sec: 818,190 | mfu: 51.14 | epoch: 2 | total time: 95.47m | eta: 84.1m +step 08886/16704 (53.20%) | loss: 2.659466 | lrm: 0.94 | dt: 642.54ms | tok/sec: 815,962 | mfu: 51.00 | epoch: 2 | total time: 95.48m | eta: 84.1m +step 08887/16704 (53.20%) | loss: 2.666682 | lrm: 0.94 | dt: 643.80ms | tok/sec: 814,359 | mfu: 50.90 | epoch: 2 | total time: 95.49m | eta: 84.1m +step 08888/16704 (53.21%) | loss: 2.679485 | lrm: 0.94 | dt: 677.24ms | tok/sec: 774,158 | mfu: 48.39 | epoch: 2 | total time: 95.50m | eta: 84.1m +step 08889/16704 (53.21%) | loss: 2.673441 | lrm: 0.94 | dt: 636.51ms | tok/sec: 823,692 | mfu: 51.48 | epoch: 2 | total time: 95.51m | eta: 84.1m +step 08890/16704 (53.22%) | loss: 2.666766 | lrm: 0.94 | dt: 649.40ms | tok/sec: 807,343 | mfu: 50.46 | epoch: 2 | total time: 95.52m | eta: 84.1m +step 08891/16704 (53.23%) | loss: 2.660892 | lrm: 0.94 | dt: 645.55ms | tok/sec: 812,161 | mfu: 50.76 | epoch: 2 | total time: 95.53m | eta: 84.0m +step 08892/16704 (53.23%) | loss: 2.661886 | lrm: 0.94 | dt: 641.29ms | tok/sec: 817,555 | mfu: 51.10 | epoch: 2 | total time: 95.55m | eta: 84.0m +step 08893/16704 (53.24%) | loss: 2.652363 | lrm: 0.94 | dt: 647.54ms | tok/sec: 809,666 | mfu: 50.61 | epoch: 2 | total time: 95.56m | eta: 84.0m +step 08894/16704 (53.24%) | loss: 2.657743 | lrm: 0.94 | dt: 693.31ms | tok/sec: 756,204 | mfu: 47.26 | epoch: 2 | total time: 95.57m | eta: 84.0m +step 08895/16704 (53.25%) | loss: 2.650295 | lrm: 0.93 | dt: 631.63ms | tok/sec: 830,061 | mfu: 51.88 | epoch: 2 | total time: 95.58m | eta: 84.0m +step 08896/16704 (53.26%) | loss: 2.663238 | lrm: 0.93 | dt: 653.74ms | tok/sec: 801,980 | mfu: 50.12 | epoch: 2 | total time: 95.59m | eta: 84.0m +step 08897/16704 (53.26%) | loss: 2.662701 | lrm: 0.93 | dt: 643.25ms | tok/sec: 815,060 | mfu: 50.94 | epoch: 2 | total time: 95.60m | eta: 84.0m +step 08898/16704 (53.27%) | loss: 2.657973 | lrm: 0.93 | dt: 645.53ms | tok/sec: 812,180 | mfu: 50.76 | epoch: 2 | total time: 95.61m | eta: 84.0m +step 08899/16704 (53.27%) | loss: 2.675177 | lrm: 0.93 | dt: 648.85ms | tok/sec: 808,025 | mfu: 50.50 | epoch: 2 | total time: 95.62m | eta: 84.0m +step 08900/16704 (53.28%) | loss: 2.686449 | lrm: 0.93 | dt: 642.58ms | tok/sec: 815,909 | mfu: 51.00 | epoch: 2 | total time: 95.63m | eta: 83.9m +step 08901/16704 (53.29%) | loss: 2.697664 | lrm: 0.93 | dt: 647.91ms | tok/sec: 809,197 | mfu: 50.58 | epoch: 2 | total time: 95.64m | eta: 83.9m +step 08902/16704 (53.29%) | loss: 2.692810 | lrm: 0.93 | dt: 646.64ms | tok/sec: 810,790 | mfu: 50.68 | epoch: 2 | total time: 95.65m | eta: 83.9m +step 08903/16704 (53.30%) | loss: 2.691957 | lrm: 0.93 | dt: 641.91ms | tok/sec: 816,762 | mfu: 51.05 | epoch: 2 | total time: 95.66m | eta: 83.9m +step 08904/16704 (53.30%) | loss: 2.681036 | lrm: 0.93 | dt: 645.65ms | tok/sec: 812,035 | mfu: 50.75 | epoch: 2 | total time: 95.68m | eta: 83.9m +step 08905/16704 (53.31%) | loss: 2.676908 | lrm: 0.93 | dt: 644.28ms | tok/sec: 813,757 | mfu: 50.86 | epoch: 2 | total time: 95.69m | eta: 83.9m +step 08906/16704 (53.32%) | loss: 2.669340 | lrm: 0.93 | dt: 643.42ms | tok/sec: 814,847 | mfu: 50.93 | epoch: 2 | total time: 95.70m | eta: 83.9m +step 08907/16704 (53.32%) | loss: 2.669199 | lrm: 0.93 | dt: 644.42ms | tok/sec: 813,576 | mfu: 50.85 | epoch: 2 | total time: 95.71m | eta: 83.9m +step 08908/16704 (53.33%) | loss: 2.655932 | lrm: 0.93 | dt: 646.32ms | tok/sec: 811,186 | mfu: 50.70 | epoch: 2 | total time: 95.72m | eta: 83.9m +step 08909/16704 (53.33%) | loss: 2.643435 | lrm: 0.93 | dt: 644.09ms | tok/sec: 814,003 | mfu: 50.88 | epoch: 2 | total time: 95.73m | eta: 83.9m +step 08910/16704 (53.34%) | loss: 2.651039 | lrm: 0.93 | dt: 642.16ms | tok/sec: 816,446 | mfu: 51.03 | epoch: 2 | total time: 95.74m | eta: 83.8m +step 08911/16704 (53.35%) | loss: 2.641299 | lrm: 0.93 | dt: 643.70ms | tok/sec: 814,489 | mfu: 50.91 | epoch: 2 | total time: 95.75m | eta: 83.8m +step 08912/16704 (53.35%) | loss: 2.649846 | lrm: 0.93 | dt: 646.44ms | tok/sec: 811,036 | mfu: 50.69 | epoch: 2 | total time: 95.76m | eta: 83.8m +step 08913/16704 (53.36%) | loss: 2.651248 | lrm: 0.93 | dt: 643.70ms | tok/sec: 814,488 | mfu: 50.91 | epoch: 2 | total time: 95.77m | eta: 83.8m +step 08914/16704 (53.36%) | loss: 2.652988 | lrm: 0.93 | dt: 645.38ms | tok/sec: 812,368 | mfu: 50.77 | epoch: 2 | total time: 95.78m | eta: 83.8m +step 08915/16704 (53.37%) | loss: 2.651023 | lrm: 0.93 | dt: 644.33ms | tok/sec: 813,693 | mfu: 50.86 | epoch: 2 | total time: 95.79m | eta: 83.8m +step 08916/16704 (53.38%) | loss: 2.640509 | lrm: 0.93 | dt: 642.95ms | tok/sec: 815,436 | mfu: 50.97 | epoch: 2 | total time: 95.80m | eta: 83.8m +step 08917/16704 (53.38%) | loss: 2.655596 | lrm: 0.93 | dt: 645.39ms | tok/sec: 812,363 | mfu: 50.77 | epoch: 2 | total time: 95.81m | eta: 83.8m +step 08918/16704 (53.39%) | loss: 2.662381 | lrm: 0.93 | dt: 645.79ms | tok/sec: 811,855 | mfu: 50.74 | epoch: 2 | total time: 95.83m | eta: 83.8m +step 08919/16704 (53.39%) | loss: 2.659523 | lrm: 0.93 | dt: 643.70ms | tok/sec: 814,488 | mfu: 50.91 | epoch: 2 | total time: 95.84m | eta: 83.7m +step 08920/16704 (53.40%) | loss: 2.675948 | lrm: 0.93 | dt: 645.96ms | tok/sec: 811,647 | mfu: 50.73 | epoch: 2 | total time: 95.85m | eta: 83.7m +step 08921/16704 (53.41%) | loss: 2.684513 | lrm: 0.93 | dt: 645.51ms | tok/sec: 812,205 | mfu: 50.76 | epoch: 2 | total time: 95.86m | eta: 83.7m +step 08922/16704 (53.41%) | loss: 2.678744 | lrm: 0.93 | dt: 644.65ms | tok/sec: 813,286 | mfu: 50.83 | epoch: 2 | total time: 95.87m | eta: 83.7m +step 08923/16704 (53.42%) | loss: 2.681836 | lrm: 0.93 | dt: 647.81ms | tok/sec: 809,327 | mfu: 50.58 | epoch: 2 | total time: 95.88m | eta: 83.7m +step 08924/16704 (53.42%) | loss: 2.680381 | lrm: 0.93 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 2 | total time: 95.89m | eta: 83.7m +step 08925/16704 (53.43%) | loss: 2.682510 | lrm: 0.93 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 2 | total time: 95.90m | eta: 83.7m +step 08926/16704 (53.44%) | loss: 2.689057 | lrm: 0.93 | dt: 645.91ms | tok/sec: 811,701 | mfu: 50.73 | epoch: 2 | total time: 95.91m | eta: 83.7m +step 08927/16704 (53.44%) | loss: 2.686371 | lrm: 0.93 | dt: 642.79ms | tok/sec: 815,641 | mfu: 50.98 | epoch: 2 | total time: 95.92m | eta: 83.7m +step 08928/16704 (53.45%) | loss: 2.681199 | lrm: 0.93 | dt: 645.82ms | tok/sec: 811,813 | mfu: 50.74 | epoch: 2 | total time: 95.93m | eta: 83.6m +step 08929/16704 (53.45%) | loss: 2.682950 | lrm: 0.93 | dt: 644.05ms | tok/sec: 814,046 | mfu: 50.88 | epoch: 2 | total time: 95.94m | eta: 83.6m +step 08930/16704 (53.46%) | loss: 2.681813 | lrm: 0.93 | dt: 643.94ms | tok/sec: 814,183 | mfu: 50.89 | epoch: 2 | total time: 95.95m | eta: 83.6m +step 08931/16704 (53.47%) | loss: 2.683425 | lrm: 0.93 | dt: 643.68ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 2 | total time: 95.97m | eta: 83.6m +step 08932/16704 (53.47%) | loss: 2.677368 | lrm: 0.93 | dt: 644.87ms | tok/sec: 813,015 | mfu: 50.81 | epoch: 2 | total time: 95.98m | eta: 83.6m +step 08933/16704 (53.48%) | loss: 2.677667 | lrm: 0.93 | dt: 643.65ms | tok/sec: 814,558 | mfu: 50.91 | epoch: 2 | total time: 95.99m | eta: 83.6m +step 08934/16704 (53.48%) | loss: 2.672734 | lrm: 0.93 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 2 | total time: 96.00m | eta: 83.6m +step 08935/16704 (53.49%) | loss: 2.661632 | lrm: 0.93 | dt: 644.78ms | tok/sec: 813,129 | mfu: 50.82 | epoch: 2 | total time: 96.01m | eta: 83.6m +step 08936/16704 (53.50%) | loss: 2.674270 | lrm: 0.93 | dt: 643.24ms | tok/sec: 815,079 | mfu: 50.94 | epoch: 2 | total time: 96.02m | eta: 83.6m +step 08937/16704 (53.50%) | loss: 2.684604 | lrm: 0.93 | dt: 645.40ms | tok/sec: 812,347 | mfu: 50.77 | epoch: 2 | total time: 96.03m | eta: 83.6m +step 08938/16704 (53.51%) | loss: 2.690998 | lrm: 0.93 | dt: 644.30ms | tok/sec: 813,734 | mfu: 50.86 | epoch: 2 | total time: 96.04m | eta: 83.5m +step 08939/16704 (53.51%) | loss: 2.707502 | lrm: 0.93 | dt: 644.90ms | tok/sec: 812,971 | mfu: 50.81 | epoch: 2 | total time: 96.05m | eta: 83.5m +step 08940/16704 (53.52%) | loss: 2.708317 | lrm: 0.93 | dt: 645.31ms | tok/sec: 812,459 | mfu: 50.78 | epoch: 2 | total time: 96.06m | eta: 83.5m +step 08941/16704 (53.53%) | loss: 2.705133 | lrm: 0.93 | dt: 643.22ms | tok/sec: 815,098 | mfu: 50.94 | epoch: 2 | total time: 96.07m | eta: 83.5m +step 08942/16704 (53.53%) | loss: 2.715446 | lrm: 0.93 | dt: 646.56ms | tok/sec: 810,894 | mfu: 50.68 | epoch: 2 | total time: 96.08m | eta: 83.5m +step 08943/16704 (53.54%) | loss: 2.720050 | lrm: 0.93 | dt: 644.53ms | tok/sec: 813,438 | mfu: 50.84 | epoch: 2 | total time: 96.09m | eta: 83.5m +step 08944/16704 (53.54%) | loss: 2.727826 | lrm: 0.93 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 2 | total time: 96.10m | eta: 83.5m +step 08945/16704 (53.55%) | loss: 2.714316 | lrm: 0.93 | dt: 645.54ms | tok/sec: 812,173 | mfu: 50.76 | epoch: 2 | total time: 96.12m | eta: 83.5m +step 08946/16704 (53.56%) | loss: 2.714934 | lrm: 0.93 | dt: 643.65ms | tok/sec: 814,548 | mfu: 50.91 | epoch: 2 | total time: 96.13m | eta: 83.5m +step 08947/16704 (53.56%) | loss: 2.718826 | lrm: 0.93 | dt: 642.27ms | tok/sec: 816,307 | mfu: 51.02 | epoch: 2 | total time: 96.14m | eta: 83.4m +step 08948/16704 (53.57%) | loss: 2.702575 | lrm: 0.93 | dt: 641.79ms | tok/sec: 816,918 | mfu: 51.06 | epoch: 2 | total time: 96.15m | eta: 83.4m +step 08949/16704 (53.57%) | loss: 2.689540 | lrm: 0.93 | dt: 642.49ms | tok/sec: 816,022 | mfu: 51.00 | epoch: 2 | total time: 96.16m | eta: 83.4m +step 08950/16704 (53.58%) | loss: 2.689786 | lrm: 0.93 | dt: 644.63ms | tok/sec: 813,310 | mfu: 50.83 | epoch: 2 | total time: 96.17m | eta: 83.4m +step 08951/16704 (53.59%) | loss: 2.691154 | lrm: 0.93 | dt: 645.37ms | tok/sec: 812,388 | mfu: 50.78 | epoch: 2 | total time: 96.18m | eta: 83.4m +step 08952/16704 (53.59%) | loss: 2.686987 | lrm: 0.93 | dt: 644.11ms | tok/sec: 813,969 | mfu: 50.87 | epoch: 2 | total time: 96.19m | eta: 83.4m +step 08953/16704 (53.60%) | loss: 2.682409 | lrm: 0.93 | dt: 642.96ms | tok/sec: 815,424 | mfu: 50.97 | epoch: 2 | total time: 96.20m | eta: 83.4m +step 08954/16704 (53.60%) | loss: 2.679275 | lrm: 0.93 | dt: 646.48ms | tok/sec: 810,992 | mfu: 50.69 | epoch: 2 | total time: 96.21m | eta: 83.4m +step 08955/16704 (53.61%) | loss: 2.679166 | lrm: 0.93 | dt: 645.06ms | tok/sec: 812,778 | mfu: 50.80 | epoch: 2 | total time: 96.22m | eta: 83.4m +step 08956/16704 (53.62%) | loss: 2.696448 | lrm: 0.93 | dt: 644.25ms | tok/sec: 813,791 | mfu: 50.86 | epoch: 2 | total time: 96.23m | eta: 83.3m +step 08957/16704 (53.62%) | loss: 2.707847 | lrm: 0.93 | dt: 644.49ms | tok/sec: 813,487 | mfu: 50.84 | epoch: 2 | total time: 96.24m | eta: 83.3m +step 08958/16704 (53.63%) | loss: 2.699085 | lrm: 0.93 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 2 | total time: 96.26m | eta: 83.3m +step 08959/16704 (53.63%) | loss: 2.707979 | lrm: 0.93 | dt: 644.95ms | tok/sec: 812,914 | mfu: 50.81 | epoch: 2 | total time: 96.27m | eta: 83.3m +step 08960/16704 (53.64%) | loss: 2.713020 | lrm: 0.93 | dt: 643.20ms | tok/sec: 815,119 | mfu: 50.95 | epoch: 2 | total time: 96.28m | eta: 83.3m +step 08961/16704 (53.65%) | loss: 2.707296 | lrm: 0.93 | dt: 646.86ms | tok/sec: 810,508 | mfu: 50.66 | epoch: 2 | total time: 96.29m | eta: 83.3m +step 08962/16704 (53.65%) | loss: 2.714355 | lrm: 0.93 | dt: 644.65ms | tok/sec: 813,290 | mfu: 50.83 | epoch: 2 | total time: 96.30m | eta: 83.3m +step 08963/16704 (53.66%) | loss: 2.703347 | lrm: 0.93 | dt: 645.87ms | tok/sec: 811,754 | mfu: 50.74 | epoch: 2 | total time: 96.31m | eta: 83.3m +step 08964/16704 (53.66%) | loss: 2.702335 | lrm: 0.93 | dt: 645.59ms | tok/sec: 812,112 | mfu: 50.76 | epoch: 2 | total time: 96.32m | eta: 83.3m +step 08965/16704 (53.67%) | loss: 2.690628 | lrm: 0.93 | dt: 641.87ms | tok/sec: 816,816 | mfu: 51.05 | epoch: 2 | total time: 96.33m | eta: 83.2m +step 08966/16704 (53.68%) | loss: 2.711495 | lrm: 0.93 | dt: 643.39ms | tok/sec: 814,881 | mfu: 50.93 | epoch: 2 | total time: 96.34m | eta: 83.2m +step 08967/16704 (53.68%) | loss: 2.699818 | lrm: 0.93 | dt: 643.08ms | tok/sec: 815,278 | mfu: 50.96 | epoch: 2 | total time: 96.35m | eta: 83.2m +step 08968/16704 (53.69%) | loss: 2.695059 | lrm: 0.93 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 2 | total time: 96.36m | eta: 83.2m +step 08969/16704 (53.69%) | loss: 2.696798 | lrm: 0.93 | dt: 643.49ms | tok/sec: 814,753 | mfu: 50.92 | epoch: 2 | total time: 96.37m | eta: 83.2m +step 08970/16704 (53.70%) | loss: 2.694611 | lrm: 0.93 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 2 | total time: 96.38m | eta: 83.2m +step 08971/16704 (53.71%) | loss: 2.692365 | lrm: 0.93 | dt: 643.47ms | tok/sec: 814,782 | mfu: 50.93 | epoch: 2 | total time: 96.39m | eta: 83.2m +step 08972/16704 (53.71%) | loss: 2.683768 | lrm: 0.93 | dt: 643.88ms | tok/sec: 814,264 | mfu: 50.89 | epoch: 2 | total time: 96.41m | eta: 83.2m +step 08973/16704 (53.72%) | loss: 2.683448 | lrm: 0.93 | dt: 643.55ms | tok/sec: 814,680 | mfu: 50.92 | epoch: 2 | total time: 96.42m | eta: 83.2m +step 08974/16704 (53.72%) | loss: 2.671746 | lrm: 0.93 | dt: 644.65ms | tok/sec: 813,295 | mfu: 50.83 | epoch: 2 | total time: 96.43m | eta: 83.2m +step 08975/16704 (53.73%) | loss: 2.677525 | lrm: 0.93 | dt: 645.78ms | tok/sec: 811,873 | mfu: 50.74 | epoch: 2 | total time: 96.44m | eta: 83.1m +step 08976/16704 (53.74%) | loss: 2.686528 | lrm: 0.93 | dt: 644.55ms | tok/sec: 813,416 | mfu: 50.84 | epoch: 2 | total time: 96.45m | eta: 83.1m +step 08977/16704 (53.74%) | loss: 2.680020 | lrm: 0.93 | dt: 645.50ms | tok/sec: 812,218 | mfu: 50.76 | epoch: 2 | total time: 96.46m | eta: 83.1m +step 08978/16704 (53.75%) | loss: 2.691961 | lrm: 0.93 | dt: 645.64ms | tok/sec: 812,040 | mfu: 50.75 | epoch: 2 | total time: 96.47m | eta: 83.1m +step 08979/16704 (53.75%) | loss: 2.686131 | lrm: 0.92 | dt: 643.68ms | tok/sec: 814,521 | mfu: 50.91 | epoch: 2 | total time: 96.48m | eta: 83.1m +step 08980/16704 (53.76%) | loss: 2.690124 | lrm: 0.92 | dt: 645.19ms | tok/sec: 812,609 | mfu: 50.79 | epoch: 2 | total time: 96.49m | eta: 83.1m +step 08981/16704 (53.77%) | loss: 2.697339 | lrm: 0.92 | dt: 647.40ms | tok/sec: 809,837 | mfu: 50.62 | epoch: 2 | total time: 96.50m | eta: 83.1m +step 08982/16704 (53.77%) | loss: 2.695218 | lrm: 0.92 | dt: 645.14ms | tok/sec: 812,666 | mfu: 50.79 | epoch: 2 | total time: 96.51m | eta: 83.1m +step 08983/16704 (53.78%) | loss: 2.661483 | lrm: 0.92 | dt: 644.22ms | tok/sec: 813,838 | mfu: 50.87 | epoch: 2 | total time: 96.52m | eta: 83.1m +step 08984/16704 (53.78%) | loss: 2.645610 | lrm: 0.92 | dt: 644.02ms | tok/sec: 814,085 | mfu: 50.88 | epoch: 2 | total time: 96.53m | eta: 83.0m +step 08985/16704 (53.79%) | loss: 2.649288 | lrm: 0.92 | dt: 643.88ms | tok/sec: 814,265 | mfu: 50.89 | epoch: 2 | total time: 96.55m | eta: 83.0m +step 08986/16704 (53.80%) | loss: 2.660699 | lrm: 0.92 | dt: 644.95ms | tok/sec: 812,906 | mfu: 50.81 | epoch: 2 | total time: 96.56m | eta: 83.0m +step 08987/16704 (53.80%) | loss: 2.660557 | lrm: 0.92 | dt: 643.19ms | tok/sec: 815,136 | mfu: 50.95 | epoch: 2 | total time: 96.57m | eta: 83.0m +step 08988/16704 (53.81%) | loss: 2.670834 | lrm: 0.92 | dt: 644.77ms | tok/sec: 813,141 | mfu: 50.82 | epoch: 2 | total time: 96.58m | eta: 83.0m +step 08989/16704 (53.81%) | loss: 2.688974 | lrm: 0.92 | dt: 645.56ms | tok/sec: 812,149 | mfu: 50.76 | epoch: 2 | total time: 96.59m | eta: 83.0m +step 08990/16704 (53.82%) | loss: 2.689196 | lrm: 0.92 | dt: 645.57ms | tok/sec: 812,126 | mfu: 50.76 | epoch: 2 | total time: 96.60m | eta: 83.0m +step 08991/16704 (53.83%) | loss: 2.676799 | lrm: 0.92 | dt: 645.62ms | tok/sec: 812,064 | mfu: 50.76 | epoch: 2 | total time: 96.61m | eta: 83.0m +step 08992/16704 (53.83%) | loss: 2.662218 | lrm: 0.92 | dt: 643.84ms | tok/sec: 814,318 | mfu: 50.90 | epoch: 2 | total time: 96.62m | eta: 83.0m +step 08993/16704 (53.84%) | loss: 2.658848 | lrm: 0.92 | dt: 644.28ms | tok/sec: 813,761 | mfu: 50.86 | epoch: 2 | total time: 96.63m | eta: 82.9m +step 08994/16704 (53.84%) | loss: 2.661298 | lrm: 0.92 | dt: 645.53ms | tok/sec: 812,186 | mfu: 50.76 | epoch: 2 | total time: 96.64m | eta: 82.9m +step 08995/16704 (53.85%) | loss: 2.663668 | lrm: 0.92 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 2 | total time: 96.65m | eta: 82.9m +step 08996/16704 (53.86%) | loss: 2.655229 | lrm: 0.92 | dt: 644.74ms | tok/sec: 813,175 | mfu: 50.82 | epoch: 2 | total time: 96.66m | eta: 82.9m +step 08997/16704 (53.86%) | loss: 2.637449 | lrm: 0.92 | dt: 644.36ms | tok/sec: 813,657 | mfu: 50.85 | epoch: 2 | total time: 96.67m | eta: 82.9m +step 08998/16704 (53.87%) | loss: 2.637206 | lrm: 0.92 | dt: 644.90ms | tok/sec: 812,969 | mfu: 50.81 | epoch: 2 | total time: 96.68m | eta: 82.9m +step 08999/16704 (53.87%) | loss: 2.629414 | lrm: 0.92 | dt: 647.88ms | tok/sec: 809,230 | mfu: 50.58 | epoch: 2 | total time: 96.70m | eta: 82.9m +Step 09000 | Validation bpb: 0.814414 +Evaluating: hellaswag_zeroshot (0-shot, type: multiple_choice)... accuracy: 0.4409 | centered: 0.2546 | time: 22.89s +Evaluating: jeopardy (10-shot, type: language_modeling)... accuracy: 0.0713 | centered: 0.0713 | time: 4.89s +Evaluating: bigbench_qa_wikidata (10-shot, type: language_modeling)... accuracy: 0.4376 | centered: 0.4376 | time: 47.99s +Evaluating: arc_easy (10-shot, type: multiple_choice)... accuracy: 0.6263 | centered: 0.5017 | time: 5.98s +Evaluating: arc_challenge (10-shot, type: multiple_choice)... accuracy: 0.3276 | centered: 0.1035 | time: 2.98s +Evaluating: copa (0-shot, type: multiple_choice)... accuracy: 0.6400 | centered: 0.2800 | time: 0.24s +Evaluating: commonsense_qa (10-shot, type: multiple_choice)... accuracy: 0.2662 | centered: 0.0827 | time: 3.15s +Evaluating: piqa (10-shot, type: multiple_choice)... accuracy: 0.6877 | centered: 0.3754 | time: 4.45s +Evaluating: openbook_qa (0-shot, type: multiple_choice)... accuracy: 0.3540 | centered: 0.1387 | time: 1.15s +Evaluating: lambada_openai (0-shot, type: language_modeling)... accuracy: 0.3538 | centered: 0.3538 | time: 11.59s +Evaluating: hellaswag (10-shot, type: multiple_choice)... accuracy: 0.4369 | centered: 0.2492 | time: 35.47s +Evaluating: winograd (0-shot, type: schema)... accuracy: 0.6484 | centered: 0.2967 | time: 0.62s +Evaluating: winogrande (0-shot, type: schema)... accuracy: 0.5430 | centered: 0.0860 | time: 2.83s +Evaluating: bigbench_dyck_languages (10-shot, type: language_modeling)... accuracy: 0.1380 | centered: 0.1380 | time: 2.42s +Evaluating: agi_eval_lsat_ar (3-shot, type: multiple_choice)... accuracy: 0.2261 | centered: 0.0326 | time: 0.79s +Evaluating: bigbench_cs_algorithms (10-shot, type: language_modeling)... accuracy: 0.4227 | centered: 0.4227 | time: 3.11s +Evaluating: bigbench_operators (10-shot, type: language_modeling)... accuracy: 0.1429 | centered: 0.1429 | time: 0.51s +Evaluating: bigbench_repeat_copy_logic (10-shot, type: language_modeling)... accuracy: 0.0000 | centered: 0.0000 | time: 0.08s +Evaluating: squad (10-shot, type: language_modeling)... accuracy: 0.2577 | centered: 0.2577 | time: 28.74s +Evaluating: coqa (0-shot, type: language_modeling)... accuracy: 0.1859 | centered: 0.1859 | time: 18.66s +Evaluating: boolq (10-shot, type: multiple_choice)... accuracy: 0.6202 | centered: 0.0005 | time: 10.69s +Evaluating: bigbench_language_identification (10-shot, type: multiple_choice)... accuracy: 0.2491 | centered: 0.1739 | time: 59.19s +Step 09000 | CORE metric: 0.2084 +step 09000/16704 (53.88%) | loss: 2.640356 | lrm: 0.92 | dt: 628.59ms | tok/sec: 834,072 | mfu: 52.13 | epoch: 2 | total time: 96.71m | eta: 82.9m +step 09001/16704 (53.89%) | loss: 2.652126 | lrm: 0.92 | dt: 650.64ms | tok/sec: 805,806 | mfu: 50.36 | epoch: 2 | total time: 96.72m | eta: 82.9m +step 09002/16704 (53.89%) | loss: 2.642936 | lrm: 0.92 | dt: 640.37ms | tok/sec: 818,728 | mfu: 51.17 | epoch: 2 | total time: 96.73m | eta: 82.9m +step 09003/16704 (53.90%) | loss: 2.638851 | lrm: 0.92 | dt: 647.09ms | tok/sec: 810,224 | mfu: 50.64 | epoch: 2 | total time: 96.74m | eta: 82.8m +step 09004/16704 (53.90%) | loss: 2.650603 | lrm: 0.92 | dt: 643.12ms | tok/sec: 815,223 | mfu: 50.95 | epoch: 2 | total time: 96.75m | eta: 82.8m +step 09005/16704 (53.91%) | loss: 2.653900 | lrm: 0.92 | dt: 641.52ms | tok/sec: 817,253 | mfu: 51.08 | epoch: 2 | total time: 96.76m | eta: 82.8m +step 09006/16704 (53.92%) | loss: 2.668232 | lrm: 0.92 | dt: 642.28ms | tok/sec: 816,295 | mfu: 51.02 | epoch: 2 | total time: 96.77m | eta: 82.8m +step 09007/16704 (53.92%) | loss: 2.670442 | lrm: 0.92 | dt: 643.01ms | tok/sec: 815,366 | mfu: 50.96 | epoch: 2 | total time: 96.78m | eta: 82.8m +step 09008/16704 (53.93%) | loss: 2.656278 | lrm: 0.92 | dt: 641.81ms | tok/sec: 816,891 | mfu: 51.06 | epoch: 2 | total time: 96.79m | eta: 82.8m +step 09009/16704 (53.93%) | loss: 2.652279 | lrm: 0.92 | dt: 645.64ms | tok/sec: 812,040 | mfu: 50.75 | epoch: 2 | total time: 96.80m | eta: 82.8m +step 09010/16704 (53.94%) | loss: 2.669584 | lrm: 0.92 | dt: 646.64ms | tok/sec: 810,789 | mfu: 50.68 | epoch: 2 | total time: 96.81m | eta: 82.8m +step 09011/16704 (53.95%) | loss: 2.666508 | lrm: 0.92 | dt: 640.85ms | tok/sec: 818,107 | mfu: 51.13 | epoch: 2 | total time: 96.82m | eta: 82.8m +step 09012/16704 (53.95%) | loss: 2.658096 | lrm: 0.92 | dt: 642.31ms | tok/sec: 816,258 | mfu: 51.02 | epoch: 2 | total time: 96.84m | eta: 82.7m +step 09013/16704 (53.96%) | loss: 2.658442 | lrm: 0.92 | dt: 648.37ms | tok/sec: 808,627 | mfu: 50.54 | epoch: 2 | total time: 96.85m | eta: 82.7m +step 09014/16704 (53.96%) | loss: 2.655483 | lrm: 0.92 | dt: 645.15ms | tok/sec: 812,656 | mfu: 50.79 | epoch: 2 | total time: 96.86m | eta: 82.7m +step 09015/16704 (53.97%) | loss: 2.651330 | lrm: 0.92 | dt: 644.10ms | tok/sec: 813,982 | mfu: 50.88 | epoch: 2 | total time: 96.87m | eta: 82.7m +step 09016/16704 (53.98%) | loss: 2.663461 | lrm: 0.92 | dt: 643.99ms | tok/sec: 814,123 | mfu: 50.88 | epoch: 2 | total time: 96.88m | eta: 82.7m +step 09017/16704 (53.98%) | loss: 2.665877 | lrm: 0.92 | dt: 643.99ms | tok/sec: 814,121 | mfu: 50.88 | epoch: 2 | total time: 96.89m | eta: 82.7m +step 09018/16704 (53.99%) | loss: 2.662614 | lrm: 0.92 | dt: 640.42ms | tok/sec: 818,668 | mfu: 51.17 | epoch: 2 | total time: 96.90m | eta: 82.7m +step 09019/16704 (53.99%) | loss: 2.663961 | lrm: 0.92 | dt: 641.20ms | tok/sec: 817,671 | mfu: 51.11 | epoch: 2 | total time: 96.91m | eta: 82.7m +step 09020/16704 (54.00%) | loss: 2.665891 | lrm: 0.92 | dt: 641.90ms | tok/sec: 816,777 | mfu: 51.05 | epoch: 2 | total time: 96.92m | eta: 82.7m +step 09021/16704 (54.01%) | loss: 2.670181 | lrm: 0.92 | dt: 642.74ms | tok/sec: 815,713 | mfu: 50.98 | epoch: 2 | total time: 96.93m | eta: 82.6m +step 09022/16704 (54.01%) | loss: 2.691989 | lrm: 0.92 | dt: 641.25ms | tok/sec: 817,603 | mfu: 51.10 | epoch: 2 | total time: 96.94m | eta: 82.6m +step 09023/16704 (54.02%) | loss: 2.693531 | lrm: 0.92 | dt: 643.46ms | tok/sec: 814,789 | mfu: 50.93 | epoch: 2 | total time: 96.95m | eta: 82.6m +step 09024/16704 (54.02%) | loss: 2.700727 | lrm: 0.92 | dt: 641.22ms | tok/sec: 817,641 | mfu: 51.10 | epoch: 2 | total time: 96.96m | eta: 82.6m +step 09025/16704 (54.03%) | loss: 2.701763 | lrm: 0.92 | dt: 641.02ms | tok/sec: 817,892 | mfu: 51.12 | epoch: 2 | total time: 96.97m | eta: 82.6m +step 09026/16704 (54.03%) | loss: 2.696116 | lrm: 0.92 | dt: 640.38ms | tok/sec: 818,715 | mfu: 51.17 | epoch: 2 | total time: 96.98m | eta: 82.6m +step 09027/16704 (54.04%) | loss: 2.696584 | lrm: 0.92 | dt: 640.61ms | tok/sec: 818,419 | mfu: 51.15 | epoch: 2 | total time: 97.00m | eta: 82.6m +step 09028/16704 (54.05%) | loss: 2.697641 | lrm: 0.92 | dt: 641.00ms | tok/sec: 817,927 | mfu: 51.12 | epoch: 2 | total time: 97.01m | eta: 82.6m +step 09029/16704 (54.05%) | loss: 2.685377 | lrm: 0.92 | dt: 641.18ms | tok/sec: 817,698 | mfu: 51.11 | epoch: 2 | total time: 97.02m | eta: 82.6m +step 09030/16704 (54.06%) | loss: 2.683076 | lrm: 0.92 | dt: 642.12ms | tok/sec: 816,489 | mfu: 51.03 | epoch: 2 | total time: 97.03m | eta: 82.5m +step 09031/16704 (54.06%) | loss: 2.686862 | lrm: 0.92 | dt: 641.12ms | tok/sec: 817,772 | mfu: 51.11 | epoch: 2 | total time: 97.04m | eta: 82.5m +step 09032/16704 (54.07%) | loss: 2.677171 | lrm: 0.92 | dt: 641.91ms | tok/sec: 816,767 | mfu: 51.05 | epoch: 2 | total time: 97.05m | eta: 82.5m +step 09033/16704 (54.08%) | loss: 2.677146 | lrm: 0.92 | dt: 642.86ms | tok/sec: 815,552 | mfu: 50.97 | epoch: 2 | total time: 97.06m | eta: 82.5m +step 09034/16704 (54.08%) | loss: 2.677662 | lrm: 0.92 | dt: 641.69ms | tok/sec: 817,038 | mfu: 51.07 | epoch: 2 | total time: 97.07m | eta: 82.5m +step 09035/16704 (54.09%) | loss: 2.682106 | lrm: 0.92 | dt: 641.86ms | tok/sec: 816,823 | mfu: 51.05 | epoch: 2 | total time: 97.08m | eta: 82.5m +step 09036/16704 (54.09%) | loss: 2.672441 | lrm: 0.92 | dt: 642.26ms | tok/sec: 816,316 | mfu: 51.02 | epoch: 2 | total time: 97.09m | eta: 82.5m +step 09037/16704 (54.10%) | loss: 2.665359 | lrm: 0.92 | dt: 643.42ms | tok/sec: 814,846 | mfu: 50.93 | epoch: 2 | total time: 97.10m | eta: 82.5m +step 09038/16704 (54.11%) | loss: 2.672550 | lrm: 0.92 | dt: 640.78ms | tok/sec: 818,206 | mfu: 51.14 | epoch: 2 | total time: 97.11m | eta: 82.5m +step 09039/16704 (54.11%) | loss: 2.666504 | lrm: 0.92 | dt: 642.70ms | tok/sec: 815,759 | mfu: 50.99 | epoch: 2 | total time: 97.12m | eta: 82.5m +step 09040/16704 (54.12%) | loss: 2.670515 | lrm: 0.92 | dt: 643.29ms | tok/sec: 815,015 | mfu: 50.94 | epoch: 2 | total time: 97.13m | eta: 82.4m +step 09041/16704 (54.12%) | loss: 2.663937 | lrm: 0.92 | dt: 642.23ms | tok/sec: 816,358 | mfu: 51.02 | epoch: 2 | total time: 97.15m | eta: 82.4m +step 09042/16704 (54.13%) | loss: 2.660833 | lrm: 0.92 | dt: 643.18ms | tok/sec: 815,148 | mfu: 50.95 | epoch: 2 | total time: 97.16m | eta: 82.4m +step 09043/16704 (54.14%) | loss: 2.661304 | lrm: 0.92 | dt: 641.92ms | tok/sec: 816,744 | mfu: 51.05 | epoch: 2 | total time: 97.17m | eta: 82.4m +step 09044/16704 (54.14%) | loss: 2.653692 | lrm: 0.92 | dt: 643.27ms | tok/sec: 815,031 | mfu: 50.94 | epoch: 2 | total time: 97.18m | eta: 82.4m +step 09045/16704 (54.15%) | loss: 2.639930 | lrm: 0.92 | dt: 641.04ms | tok/sec: 817,868 | mfu: 51.12 | epoch: 2 | total time: 97.19m | eta: 82.4m +step 09046/16704 (54.15%) | loss: 2.622670 | lrm: 0.92 | dt: 641.38ms | tok/sec: 817,439 | mfu: 51.09 | epoch: 2 | total time: 97.20m | eta: 82.4m +step 09047/16704 (54.16%) | loss: 2.640585 | lrm: 0.92 | dt: 643.52ms | tok/sec: 814,714 | mfu: 50.92 | epoch: 2 | total time: 97.21m | eta: 82.4m +step 09048/16704 (54.17%) | loss: 2.645317 | lrm: 0.92 | dt: 639.94ms | tok/sec: 819,279 | mfu: 51.21 | epoch: 2 | total time: 97.22m | eta: 82.4m +step 09049/16704 (54.17%) | loss: 2.643744 | lrm: 0.92 | dt: 643.17ms | tok/sec: 815,164 | mfu: 50.95 | epoch: 2 | total time: 97.23m | eta: 82.3m +step 09050/16704 (54.18%) | loss: 2.646120 | lrm: 0.92 | dt: 641.08ms | tok/sec: 817,826 | mfu: 51.12 | epoch: 2 | total time: 97.24m | eta: 82.3m +step 09051/16704 (54.18%) | loss: 2.637434 | lrm: 0.92 | dt: 644.38ms | tok/sec: 813,636 | mfu: 50.85 | epoch: 2 | total time: 97.25m | eta: 82.3m +step 09052/16704 (54.19%) | loss: 2.631511 | lrm: 0.92 | dt: 641.98ms | tok/sec: 816,670 | mfu: 51.04 | epoch: 2 | total time: 97.26m | eta: 82.3m +step 09053/16704 (54.20%) | loss: 2.639834 | lrm: 0.92 | dt: 643.06ms | tok/sec: 815,307 | mfu: 50.96 | epoch: 2 | total time: 97.27m | eta: 82.3m +step 09054/16704 (54.20%) | loss: 2.648665 | lrm: 0.92 | dt: 642.42ms | tok/sec: 816,107 | mfu: 51.01 | epoch: 2 | total time: 97.28m | eta: 82.3m +step 09055/16704 (54.21%) | loss: 2.666538 | lrm: 0.92 | dt: 643.06ms | tok/sec: 815,303 | mfu: 50.96 | epoch: 2 | total time: 97.30m | eta: 82.3m +step 09056/16704 (54.21%) | loss: 2.649511 | lrm: 0.92 | dt: 642.17ms | tok/sec: 816,433 | mfu: 51.03 | epoch: 2 | total time: 97.31m | eta: 82.3m +step 09057/16704 (54.22%) | loss: 2.654581 | lrm: 0.92 | dt: 643.73ms | tok/sec: 814,449 | mfu: 50.90 | epoch: 2 | total time: 97.32m | eta: 82.3m +step 09058/16704 (54.23%) | loss: 2.663279 | lrm: 0.92 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 2 | total time: 97.33m | eta: 82.2m +step 09059/16704 (54.23%) | loss: 2.670057 | lrm: 0.92 | dt: 641.35ms | tok/sec: 817,478 | mfu: 51.09 | epoch: 2 | total time: 97.34m | eta: 82.2m +step 09060/16704 (54.24%) | loss: 2.676284 | lrm: 0.92 | dt: 642.77ms | tok/sec: 815,674 | mfu: 50.98 | epoch: 2 | total time: 97.35m | eta: 82.2m +step 09061/16704 (54.24%) | loss: 2.695411 | lrm: 0.92 | dt: 642.58ms | tok/sec: 815,914 | mfu: 51.00 | epoch: 2 | total time: 97.36m | eta: 82.2m +step 09062/16704 (54.25%) | loss: 2.697565 | lrm: 0.91 | dt: 641.22ms | tok/sec: 817,647 | mfu: 51.10 | epoch: 2 | total time: 97.37m | eta: 82.2m +step 09063/16704 (54.26%) | loss: 2.680426 | lrm: 0.91 | dt: 642.69ms | tok/sec: 815,767 | mfu: 50.99 | epoch: 2 | total time: 97.38m | eta: 82.2m +step 09064/16704 (54.26%) | loss: 2.670323 | lrm: 0.91 | dt: 641.53ms | tok/sec: 817,241 | mfu: 51.08 | epoch: 2 | total time: 97.39m | eta: 82.2m +step 09065/16704 (54.27%) | loss: 2.658167 | lrm: 0.91 | dt: 641.83ms | tok/sec: 816,864 | mfu: 51.06 | epoch: 2 | total time: 97.40m | eta: 82.2m +step 09066/16704 (54.27%) | loss: 2.631787 | lrm: 0.91 | dt: 643.24ms | tok/sec: 815,069 | mfu: 50.94 | epoch: 2 | total time: 97.41m | eta: 82.2m +step 09067/16704 (54.28%) | loss: 2.618762 | lrm: 0.91 | dt: 643.34ms | tok/sec: 814,952 | mfu: 50.94 | epoch: 2 | total time: 97.42m | eta: 82.1m +step 09068/16704 (54.29%) | loss: 2.620813 | lrm: 0.91 | dt: 641.45ms | tok/sec: 817,354 | mfu: 51.09 | epoch: 2 | total time: 97.43m | eta: 82.1m +step 09069/16704 (54.29%) | loss: 2.624055 | lrm: 0.91 | dt: 642.59ms | tok/sec: 815,893 | mfu: 50.99 | epoch: 2 | total time: 97.45m | eta: 82.1m +step 09070/16704 (54.30%) | loss: 2.647534 | lrm: 0.91 | dt: 641.74ms | tok/sec: 816,978 | mfu: 51.06 | epoch: 2 | total time: 97.46m | eta: 82.1m +step 09071/16704 (54.30%) | loss: 2.645717 | lrm: 0.91 | dt: 642.14ms | tok/sec: 816,466 | mfu: 51.03 | epoch: 2 | total time: 97.47m | eta: 82.1m +step 09072/16704 (54.31%) | loss: 2.640512 | lrm: 0.91 | dt: 641.95ms | tok/sec: 816,714 | mfu: 51.05 | epoch: 2 | total time: 97.48m | eta: 82.1m +step 09073/16704 (54.32%) | loss: 2.639633 | lrm: 0.91 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 2 | total time: 97.49m | eta: 82.1m +step 09074/16704 (54.32%) | loss: 2.645522 | lrm: 0.91 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 2 | total time: 97.50m | eta: 82.1m +step 09075/16704 (54.33%) | loss: 2.642422 | lrm: 0.91 | dt: 643.30ms | tok/sec: 814,994 | mfu: 50.94 | epoch: 2 | total time: 97.51m | eta: 82.1m +step 09076/16704 (54.33%) | loss: 2.647179 | lrm: 0.91 | dt: 641.63ms | tok/sec: 817,114 | mfu: 51.07 | epoch: 2 | total time: 97.52m | eta: 82.1m +step 09077/16704 (54.34%) | loss: 2.629627 | lrm: 0.91 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 2 | total time: 97.53m | eta: 82.0m +step 09078/16704 (54.35%) | loss: 2.628936 | lrm: 0.91 | dt: 640.63ms | tok/sec: 818,400 | mfu: 51.15 | epoch: 2 | total time: 97.54m | eta: 82.0m +step 09079/16704 (54.35%) | loss: 2.633548 | lrm: 0.91 | dt: 645.53ms | tok/sec: 812,184 | mfu: 50.76 | epoch: 2 | total time: 97.55m | eta: 82.0m +step 09080/16704 (54.36%) | loss: 2.615694 | lrm: 0.91 | dt: 641.56ms | tok/sec: 817,204 | mfu: 51.08 | epoch: 2 | total time: 97.56m | eta: 82.0m +step 09081/16704 (54.36%) | loss: 2.626072 | lrm: 0.91 | dt: 643.65ms | tok/sec: 814,553 | mfu: 50.91 | epoch: 2 | total time: 97.57m | eta: 82.0m +step 09082/16704 (54.37%) | loss: 2.635487 | lrm: 0.91 | dt: 643.44ms | tok/sec: 814,822 | mfu: 50.93 | epoch: 2 | total time: 97.58m | eta: 82.0m +step 09083/16704 (54.38%) | loss: 2.642604 | lrm: 0.91 | dt: 642.20ms | tok/sec: 816,398 | mfu: 51.03 | epoch: 2 | total time: 97.60m | eta: 82.0m +step 09084/16704 (54.38%) | loss: 2.651896 | lrm: 0.91 | dt: 641.85ms | tok/sec: 816,844 | mfu: 51.05 | epoch: 2 | total time: 97.61m | eta: 82.0m +step 09085/16704 (54.39%) | loss: 2.644916 | lrm: 0.91 | dt: 641.35ms | tok/sec: 817,476 | mfu: 51.09 | epoch: 2 | total time: 97.62m | eta: 82.0m +step 09086/16704 (54.39%) | loss: 2.650243 | lrm: 0.91 | dt: 643.08ms | tok/sec: 815,281 | mfu: 50.96 | epoch: 2 | total time: 97.63m | eta: 81.9m +step 09087/16704 (54.40%) | loss: 2.655798 | lrm: 0.91 | dt: 643.41ms | tok/sec: 814,862 | mfu: 50.93 | epoch: 2 | total time: 97.64m | eta: 81.9m +step 09088/16704 (54.41%) | loss: 2.666695 | lrm: 0.91 | dt: 640.71ms | tok/sec: 818,294 | mfu: 51.14 | epoch: 2 | total time: 97.65m | eta: 81.9m +step 09089/16704 (54.41%) | loss: 2.655966 | lrm: 0.91 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 2 | total time: 97.66m | eta: 81.9m +step 09090/16704 (54.42%) | loss: 2.657112 | lrm: 0.91 | dt: 639.67ms | tok/sec: 819,616 | mfu: 51.23 | epoch: 2 | total time: 97.67m | eta: 81.9m +step 09091/16704 (54.42%) | loss: 2.661214 | lrm: 0.91 | dt: 645.69ms | tok/sec: 811,975 | mfu: 50.75 | epoch: 2 | total time: 97.68m | eta: 81.9m +step 09092/16704 (54.43%) | loss: 2.660352 | lrm: 0.91 | dt: 643.08ms | tok/sec: 815,278 | mfu: 50.96 | epoch: 2 | total time: 97.69m | eta: 81.9m +step 09093/16704 (54.44%) | loss: 2.648377 | lrm: 0.91 | dt: 641.98ms | tok/sec: 816,671 | mfu: 51.04 | epoch: 2 | total time: 97.70m | eta: 81.9m +step 09094/16704 (54.44%) | loss: 2.648862 | lrm: 0.91 | dt: 644.16ms | tok/sec: 813,914 | mfu: 50.87 | epoch: 2 | total time: 97.71m | eta: 81.9m +step 09095/16704 (54.45%) | loss: 2.648524 | lrm: 0.91 | dt: 641.16ms | tok/sec: 817,722 | mfu: 51.11 | epoch: 2 | total time: 97.72m | eta: 81.8m +step 09096/16704 (54.45%) | loss: 2.635680 | lrm: 0.91 | dt: 645.70ms | tok/sec: 811,970 | mfu: 50.75 | epoch: 2 | total time: 97.73m | eta: 81.8m +step 09097/16704 (54.46%) | loss: 2.641121 | lrm: 0.91 | dt: 642.48ms | tok/sec: 816,043 | mfu: 51.00 | epoch: 2 | total time: 97.75m | eta: 81.8m +step 09098/16704 (54.47%) | loss: 2.646531 | lrm: 0.91 | dt: 640.02ms | tok/sec: 819,175 | mfu: 51.20 | epoch: 2 | total time: 97.76m | eta: 81.8m +step 09099/16704 (54.47%) | loss: 2.655881 | lrm: 0.91 | dt: 642.12ms | tok/sec: 816,499 | mfu: 51.03 | epoch: 2 | total time: 97.77m | eta: 81.8m +step 09100/16704 (54.48%) | loss: 2.644954 | lrm: 0.91 | dt: 642.16ms | tok/sec: 816,443 | mfu: 51.03 | epoch: 2 | total time: 97.78m | eta: 81.8m +step 09101/16704 (54.48%) | loss: 2.657452 | lrm: 0.91 | dt: 641.47ms | tok/sec: 817,317 | mfu: 51.08 | epoch: 2 | total time: 97.79m | eta: 81.8m +step 09102/16704 (54.49%) | loss: 2.623574 | lrm: 0.91 | dt: 642.01ms | tok/sec: 816,639 | mfu: 51.04 | epoch: 2 | total time: 97.80m | eta: 81.8m +step 09103/16704 (54.50%) | loss: 2.628538 | lrm: 0.91 | dt: 643.12ms | tok/sec: 815,229 | mfu: 50.95 | epoch: 2 | total time: 97.81m | eta: 81.8m +step 09104/16704 (54.50%) | loss: 2.636654 | lrm: 0.91 | dt: 641.89ms | tok/sec: 816,785 | mfu: 51.05 | epoch: 2 | total time: 97.82m | eta: 81.7m +step 09105/16704 (54.51%) | loss: 2.627935 | lrm: 0.91 | dt: 644.62ms | tok/sec: 813,327 | mfu: 50.83 | epoch: 2 | total time: 97.83m | eta: 81.7m +step 09106/16704 (54.51%) | loss: 2.639778 | lrm: 0.91 | dt: 640.31ms | tok/sec: 818,803 | mfu: 51.18 | epoch: 2 | total time: 97.84m | eta: 81.7m +step 09107/16704 (54.52%) | loss: 2.628635 | lrm: 0.91 | dt: 644.22ms | tok/sec: 813,834 | mfu: 50.87 | epoch: 2 | total time: 97.85m | eta: 81.7m +step 09108/16704 (54.53%) | loss: 2.627162 | lrm: 0.91 | dt: 644.02ms | tok/sec: 814,086 | mfu: 50.88 | epoch: 2 | total time: 97.86m | eta: 81.7m +step 09109/16704 (54.53%) | loss: 2.626481 | lrm: 0.91 | dt: 641.88ms | tok/sec: 816,795 | mfu: 51.05 | epoch: 2 | total time: 97.87m | eta: 81.7m +step 09110/16704 (54.54%) | loss: 2.639893 | lrm: 0.91 | dt: 644.78ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 2 | total time: 97.88m | eta: 81.7m +step 09111/16704 (54.54%) | loss: 2.636580 | lrm: 0.91 | dt: 642.12ms | tok/sec: 816,489 | mfu: 51.03 | epoch: 2 | total time: 97.90m | eta: 81.7m +step 09112/16704 (54.55%) | loss: 2.638451 | lrm: 0.91 | dt: 641.16ms | tok/sec: 817,723 | mfu: 51.11 | epoch: 2 | total time: 97.91m | eta: 81.7m +step 09113/16704 (54.56%) | loss: 2.643354 | lrm: 0.91 | dt: 642.87ms | tok/sec: 815,543 | mfu: 50.97 | epoch: 2 | total time: 97.92m | eta: 81.7m +step 09114/16704 (54.56%) | loss: 2.657118 | lrm: 0.91 | dt: 641.77ms | tok/sec: 816,938 | mfu: 51.06 | epoch: 2 | total time: 97.93m | eta: 81.6m +step 09115/16704 (54.57%) | loss: 2.655852 | lrm: 0.91 | dt: 645.57ms | tok/sec: 812,127 | mfu: 50.76 | epoch: 2 | total time: 97.94m | eta: 81.6m +step 09116/16704 (54.57%) | loss: 2.660517 | lrm: 0.91 | dt: 641.65ms | tok/sec: 817,098 | mfu: 51.07 | epoch: 2 | total time: 97.95m | eta: 81.6m +step 09117/16704 (54.58%) | loss: 2.653562 | lrm: 0.91 | dt: 643.72ms | tok/sec: 814,469 | mfu: 50.91 | epoch: 2 | total time: 97.96m | eta: 81.6m +step 09118/16704 (54.59%) | loss: 2.653780 | lrm: 0.91 | dt: 644.15ms | tok/sec: 813,927 | mfu: 50.87 | epoch: 2 | total time: 97.97m | eta: 81.6m +step 09119/16704 (54.59%) | loss: 2.650415 | lrm: 0.91 | dt: 643.94ms | tok/sec: 814,184 | mfu: 50.89 | epoch: 2 | total time: 97.98m | eta: 81.6m +step 09120/16704 (54.60%) | loss: 2.654667 | lrm: 0.91 | dt: 642.24ms | tok/sec: 816,336 | mfu: 51.02 | epoch: 2 | total time: 97.99m | eta: 81.6m +step 09121/16704 (54.60%) | loss: 2.649741 | lrm: 0.91 | dt: 642.90ms | tok/sec: 815,508 | mfu: 50.97 | epoch: 2 | total time: 98.00m | eta: 81.6m +step 09122/16704 (54.61%) | loss: 2.645415 | lrm: 0.91 | dt: 643.41ms | tok/sec: 814,860 | mfu: 50.93 | epoch: 2 | total time: 98.01m | eta: 81.6m +step 09123/16704 (54.62%) | loss: 2.648214 | lrm: 0.91 | dt: 641.72ms | tok/sec: 817,003 | mfu: 51.06 | epoch: 2 | total time: 98.02m | eta: 81.5m +step 09124/16704 (54.62%) | loss: 2.657749 | lrm: 0.91 | dt: 644.82ms | tok/sec: 813,074 | mfu: 50.82 | epoch: 2 | total time: 98.03m | eta: 81.5m +step 09125/16704 (54.63%) | loss: 2.642895 | lrm: 0.91 | dt: 641.16ms | tok/sec: 817,713 | mfu: 51.11 | epoch: 2 | total time: 98.05m | eta: 81.5m +step 09126/16704 (54.63%) | loss: 2.647275 | lrm: 0.91 | dt: 642.76ms | tok/sec: 815,679 | mfu: 50.98 | epoch: 2 | total time: 98.06m | eta: 81.5m +step 09127/16704 (54.64%) | loss: 2.651931 | lrm: 0.91 | dt: 641.83ms | tok/sec: 816,861 | mfu: 51.05 | epoch: 2 | total time: 98.07m | eta: 81.5m +step 09128/16704 (54.65%) | loss: 2.653664 | lrm: 0.91 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 2 | total time: 98.08m | eta: 81.5m +step 09129/16704 (54.65%) | loss: 2.645606 | lrm: 0.91 | dt: 643.97ms | tok/sec: 814,144 | mfu: 50.89 | epoch: 2 | total time: 98.09m | eta: 81.5m +step 09130/16704 (54.66%) | loss: 2.651927 | lrm: 0.91 | dt: 642.22ms | tok/sec: 816,362 | mfu: 51.02 | epoch: 2 | total time: 98.10m | eta: 81.5m +step 09131/16704 (54.66%) | loss: 2.661276 | lrm: 0.91 | dt: 644.24ms | tok/sec: 813,812 | mfu: 50.86 | epoch: 2 | total time: 98.11m | eta: 81.5m +step 09132/16704 (54.67%) | loss: 2.652724 | lrm: 0.91 | dt: 641.83ms | tok/sec: 816,859 | mfu: 51.05 | epoch: 2 | total time: 98.12m | eta: 81.4m +step 09133/16704 (54.68%) | loss: 2.649248 | lrm: 0.91 | dt: 644.74ms | tok/sec: 813,176 | mfu: 50.82 | epoch: 2 | total time: 98.13m | eta: 81.4m +step 09134/16704 (54.68%) | loss: 2.643488 | lrm: 0.91 | dt: 644.51ms | tok/sec: 813,463 | mfu: 50.84 | epoch: 2 | total time: 98.14m | eta: 81.4m +step 09135/16704 (54.69%) | loss: 2.644365 | lrm: 0.91 | dt: 642.89ms | tok/sec: 815,519 | mfu: 50.97 | epoch: 2 | total time: 98.15m | eta: 81.4m +step 09136/16704 (54.69%) | loss: 2.647964 | lrm: 0.91 | dt: 644.31ms | tok/sec: 813,718 | mfu: 50.86 | epoch: 2 | total time: 98.16m | eta: 81.4m +step 09137/16704 (54.70%) | loss: 2.659071 | lrm: 0.91 | dt: 644.03ms | tok/sec: 814,071 | mfu: 50.88 | epoch: 2 | total time: 98.17m | eta: 81.4m +step 09138/16704 (54.71%) | loss: 2.663003 | lrm: 0.91 | dt: 642.96ms | tok/sec: 815,429 | mfu: 50.97 | epoch: 2 | total time: 98.18m | eta: 81.4m +step 09139/16704 (54.71%) | loss: 2.652952 | lrm: 0.91 | dt: 644.26ms | tok/sec: 813,784 | mfu: 50.86 | epoch: 2 | total time: 98.20m | eta: 81.4m +step 09140/16704 (54.72%) | loss: 2.655889 | lrm: 0.91 | dt: 645.26ms | tok/sec: 812,519 | mfu: 50.78 | epoch: 2 | total time: 98.21m | eta: 81.4m +step 09141/16704 (54.72%) | loss: 2.652038 | lrm: 0.91 | dt: 643.10ms | tok/sec: 815,251 | mfu: 50.95 | epoch: 2 | total time: 98.22m | eta: 81.4m +step 09142/16704 (54.73%) | loss: 2.660763 | lrm: 0.91 | dt: 640.58ms | tok/sec: 818,459 | mfu: 51.15 | epoch: 2 | total time: 98.23m | eta: 81.3m +step 09143/16704 (54.74%) | loss: 2.658473 | lrm: 0.91 | dt: 645.20ms | tok/sec: 812,592 | mfu: 50.79 | epoch: 2 | total time: 98.24m | eta: 81.3m +step 09144/16704 (54.74%) | loss: 2.657566 | lrm: 0.91 | dt: 642.78ms | tok/sec: 815,655 | mfu: 50.98 | epoch: 2 | total time: 98.25m | eta: 81.3m +step 09145/16704 (54.75%) | loss: 2.655730 | lrm: 0.91 | dt: 644.24ms | tok/sec: 813,805 | mfu: 50.86 | epoch: 2 | total time: 98.26m | eta: 81.3m +step 09146/16704 (54.75%) | loss: 2.657273 | lrm: 0.90 | dt: 643.92ms | tok/sec: 814,210 | mfu: 50.89 | epoch: 2 | total time: 98.27m | eta: 81.3m +step 09147/16704 (54.76%) | loss: 2.669887 | lrm: 0.90 | dt: 644.13ms | tok/sec: 813,943 | mfu: 50.87 | epoch: 2 | total time: 98.28m | eta: 81.3m +step 09148/16704 (54.77%) | loss: 2.663924 | lrm: 0.90 | dt: 639.84ms | tok/sec: 819,398 | mfu: 51.21 | epoch: 2 | total time: 98.29m | eta: 81.3m +step 09149/16704 (54.77%) | loss: 2.660793 | lrm: 0.90 | dt: 642.23ms | tok/sec: 816,356 | mfu: 51.02 | epoch: 2 | total time: 98.30m | eta: 81.3m +step 09150/16704 (54.78%) | loss: 2.651782 | lrm: 0.90 | dt: 643.45ms | tok/sec: 814,809 | mfu: 50.93 | epoch: 2 | total time: 98.31m | eta: 81.3m +step 09151/16704 (54.78%) | loss: 2.650003 | lrm: 0.90 | dt: 641.77ms | tok/sec: 816,944 | mfu: 51.06 | epoch: 2 | total time: 98.32m | eta: 81.2m +step 09152/16704 (54.79%) | loss: 2.656348 | lrm: 0.90 | dt: 645.93ms | tok/sec: 811,680 | mfu: 50.73 | epoch: 2 | total time: 98.33m | eta: 81.2m +step 09153/16704 (54.80%) | loss: 2.646248 | lrm: 0.90 | dt: 642.51ms | tok/sec: 816,001 | mfu: 51.00 | epoch: 2 | total time: 98.35m | eta: 81.2m +step 09154/16704 (54.80%) | loss: 2.648976 | lrm: 0.90 | dt: 642.80ms | tok/sec: 815,625 | mfu: 50.98 | epoch: 2 | total time: 98.36m | eta: 81.2m +step 09155/16704 (54.81%) | loss: 2.648384 | lrm: 0.90 | dt: 642.21ms | tok/sec: 816,380 | mfu: 51.02 | epoch: 2 | total time: 98.37m | eta: 81.2m +step 09156/16704 (54.81%) | loss: 2.660009 | lrm: 0.90 | dt: 642.99ms | tok/sec: 815,393 | mfu: 50.96 | epoch: 2 | total time: 98.38m | eta: 81.2m +step 09157/16704 (54.82%) | loss: 2.680275 | lrm: 0.90 | dt: 643.54ms | tok/sec: 814,697 | mfu: 50.92 | epoch: 2 | total time: 98.39m | eta: 81.2m +step 09158/16704 (54.83%) | loss: 2.669860 | lrm: 0.90 | dt: 642.53ms | tok/sec: 815,978 | mfu: 51.00 | epoch: 2 | total time: 98.40m | eta: 81.2m +step 09159/16704 (54.83%) | loss: 2.669987 | lrm: 0.90 | dt: 642.81ms | tok/sec: 815,618 | mfu: 50.98 | epoch: 2 | total time: 98.41m | eta: 81.2m +step 09160/16704 (54.84%) | loss: 2.673077 | lrm: 0.90 | dt: 643.83ms | tok/sec: 814,326 | mfu: 50.90 | epoch: 2 | total time: 98.42m | eta: 81.1m +step 09161/16704 (54.84%) | loss: 2.675248 | lrm: 0.90 | dt: 642.49ms | tok/sec: 816,023 | mfu: 51.00 | epoch: 2 | total time: 98.43m | eta: 81.1m +step 09162/16704 (54.85%) | loss: 2.678650 | lrm: 0.90 | dt: 644.02ms | tok/sec: 814,081 | mfu: 50.88 | epoch: 2 | total time: 98.44m | eta: 81.1m +step 09163/16704 (54.86%) | loss: 2.690201 | lrm: 0.90 | dt: 644.66ms | tok/sec: 813,275 | mfu: 50.83 | epoch: 2 | total time: 98.45m | eta: 81.1m +step 09164/16704 (54.86%) | loss: 2.693873 | lrm: 0.90 | dt: 643.34ms | tok/sec: 814,944 | mfu: 50.94 | epoch: 2 | total time: 98.46m | eta: 81.1m +step 09165/16704 (54.87%) | loss: 2.686084 | lrm: 0.90 | dt: 644.10ms | tok/sec: 813,983 | mfu: 50.88 | epoch: 2 | total time: 98.47m | eta: 81.1m +step 09166/16704 (54.87%) | loss: 2.692320 | lrm: 0.90 | dt: 643.74ms | tok/sec: 814,446 | mfu: 50.90 | epoch: 2 | total time: 98.48m | eta: 81.1m +step 09167/16704 (54.88%) | loss: 2.687525 | lrm: 0.90 | dt: 643.53ms | tok/sec: 814,700 | mfu: 50.92 | epoch: 2 | total time: 98.50m | eta: 81.1m +step 09168/16704 (54.89%) | loss: 2.682830 | lrm: 0.90 | dt: 642.92ms | tok/sec: 815,476 | mfu: 50.97 | epoch: 2 | total time: 98.51m | eta: 81.1m +step 09169/16704 (54.89%) | loss: 2.666925 | lrm: 0.90 | dt: 646.00ms | tok/sec: 811,597 | mfu: 50.73 | epoch: 2 | total time: 98.52m | eta: 81.0m +step 09170/16704 (54.90%) | loss: 2.681311 | lrm: 0.90 | dt: 641.47ms | tok/sec: 817,323 | mfu: 51.08 | epoch: 2 | total time: 98.53m | eta: 81.0m +step 09171/16704 (54.90%) | loss: 2.670916 | lrm: 0.90 | dt: 643.74ms | tok/sec: 814,439 | mfu: 50.90 | epoch: 2 | total time: 98.54m | eta: 81.0m +step 09172/16704 (54.91%) | loss: 2.685452 | lrm: 0.90 | dt: 645.12ms | tok/sec: 812,695 | mfu: 50.79 | epoch: 2 | total time: 98.55m | eta: 81.0m +step 09173/16704 (54.91%) | loss: 2.685671 | lrm: 0.90 | dt: 641.78ms | tok/sec: 816,931 | mfu: 51.06 | epoch: 2 | total time: 98.56m | eta: 81.0m +step 09174/16704 (54.92%) | loss: 2.672154 | lrm: 0.90 | dt: 643.17ms | tok/sec: 815,161 | mfu: 50.95 | epoch: 2 | total time: 98.57m | eta: 81.0m +step 09175/16704 (54.93%) | loss: 2.672619 | lrm: 0.90 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 2 | total time: 98.58m | eta: 81.0m +step 09176/16704 (54.93%) | loss: 2.695290 | lrm: 0.90 | dt: 643.54ms | tok/sec: 814,695 | mfu: 50.92 | epoch: 2 | total time: 98.59m | eta: 81.0m +step 09177/16704 (54.94%) | loss: 2.696015 | lrm: 0.90 | dt: 643.50ms | tok/sec: 814,738 | mfu: 50.92 | epoch: 2 | total time: 98.60m | eta: 81.0m +step 09178/16704 (54.94%) | loss: 2.689088 | lrm: 0.90 | dt: 642.09ms | tok/sec: 816,534 | mfu: 51.03 | epoch: 2 | total time: 98.61m | eta: 81.0m +step 09179/16704 (54.95%) | loss: 2.693987 | lrm: 0.90 | dt: 643.24ms | tok/sec: 815,074 | mfu: 50.94 | epoch: 2 | total time: 98.62m | eta: 80.9m +step 09180/16704 (54.96%) | loss: 2.701212 | lrm: 0.90 | dt: 643.54ms | tok/sec: 814,689 | mfu: 50.92 | epoch: 2 | total time: 98.63m | eta: 80.9m +step 09181/16704 (54.96%) | loss: 2.681828 | lrm: 0.90 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 2 | total time: 98.65m | eta: 80.9m +step 09182/16704 (54.97%) | loss: 2.680170 | lrm: 0.90 | dt: 644.65ms | tok/sec: 813,291 | mfu: 50.83 | epoch: 2 | total time: 98.66m | eta: 80.9m +step 09183/16704 (54.97%) | loss: 2.687481 | lrm: 0.90 | dt: 642.65ms | tok/sec: 815,827 | mfu: 50.99 | epoch: 2 | total time: 98.67m | eta: 80.9m +step 09184/16704 (54.98%) | loss: 2.677033 | lrm: 0.90 | dt: 643.16ms | tok/sec: 815,180 | mfu: 50.95 | epoch: 2 | total time: 98.68m | eta: 80.9m +step 09185/16704 (54.99%) | loss: 2.690217 | lrm: 0.90 | dt: 642.30ms | tok/sec: 816,269 | mfu: 51.02 | epoch: 2 | total time: 98.69m | eta: 80.9m +step 09186/16704 (54.99%) | loss: 2.698409 | lrm: 0.90 | dt: 645.92ms | tok/sec: 811,686 | mfu: 50.73 | epoch: 2 | total time: 98.70m | eta: 80.9m +step 09187/16704 (55.00%) | loss: 2.689569 | lrm: 0.90 | dt: 641.60ms | tok/sec: 817,153 | mfu: 51.07 | epoch: 2 | total time: 98.71m | eta: 80.9m +step 09188/16704 (55.00%) | loss: 2.679235 | lrm: 0.90 | dt: 644.40ms | tok/sec: 813,608 | mfu: 50.85 | epoch: 2 | total time: 98.72m | eta: 80.8m +step 09189/16704 (55.01%) | loss: 2.684950 | lrm: 0.90 | dt: 642.57ms | tok/sec: 815,918 | mfu: 51.00 | epoch: 2 | total time: 98.73m | eta: 80.8m +step 09190/16704 (55.02%) | loss: 2.681250 | lrm: 0.90 | dt: 644.74ms | tok/sec: 813,177 | mfu: 50.82 | epoch: 2 | total time: 98.74m | eta: 80.8m +step 09191/16704 (55.02%) | loss: 2.669137 | lrm: 0.90 | dt: 641.76ms | tok/sec: 816,952 | mfu: 51.06 | epoch: 2 | total time: 98.75m | eta: 80.8m +step 09192/16704 (55.03%) | loss: 2.664166 | lrm: 0.90 | dt: 643.20ms | tok/sec: 815,128 | mfu: 50.95 | epoch: 2 | total time: 98.76m | eta: 80.8m +step 09193/16704 (55.03%) | loss: 2.667220 | lrm: 0.90 | dt: 645.55ms | tok/sec: 812,152 | mfu: 50.76 | epoch: 2 | total time: 98.77m | eta: 80.8m +step 09194/16704 (55.04%) | loss: 2.658119 | lrm: 0.90 | dt: 641.87ms | tok/sec: 816,812 | mfu: 51.05 | epoch: 2 | total time: 98.79m | eta: 80.8m +step 09195/16704 (55.05%) | loss: 2.660215 | lrm: 0.90 | dt: 643.50ms | tok/sec: 814,749 | mfu: 50.92 | epoch: 2 | total time: 98.80m | eta: 80.8m +step 09196/16704 (55.05%) | loss: 2.662940 | lrm: 0.90 | dt: 641.77ms | tok/sec: 816,945 | mfu: 51.06 | epoch: 2 | total time: 98.81m | eta: 80.8m +step 09197/16704 (55.06%) | loss: 2.664577 | lrm: 0.90 | dt: 642.46ms | tok/sec: 816,066 | mfu: 51.01 | epoch: 2 | total time: 98.82m | eta: 80.7m +step 09198/16704 (55.06%) | loss: 2.673510 | lrm: 0.90 | dt: 643.18ms | tok/sec: 815,151 | mfu: 50.95 | epoch: 2 | total time: 98.83m | eta: 80.7m +step 09199/16704 (55.07%) | loss: 2.684871 | lrm: 0.90 | dt: 642.98ms | tok/sec: 815,404 | mfu: 50.96 | epoch: 2 | total time: 98.84m | eta: 80.7m +step 09200/16704 (55.08%) | loss: 2.665546 | lrm: 0.90 | dt: 643.00ms | tok/sec: 815,378 | mfu: 50.96 | epoch: 2 | total time: 98.85m | eta: 80.7m +step 09201/16704 (55.08%) | loss: 2.666540 | lrm: 0.90 | dt: 642.75ms | tok/sec: 815,690 | mfu: 50.98 | epoch: 2 | total time: 98.86m | eta: 80.7m +step 09202/16704 (55.09%) | loss: 2.661024 | lrm: 0.90 | dt: 642.12ms | tok/sec: 816,495 | mfu: 51.03 | epoch: 2 | total time: 98.87m | eta: 80.7m +step 09203/16704 (55.09%) | loss: 2.673637 | lrm: 0.90 | dt: 642.53ms | tok/sec: 815,973 | mfu: 51.00 | epoch: 2 | total time: 98.88m | eta: 80.7m +step 09204/16704 (55.10%) | loss: 2.669731 | lrm: 0.90 | dt: 643.45ms | tok/sec: 814,801 | mfu: 50.93 | epoch: 2 | total time: 98.89m | eta: 80.7m +step 09205/16704 (55.11%) | loss: 2.658201 | lrm: 0.90 | dt: 641.94ms | tok/sec: 816,730 | mfu: 51.05 | epoch: 2 | total time: 98.90m | eta: 80.7m +step 09206/16704 (55.11%) | loss: 2.634565 | lrm: 0.90 | dt: 642.67ms | tok/sec: 815,798 | mfu: 50.99 | epoch: 2 | total time: 98.91m | eta: 80.6m +step 09207/16704 (55.12%) | loss: 2.658776 | lrm: 0.90 | dt: 643.18ms | tok/sec: 815,153 | mfu: 50.95 | epoch: 2 | total time: 98.92m | eta: 80.6m +step 09208/16704 (55.12%) | loss: 2.669011 | lrm: 0.90 | dt: 641.79ms | tok/sec: 816,913 | mfu: 51.06 | epoch: 2 | total time: 98.94m | eta: 80.6m +step 09209/16704 (55.13%) | loss: 2.682027 | lrm: 0.90 | dt: 642.65ms | tok/sec: 815,827 | mfu: 50.99 | epoch: 2 | total time: 98.95m | eta: 80.6m +step 09210/16704 (55.14%) | loss: 2.677318 | lrm: 0.90 | dt: 644.27ms | tok/sec: 813,770 | mfu: 50.86 | epoch: 2 | total time: 98.96m | eta: 80.6m +step 09211/16704 (55.14%) | loss: 2.684550 | lrm: 0.90 | dt: 645.21ms | tok/sec: 812,587 | mfu: 50.79 | epoch: 2 | total time: 98.97m | eta: 80.6m +step 09212/16704 (55.15%) | loss: 2.669451 | lrm: 0.90 | dt: 643.36ms | tok/sec: 814,927 | mfu: 50.93 | epoch: 2 | total time: 98.98m | eta: 80.6m +step 09213/16704 (55.15%) | loss: 2.686288 | lrm: 0.90 | dt: 643.79ms | tok/sec: 814,379 | mfu: 50.90 | epoch: 2 | total time: 98.99m | eta: 80.6m +step 09214/16704 (55.16%) | loss: 2.671514 | lrm: 0.90 | dt: 644.13ms | tok/sec: 813,944 | mfu: 50.87 | epoch: 2 | total time: 99.00m | eta: 80.6m +step 09215/16704 (55.17%) | loss: 2.680795 | lrm: 0.90 | dt: 641.73ms | tok/sec: 816,986 | mfu: 51.06 | epoch: 2 | total time: 99.01m | eta: 80.6m +step 09216/16704 (55.17%) | loss: 2.685956 | lrm: 0.90 | dt: 643.69ms | tok/sec: 814,509 | mfu: 50.91 | epoch: 2 | total time: 99.02m | eta: 80.5m +step 09217/16704 (55.18%) | loss: 2.679490 | lrm: 0.90 | dt: 645.01ms | tok/sec: 812,836 | mfu: 50.80 | epoch: 2 | total time: 99.03m | eta: 80.5m +step 09218/16704 (55.18%) | loss: 2.675943 | lrm: 0.90 | dt: 640.44ms | tok/sec: 818,635 | mfu: 51.17 | epoch: 2 | total time: 99.04m | eta: 80.5m +step 09219/16704 (55.19%) | loss: 2.653057 | lrm: 0.90 | dt: 643.09ms | tok/sec: 815,266 | mfu: 50.96 | epoch: 2 | total time: 99.05m | eta: 80.5m +step 09220/16704 (55.20%) | loss: 2.649511 | lrm: 0.90 | dt: 641.04ms | tok/sec: 817,874 | mfu: 51.12 | epoch: 2 | total time: 99.06m | eta: 80.5m +step 09221/16704 (55.20%) | loss: 2.656082 | lrm: 0.90 | dt: 642.72ms | tok/sec: 815,737 | mfu: 50.98 | epoch: 2 | total time: 99.07m | eta: 80.5m +step 09222/16704 (55.21%) | loss: 2.653953 | lrm: 0.90 | dt: 644.44ms | tok/sec: 813,550 | mfu: 50.85 | epoch: 2 | total time: 99.09m | eta: 80.5m +step 09223/16704 (55.21%) | loss: 2.638766 | lrm: 0.90 | dt: 642.69ms | tok/sec: 815,774 | mfu: 50.99 | epoch: 2 | total time: 99.10m | eta: 80.5m +step 09224/16704 (55.22%) | loss: 2.644698 | lrm: 0.90 | dt: 644.86ms | tok/sec: 813,023 | mfu: 50.82 | epoch: 2 | total time: 99.11m | eta: 80.5m +step 09225/16704 (55.23%) | loss: 2.642908 | lrm: 0.90 | dt: 643.30ms | tok/sec: 814,996 | mfu: 50.94 | epoch: 2 | total time: 99.12m | eta: 80.4m +step 09226/16704 (55.23%) | loss: 2.633370 | lrm: 0.90 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 2 | total time: 99.13m | eta: 80.4m +step 09227/16704 (55.24%) | loss: 2.631088 | lrm: 0.90 | dt: 642.50ms | tok/sec: 816,006 | mfu: 51.00 | epoch: 2 | total time: 99.14m | eta: 80.4m +step 09228/16704 (55.24%) | loss: 2.641262 | lrm: 0.90 | dt: 641.98ms | tok/sec: 816,667 | mfu: 51.04 | epoch: 2 | total time: 99.15m | eta: 80.4m +step 09229/16704 (55.25%) | loss: 2.642539 | lrm: 0.89 | dt: 644.73ms | tok/sec: 813,193 | mfu: 50.83 | epoch: 2 | total time: 99.16m | eta: 80.4m +step 09230/16704 (55.26%) | loss: 2.645087 | lrm: 0.89 | dt: 642.26ms | tok/sec: 816,322 | mfu: 51.02 | epoch: 2 | total time: 99.17m | eta: 80.4m +step 09231/16704 (55.26%) | loss: 2.640966 | lrm: 0.89 | dt: 644.35ms | tok/sec: 813,663 | mfu: 50.86 | epoch: 2 | total time: 99.18m | eta: 80.4m +step 09232/16704 (55.27%) | loss: 2.650957 | lrm: 0.89 | dt: 643.10ms | tok/sec: 815,253 | mfu: 50.95 | epoch: 2 | total time: 99.19m | eta: 80.4m +step 09233/16704 (55.27%) | loss: 2.659502 | lrm: 0.89 | dt: 643.96ms | tok/sec: 814,161 | mfu: 50.89 | epoch: 2 | total time: 99.20m | eta: 80.4m +step 09234/16704 (55.28%) | loss: 2.661725 | lrm: 0.89 | dt: 643.87ms | tok/sec: 814,272 | mfu: 50.89 | epoch: 2 | total time: 99.21m | eta: 80.3m +step 09235/16704 (55.29%) | loss: 2.661297 | lrm: 0.89 | dt: 642.22ms | tok/sec: 816,371 | mfu: 51.02 | epoch: 2 | total time: 99.22m | eta: 80.3m +step 09236/16704 (55.29%) | loss: 2.669428 | lrm: 0.89 | dt: 642.56ms | tok/sec: 815,941 | mfu: 51.00 | epoch: 2 | total time: 99.24m | eta: 80.3m +step 09237/16704 (55.30%) | loss: 2.664969 | lrm: 0.89 | dt: 643.01ms | tok/sec: 815,362 | mfu: 50.96 | epoch: 2 | total time: 99.25m | eta: 80.3m +step 09238/16704 (55.30%) | loss: 2.659304 | lrm: 0.89 | dt: 640.03ms | tok/sec: 819,155 | mfu: 51.20 | epoch: 2 | total time: 99.26m | eta: 80.3m +step 09239/16704 (55.31%) | loss: 2.646114 | lrm: 0.89 | dt: 643.35ms | tok/sec: 814,936 | mfu: 50.93 | epoch: 2 | total time: 99.27m | eta: 80.3m +step 09240/16704 (55.32%) | loss: 2.641613 | lrm: 0.89 | dt: 643.04ms | tok/sec: 815,330 | mfu: 50.96 | epoch: 2 | total time: 99.28m | eta: 80.3m +step 09241/16704 (55.32%) | loss: 2.643457 | lrm: 0.89 | dt: 643.10ms | tok/sec: 815,247 | mfu: 50.95 | epoch: 2 | total time: 99.29m | eta: 80.3m +step 09242/16704 (55.33%) | loss: 2.641440 | lrm: 0.89 | dt: 643.68ms | tok/sec: 814,519 | mfu: 50.91 | epoch: 2 | total time: 99.30m | eta: 80.3m +step 09243/16704 (55.33%) | loss: 2.656240 | lrm: 0.89 | dt: 643.53ms | tok/sec: 814,707 | mfu: 50.92 | epoch: 2 | total time: 99.31m | eta: 80.3m +step 09244/16704 (55.34%) | loss: 2.654002 | lrm: 0.89 | dt: 641.23ms | tok/sec: 817,625 | mfu: 51.10 | epoch: 2 | total time: 99.32m | eta: 80.2m +step 09245/16704 (55.35%) | loss: 2.655549 | lrm: 0.89 | dt: 642.45ms | tok/sec: 816,079 | mfu: 51.01 | epoch: 2 | total time: 99.33m | eta: 80.2m +step 09246/16704 (55.35%) | loss: 2.657395 | lrm: 0.89 | dt: 641.83ms | tok/sec: 816,866 | mfu: 51.06 | epoch: 2 | total time: 99.34m | eta: 80.2m +step 09247/16704 (55.36%) | loss: 2.645177 | lrm: 0.89 | dt: 642.52ms | tok/sec: 815,983 | mfu: 51.00 | epoch: 2 | total time: 99.35m | eta: 80.2m +step 09248/16704 (55.36%) | loss: 2.650243 | lrm: 0.89 | dt: 644.62ms | tok/sec: 813,330 | mfu: 50.83 | epoch: 2 | total time: 99.36m | eta: 80.2m +step 09249/16704 (55.37%) | loss: 2.647451 | lrm: 0.89 | dt: 641.11ms | tok/sec: 817,779 | mfu: 51.11 | epoch: 2 | total time: 99.37m | eta: 80.2m +Step 09250 | Validation bpb: 0.812113 +step 09250/16704 (55.38%) | loss: 2.652038 | lrm: 0.89 | dt: 643.40ms | tok/sec: 814,873 | mfu: 50.93 | epoch: 2 | total time: 99.39m | eta: 80.2m +step 09251/16704 (55.38%) | loss: 2.650729 | lrm: 0.89 | dt: 647.89ms | tok/sec: 809,220 | mfu: 50.58 | epoch: 2 | total time: 99.40m | eta: 80.2m +step 09252/16704 (55.39%) | loss: 2.647357 | lrm: 0.89 | dt: 641.01ms | tok/sec: 817,911 | mfu: 51.12 | epoch: 2 | total time: 99.41m | eta: 80.2m +step 09253/16704 (55.39%) | loss: 2.647807 | lrm: 0.89 | dt: 640.88ms | tok/sec: 818,076 | mfu: 51.13 | epoch: 2 | total time: 99.42m | eta: 80.1m +step 09254/16704 (55.40%) | loss: 2.643176 | lrm: 0.89 | dt: 644.96ms | tok/sec: 812,897 | mfu: 50.81 | epoch: 2 | total time: 99.43m | eta: 80.1m +step 09255/16704 (55.41%) | loss: 2.651142 | lrm: 0.89 | dt: 640.65ms | tok/sec: 818,367 | mfu: 51.15 | epoch: 2 | total time: 99.44m | eta: 80.1m +step 09256/16704 (55.41%) | loss: 2.650403 | lrm: 0.89 | dt: 642.89ms | tok/sec: 815,521 | mfu: 50.97 | epoch: 2 | total time: 99.45m | eta: 80.1m +step 09257/16704 (55.42%) | loss: 2.640124 | lrm: 0.89 | dt: 644.29ms | tok/sec: 813,746 | mfu: 50.86 | epoch: 2 | total time: 99.46m | eta: 80.1m +step 09258/16704 (55.42%) | loss: 2.646816 | lrm: 0.89 | dt: 640.58ms | tok/sec: 818,462 | mfu: 51.16 | epoch: 2 | total time: 99.47m | eta: 80.1m +step 09259/16704 (55.43%) | loss: 2.642633 | lrm: 0.89 | dt: 644.73ms | tok/sec: 813,195 | mfu: 50.83 | epoch: 2 | total time: 99.48m | eta: 80.1m +step 09260/16704 (55.44%) | loss: 2.656607 | lrm: 0.89 | dt: 642.14ms | tok/sec: 816,467 | mfu: 51.03 | epoch: 2 | total time: 99.49m | eta: 80.1m +step 09261/16704 (55.44%) | loss: 2.667849 | lrm: 0.89 | dt: 642.91ms | tok/sec: 815,492 | mfu: 50.97 | epoch: 2 | total time: 99.50m | eta: 80.1m +step 09262/16704 (55.45%) | loss: 2.664711 | lrm: 0.89 | dt: 643.15ms | tok/sec: 815,182 | mfu: 50.95 | epoch: 2 | total time: 99.51m | eta: 80.0m +step 09263/16704 (55.45%) | loss: 2.667737 | lrm: 0.89 | dt: 642.26ms | tok/sec: 816,318 | mfu: 51.02 | epoch: 2 | total time: 99.52m | eta: 80.0m +step 09264/16704 (55.46%) | loss: 2.654695 | lrm: 0.89 | dt: 644.92ms | tok/sec: 812,951 | mfu: 50.81 | epoch: 2 | total time: 99.54m | eta: 80.0m +step 09265/16704 (55.47%) | loss: 2.658920 | lrm: 0.89 | dt: 640.23ms | tok/sec: 818,908 | mfu: 51.18 | epoch: 2 | total time: 99.55m | eta: 80.0m +step 09266/16704 (55.47%) | loss: 2.659872 | lrm: 0.89 | dt: 642.81ms | tok/sec: 815,614 | mfu: 50.98 | epoch: 2 | total time: 99.56m | eta: 80.0m +step 09267/16704 (55.48%) | loss: 2.655330 | lrm: 0.89 | dt: 641.30ms | tok/sec: 817,542 | mfu: 51.10 | epoch: 2 | total time: 99.57m | eta: 80.0m +step 09268/16704 (55.48%) | loss: 2.647664 | lrm: 0.89 | dt: 643.86ms | tok/sec: 814,290 | mfu: 50.89 | epoch: 2 | total time: 99.58m | eta: 80.0m +step 09269/16704 (55.49%) | loss: 2.647334 | lrm: 0.89 | dt: 642.94ms | tok/sec: 815,456 | mfu: 50.97 | epoch: 2 | total time: 99.59m | eta: 80.0m +step 09270/16704 (55.50%) | loss: 2.656412 | lrm: 0.89 | dt: 643.58ms | tok/sec: 814,644 | mfu: 50.92 | epoch: 2 | total time: 99.60m | eta: 80.0m +step 09271/16704 (55.50%) | loss: 2.649935 | lrm: 0.89 | dt: 645.09ms | tok/sec: 812,740 | mfu: 50.80 | epoch: 2 | total time: 99.61m | eta: 79.9m +step 09272/16704 (55.51%) | loss: 2.655190 | lrm: 0.89 | dt: 641.43ms | tok/sec: 817,372 | mfu: 51.09 | epoch: 2 | total time: 99.62m | eta: 79.9m +step 09273/16704 (55.51%) | loss: 2.667974 | lrm: 0.89 | dt: 643.03ms | tok/sec: 815,333 | mfu: 50.96 | epoch: 2 | total time: 99.63m | eta: 79.9m +step 09274/16704 (55.52%) | loss: 2.655115 | lrm: 0.89 | dt: 643.79ms | tok/sec: 814,374 | mfu: 50.90 | epoch: 2 | total time: 99.64m | eta: 79.9m +step 09275/16704 (55.53%) | loss: 2.669160 | lrm: 0.89 | dt: 642.25ms | tok/sec: 816,330 | mfu: 51.02 | epoch: 2 | total time: 99.65m | eta: 79.9m +step 09276/16704 (55.53%) | loss: 2.686236 | lrm: 0.89 | dt: 642.92ms | tok/sec: 815,473 | mfu: 50.97 | epoch: 2 | total time: 99.66m | eta: 79.9m +step 09277/16704 (55.54%) | loss: 2.689341 | lrm: 0.89 | dt: 643.26ms | tok/sec: 815,051 | mfu: 50.94 | epoch: 2 | total time: 99.67m | eta: 79.9m +step 09278/16704 (55.54%) | loss: 2.681380 | lrm: 0.89 | dt: 643.16ms | tok/sec: 815,175 | mfu: 50.95 | epoch: 2 | total time: 99.69m | eta: 79.9m +step 09279/16704 (55.55%) | loss: 2.689305 | lrm: 0.89 | dt: 641.39ms | tok/sec: 817,420 | mfu: 51.09 | epoch: 2 | total time: 99.70m | eta: 79.9m +step 09280/16704 (55.56%) | loss: 2.683940 | lrm: 0.89 | dt: 643.74ms | tok/sec: 814,434 | mfu: 50.90 | epoch: 2 | total time: 99.71m | eta: 79.9m +step 09281/16704 (55.56%) | loss: 2.671672 | lrm: 0.89 | dt: 641.65ms | tok/sec: 817,087 | mfu: 51.07 | epoch: 2 | total time: 99.72m | eta: 79.8m +step 09282/16704 (55.57%) | loss: 2.681558 | lrm: 0.89 | dt: 645.13ms | tok/sec: 812,688 | mfu: 50.79 | epoch: 2 | total time: 99.73m | eta: 79.8m +step 09283/16704 (55.57%) | loss: 2.678694 | lrm: 0.89 | dt: 642.00ms | tok/sec: 816,643 | mfu: 51.04 | epoch: 2 | total time: 99.74m | eta: 79.8m +step 09284/16704 (55.58%) | loss: 2.671823 | lrm: 0.89 | dt: 642.84ms | tok/sec: 815,579 | mfu: 50.97 | epoch: 2 | total time: 99.75m | eta: 79.8m +step 09285/16704 (55.59%) | loss: 2.671173 | lrm: 0.89 | dt: 643.81ms | tok/sec: 814,350 | mfu: 50.90 | epoch: 2 | total time: 99.76m | eta: 79.8m +step 09286/16704 (55.59%) | loss: 2.676545 | lrm: 0.89 | dt: 642.49ms | tok/sec: 816,022 | mfu: 51.00 | epoch: 2 | total time: 99.77m | eta: 79.8m +step 09287/16704 (55.60%) | loss: 2.672813 | lrm: 0.89 | dt: 642.26ms | tok/sec: 816,323 | mfu: 51.02 | epoch: 2 | total time: 99.78m | eta: 79.8m +step 09288/16704 (55.60%) | loss: 2.666182 | lrm: 0.89 | dt: 642.21ms | tok/sec: 816,381 | mfu: 51.02 | epoch: 2 | total time: 99.79m | eta: 79.8m +step 09289/16704 (55.61%) | loss: 2.685610 | lrm: 0.89 | dt: 643.75ms | tok/sec: 814,430 | mfu: 50.90 | epoch: 2 | total time: 99.80m | eta: 79.8m +step 09290/16704 (55.62%) | loss: 2.683351 | lrm: 0.89 | dt: 641.95ms | tok/sec: 816,712 | mfu: 51.05 | epoch: 2 | total time: 99.81m | eta: 79.7m +step 09291/16704 (55.62%) | loss: 2.665851 | lrm: 0.89 | dt: 645.41ms | tok/sec: 812,334 | mfu: 50.77 | epoch: 2 | total time: 99.82m | eta: 79.7m +step 09292/16704 (55.63%) | loss: 2.674168 | lrm: 0.89 | dt: 641.69ms | tok/sec: 817,040 | mfu: 51.07 | epoch: 2 | total time: 99.84m | eta: 79.7m +step 09293/16704 (55.63%) | loss: 2.654640 | lrm: 0.89 | dt: 642.82ms | tok/sec: 815,605 | mfu: 50.98 | epoch: 2 | total time: 99.85m | eta: 79.7m +step 09294/16704 (55.64%) | loss: 2.654127 | lrm: 0.89 | dt: 645.11ms | tok/sec: 812,714 | mfu: 50.80 | epoch: 2 | total time: 99.86m | eta: 79.7m +step 09295/16704 (55.65%) | loss: 2.653498 | lrm: 0.89 | dt: 641.78ms | tok/sec: 816,932 | mfu: 51.06 | epoch: 2 | total time: 99.87m | eta: 79.7m +step 09296/16704 (55.65%) | loss: 2.656387 | lrm: 0.89 | dt: 642.15ms | tok/sec: 816,458 | mfu: 51.03 | epoch: 2 | total time: 99.88m | eta: 79.7m +step 09297/16704 (55.66%) | loss: 2.650583 | lrm: 0.89 | dt: 644.64ms | tok/sec: 813,302 | mfu: 50.83 | epoch: 2 | total time: 99.89m | eta: 79.7m +step 09298/16704 (55.66%) | loss: 2.661035 | lrm: 0.89 | dt: 641.50ms | tok/sec: 817,287 | mfu: 51.08 | epoch: 2 | total time: 99.90m | eta: 79.7m +step 09299/16704 (55.67%) | loss: 2.653043 | lrm: 0.89 | dt: 642.64ms | tok/sec: 815,829 | mfu: 50.99 | epoch: 2 | total time: 99.91m | eta: 79.6m +step 09300/16704 (55.68%) | loss: 2.649480 | lrm: 0.89 | dt: 643.36ms | tok/sec: 814,918 | mfu: 50.93 | epoch: 2 | total time: 99.92m | eta: 79.6m +step 09301/16704 (55.68%) | loss: 2.654665 | lrm: 0.89 | dt: 643.38ms | tok/sec: 814,897 | mfu: 50.93 | epoch: 2 | total time: 99.93m | eta: 79.6m +step 09302/16704 (55.69%) | loss: 2.651597 | lrm: 0.89 | dt: 643.50ms | tok/sec: 814,745 | mfu: 50.92 | epoch: 2 | total time: 99.94m | eta: 79.6m +step 09303/16704 (55.69%) | loss: 2.664147 | lrm: 0.89 | dt: 644.90ms | tok/sec: 812,973 | mfu: 50.81 | epoch: 2 | total time: 99.95m | eta: 79.6m +step 09304/16704 (55.70%) | loss: 2.668830 | lrm: 0.89 | dt: 641.83ms | tok/sec: 816,868 | mfu: 51.06 | epoch: 2 | total time: 99.96m | eta: 79.6m +step 09305/16704 (55.71%) | loss: 2.669429 | lrm: 0.89 | dt: 643.80ms | tok/sec: 814,365 | mfu: 50.90 | epoch: 2 | total time: 99.97m | eta: 79.6m +step 09306/16704 (55.71%) | loss: 2.669059 | lrm: 0.89 | dt: 645.40ms | tok/sec: 812,347 | mfu: 50.77 | epoch: 2 | total time: 99.99m | eta: 79.6m +step 09307/16704 (55.72%) | loss: 2.650898 | lrm: 0.89 | dt: 644.89ms | tok/sec: 812,990 | mfu: 50.81 | epoch: 2 | total time: 100.00m | eta: 79.6m +step 09308/16704 (55.72%) | loss: 2.658780 | lrm: 0.89 | dt: 645.05ms | tok/sec: 812,786 | mfu: 50.80 | epoch: 2 | total time: 100.01m | eta: 79.5m +step 09309/16704 (55.73%) | loss: 2.668483 | lrm: 0.89 | dt: 640.80ms | tok/sec: 818,183 | mfu: 51.14 | epoch: 2 | total time: 100.02m | eta: 79.5m +step 09310/16704 (55.74%) | loss: 2.666825 | lrm: 0.89 | dt: 645.10ms | tok/sec: 812,721 | mfu: 50.80 | epoch: 2 | total time: 100.03m | eta: 79.5m +step 09311/16704 (55.74%) | loss: 2.648806 | lrm: 0.89 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 2 | total time: 100.04m | eta: 79.5m +step 09312/16704 (55.75%) | loss: 2.658838 | lrm: 0.89 | dt: 644.25ms | tok/sec: 813,796 | mfu: 50.86 | epoch: 2 | total time: 100.05m | eta: 79.5m +step 09313/16704 (55.75%) | loss: 2.649234 | lrm: 0.88 | dt: 644.69ms | tok/sec: 813,236 | mfu: 50.83 | epoch: 2 | total time: 100.06m | eta: 79.5m +step 09314/16704 (55.76%) | loss: 2.643761 | lrm: 0.88 | dt: 642.02ms | tok/sec: 816,628 | mfu: 51.04 | epoch: 2 | total time: 100.07m | eta: 79.5m +step 09315/16704 (55.77%) | loss: 2.655450 | lrm: 0.88 | dt: 644.12ms | tok/sec: 813,954 | mfu: 50.87 | epoch: 2 | total time: 100.08m | eta: 79.5m +step 09316/16704 (55.77%) | loss: 2.647234 | lrm: 0.88 | dt: 641.67ms | tok/sec: 817,067 | mfu: 51.07 | epoch: 2 | total time: 100.09m | eta: 79.5m +step 09317/16704 (55.78%) | loss: 2.651172 | lrm: 0.88 | dt: 643.56ms | tok/sec: 814,662 | mfu: 50.92 | epoch: 2 | total time: 100.10m | eta: 79.5m +step 09318/16704 (55.78%) | loss: 2.661929 | lrm: 0.88 | dt: 645.02ms | tok/sec: 812,829 | mfu: 50.80 | epoch: 2 | total time: 100.11m | eta: 79.4m +step 09319/16704 (55.79%) | loss: 2.663422 | lrm: 0.88 | dt: 642.25ms | tok/sec: 816,333 | mfu: 51.02 | epoch: 2 | total time: 100.12m | eta: 79.4m +step 09320/16704 (55.80%) | loss: 2.656332 | lrm: 0.88 | dt: 643.13ms | tok/sec: 815,209 | mfu: 50.95 | epoch: 2 | total time: 100.14m | eta: 79.4m +step 09321/16704 (55.80%) | loss: 2.671422 | lrm: 0.88 | dt: 642.25ms | tok/sec: 816,336 | mfu: 51.02 | epoch: 2 | total time: 100.15m | eta: 79.4m +step 09322/16704 (55.81%) | loss: 2.672427 | lrm: 0.88 | dt: 641.78ms | tok/sec: 816,931 | mfu: 51.06 | epoch: 2 | total time: 100.16m | eta: 79.4m +step 09323/16704 (55.81%) | loss: 2.670662 | lrm: 0.88 | dt: 643.24ms | tok/sec: 815,076 | mfu: 50.94 | epoch: 2 | total time: 100.17m | eta: 79.4m +step 09324/16704 (55.82%) | loss: 2.677515 | lrm: 0.88 | dt: 643.28ms | tok/sec: 815,028 | mfu: 50.94 | epoch: 2 | total time: 100.18m | eta: 79.4m +step 09325/16704 (55.82%) | loss: 2.675978 | lrm: 0.88 | dt: 642.68ms | tok/sec: 815,789 | mfu: 50.99 | epoch: 2 | total time: 100.19m | eta: 79.4m +step 09326/16704 (55.83%) | loss: 2.662646 | lrm: 0.88 | dt: 643.81ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 2 | total time: 100.20m | eta: 79.4m +step 09327/16704 (55.84%) | loss: 2.658930 | lrm: 0.88 | dt: 640.23ms | tok/sec: 818,904 | mfu: 51.18 | epoch: 2 | total time: 100.21m | eta: 79.3m +step 09328/16704 (55.84%) | loss: 2.664650 | lrm: 0.88 | dt: 644.94ms | tok/sec: 812,920 | mfu: 50.81 | epoch: 2 | total time: 100.22m | eta: 79.3m +step 09329/16704 (55.85%) | loss: 2.657894 | lrm: 0.88 | dt: 640.60ms | tok/sec: 818,433 | mfu: 51.15 | epoch: 2 | total time: 100.23m | eta: 79.3m +step 09330/16704 (55.85%) | loss: 2.661876 | lrm: 0.88 | dt: 644.43ms | tok/sec: 813,570 | mfu: 50.85 | epoch: 2 | total time: 100.24m | eta: 79.3m +step 09331/16704 (55.86%) | loss: 2.649424 | lrm: 0.88 | dt: 644.60ms | tok/sec: 813,350 | mfu: 50.84 | epoch: 2 | total time: 100.25m | eta: 79.3m +step 09332/16704 (55.87%) | loss: 2.644943 | lrm: 0.88 | dt: 642.83ms | tok/sec: 815,599 | mfu: 50.98 | epoch: 2 | total time: 100.26m | eta: 79.3m +step 09333/16704 (55.87%) | loss: 2.650418 | lrm: 0.88 | dt: 642.68ms | tok/sec: 815,779 | mfu: 50.99 | epoch: 2 | total time: 100.27m | eta: 79.3m +step 09334/16704 (55.88%) | loss: 2.657258 | lrm: 0.88 | dt: 645.76ms | tok/sec: 811,888 | mfu: 50.74 | epoch: 2 | total time: 100.29m | eta: 79.3m +step 09335/16704 (55.88%) | loss: 2.644246 | lrm: 0.88 | dt: 641.11ms | tok/sec: 817,785 | mfu: 51.11 | epoch: 2 | total time: 100.30m | eta: 79.3m +step 09336/16704 (55.89%) | loss: 2.642852 | lrm: 0.88 | dt: 643.92ms | tok/sec: 814,212 | mfu: 50.89 | epoch: 2 | total time: 100.31m | eta: 79.2m +step 09337/16704 (55.90%) | loss: 2.656186 | lrm: 0.88 | dt: 643.41ms | tok/sec: 814,856 | mfu: 50.93 | epoch: 2 | total time: 100.32m | eta: 79.2m +step 09338/16704 (55.90%) | loss: 2.662133 | lrm: 0.88 | dt: 645.84ms | tok/sec: 811,793 | mfu: 50.74 | epoch: 2 | total time: 100.33m | eta: 79.2m +step 09339/16704 (55.91%) | loss: 2.650290 | lrm: 0.88 | dt: 640.02ms | tok/sec: 819,170 | mfu: 51.20 | epoch: 2 | total time: 100.34m | eta: 79.2m +step 09340/16704 (55.91%) | loss: 2.671443 | lrm: 0.88 | dt: 641.24ms | tok/sec: 817,621 | mfu: 51.10 | epoch: 2 | total time: 100.35m | eta: 79.2m +step 09341/16704 (55.92%) | loss: 2.667979 | lrm: 0.88 | dt: 643.30ms | tok/sec: 814,996 | mfu: 50.94 | epoch: 2 | total time: 100.36m | eta: 79.2m +step 09342/16704 (55.93%) | loss: 2.670028 | lrm: 0.88 | dt: 643.15ms | tok/sec: 815,190 | mfu: 50.95 | epoch: 2 | total time: 100.37m | eta: 79.2m +step 09343/16704 (55.93%) | loss: 2.685714 | lrm: 0.88 | dt: 645.70ms | tok/sec: 811,972 | mfu: 50.75 | epoch: 2 | total time: 100.38m | eta: 79.2m +step 09344/16704 (55.94%) | loss: 2.682653 | lrm: 0.88 | dt: 642.84ms | tok/sec: 815,581 | mfu: 50.98 | epoch: 2 | total time: 100.39m | eta: 79.2m +step 09345/16704 (55.94%) | loss: 2.661998 | lrm: 0.88 | dt: 642.25ms | tok/sec: 816,332 | mfu: 51.02 | epoch: 2 | total time: 100.40m | eta: 79.2m +step 09346/16704 (55.95%) | loss: 2.663147 | lrm: 0.88 | dt: 644.12ms | tok/sec: 813,956 | mfu: 50.87 | epoch: 2 | total time: 100.41m | eta: 79.1m +step 09347/16704 (55.96%) | loss: 2.652504 | lrm: 0.88 | dt: 640.62ms | tok/sec: 818,405 | mfu: 51.15 | epoch: 2 | total time: 100.42m | eta: 79.1m +step 09348/16704 (55.96%) | loss: 2.656231 | lrm: 0.88 | dt: 645.64ms | tok/sec: 812,048 | mfu: 50.75 | epoch: 2 | total time: 100.44m | eta: 79.1m +step 09349/16704 (55.97%) | loss: 2.651181 | lrm: 0.88 | dt: 642.02ms | tok/sec: 816,616 | mfu: 51.04 | epoch: 2 | total time: 100.45m | eta: 79.1m +step 09350/16704 (55.97%) | loss: 2.652172 | lrm: 0.88 | dt: 643.11ms | tok/sec: 815,244 | mfu: 50.95 | epoch: 2 | total time: 100.46m | eta: 79.1m +step 09351/16704 (55.98%) | loss: 2.660446 | lrm: 0.88 | dt: 642.47ms | tok/sec: 816,056 | mfu: 51.00 | epoch: 2 | total time: 100.47m | eta: 79.1m +step 09352/16704 (55.99%) | loss: 2.659943 | lrm: 0.88 | dt: 642.01ms | tok/sec: 816,633 | mfu: 51.04 | epoch: 2 | total time: 100.48m | eta: 79.1m +step 09353/16704 (55.99%) | loss: 2.669908 | lrm: 0.88 | dt: 642.73ms | tok/sec: 815,714 | mfu: 50.98 | epoch: 2 | total time: 100.49m | eta: 79.1m +step 09354/16704 (56.00%) | loss: 2.671382 | lrm: 0.88 | dt: 642.74ms | tok/sec: 815,707 | mfu: 50.98 | epoch: 2 | total time: 100.50m | eta: 79.1m +step 09355/16704 (56.00%) | loss: 2.668179 | lrm: 0.88 | dt: 643.19ms | tok/sec: 815,139 | mfu: 50.95 | epoch: 2 | total time: 100.51m | eta: 79.0m +step 09356/16704 (56.01%) | loss: 2.670727 | lrm: 0.88 | dt: 644.48ms | tok/sec: 813,499 | mfu: 50.84 | epoch: 2 | total time: 100.52m | eta: 79.0m +step 09357/16704 (56.02%) | loss: 2.668421 | lrm: 0.88 | dt: 643.82ms | tok/sec: 814,338 | mfu: 50.90 | epoch: 2 | total time: 100.53m | eta: 79.0m +step 09358/16704 (56.02%) | loss: 2.648124 | lrm: 0.88 | dt: 642.09ms | tok/sec: 816,536 | mfu: 51.03 | epoch: 2 | total time: 100.54m | eta: 79.0m +step 09359/16704 (56.03%) | loss: 2.659555 | lrm: 0.88 | dt: 641.83ms | tok/sec: 816,863 | mfu: 51.06 | epoch: 2 | total time: 100.55m | eta: 79.0m +step 09360/16704 (56.03%) | loss: 2.649262 | lrm: 0.88 | dt: 641.78ms | tok/sec: 816,925 | mfu: 51.06 | epoch: 2 | total time: 100.56m | eta: 79.0m +step 09361/16704 (56.04%) | loss: 2.662108 | lrm: 0.88 | dt: 643.79ms | tok/sec: 814,373 | mfu: 50.90 | epoch: 2 | total time: 100.57m | eta: 79.0m +step 09362/16704 (56.05%) | loss: 2.666447 | lrm: 0.88 | dt: 642.69ms | tok/sec: 815,776 | mfu: 50.99 | epoch: 2 | total time: 100.59m | eta: 79.0m +step 09363/16704 (56.05%) | loss: 2.672171 | lrm: 0.88 | dt: 643.68ms | tok/sec: 814,513 | mfu: 50.91 | epoch: 2 | total time: 100.60m | eta: 79.0m +step 09364/16704 (56.06%) | loss: 2.688485 | lrm: 0.88 | dt: 643.15ms | tok/sec: 815,187 | mfu: 50.95 | epoch: 2 | total time: 100.61m | eta: 78.9m +step 09365/16704 (56.06%) | loss: 2.685847 | lrm: 0.88 | dt: 642.59ms | tok/sec: 815,901 | mfu: 50.99 | epoch: 2 | total time: 100.62m | eta: 78.9m +step 09366/16704 (56.07%) | loss: 2.685446 | lrm: 0.88 | dt: 641.65ms | tok/sec: 817,093 | mfu: 51.07 | epoch: 2 | total time: 100.63m | eta: 78.9m +step 09367/16704 (56.08%) | loss: 2.679892 | lrm: 0.88 | dt: 643.45ms | tok/sec: 814,807 | mfu: 50.93 | epoch: 2 | total time: 100.64m | eta: 78.9m +step 09368/16704 (56.08%) | loss: 2.687963 | lrm: 0.88 | dt: 642.63ms | tok/sec: 815,849 | mfu: 50.99 | epoch: 2 | total time: 100.65m | eta: 78.9m +step 09369/16704 (56.09%) | loss: 2.696943 | lrm: 0.88 | dt: 643.19ms | tok/sec: 815,132 | mfu: 50.95 | epoch: 2 | total time: 100.66m | eta: 78.9m +step 09370/16704 (56.09%) | loss: 2.685662 | lrm: 0.88 | dt: 644.34ms | tok/sec: 813,678 | mfu: 50.86 | epoch: 2 | total time: 100.67m | eta: 78.9m +step 09371/16704 (56.10%) | loss: 2.669972 | lrm: 0.88 | dt: 647.33ms | tok/sec: 809,919 | mfu: 50.62 | epoch: 2 | total time: 100.68m | eta: 78.9m +step 09372/16704 (56.11%) | loss: 2.670889 | lrm: 0.88 | dt: 644.64ms | tok/sec: 813,303 | mfu: 50.83 | epoch: 2 | total time: 100.69m | eta: 78.9m +step 09373/16704 (56.11%) | loss: 2.675721 | lrm: 0.88 | dt: 643.79ms | tok/sec: 814,381 | mfu: 50.90 | epoch: 2 | total time: 100.70m | eta: 78.8m +step 09374/16704 (56.12%) | loss: 2.693927 | lrm: 0.88 | dt: 643.90ms | tok/sec: 814,241 | mfu: 50.89 | epoch: 2 | total time: 100.71m | eta: 78.8m +step 09375/16704 (56.12%) | loss: 2.702202 | lrm: 0.88 | dt: 641.63ms | tok/sec: 817,120 | mfu: 51.07 | epoch: 2 | total time: 100.72m | eta: 78.8m +step 09376/16704 (56.13%) | loss: 2.707134 | lrm: 0.88 | dt: 641.94ms | tok/sec: 816,718 | mfu: 51.05 | epoch: 2 | total time: 100.74m | eta: 78.8m +step 09377/16704 (56.14%) | loss: 2.700848 | lrm: 0.88 | dt: 644.52ms | tok/sec: 813,451 | mfu: 50.84 | epoch: 2 | total time: 100.75m | eta: 78.8m +step 09378/16704 (56.14%) | loss: 2.685660 | lrm: 0.88 | dt: 644.25ms | tok/sec: 813,790 | mfu: 50.86 | epoch: 2 | total time: 100.76m | eta: 78.8m +step 09379/16704 (56.15%) | loss: 2.673349 | lrm: 0.88 | dt: 642.89ms | tok/sec: 815,515 | mfu: 50.97 | epoch: 2 | total time: 100.77m | eta: 78.8m +step 09380/16704 (56.15%) | loss: 2.671805 | lrm: 0.88 | dt: 644.92ms | tok/sec: 812,954 | mfu: 50.81 | epoch: 2 | total time: 100.78m | eta: 78.8m +step 09381/16704 (56.16%) | loss: 2.676230 | lrm: 0.88 | dt: 642.87ms | tok/sec: 815,546 | mfu: 50.97 | epoch: 2 | total time: 100.79m | eta: 78.8m +step 09382/16704 (56.17%) | loss: 2.691554 | lrm: 0.88 | dt: 644.17ms | tok/sec: 813,899 | mfu: 50.87 | epoch: 2 | total time: 100.80m | eta: 78.8m +step 09383/16704 (56.17%) | loss: 2.697612 | lrm: 0.88 | dt: 641.93ms | tok/sec: 816,740 | mfu: 51.05 | epoch: 2 | total time: 100.81m | eta: 78.7m +step 09384/16704 (56.18%) | loss: 2.690067 | lrm: 0.88 | dt: 642.68ms | tok/sec: 815,782 | mfu: 50.99 | epoch: 2 | total time: 100.82m | eta: 78.7m +step 09385/16704 (56.18%) | loss: 2.688514 | lrm: 0.88 | dt: 644.92ms | tok/sec: 812,953 | mfu: 50.81 | epoch: 2 | total time: 100.83m | eta: 78.7m +step 09386/16704 (56.19%) | loss: 2.692150 | lrm: 0.88 | dt: 642.29ms | tok/sec: 816,275 | mfu: 51.02 | epoch: 2 | total time: 100.84m | eta: 78.7m +step 09387/16704 (56.20%) | loss: 2.694402 | lrm: 0.88 | dt: 644.26ms | tok/sec: 813,783 | mfu: 50.86 | epoch: 2 | total time: 100.85m | eta: 78.7m +step 09388/16704 (56.20%) | loss: 2.676186 | lrm: 0.88 | dt: 642.96ms | tok/sec: 815,428 | mfu: 50.97 | epoch: 2 | total time: 100.86m | eta: 78.7m +step 09389/16704 (56.21%) | loss: 2.666958 | lrm: 0.88 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 2 | total time: 100.88m | eta: 78.7m +step 09390/16704 (56.21%) | loss: 2.672896 | lrm: 0.88 | dt: 644.88ms | tok/sec: 813,006 | mfu: 50.81 | epoch: 2 | total time: 100.89m | eta: 78.7m +step 09391/16704 (56.22%) | loss: 2.674999 | lrm: 0.88 | dt: 658.32ms | tok/sec: 796,401 | mfu: 49.78 | epoch: 2 | total time: 100.90m | eta: 78.7m +step 09392/16704 (56.23%) | loss: 2.668806 | lrm: 0.88 | dt: 647.03ms | tok/sec: 810,299 | mfu: 50.64 | epoch: 2 | total time: 100.91m | eta: 78.6m +step 09393/16704 (56.23%) | loss: 2.654971 | lrm: 0.88 | dt: 641.84ms | tok/sec: 816,849 | mfu: 51.05 | epoch: 2 | total time: 100.92m | eta: 78.6m +step 09394/16704 (56.24%) | loss: 2.660650 | lrm: 0.88 | dt: 646.31ms | tok/sec: 811,204 | mfu: 50.70 | epoch: 2 | total time: 100.93m | eta: 78.6m +step 09395/16704 (56.24%) | loss: 2.666909 | lrm: 0.88 | dt: 641.28ms | tok/sec: 817,564 | mfu: 51.10 | epoch: 2 | total time: 100.94m | eta: 78.6m +step 09396/16704 (56.25%) | loss: 2.666191 | lrm: 0.88 | dt: 646.22ms | tok/sec: 811,315 | mfu: 50.71 | epoch: 2 | total time: 100.95m | eta: 78.6m +step 09397/16704 (56.26%) | loss: 2.664156 | lrm: 0.87 | dt: 641.90ms | tok/sec: 816,779 | mfu: 51.05 | epoch: 2 | total time: 100.96m | eta: 78.6m +step 09398/16704 (56.26%) | loss: 2.670855 | lrm: 0.87 | dt: 643.36ms | tok/sec: 814,926 | mfu: 50.93 | epoch: 2 | total time: 100.97m | eta: 78.6m +step 09399/16704 (56.27%) | loss: 2.664623 | lrm: 0.87 | dt: 643.38ms | tok/sec: 814,890 | mfu: 50.93 | epoch: 2 | total time: 100.98m | eta: 78.6m +step 09400/16704 (56.27%) | loss: 2.652572 | lrm: 0.87 | dt: 640.78ms | tok/sec: 818,196 | mfu: 51.14 | epoch: 2 | total time: 100.99m | eta: 78.6m +step 09401/16704 (56.28%) | loss: 2.644381 | lrm: 0.87 | dt: 643.43ms | tok/sec: 814,838 | mfu: 50.93 | epoch: 2 | total time: 101.00m | eta: 78.5m +step 09402/16704 (56.29%) | loss: 2.641279 | lrm: 0.87 | dt: 641.84ms | tok/sec: 816,856 | mfu: 51.05 | epoch: 2 | total time: 101.01m | eta: 78.5m +step 09403/16704 (56.29%) | loss: 2.637662 | lrm: 0.87 | dt: 643.42ms | tok/sec: 814,840 | mfu: 50.93 | epoch: 2 | total time: 101.03m | eta: 78.5m +step 09404/16704 (56.30%) | loss: 2.634126 | lrm: 0.87 | dt: 644.26ms | tok/sec: 813,777 | mfu: 50.86 | epoch: 2 | total time: 101.04m | eta: 78.5m +step 09405/16704 (56.30%) | loss: 2.627877 | lrm: 0.87 | dt: 640.63ms | tok/sec: 818,391 | mfu: 51.15 | epoch: 2 | total time: 101.05m | eta: 78.5m +step 09406/16704 (56.31%) | loss: 2.628243 | lrm: 0.87 | dt: 643.05ms | tok/sec: 815,308 | mfu: 50.96 | epoch: 2 | total time: 101.06m | eta: 78.5m +step 09407/16704 (56.32%) | loss: 2.637805 | lrm: 0.87 | dt: 642.92ms | tok/sec: 815,476 | mfu: 50.97 | epoch: 2 | total time: 101.07m | eta: 78.5m +step 09408/16704 (56.32%) | loss: 2.640964 | lrm: 0.87 | dt: 642.54ms | tok/sec: 815,962 | mfu: 51.00 | epoch: 2 | total time: 101.08m | eta: 78.5m +step 09409/16704 (56.33%) | loss: 2.628911 | lrm: 0.87 | dt: 643.40ms | tok/sec: 814,868 | mfu: 50.93 | epoch: 2 | total time: 101.09m | eta: 78.5m +step 09410/16704 (56.33%) | loss: 2.642060 | lrm: 0.87 | dt: 643.95ms | tok/sec: 814,172 | mfu: 50.89 | epoch: 2 | total time: 101.10m | eta: 78.4m +step 09411/16704 (56.34%) | loss: 2.648366 | lrm: 0.87 | dt: 644.83ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 2 | total time: 101.11m | eta: 78.4m +step 09412/16704 (56.35%) | loss: 2.635192 | lrm: 0.87 | dt: 641.63ms | tok/sec: 817,120 | mfu: 51.07 | epoch: 2 | total time: 101.12m | eta: 78.4m +step 09413/16704 (56.35%) | loss: 2.632044 | lrm: 0.87 | dt: 644.14ms | tok/sec: 813,939 | mfu: 50.87 | epoch: 2 | total time: 101.13m | eta: 78.4m +step 09414/16704 (56.36%) | loss: 2.625228 | lrm: 0.87 | dt: 643.71ms | tok/sec: 814,472 | mfu: 50.91 | epoch: 2 | total time: 101.14m | eta: 78.4m +step 09415/16704 (56.36%) | loss: 2.622353 | lrm: 0.87 | dt: 642.47ms | tok/sec: 816,052 | mfu: 51.00 | epoch: 2 | total time: 101.15m | eta: 78.4m +step 09416/16704 (56.37%) | loss: 2.620844 | lrm: 0.87 | dt: 645.43ms | tok/sec: 812,313 | mfu: 50.77 | epoch: 2 | total time: 101.16m | eta: 78.4m +step 09417/16704 (56.38%) | loss: 2.617124 | lrm: 0.87 | dt: 641.39ms | tok/sec: 817,430 | mfu: 51.09 | epoch: 2 | total time: 101.18m | eta: 78.4m +step 09418/16704 (56.38%) | loss: 2.622902 | lrm: 0.87 | dt: 642.33ms | tok/sec: 816,226 | mfu: 51.02 | epoch: 2 | total time: 101.19m | eta: 78.4m +step 09419/16704 (56.39%) | loss: 2.618842 | lrm: 0.87 | dt: 645.21ms | tok/sec: 812,583 | mfu: 50.79 | epoch: 2 | total time: 101.20m | eta: 78.4m +step 09420/16704 (56.39%) | loss: 2.626190 | lrm: 0.87 | dt: 641.47ms | tok/sec: 817,318 | mfu: 51.08 | epoch: 2 | total time: 101.21m | eta: 78.3m +step 09421/16704 (56.40%) | loss: 2.641836 | lrm: 0.87 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 2 | total time: 101.22m | eta: 78.3m +step 09422/16704 (56.41%) | loss: 2.643609 | lrm: 0.87 | dt: 643.42ms | tok/sec: 814,843 | mfu: 50.93 | epoch: 2 | total time: 101.23m | eta: 78.3m +step 09423/16704 (56.41%) | loss: 2.650260 | lrm: 0.87 | dt: 641.99ms | tok/sec: 816,655 | mfu: 51.04 | epoch: 2 | total time: 101.24m | eta: 78.3m +step 09424/16704 (56.42%) | loss: 2.642920 | lrm: 0.87 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 2 | total time: 101.25m | eta: 78.3m +step 09425/16704 (56.42%) | loss: 2.646859 | lrm: 0.87 | dt: 639.81ms | tok/sec: 819,439 | mfu: 51.22 | epoch: 2 | total time: 101.26m | eta: 78.3m +step 09426/16704 (56.43%) | loss: 2.660547 | lrm: 0.87 | dt: 643.81ms | tok/sec: 814,354 | mfu: 50.90 | epoch: 2 | total time: 101.27m | eta: 78.3m +step 09427/16704 (56.44%) | loss: 2.655300 | lrm: 0.87 | dt: 644.50ms | tok/sec: 813,485 | mfu: 50.84 | epoch: 2 | total time: 101.28m | eta: 78.3m +step 09428/16704 (56.44%) | loss: 2.653387 | lrm: 0.87 | dt: 641.58ms | tok/sec: 817,188 | mfu: 51.08 | epoch: 2 | total time: 101.29m | eta: 78.3m +step 09429/16704 (56.45%) | loss: 2.659336 | lrm: 0.87 | dt: 644.80ms | tok/sec: 813,098 | mfu: 50.82 | epoch: 2 | total time: 101.30m | eta: 78.2m +step 09430/16704 (56.45%) | loss: 2.666893 | lrm: 0.87 | dt: 642.83ms | tok/sec: 815,591 | mfu: 50.98 | epoch: 2 | total time: 101.31m | eta: 78.2m +step 09431/16704 (56.46%) | loss: 2.674481 | lrm: 0.87 | dt: 642.38ms | tok/sec: 816,170 | mfu: 51.01 | epoch: 2 | total time: 101.33m | eta: 78.2m +step 09432/16704 (56.47%) | loss: 2.667909 | lrm: 0.87 | dt: 643.47ms | tok/sec: 814,781 | mfu: 50.92 | epoch: 2 | total time: 101.34m | eta: 78.2m +step 09433/16704 (56.47%) | loss: 2.669913 | lrm: 0.87 | dt: 643.35ms | tok/sec: 814,928 | mfu: 50.93 | epoch: 2 | total time: 101.35m | eta: 78.2m +step 09434/16704 (56.48%) | loss: 2.679797 | lrm: 0.87 | dt: 642.63ms | tok/sec: 815,850 | mfu: 50.99 | epoch: 2 | total time: 101.36m | eta: 78.2m +step 09435/16704 (56.48%) | loss: 2.658148 | lrm: 0.87 | dt: 643.14ms | tok/sec: 815,200 | mfu: 50.95 | epoch: 2 | total time: 101.37m | eta: 78.2m +step 09436/16704 (56.49%) | loss: 2.650680 | lrm: 0.87 | dt: 642.94ms | tok/sec: 815,458 | mfu: 50.97 | epoch: 2 | total time: 101.38m | eta: 78.2m +step 09437/16704 (56.50%) | loss: 2.656587 | lrm: 0.87 | dt: 643.83ms | tok/sec: 814,326 | mfu: 50.90 | epoch: 2 | total time: 101.39m | eta: 78.2m +step 09438/16704 (56.50%) | loss: 2.667943 | lrm: 0.87 | dt: 643.29ms | tok/sec: 815,015 | mfu: 50.94 | epoch: 2 | total time: 101.40m | eta: 78.1m +step 09439/16704 (56.51%) | loss: 2.663190 | lrm: 0.87 | dt: 643.73ms | tok/sec: 814,449 | mfu: 50.90 | epoch: 2 | total time: 101.41m | eta: 78.1m +step 09440/16704 (56.51%) | loss: 2.663784 | lrm: 0.87 | dt: 641.65ms | tok/sec: 817,098 | mfu: 51.07 | epoch: 2 | total time: 101.42m | eta: 78.1m +step 09441/16704 (56.52%) | loss: 2.664471 | lrm: 0.87 | dt: 643.81ms | tok/sec: 814,353 | mfu: 50.90 | epoch: 2 | total time: 101.43m | eta: 78.1m +step 09442/16704 (56.53%) | loss: 2.670409 | lrm: 0.87 | dt: 643.58ms | tok/sec: 814,639 | mfu: 50.92 | epoch: 2 | total time: 101.44m | eta: 78.1m +step 09443/16704 (56.53%) | loss: 2.673769 | lrm: 0.87 | dt: 644.14ms | tok/sec: 813,934 | mfu: 50.87 | epoch: 2 | total time: 101.45m | eta: 78.1m +step 09444/16704 (56.54%) | loss: 2.672673 | lrm: 0.87 | dt: 644.14ms | tok/sec: 813,935 | mfu: 50.87 | epoch: 2 | total time: 101.47m | eta: 78.1m +step 09445/16704 (56.54%) | loss: 2.676935 | lrm: 0.87 | dt: 643.70ms | tok/sec: 814,494 | mfu: 50.91 | epoch: 2 | total time: 101.48m | eta: 78.1m +step 09446/16704 (56.55%) | loss: 2.681863 | lrm: 0.87 | dt: 643.84ms | tok/sec: 814,312 | mfu: 50.90 | epoch: 2 | total time: 101.49m | eta: 78.1m +step 09447/16704 (56.56%) | loss: 2.678433 | lrm: 0.87 | dt: 641.06ms | tok/sec: 817,843 | mfu: 51.12 | epoch: 2 | total time: 101.50m | eta: 78.1m +step 09448/16704 (56.56%) | loss: 2.688702 | lrm: 0.87 | dt: 643.14ms | tok/sec: 815,199 | mfu: 50.95 | epoch: 2 | total time: 101.51m | eta: 78.0m +step 09449/16704 (56.57%) | loss: 2.687384 | lrm: 0.87 | dt: 641.64ms | tok/sec: 817,100 | mfu: 51.07 | epoch: 2 | total time: 101.52m | eta: 78.0m +step 09450/16704 (56.57%) | loss: 2.700841 | lrm: 0.87 | dt: 645.83ms | tok/sec: 811,799 | mfu: 50.74 | epoch: 2 | total time: 101.53m | eta: 78.0m +step 09451/16704 (56.58%) | loss: 2.696319 | lrm: 0.87 | dt: 641.74ms | tok/sec: 816,974 | mfu: 51.06 | epoch: 2 | total time: 101.54m | eta: 78.0m +step 09452/16704 (56.59%) | loss: 2.696549 | lrm: 0.87 | dt: 643.37ms | tok/sec: 814,913 | mfu: 50.93 | epoch: 2 | total time: 101.55m | eta: 78.0m +step 09453/16704 (56.59%) | loss: 2.693426 | lrm: 0.87 | dt: 645.93ms | tok/sec: 811,676 | mfu: 50.73 | epoch: 2 | total time: 101.56m | eta: 78.0m +step 09454/16704 (56.60%) | loss: 2.692957 | lrm: 0.87 | dt: 643.77ms | tok/sec: 814,401 | mfu: 50.90 | epoch: 2 | total time: 101.57m | eta: 78.0m +step 09455/16704 (56.60%) | loss: 2.682637 | lrm: 0.87 | dt: 642.74ms | tok/sec: 815,713 | mfu: 50.98 | epoch: 2 | total time: 101.58m | eta: 78.0m +step 09456/16704 (56.61%) | loss: 2.683994 | lrm: 0.87 | dt: 643.56ms | tok/sec: 814,668 | mfu: 50.92 | epoch: 2 | total time: 101.59m | eta: 78.0m +step 09457/16704 (56.62%) | loss: 2.679658 | lrm: 0.87 | dt: 643.35ms | tok/sec: 814,932 | mfu: 50.93 | epoch: 2 | total time: 101.60m | eta: 77.9m +step 09458/16704 (56.62%) | loss: 2.676817 | lrm: 0.87 | dt: 645.17ms | tok/sec: 812,640 | mfu: 50.79 | epoch: 2 | total time: 101.62m | eta: 77.9m +step 09459/16704 (56.63%) | loss: 2.677455 | lrm: 0.87 | dt: 641.23ms | tok/sec: 817,632 | mfu: 51.10 | epoch: 2 | total time: 101.63m | eta: 77.9m +step 09460/16704 (56.63%) | loss: 2.666851 | lrm: 0.87 | dt: 642.71ms | tok/sec: 815,743 | mfu: 50.99 | epoch: 2 | total time: 101.64m | eta: 77.9m +step 09461/16704 (56.64%) | loss: 2.672816 | lrm: 0.87 | dt: 644.18ms | tok/sec: 813,889 | mfu: 50.87 | epoch: 2 | total time: 101.65m | eta: 77.9m +step 09462/16704 (56.65%) | loss: 2.669210 | lrm: 0.87 | dt: 645.09ms | tok/sec: 812,736 | mfu: 50.80 | epoch: 2 | total time: 101.66m | eta: 77.9m +step 09463/16704 (56.65%) | loss: 2.656000 | lrm: 0.87 | dt: 644.44ms | tok/sec: 813,552 | mfu: 50.85 | epoch: 2 | total time: 101.67m | eta: 77.9m +step 09464/16704 (56.66%) | loss: 2.644987 | lrm: 0.87 | dt: 642.58ms | tok/sec: 815,909 | mfu: 51.00 | epoch: 2 | total time: 101.68m | eta: 77.9m +step 09465/16704 (56.66%) | loss: 2.650384 | lrm: 0.87 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 2 | total time: 101.69m | eta: 77.9m +step 09466/16704 (56.67%) | loss: 2.639803 | lrm: 0.87 | dt: 643.56ms | tok/sec: 814,673 | mfu: 50.92 | epoch: 2 | total time: 101.70m | eta: 77.8m +step 09467/16704 (56.68%) | loss: 2.632846 | lrm: 0.87 | dt: 642.41ms | tok/sec: 816,123 | mfu: 51.01 | epoch: 2 | total time: 101.71m | eta: 77.8m +step 09468/16704 (56.68%) | loss: 2.641955 | lrm: 0.87 | dt: 643.43ms | tok/sec: 814,836 | mfu: 50.93 | epoch: 2 | total time: 101.72m | eta: 77.8m +step 09469/16704 (56.69%) | loss: 2.639121 | lrm: 0.87 | dt: 648.21ms | tok/sec: 808,820 | mfu: 50.55 | epoch: 2 | total time: 101.73m | eta: 77.8m +step 09470/16704 (56.69%) | loss: 2.641000 | lrm: 0.87 | dt: 642.01ms | tok/sec: 816,632 | mfu: 51.04 | epoch: 2 | total time: 101.74m | eta: 77.8m +step 09471/16704 (56.70%) | loss: 2.647079 | lrm: 0.87 | dt: 643.47ms | tok/sec: 814,786 | mfu: 50.93 | epoch: 2 | total time: 101.75m | eta: 77.8m +step 09472/16704 (56.70%) | loss: 2.651813 | lrm: 0.87 | dt: 643.48ms | tok/sec: 814,772 | mfu: 50.92 | epoch: 2 | total time: 101.77m | eta: 77.8m +step 09473/16704 (56.71%) | loss: 2.646119 | lrm: 0.87 | dt: 645.70ms | tok/sec: 811,966 | mfu: 50.75 | epoch: 2 | total time: 101.78m | eta: 77.8m +step 09474/16704 (56.72%) | loss: 2.636801 | lrm: 0.87 | dt: 644.44ms | tok/sec: 813,558 | mfu: 50.85 | epoch: 2 | total time: 101.79m | eta: 77.8m +step 09475/16704 (56.72%) | loss: 2.635646 | lrm: 0.87 | dt: 641.17ms | tok/sec: 817,700 | mfu: 51.11 | epoch: 2 | total time: 101.80m | eta: 77.7m +step 09476/16704 (56.73%) | loss: 2.638668 | lrm: 0.87 | dt: 643.37ms | tok/sec: 814,910 | mfu: 50.93 | epoch: 2 | total time: 101.81m | eta: 77.7m +step 09477/16704 (56.73%) | loss: 2.635496 | lrm: 0.87 | dt: 643.91ms | tok/sec: 814,228 | mfu: 50.89 | epoch: 2 | total time: 101.82m | eta: 77.7m +step 09478/16704 (56.74%) | loss: 2.644218 | lrm: 0.87 | dt: 643.69ms | tok/sec: 814,504 | mfu: 50.91 | epoch: 2 | total time: 101.83m | eta: 77.7m +step 09479/16704 (56.75%) | loss: 2.652176 | lrm: 0.87 | dt: 643.37ms | tok/sec: 814,907 | mfu: 50.93 | epoch: 2 | total time: 101.84m | eta: 77.7m +step 09480/16704 (56.75%) | loss: 2.672786 | lrm: 0.86 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 2 | total time: 101.85m | eta: 77.7m +step 09481/16704 (56.76%) | loss: 2.671419 | lrm: 0.86 | dt: 643.68ms | tok/sec: 814,521 | mfu: 50.91 | epoch: 2 | total time: 101.86m | eta: 77.7m +step 09482/16704 (56.76%) | loss: 2.681287 | lrm: 0.86 | dt: 643.89ms | tok/sec: 814,256 | mfu: 50.89 | epoch: 2 | total time: 101.87m | eta: 77.7m +step 09483/16704 (56.77%) | loss: 2.679349 | lrm: 0.86 | dt: 644.53ms | tok/sec: 813,444 | mfu: 50.84 | epoch: 2 | total time: 101.88m | eta: 77.7m +step 09484/16704 (56.78%) | loss: 2.672609 | lrm: 0.86 | dt: 645.38ms | tok/sec: 812,375 | mfu: 50.77 | epoch: 2 | total time: 101.89m | eta: 77.7m +step 09485/16704 (56.78%) | loss: 2.672298 | lrm: 0.86 | dt: 643.04ms | tok/sec: 815,329 | mfu: 50.96 | epoch: 2 | total time: 101.90m | eta: 77.6m +step 09486/16704 (56.79%) | loss: 2.671121 | lrm: 0.86 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 2 | total time: 101.92m | eta: 77.6m +step 09487/16704 (56.79%) | loss: 2.682039 | lrm: 0.86 | dt: 644.63ms | tok/sec: 813,321 | mfu: 50.83 | epoch: 2 | total time: 101.93m | eta: 77.6m +step 09488/16704 (56.80%) | loss: 2.678816 | lrm: 0.86 | dt: 643.39ms | tok/sec: 814,881 | mfu: 50.93 | epoch: 2 | total time: 101.94m | eta: 77.6m +step 09489/16704 (56.81%) | loss: 2.691024 | lrm: 0.86 | dt: 642.48ms | tok/sec: 816,038 | mfu: 51.00 | epoch: 2 | total time: 101.95m | eta: 77.6m +step 09490/16704 (56.81%) | loss: 2.685197 | lrm: 0.86 | dt: 641.98ms | tok/sec: 816,675 | mfu: 51.04 | epoch: 2 | total time: 101.96m | eta: 77.6m +step 09491/16704 (56.82%) | loss: 2.669680 | lrm: 0.86 | dt: 644.35ms | tok/sec: 813,668 | mfu: 50.86 | epoch: 2 | total time: 101.97m | eta: 77.6m +step 09492/16704 (56.82%) | loss: 2.670580 | lrm: 0.86 | dt: 644.92ms | tok/sec: 812,946 | mfu: 50.81 | epoch: 2 | total time: 101.98m | eta: 77.6m +step 09493/16704 (56.83%) | loss: 2.667076 | lrm: 0.86 | dt: 642.53ms | tok/sec: 815,971 | mfu: 51.00 | epoch: 2 | total time: 101.99m | eta: 77.6m +step 09494/16704 (56.84%) | loss: 2.658765 | lrm: 0.86 | dt: 643.94ms | tok/sec: 814,184 | mfu: 50.89 | epoch: 2 | total time: 102.00m | eta: 77.5m +step 09495/16704 (56.84%) | loss: 2.666189 | lrm: 0.86 | dt: 640.15ms | tok/sec: 819,012 | mfu: 51.19 | epoch: 2 | total time: 102.01m | eta: 77.5m +step 09496/16704 (56.85%) | loss: 2.668723 | lrm: 0.86 | dt: 643.87ms | tok/sec: 814,274 | mfu: 50.89 | epoch: 2 | total time: 102.02m | eta: 77.5m +step 09497/16704 (56.85%) | loss: 2.675435 | lrm: 0.86 | dt: 643.51ms | tok/sec: 814,729 | mfu: 50.92 | epoch: 2 | total time: 102.03m | eta: 77.5m +step 09498/16704 (56.86%) | loss: 2.670055 | lrm: 0.86 | dt: 642.09ms | tok/sec: 816,538 | mfu: 51.03 | epoch: 2 | total time: 102.04m | eta: 77.5m +step 09499/16704 (56.87%) | loss: 2.659731 | lrm: 0.86 | dt: 645.04ms | tok/sec: 812,795 | mfu: 50.80 | epoch: 2 | total time: 102.05m | eta: 77.5m +Step 09500 | Validation bpb: 0.809589 +step 09500/16704 (56.87%) | loss: 2.654821 | lrm: 0.86 | dt: 645.79ms | tok/sec: 811,857 | mfu: 50.74 | epoch: 2 | total time: 102.07m | eta: 77.5m +step 09501/16704 (56.88%) | loss: 2.664990 | lrm: 0.86 | dt: 648.93ms | tok/sec: 807,921 | mfu: 50.50 | epoch: 2 | total time: 102.08m | eta: 77.5m +step 09502/16704 (56.88%) | loss: 2.654229 | lrm: 0.86 | dt: 644.68ms | tok/sec: 813,254 | mfu: 50.83 | epoch: 2 | total time: 102.09m | eta: 77.5m +step 09503/16704 (56.89%) | loss: 2.660314 | lrm: 0.86 | dt: 643.68ms | tok/sec: 814,516 | mfu: 50.91 | epoch: 2 | total time: 102.10m | eta: 77.4m +step 09504/16704 (56.90%) | loss: 2.643559 | lrm: 0.86 | dt: 648.69ms | tok/sec: 808,221 | mfu: 50.52 | epoch: 2 | total time: 102.11m | eta: 77.4m +step 09505/16704 (56.90%) | loss: 2.639409 | lrm: 0.86 | dt: 641.39ms | tok/sec: 817,426 | mfu: 51.09 | epoch: 2 | total time: 102.12m | eta: 77.4m +step 09506/16704 (56.91%) | loss: 2.641547 | lrm: 0.86 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 2 | total time: 102.13m | eta: 77.4m +step 09507/16704 (56.91%) | loss: 2.635706 | lrm: 0.86 | dt: 645.68ms | tok/sec: 811,996 | mfu: 50.75 | epoch: 2 | total time: 102.14m | eta: 77.4m +step 09508/16704 (56.92%) | loss: 2.636742 | lrm: 0.86 | dt: 643.22ms | tok/sec: 815,100 | mfu: 50.94 | epoch: 2 | total time: 102.15m | eta: 77.4m +step 09509/16704 (56.93%) | loss: 2.627743 | lrm: 0.86 | dt: 643.85ms | tok/sec: 814,298 | mfu: 50.89 | epoch: 2 | total time: 102.16m | eta: 77.4m +step 09510/16704 (56.93%) | loss: 2.626826 | lrm: 0.86 | dt: 642.88ms | tok/sec: 815,535 | mfu: 50.97 | epoch: 2 | total time: 102.17m | eta: 77.4m +step 09511/16704 (56.94%) | loss: 2.629202 | lrm: 0.86 | dt: 644.83ms | tok/sec: 813,061 | mfu: 50.82 | epoch: 2 | total time: 102.18m | eta: 77.4m +step 09512/16704 (56.94%) | loss: 2.635338 | lrm: 0.86 | dt: 645.07ms | tok/sec: 812,759 | mfu: 50.80 | epoch: 2 | total time: 102.19m | eta: 77.4m +step 09513/16704 (56.95%) | loss: 2.635246 | lrm: 0.86 | dt: 643.81ms | tok/sec: 814,357 | mfu: 50.90 | epoch: 2 | total time: 102.21m | eta: 77.3m +step 09514/16704 (56.96%) | loss: 2.642252 | lrm: 0.86 | dt: 642.99ms | tok/sec: 815,393 | mfu: 50.96 | epoch: 2 | total time: 102.22m | eta: 77.3m +step 09515/16704 (56.96%) | loss: 2.658137 | lrm: 0.86 | dt: 643.31ms | tok/sec: 814,988 | mfu: 50.94 | epoch: 2 | total time: 102.23m | eta: 77.3m +step 09516/16704 (56.97%) | loss: 2.653171 | lrm: 0.86 | dt: 642.90ms | tok/sec: 815,505 | mfu: 50.97 | epoch: 2 | total time: 102.24m | eta: 77.3m +step 09517/16704 (56.97%) | loss: 2.640058 | lrm: 0.86 | dt: 645.63ms | tok/sec: 812,054 | mfu: 50.75 | epoch: 2 | total time: 102.25m | eta: 77.3m +step 09518/16704 (56.98%) | loss: 2.639905 | lrm: 0.86 | dt: 644.26ms | tok/sec: 813,778 | mfu: 50.86 | epoch: 2 | total time: 102.26m | eta: 77.3m +step 09519/16704 (56.99%) | loss: 2.641150 | lrm: 0.86 | dt: 643.16ms | tok/sec: 815,173 | mfu: 50.95 | epoch: 2 | total time: 102.27m | eta: 77.3m +step 09520/16704 (56.99%) | loss: 2.631336 | lrm: 0.86 | dt: 644.60ms | tok/sec: 813,348 | mfu: 50.84 | epoch: 2 | total time: 102.28m | eta: 77.3m +step 09521/16704 (57.00%) | loss: 2.635498 | lrm: 0.86 | dt: 644.47ms | tok/sec: 813,512 | mfu: 50.85 | epoch: 2 | total time: 102.29m | eta: 77.3m +step 09522/16704 (57.00%) | loss: 2.639690 | lrm: 0.86 | dt: 645.33ms | tok/sec: 812,438 | mfu: 50.78 | epoch: 2 | total time: 102.30m | eta: 77.2m +step 09523/16704 (57.01%) | loss: 2.645795 | lrm: 0.86 | dt: 645.34ms | tok/sec: 812,422 | mfu: 50.78 | epoch: 2 | total time: 102.31m | eta: 77.2m +step 09524/16704 (57.02%) | loss: 2.646500 | lrm: 0.86 | dt: 642.56ms | tok/sec: 815,939 | mfu: 51.00 | epoch: 2 | total time: 102.32m | eta: 77.2m +step 09525/16704 (57.02%) | loss: 2.650891 | lrm: 0.86 | dt: 647.48ms | tok/sec: 809,739 | mfu: 50.61 | epoch: 2 | total time: 102.33m | eta: 77.2m +step 09526/16704 (57.03%) | loss: 2.655041 | lrm: 0.86 | dt: 642.95ms | tok/sec: 815,436 | mfu: 50.97 | epoch: 2 | total time: 102.35m | eta: 77.2m +step 09527/16704 (57.03%) | loss: 2.667492 | lrm: 0.86 | dt: 644.96ms | tok/sec: 812,901 | mfu: 50.81 | epoch: 2 | total time: 102.36m | eta: 77.2m +step 09528/16704 (57.04%) | loss: 2.662487 | lrm: 0.86 | dt: 643.61ms | tok/sec: 814,599 | mfu: 50.91 | epoch: 2 | total time: 102.37m | eta: 77.2m +step 09529/16704 (57.05%) | loss: 2.661702 | lrm: 0.86 | dt: 644.39ms | tok/sec: 813,616 | mfu: 50.85 | epoch: 2 | total time: 102.38m | eta: 77.2m +step 09530/16704 (57.05%) | loss: 2.665388 | lrm: 0.86 | dt: 643.00ms | tok/sec: 815,380 | mfu: 50.96 | epoch: 2 | total time: 102.39m | eta: 77.2m +step 09531/16704 (57.06%) | loss: 2.663353 | lrm: 0.86 | dt: 643.50ms | tok/sec: 814,746 | mfu: 50.92 | epoch: 2 | total time: 102.40m | eta: 77.1m +step 09532/16704 (57.06%) | loss: 2.660773 | lrm: 0.86 | dt: 644.98ms | tok/sec: 812,878 | mfu: 50.81 | epoch: 2 | total time: 102.41m | eta: 77.1m +step 09533/16704 (57.07%) | loss: 2.661321 | lrm: 0.86 | dt: 644.16ms | tok/sec: 813,906 | mfu: 50.87 | epoch: 2 | total time: 102.42m | eta: 77.1m +step 09534/16704 (57.08%) | loss: 2.646237 | lrm: 0.86 | dt: 642.75ms | tok/sec: 815,693 | mfu: 50.98 | epoch: 2 | total time: 102.43m | eta: 77.1m +step 09535/16704 (57.08%) | loss: 2.648012 | lrm: 0.86 | dt: 644.03ms | tok/sec: 814,072 | mfu: 50.88 | epoch: 2 | total time: 102.44m | eta: 77.1m +step 09536/16704 (57.09%) | loss: 2.650111 | lrm: 0.86 | dt: 645.98ms | tok/sec: 811,622 | mfu: 50.73 | epoch: 2 | total time: 102.45m | eta: 77.1m +step 09537/16704 (57.09%) | loss: 2.648208 | lrm: 0.86 | dt: 642.20ms | tok/sec: 816,393 | mfu: 51.03 | epoch: 2 | total time: 102.46m | eta: 77.1m +step 09538/16704 (57.10%) | loss: 2.636971 | lrm: 0.86 | dt: 644.56ms | tok/sec: 813,402 | mfu: 50.84 | epoch: 2 | total time: 102.47m | eta: 77.1m +step 09539/16704 (57.11%) | loss: 2.629640 | lrm: 0.86 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 2 | total time: 102.48m | eta: 77.1m +step 09540/16704 (57.11%) | loss: 2.644788 | lrm: 0.86 | dt: 643.70ms | tok/sec: 814,485 | mfu: 50.91 | epoch: 2 | total time: 102.50m | eta: 77.0m +step 09541/16704 (57.12%) | loss: 2.662137 | lrm: 0.86 | dt: 644.33ms | tok/sec: 813,695 | mfu: 50.86 | epoch: 2 | total time: 102.51m | eta: 77.0m +step 09542/16704 (57.12%) | loss: 2.670706 | lrm: 0.86 | dt: 642.63ms | tok/sec: 815,845 | mfu: 50.99 | epoch: 2 | total time: 102.52m | eta: 77.0m +step 09543/16704 (57.13%) | loss: 2.657891 | lrm: 0.86 | dt: 644.78ms | tok/sec: 813,127 | mfu: 50.82 | epoch: 2 | total time: 102.53m | eta: 77.0m +step 09544/16704 (57.14%) | loss: 2.649029 | lrm: 0.86 | dt: 644.36ms | tok/sec: 813,657 | mfu: 50.85 | epoch: 2 | total time: 102.54m | eta: 77.0m +step 09545/16704 (57.14%) | loss: 2.635340 | lrm: 0.86 | dt: 644.07ms | tok/sec: 814,017 | mfu: 50.88 | epoch: 2 | total time: 102.55m | eta: 77.0m +step 09546/16704 (57.15%) | loss: 2.624557 | lrm: 0.86 | dt: 644.00ms | tok/sec: 814,113 | mfu: 50.88 | epoch: 2 | total time: 102.56m | eta: 77.0m +step 09547/16704 (57.15%) | loss: 2.624670 | lrm: 0.86 | dt: 644.11ms | tok/sec: 813,966 | mfu: 50.87 | epoch: 2 | total time: 102.57m | eta: 77.0m +step 09548/16704 (57.16%) | loss: 2.622999 | lrm: 0.86 | dt: 643.56ms | tok/sec: 814,663 | mfu: 50.92 | epoch: 2 | total time: 102.58m | eta: 77.0m +step 09549/16704 (57.17%) | loss: 2.641004 | lrm: 0.86 | dt: 645.09ms | tok/sec: 812,735 | mfu: 50.80 | epoch: 2 | total time: 102.59m | eta: 77.0m +step 09550/16704 (57.17%) | loss: 2.650641 | lrm: 0.86 | dt: 643.24ms | tok/sec: 815,070 | mfu: 50.94 | epoch: 2 | total time: 102.60m | eta: 76.9m +step 09551/16704 (57.18%) | loss: 2.652366 | lrm: 0.86 | dt: 643.69ms | tok/sec: 814,499 | mfu: 50.91 | epoch: 2 | total time: 102.61m | eta: 76.9m +step 09552/16704 (57.18%) | loss: 2.649262 | lrm: 0.86 | dt: 644.09ms | tok/sec: 813,996 | mfu: 50.88 | epoch: 2 | total time: 102.62m | eta: 76.9m +step 09553/16704 (57.19%) | loss: 2.654800 | lrm: 0.86 | dt: 643.52ms | tok/sec: 814,720 | mfu: 50.92 | epoch: 2 | total time: 102.63m | eta: 76.9m +step 09554/16704 (57.20%) | loss: 2.660193 | lrm: 0.86 | dt: 648.19ms | tok/sec: 808,846 | mfu: 50.55 | epoch: 2 | total time: 102.65m | eta: 76.9m +step 09555/16704 (57.20%) | loss: 2.660066 | lrm: 0.86 | dt: 642.10ms | tok/sec: 816,520 | mfu: 51.03 | epoch: 2 | total time: 102.66m | eta: 76.9m +step 09556/16704 (57.21%) | loss: 2.651444 | lrm: 0.86 | dt: 643.37ms | tok/sec: 814,913 | mfu: 50.93 | epoch: 2 | total time: 102.67m | eta: 76.9m +step 09557/16704 (57.21%) | loss: 2.649855 | lrm: 0.86 | dt: 645.03ms | tok/sec: 812,807 | mfu: 50.80 | epoch: 2 | total time: 102.68m | eta: 76.9m +step 09558/16704 (57.22%) | loss: 2.663133 | lrm: 0.86 | dt: 644.50ms | tok/sec: 813,483 | mfu: 50.84 | epoch: 2 | total time: 102.69m | eta: 76.9m +step 09559/16704 (57.23%) | loss: 2.657943 | lrm: 0.86 | dt: 645.73ms | tok/sec: 811,936 | mfu: 50.75 | epoch: 2 | total time: 102.70m | eta: 76.8m +step 09560/16704 (57.23%) | loss: 2.678535 | lrm: 0.86 | dt: 642.67ms | tok/sec: 815,793 | mfu: 50.99 | epoch: 2 | total time: 102.71m | eta: 76.8m +step 09561/16704 (57.24%) | loss: 2.676549 | lrm: 0.86 | dt: 646.05ms | tok/sec: 811,522 | mfu: 50.72 | epoch: 2 | total time: 102.72m | eta: 76.8m +step 09562/16704 (57.24%) | loss: 2.654622 | lrm: 0.86 | dt: 644.15ms | tok/sec: 813,926 | mfu: 50.87 | epoch: 2 | total time: 102.73m | eta: 76.8m +step 09563/16704 (57.25%) | loss: 2.661135 | lrm: 0.86 | dt: 644.07ms | tok/sec: 814,027 | mfu: 50.88 | epoch: 2 | total time: 102.74m | eta: 76.8m +step 09564/16704 (57.26%) | loss: 2.658736 | lrm: 0.85 | dt: 643.74ms | tok/sec: 814,437 | mfu: 50.90 | epoch: 2 | total time: 102.75m | eta: 76.8m +step 09565/16704 (57.26%) | loss: 2.664267 | lrm: 0.85 | dt: 643.60ms | tok/sec: 814,620 | mfu: 50.91 | epoch: 2 | total time: 102.76m | eta: 76.8m +step 09566/16704 (57.27%) | loss: 2.688390 | lrm: 0.85 | dt: 643.68ms | tok/sec: 814,513 | mfu: 50.91 | epoch: 2 | total time: 102.77m | eta: 76.8m +step 09567/16704 (57.27%) | loss: 2.686957 | lrm: 0.85 | dt: 644.07ms | tok/sec: 814,025 | mfu: 50.88 | epoch: 2 | total time: 102.79m | eta: 76.8m +step 09568/16704 (57.28%) | loss: 2.676395 | lrm: 0.85 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 2 | total time: 102.80m | eta: 76.7m +step 09569/16704 (57.29%) | loss: 2.671846 | lrm: 0.85 | dt: 642.02ms | tok/sec: 816,620 | mfu: 51.04 | epoch: 2 | total time: 102.81m | eta: 76.7m +step 09570/16704 (57.29%) | loss: 2.679732 | lrm: 0.85 | dt: 644.29ms | tok/sec: 813,746 | mfu: 50.86 | epoch: 2 | total time: 102.82m | eta: 76.7m +step 09571/16704 (57.30%) | loss: 2.679219 | lrm: 0.85 | dt: 644.31ms | tok/sec: 813,717 | mfu: 50.86 | epoch: 2 | total time: 102.83m | eta: 76.7m +step 09572/16704 (57.30%) | loss: 2.674901 | lrm: 0.85 | dt: 644.40ms | tok/sec: 813,603 | mfu: 50.85 | epoch: 2 | total time: 102.84m | eta: 76.7m +step 09573/16704 (57.31%) | loss: 2.676831 | lrm: 0.85 | dt: 642.77ms | tok/sec: 815,670 | mfu: 50.98 | epoch: 2 | total time: 102.85m | eta: 76.7m +step 09574/16704 (57.32%) | loss: 2.688942 | lrm: 0.85 | dt: 643.23ms | tok/sec: 815,082 | mfu: 50.94 | epoch: 2 | total time: 102.86m | eta: 76.7m +step 09575/16704 (57.32%) | loss: 2.683048 | lrm: 0.85 | dt: 644.11ms | tok/sec: 813,971 | mfu: 50.87 | epoch: 2 | total time: 102.87m | eta: 76.7m +step 09576/16704 (57.33%) | loss: 2.686123 | lrm: 0.85 | dt: 643.14ms | tok/sec: 815,200 | mfu: 50.95 | epoch: 2 | total time: 102.88m | eta: 76.7m +step 09577/16704 (57.33%) | loss: 2.690945 | lrm: 0.85 | dt: 644.32ms | tok/sec: 813,711 | mfu: 50.86 | epoch: 2 | total time: 102.89m | eta: 76.7m +step 09578/16704 (57.34%) | loss: 2.683543 | lrm: 0.85 | dt: 643.47ms | tok/sec: 814,776 | mfu: 50.92 | epoch: 2 | total time: 102.90m | eta: 76.6m +step 09579/16704 (57.35%) | loss: 2.679448 | lrm: 0.85 | dt: 643.99ms | tok/sec: 814,127 | mfu: 50.88 | epoch: 2 | total time: 102.91m | eta: 76.6m +step 09580/16704 (57.35%) | loss: 2.669560 | lrm: 0.85 | dt: 643.62ms | tok/sec: 814,591 | mfu: 50.91 | epoch: 2 | total time: 102.92m | eta: 76.6m +step 09581/16704 (57.36%) | loss: 2.662870 | lrm: 0.85 | dt: 644.27ms | tok/sec: 813,775 | mfu: 50.86 | epoch: 2 | total time: 102.94m | eta: 76.6m +step 09582/16704 (57.36%) | loss: 2.663465 | lrm: 0.85 | dt: 644.70ms | tok/sec: 813,228 | mfu: 50.83 | epoch: 2 | total time: 102.95m | eta: 76.6m +step 09583/16704 (57.37%) | loss: 2.659244 | lrm: 0.85 | dt: 644.66ms | tok/sec: 813,282 | mfu: 50.83 | epoch: 2 | total time: 102.96m | eta: 76.6m +step 09584/16704 (57.38%) | loss: 2.656522 | lrm: 0.85 | dt: 643.07ms | tok/sec: 815,287 | mfu: 50.96 | epoch: 2 | total time: 102.97m | eta: 76.6m +step 09585/16704 (57.38%) | loss: 2.669125 | lrm: 0.85 | dt: 643.35ms | tok/sec: 814,928 | mfu: 50.93 | epoch: 2 | total time: 102.98m | eta: 76.6m +step 09586/16704 (57.39%) | loss: 2.676460 | lrm: 0.85 | dt: 645.04ms | tok/sec: 812,802 | mfu: 50.80 | epoch: 2 | total time: 102.99m | eta: 76.6m +step 09587/16704 (57.39%) | loss: 2.664066 | lrm: 0.85 | dt: 642.69ms | tok/sec: 815,767 | mfu: 50.99 | epoch: 2 | total time: 103.00m | eta: 76.5m +step 09588/16704 (57.40%) | loss: 2.667906 | lrm: 0.85 | dt: 644.68ms | tok/sec: 813,250 | mfu: 50.83 | epoch: 2 | total time: 103.01m | eta: 76.5m +step 09589/16704 (57.41%) | loss: 2.666152 | lrm: 0.85 | dt: 644.41ms | tok/sec: 813,596 | mfu: 50.85 | epoch: 2 | total time: 103.02m | eta: 76.5m +step 09590/16704 (57.41%) | loss: 2.661637 | lrm: 0.85 | dt: 644.56ms | tok/sec: 813,400 | mfu: 50.84 | epoch: 2 | total time: 103.03m | eta: 76.5m +step 09591/16704 (57.42%) | loss: 2.664314 | lrm: 0.85 | dt: 642.57ms | tok/sec: 815,922 | mfu: 51.00 | epoch: 2 | total time: 103.04m | eta: 76.5m +step 09592/16704 (57.42%) | loss: 2.665815 | lrm: 0.85 | dt: 643.76ms | tok/sec: 814,413 | mfu: 50.90 | epoch: 2 | total time: 103.05m | eta: 76.5m +step 09593/16704 (57.43%) | loss: 2.659926 | lrm: 0.85 | dt: 642.53ms | tok/sec: 815,978 | mfu: 51.00 | epoch: 2 | total time: 103.06m | eta: 76.5m +step 09594/16704 (57.44%) | loss: 2.662538 | lrm: 0.85 | dt: 643.93ms | tok/sec: 814,201 | mfu: 50.89 | epoch: 2 | total time: 103.07m | eta: 76.5m +step 09595/16704 (57.44%) | loss: 2.658531 | lrm: 0.85 | dt: 643.93ms | tok/sec: 814,202 | mfu: 50.89 | epoch: 2 | total time: 103.09m | eta: 76.5m +step 09596/16704 (57.45%) | loss: 2.655969 | lrm: 0.85 | dt: 642.69ms | tok/sec: 815,770 | mfu: 50.99 | epoch: 2 | total time: 103.10m | eta: 76.4m +step 09597/16704 (57.45%) | loss: 2.644627 | lrm: 0.85 | dt: 643.76ms | tok/sec: 814,419 | mfu: 50.90 | epoch: 2 | total time: 103.11m | eta: 76.4m +step 09598/16704 (57.46%) | loss: 2.643062 | lrm: 0.85 | dt: 644.92ms | tok/sec: 812,950 | mfu: 50.81 | epoch: 2 | total time: 103.12m | eta: 76.4m +step 09599/16704 (57.47%) | loss: 2.646089 | lrm: 0.85 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 2 | total time: 103.13m | eta: 76.4m +step 09600/16704 (57.47%) | loss: 2.643850 | lrm: 0.85 | dt: 644.80ms | tok/sec: 813,103 | mfu: 50.82 | epoch: 2 | total time: 103.14m | eta: 76.4m +step 09601/16704 (57.48%) | loss: 2.649835 | lrm: 0.85 | dt: 643.98ms | tok/sec: 814,139 | mfu: 50.88 | epoch: 2 | total time: 103.15m | eta: 76.4m +step 09602/16704 (57.48%) | loss: 2.650034 | lrm: 0.85 | dt: 643.12ms | tok/sec: 815,221 | mfu: 50.95 | epoch: 2 | total time: 103.16m | eta: 76.4m +step 09603/16704 (57.49%) | loss: 2.646907 | lrm: 0.85 | dt: 645.33ms | tok/sec: 812,434 | mfu: 50.78 | epoch: 2 | total time: 103.17m | eta: 76.4m +step 09604/16704 (57.50%) | loss: 2.626131 | lrm: 0.85 | dt: 641.91ms | tok/sec: 816,766 | mfu: 51.05 | epoch: 2 | total time: 103.18m | eta: 76.4m +step 09605/16704 (57.50%) | loss: 2.630700 | lrm: 0.85 | dt: 642.97ms | tok/sec: 815,416 | mfu: 50.96 | epoch: 2 | total time: 103.19m | eta: 76.3m +step 09606/16704 (57.51%) | loss: 2.637015 | lrm: 0.85 | dt: 645.78ms | tok/sec: 811,872 | mfu: 50.74 | epoch: 2 | total time: 103.20m | eta: 76.3m +step 09607/16704 (57.51%) | loss: 2.643228 | lrm: 0.85 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 2 | total time: 103.21m | eta: 76.3m +step 09608/16704 (57.52%) | loss: 2.650787 | lrm: 0.85 | dt: 645.00ms | tok/sec: 812,851 | mfu: 50.80 | epoch: 2 | total time: 103.23m | eta: 76.3m +step 09609/16704 (57.53%) | loss: 2.645860 | lrm: 0.85 | dt: 642.37ms | tok/sec: 816,173 | mfu: 51.01 | epoch: 2 | total time: 103.24m | eta: 76.3m +step 09610/16704 (57.53%) | loss: 2.649297 | lrm: 0.85 | dt: 642.82ms | tok/sec: 815,603 | mfu: 50.98 | epoch: 2 | total time: 103.25m | eta: 76.3m +step 09611/16704 (57.54%) | loss: 2.645029 | lrm: 0.85 | dt: 643.47ms | tok/sec: 814,784 | mfu: 50.93 | epoch: 2 | total time: 103.26m | eta: 76.3m +step 09612/16704 (57.54%) | loss: 2.637965 | lrm: 0.85 | dt: 644.07ms | tok/sec: 814,024 | mfu: 50.88 | epoch: 2 | total time: 103.27m | eta: 76.3m +step 09613/16704 (57.55%) | loss: 2.630230 | lrm: 0.85 | dt: 644.10ms | tok/sec: 813,989 | mfu: 50.88 | epoch: 2 | total time: 103.28m | eta: 76.3m +step 09614/16704 (57.56%) | loss: 2.649152 | lrm: 0.85 | dt: 643.66ms | tok/sec: 814,539 | mfu: 50.91 | epoch: 2 | total time: 103.29m | eta: 76.3m +step 09615/16704 (57.56%) | loss: 2.643677 | lrm: 0.85 | dt: 645.54ms | tok/sec: 812,168 | mfu: 50.76 | epoch: 2 | total time: 103.30m | eta: 76.2m +step 09616/16704 (57.57%) | loss: 2.652833 | lrm: 0.85 | dt: 643.37ms | tok/sec: 814,904 | mfu: 50.93 | epoch: 2 | total time: 103.31m | eta: 76.2m +step 09617/16704 (57.57%) | loss: 2.646698 | lrm: 0.85 | dt: 642.68ms | tok/sec: 815,783 | mfu: 50.99 | epoch: 2 | total time: 103.32m | eta: 76.2m +step 09618/16704 (57.58%) | loss: 2.644927 | lrm: 0.85 | dt: 643.43ms | tok/sec: 814,832 | mfu: 50.93 | epoch: 2 | total time: 103.33m | eta: 76.2m +step 09619/16704 (57.59%) | loss: 2.638642 | lrm: 0.85 | dt: 642.83ms | tok/sec: 815,594 | mfu: 50.98 | epoch: 2 | total time: 103.34m | eta: 76.2m +step 09620/16704 (57.59%) | loss: 2.643537 | lrm: 0.85 | dt: 641.29ms | tok/sec: 817,554 | mfu: 51.10 | epoch: 2 | total time: 103.35m | eta: 76.2m +step 09621/16704 (57.60%) | loss: 2.660098 | lrm: 0.85 | dt: 645.23ms | tok/sec: 812,560 | mfu: 50.79 | epoch: 2 | total time: 103.36m | eta: 76.2m +step 09622/16704 (57.60%) | loss: 2.650840 | lrm: 0.85 | dt: 642.22ms | tok/sec: 816,363 | mfu: 51.02 | epoch: 2 | total time: 103.38m | eta: 76.2m +step 09623/16704 (57.61%) | loss: 2.645807 | lrm: 0.85 | dt: 644.43ms | tok/sec: 813,562 | mfu: 50.85 | epoch: 2 | total time: 103.39m | eta: 76.2m +step 09624/16704 (57.61%) | loss: 2.621144 | lrm: 0.85 | dt: 642.35ms | tok/sec: 816,205 | mfu: 51.01 | epoch: 2 | total time: 103.40m | eta: 76.1m +step 09625/16704 (57.62%) | loss: 2.625425 | lrm: 0.85 | dt: 643.41ms | tok/sec: 814,859 | mfu: 50.93 | epoch: 2 | total time: 103.41m | eta: 76.1m +step 09626/16704 (57.63%) | loss: 2.620294 | lrm: 0.85 | dt: 642.11ms | tok/sec: 816,509 | mfu: 51.03 | epoch: 2 | total time: 103.42m | eta: 76.1m +step 09627/16704 (57.63%) | loss: 2.622651 | lrm: 0.85 | dt: 644.83ms | tok/sec: 813,062 | mfu: 50.82 | epoch: 2 | total time: 103.43m | eta: 76.1m +step 09628/16704 (57.64%) | loss: 2.635245 | lrm: 0.85 | dt: 644.49ms | tok/sec: 813,498 | mfu: 50.84 | epoch: 2 | total time: 103.44m | eta: 76.1m +step 09629/16704 (57.64%) | loss: 2.643203 | lrm: 0.85 | dt: 645.61ms | tok/sec: 812,079 | mfu: 50.76 | epoch: 2 | total time: 103.45m | eta: 76.1m +step 09630/16704 (57.65%) | loss: 2.647642 | lrm: 0.85 | dt: 644.28ms | tok/sec: 813,761 | mfu: 50.86 | epoch: 2 | total time: 103.46m | eta: 76.1m +step 09631/16704 (57.66%) | loss: 2.653694 | lrm: 0.85 | dt: 641.72ms | tok/sec: 817,003 | mfu: 51.06 | epoch: 2 | total time: 103.47m | eta: 76.1m +step 09632/16704 (57.66%) | loss: 2.655490 | lrm: 0.85 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 2 | total time: 103.48m | eta: 76.1m +step 09633/16704 (57.67%) | loss: 2.654293 | lrm: 0.85 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 2 | total time: 103.49m | eta: 76.0m +step 09634/16704 (57.67%) | loss: 2.646426 | lrm: 0.85 | dt: 644.55ms | tok/sec: 813,412 | mfu: 50.84 | epoch: 2 | total time: 103.50m | eta: 76.0m +step 09635/16704 (57.68%) | loss: 2.642794 | lrm: 0.85 | dt: 644.64ms | tok/sec: 813,299 | mfu: 50.83 | epoch: 2 | total time: 103.51m | eta: 76.0m +step 09636/16704 (57.69%) | loss: 2.651385 | lrm: 0.85 | dt: 642.56ms | tok/sec: 815,931 | mfu: 51.00 | epoch: 2 | total time: 103.53m | eta: 76.0m +step 09637/16704 (57.69%) | loss: 2.641535 | lrm: 0.85 | dt: 645.52ms | tok/sec: 812,194 | mfu: 50.76 | epoch: 2 | total time: 103.54m | eta: 76.0m +step 09638/16704 (57.70%) | loss: 2.644170 | lrm: 0.85 | dt: 643.60ms | tok/sec: 814,622 | mfu: 50.92 | epoch: 2 | total time: 103.55m | eta: 76.0m +step 09639/16704 (57.70%) | loss: 2.638756 | lrm: 0.85 | dt: 643.50ms | tok/sec: 814,750 | mfu: 50.92 | epoch: 2 | total time: 103.56m | eta: 76.0m +step 09640/16704 (57.71%) | loss: 2.632569 | lrm: 0.85 | dt: 644.25ms | tok/sec: 813,791 | mfu: 50.86 | epoch: 2 | total time: 103.57m | eta: 76.0m +step 09641/16704 (57.72%) | loss: 2.635522 | lrm: 0.85 | dt: 642.69ms | tok/sec: 815,769 | mfu: 50.99 | epoch: 2 | total time: 103.58m | eta: 76.0m +step 09642/16704 (57.72%) | loss: 2.645466 | lrm: 0.85 | dt: 643.61ms | tok/sec: 814,605 | mfu: 50.91 | epoch: 2 | total time: 103.59m | eta: 76.0m +step 09643/16704 (57.73%) | loss: 2.647511 | lrm: 0.85 | dt: 643.42ms | tok/sec: 814,843 | mfu: 50.93 | epoch: 2 | total time: 103.60m | eta: 75.9m +step 09644/16704 (57.73%) | loss: 2.646656 | lrm: 0.85 | dt: 642.89ms | tok/sec: 815,514 | mfu: 50.97 | epoch: 2 | total time: 103.61m | eta: 75.9m +step 09645/16704 (57.74%) | loss: 2.641794 | lrm: 0.85 | dt: 643.55ms | tok/sec: 814,675 | mfu: 50.92 | epoch: 2 | total time: 103.62m | eta: 75.9m +step 09646/16704 (57.75%) | loss: 2.653320 | lrm: 0.85 | dt: 642.07ms | tok/sec: 816,557 | mfu: 51.04 | epoch: 2 | total time: 103.63m | eta: 75.9m +step 09647/16704 (57.75%) | loss: 2.647402 | lrm: 0.84 | dt: 643.15ms | tok/sec: 815,190 | mfu: 50.95 | epoch: 2 | total time: 103.64m | eta: 75.9m +step 09648/16704 (57.76%) | loss: 2.641766 | lrm: 0.84 | dt: 642.26ms | tok/sec: 816,320 | mfu: 51.02 | epoch: 2 | total time: 103.65m | eta: 75.9m +step 09649/16704 (57.76%) | loss: 2.648393 | lrm: 0.84 | dt: 642.33ms | tok/sec: 816,231 | mfu: 51.02 | epoch: 2 | total time: 103.66m | eta: 75.9m +step 09650/16704 (57.77%) | loss: 2.654053 | lrm: 0.84 | dt: 642.39ms | tok/sec: 816,152 | mfu: 51.01 | epoch: 2 | total time: 103.68m | eta: 75.9m +step 09651/16704 (57.78%) | loss: 2.663412 | lrm: 0.84 | dt: 642.91ms | tok/sec: 815,492 | mfu: 50.97 | epoch: 2 | total time: 103.69m | eta: 75.9m +step 09652/16704 (57.78%) | loss: 2.667999 | lrm: 0.84 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 2 | total time: 103.70m | eta: 75.8m +step 09653/16704 (57.79%) | loss: 2.676473 | lrm: 0.84 | dt: 642.77ms | tok/sec: 815,670 | mfu: 50.98 | epoch: 2 | total time: 103.71m | eta: 75.8m +step 09654/16704 (57.79%) | loss: 2.661927 | lrm: 0.84 | dt: 643.14ms | tok/sec: 815,198 | mfu: 50.95 | epoch: 2 | total time: 103.72m | eta: 75.8m +step 09655/16704 (57.80%) | loss: 2.661324 | lrm: 0.84 | dt: 642.30ms | tok/sec: 816,264 | mfu: 51.02 | epoch: 2 | total time: 103.73m | eta: 75.8m +step 09656/16704 (57.81%) | loss: 2.647261 | lrm: 0.84 | dt: 641.76ms | tok/sec: 816,949 | mfu: 51.06 | epoch: 2 | total time: 103.74m | eta: 75.8m +step 09657/16704 (57.81%) | loss: 2.653557 | lrm: 0.84 | dt: 648.52ms | tok/sec: 808,432 | mfu: 50.53 | epoch: 2 | total time: 103.75m | eta: 75.8m +step 09658/16704 (57.82%) | loss: 2.657936 | lrm: 0.84 | dt: 640.63ms | tok/sec: 818,398 | mfu: 51.15 | epoch: 2 | total time: 103.76m | eta: 75.8m +step 09659/16704 (57.82%) | loss: 2.670713 | lrm: 0.84 | dt: 642.98ms | tok/sec: 815,405 | mfu: 50.96 | epoch: 2 | total time: 103.77m | eta: 75.8m +step 09660/16704 (57.83%) | loss: 2.667407 | lrm: 0.84 | dt: 644.75ms | tok/sec: 813,169 | mfu: 50.82 | epoch: 2 | total time: 103.78m | eta: 75.8m +step 09661/16704 (57.84%) | loss: 2.654390 | lrm: 0.84 | dt: 645.39ms | tok/sec: 812,358 | mfu: 50.77 | epoch: 2 | total time: 103.79m | eta: 75.7m +step 09662/16704 (57.84%) | loss: 2.653998 | lrm: 0.84 | dt: 644.36ms | tok/sec: 813,662 | mfu: 50.86 | epoch: 2 | total time: 103.80m | eta: 75.7m +step 09663/16704 (57.85%) | loss: 2.648817 | lrm: 0.84 | dt: 644.49ms | tok/sec: 813,491 | mfu: 50.84 | epoch: 2 | total time: 103.82m | eta: 75.7m +step 09664/16704 (57.85%) | loss: 2.646732 | lrm: 0.84 | dt: 641.44ms | tok/sec: 817,354 | mfu: 51.09 | epoch: 2 | total time: 103.83m | eta: 75.7m +step 09665/16704 (57.86%) | loss: 2.645129 | lrm: 0.84 | dt: 645.27ms | tok/sec: 812,510 | mfu: 50.78 | epoch: 2 | total time: 103.84m | eta: 75.7m +step 09666/16704 (57.87%) | loss: 2.643612 | lrm: 0.84 | dt: 644.27ms | tok/sec: 813,771 | mfu: 50.86 | epoch: 2 | total time: 103.85m | eta: 75.7m +step 09667/16704 (57.87%) | loss: 2.645197 | lrm: 0.84 | dt: 642.38ms | tok/sec: 816,170 | mfu: 51.01 | epoch: 2 | total time: 103.86m | eta: 75.7m +step 09668/16704 (57.88%) | loss: 2.648602 | lrm: 0.84 | dt: 642.59ms | tok/sec: 815,895 | mfu: 50.99 | epoch: 2 | total time: 103.87m | eta: 75.7m +step 09669/16704 (57.88%) | loss: 2.641472 | lrm: 0.84 | dt: 643.65ms | tok/sec: 814,560 | mfu: 50.91 | epoch: 2 | total time: 103.88m | eta: 75.7m +step 09670/16704 (57.89%) | loss: 2.644432 | lrm: 0.84 | dt: 643.14ms | tok/sec: 815,200 | mfu: 50.95 | epoch: 2 | total time: 103.89m | eta: 75.6m +step 09671/16704 (57.90%) | loss: 2.654335 | lrm: 0.84 | dt: 643.62ms | tok/sec: 814,593 | mfu: 50.91 | epoch: 2 | total time: 103.90m | eta: 75.6m +step 09672/16704 (57.90%) | loss: 2.654150 | lrm: 0.84 | dt: 642.85ms | tok/sec: 815,573 | mfu: 50.97 | epoch: 2 | total time: 103.91m | eta: 75.6m +step 09673/16704 (57.91%) | loss: 2.661518 | lrm: 0.84 | dt: 643.96ms | tok/sec: 814,165 | mfu: 50.89 | epoch: 2 | total time: 103.92m | eta: 75.6m +step 09674/16704 (57.91%) | loss: 2.658110 | lrm: 0.84 | dt: 644.43ms | tok/sec: 813,571 | mfu: 50.85 | epoch: 2 | total time: 103.93m | eta: 75.6m +step 09675/16704 (57.92%) | loss: 2.666504 | lrm: 0.84 | dt: 643.63ms | tok/sec: 814,585 | mfu: 50.91 | epoch: 2 | total time: 103.94m | eta: 75.6m +step 09676/16704 (57.93%) | loss: 2.670673 | lrm: 0.84 | dt: 643.34ms | tok/sec: 814,947 | mfu: 50.94 | epoch: 2 | total time: 103.95m | eta: 75.6m +step 09677/16704 (57.93%) | loss: 2.660189 | lrm: 0.84 | dt: 645.78ms | tok/sec: 811,867 | mfu: 50.74 | epoch: 2 | total time: 103.97m | eta: 75.6m +step 09678/16704 (57.94%) | loss: 2.663914 | lrm: 0.84 | dt: 645.12ms | tok/sec: 812,698 | mfu: 50.79 | epoch: 2 | total time: 103.98m | eta: 75.6m +step 09679/16704 (57.94%) | loss: 2.663081 | lrm: 0.84 | dt: 644.06ms | tok/sec: 814,041 | mfu: 50.88 | epoch: 2 | total time: 103.99m | eta: 75.6m +step 09680/16704 (57.95%) | loss: 2.648382 | lrm: 0.84 | dt: 645.03ms | tok/sec: 812,808 | mfu: 50.80 | epoch: 2 | total time: 104.00m | eta: 75.5m +step 09681/16704 (57.96%) | loss: 2.649077 | lrm: 0.84 | dt: 644.21ms | tok/sec: 813,843 | mfu: 50.87 | epoch: 2 | total time: 104.01m | eta: 75.5m +step 09682/16704 (57.96%) | loss: 2.643279 | lrm: 0.84 | dt: 645.69ms | tok/sec: 811,983 | mfu: 50.75 | epoch: 2 | total time: 104.02m | eta: 75.5m +step 09683/16704 (57.97%) | loss: 2.642810 | lrm: 0.84 | dt: 642.08ms | tok/sec: 816,545 | mfu: 51.04 | epoch: 2 | total time: 104.03m | eta: 75.5m +step 09684/16704 (57.97%) | loss: 2.635333 | lrm: 0.84 | dt: 641.73ms | tok/sec: 816,997 | mfu: 51.06 | epoch: 2 | total time: 104.04m | eta: 75.5m +step 09685/16704 (57.98%) | loss: 2.640001 | lrm: 0.84 | dt: 643.96ms | tok/sec: 814,157 | mfu: 50.89 | epoch: 2 | total time: 104.05m | eta: 75.5m +step 09686/16704 (57.99%) | loss: 2.644215 | lrm: 0.84 | dt: 642.55ms | tok/sec: 815,953 | mfu: 51.00 | epoch: 2 | total time: 104.06m | eta: 75.5m +step 09687/16704 (57.99%) | loss: 2.630466 | lrm: 0.84 | dt: 644.41ms | tok/sec: 813,599 | mfu: 50.85 | epoch: 2 | total time: 104.07m | eta: 75.5m +step 09688/16704 (58.00%) | loss: 2.628371 | lrm: 0.84 | dt: 646.34ms | tok/sec: 811,164 | mfu: 50.70 | epoch: 2 | total time: 104.08m | eta: 75.5m +step 09689/16704 (58.00%) | loss: 2.633297 | lrm: 0.84 | dt: 641.89ms | tok/sec: 816,783 | mfu: 51.05 | epoch: 2 | total time: 104.09m | eta: 75.4m +step 09690/16704 (58.01%) | loss: 2.632044 | lrm: 0.84 | dt: 643.40ms | tok/sec: 814,873 | mfu: 50.93 | epoch: 2 | total time: 104.10m | eta: 75.4m +step 09691/16704 (58.02%) | loss: 2.628451 | lrm: 0.84 | dt: 642.12ms | tok/sec: 816,492 | mfu: 51.03 | epoch: 2 | total time: 104.12m | eta: 75.4m +step 09692/16704 (58.02%) | loss: 2.628630 | lrm: 0.84 | dt: 643.35ms | tok/sec: 814,935 | mfu: 50.93 | epoch: 2 | total time: 104.13m | eta: 75.4m +step 09693/16704 (58.03%) | loss: 2.626346 | lrm: 0.84 | dt: 644.67ms | tok/sec: 813,262 | mfu: 50.83 | epoch: 2 | total time: 104.14m | eta: 75.4m +step 09694/16704 (58.03%) | loss: 2.636559 | lrm: 0.84 | dt: 642.92ms | tok/sec: 815,479 | mfu: 50.97 | epoch: 2 | total time: 104.15m | eta: 75.4m +step 09695/16704 (58.04%) | loss: 2.643163 | lrm: 0.84 | dt: 641.67ms | tok/sec: 817,063 | mfu: 51.07 | epoch: 2 | total time: 104.16m | eta: 75.4m +step 09696/16704 (58.05%) | loss: 2.637753 | lrm: 0.84 | dt: 644.45ms | tok/sec: 813,548 | mfu: 50.85 | epoch: 2 | total time: 104.17m | eta: 75.4m +step 09697/16704 (58.05%) | loss: 2.638208 | lrm: 0.84 | dt: 644.10ms | tok/sec: 813,991 | mfu: 50.88 | epoch: 2 | total time: 104.18m | eta: 75.4m +step 09698/16704 (58.06%) | loss: 2.639942 | lrm: 0.84 | dt: 643.62ms | tok/sec: 814,593 | mfu: 50.91 | epoch: 2 | total time: 104.19m | eta: 75.3m +step 09699/16704 (58.06%) | loss: 2.628905 | lrm: 0.84 | dt: 643.85ms | tok/sec: 814,306 | mfu: 50.90 | epoch: 2 | total time: 104.20m | eta: 75.3m +step 09700/16704 (58.07%) | loss: 2.627112 | lrm: 0.84 | dt: 643.32ms | tok/sec: 814,970 | mfu: 50.94 | epoch: 2 | total time: 104.21m | eta: 75.3m +step 09701/16704 (58.08%) | loss: 2.628364 | lrm: 0.84 | dt: 643.86ms | tok/sec: 814,290 | mfu: 50.89 | epoch: 2 | total time: 104.22m | eta: 75.3m +step 09702/16704 (58.08%) | loss: 2.607783 | lrm: 0.84 | dt: 644.04ms | tok/sec: 814,064 | mfu: 50.88 | epoch: 2 | total time: 104.23m | eta: 75.3m +step 09703/16704 (58.09%) | loss: 2.604092 | lrm: 0.84 | dt: 642.89ms | tok/sec: 815,517 | mfu: 50.97 | epoch: 2 | total time: 104.24m | eta: 75.3m +step 09704/16704 (58.09%) | loss: 2.590144 | lrm: 0.84 | dt: 644.45ms | tok/sec: 813,538 | mfu: 50.85 | epoch: 2 | total time: 104.25m | eta: 75.3m +step 09705/16704 (58.10%) | loss: 2.596664 | lrm: 0.84 | dt: 642.75ms | tok/sec: 815,692 | mfu: 50.98 | epoch: 2 | total time: 104.27m | eta: 75.3m +step 09706/16704 (58.11%) | loss: 2.602590 | lrm: 0.84 | dt: 644.85ms | tok/sec: 813,037 | mfu: 50.82 | epoch: 2 | total time: 104.28m | eta: 75.3m +step 09707/16704 (58.11%) | loss: 2.606229 | lrm: 0.84 | dt: 645.58ms | tok/sec: 812,120 | mfu: 50.76 | epoch: 2 | total time: 104.29m | eta: 75.2m +step 09708/16704 (58.12%) | loss: 2.599856 | lrm: 0.84 | dt: 643.20ms | tok/sec: 815,124 | mfu: 50.95 | epoch: 2 | total time: 104.30m | eta: 75.2m +step 09709/16704 (58.12%) | loss: 2.611895 | lrm: 0.84 | dt: 644.75ms | tok/sec: 813,162 | mfu: 50.82 | epoch: 2 | total time: 104.31m | eta: 75.2m +step 09710/16704 (58.13%) | loss: 2.604615 | lrm: 0.84 | dt: 643.69ms | tok/sec: 814,502 | mfu: 50.91 | epoch: 2 | total time: 104.32m | eta: 75.2m +step 09711/16704 (58.14%) | loss: 2.614829 | lrm: 0.84 | dt: 643.67ms | tok/sec: 814,526 | mfu: 50.91 | epoch: 2 | total time: 104.33m | eta: 75.2m +step 09712/16704 (58.14%) | loss: 2.619615 | lrm: 0.84 | dt: 644.26ms | tok/sec: 813,785 | mfu: 50.86 | epoch: 2 | total time: 104.34m | eta: 75.2m +step 09713/16704 (58.15%) | loss: 2.620246 | lrm: 0.84 | dt: 643.00ms | tok/sec: 815,382 | mfu: 50.96 | epoch: 2 | total time: 104.35m | eta: 75.2m +step 09714/16704 (58.15%) | loss: 2.630595 | lrm: 0.84 | dt: 643.45ms | tok/sec: 814,809 | mfu: 50.93 | epoch: 2 | total time: 104.36m | eta: 75.2m +step 09715/16704 (58.16%) | loss: 2.626281 | lrm: 0.84 | dt: 642.13ms | tok/sec: 816,487 | mfu: 51.03 | epoch: 2 | total time: 104.37m | eta: 75.2m +step 09716/16704 (58.17%) | loss: 2.627277 | lrm: 0.84 | dt: 644.58ms | tok/sec: 813,378 | mfu: 50.84 | epoch: 2 | total time: 104.38m | eta: 75.2m +step 09717/16704 (58.17%) | loss: 2.631892 | lrm: 0.84 | dt: 642.55ms | tok/sec: 815,944 | mfu: 51.00 | epoch: 2 | total time: 104.39m | eta: 75.1m +step 09718/16704 (58.18%) | loss: 2.640363 | lrm: 0.84 | dt: 643.39ms | tok/sec: 814,888 | mfu: 50.93 | epoch: 2 | total time: 104.41m | eta: 75.1m +step 09719/16704 (58.18%) | loss: 2.638746 | lrm: 0.84 | dt: 643.97ms | tok/sec: 814,147 | mfu: 50.89 | epoch: 2 | total time: 104.42m | eta: 75.1m +step 09720/16704 (58.19%) | loss: 2.643895 | lrm: 0.84 | dt: 646.45ms | tok/sec: 811,026 | mfu: 50.69 | epoch: 2 | total time: 104.43m | eta: 75.1m +step 09721/16704 (58.20%) | loss: 2.645318 | lrm: 0.84 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 2 | total time: 104.44m | eta: 75.1m +step 09722/16704 (58.20%) | loss: 2.635752 | lrm: 0.84 | dt: 643.40ms | tok/sec: 814,870 | mfu: 50.93 | epoch: 2 | total time: 104.45m | eta: 75.1m +step 09723/16704 (58.21%) | loss: 2.629353 | lrm: 0.84 | dt: 644.25ms | tok/sec: 813,792 | mfu: 50.86 | epoch: 2 | total time: 104.46m | eta: 75.1m +step 09724/16704 (58.21%) | loss: 2.631047 | lrm: 0.84 | dt: 643.13ms | tok/sec: 815,219 | mfu: 50.95 | epoch: 2 | total time: 104.47m | eta: 75.1m +step 09725/16704 (58.22%) | loss: 2.628291 | lrm: 0.84 | dt: 645.72ms | tok/sec: 811,944 | mfu: 50.75 | epoch: 2 | total time: 104.48m | eta: 75.1m +step 09726/16704 (58.23%) | loss: 2.623633 | lrm: 0.84 | dt: 641.87ms | tok/sec: 816,812 | mfu: 51.05 | epoch: 2 | total time: 104.49m | eta: 75.0m +step 09727/16704 (58.23%) | loss: 2.607863 | lrm: 0.84 | dt: 643.73ms | tok/sec: 814,453 | mfu: 50.90 | epoch: 2 | total time: 104.50m | eta: 75.0m +step 09728/16704 (58.24%) | loss: 2.607587 | lrm: 0.84 | dt: 645.28ms | tok/sec: 812,501 | mfu: 50.78 | epoch: 2 | total time: 104.51m | eta: 75.0m +step 09729/16704 (58.24%) | loss: 2.612808 | lrm: 0.84 | dt: 643.94ms | tok/sec: 814,181 | mfu: 50.89 | epoch: 2 | total time: 104.52m | eta: 75.0m +step 09730/16704 (58.25%) | loss: 2.622370 | lrm: 0.84 | dt: 643.86ms | tok/sec: 814,292 | mfu: 50.89 | epoch: 2 | total time: 104.53m | eta: 75.0m +step 09731/16704 (58.26%) | loss: 2.626054 | lrm: 0.83 | dt: 644.66ms | tok/sec: 813,274 | mfu: 50.83 | epoch: 2 | total time: 104.54m | eta: 75.0m +step 09732/16704 (58.26%) | loss: 2.621591 | lrm: 0.83 | dt: 642.68ms | tok/sec: 815,787 | mfu: 50.99 | epoch: 2 | total time: 104.56m | eta: 75.0m +step 09733/16704 (58.27%) | loss: 2.617959 | lrm: 0.83 | dt: 645.07ms | tok/sec: 812,758 | mfu: 50.80 | epoch: 2 | total time: 104.57m | eta: 75.0m +step 09734/16704 (58.27%) | loss: 2.626110 | lrm: 0.83 | dt: 644.01ms | tok/sec: 814,097 | mfu: 50.88 | epoch: 2 | total time: 104.58m | eta: 75.0m +step 09735/16704 (58.28%) | loss: 2.640728 | lrm: 0.83 | dt: 644.40ms | tok/sec: 813,602 | mfu: 50.85 | epoch: 2 | total time: 104.59m | eta: 74.9m +step 09736/16704 (58.29%) | loss: 2.638176 | lrm: 0.83 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 2 | total time: 104.60m | eta: 74.9m +step 09737/16704 (58.29%) | loss: 2.628333 | lrm: 0.83 | dt: 644.92ms | tok/sec: 812,951 | mfu: 50.81 | epoch: 2 | total time: 104.61m | eta: 74.9m +step 09738/16704 (58.30%) | loss: 2.617794 | lrm: 0.83 | dt: 643.34ms | tok/sec: 814,943 | mfu: 50.94 | epoch: 2 | total time: 104.62m | eta: 74.9m +step 09739/16704 (58.30%) | loss: 2.622182 | lrm: 0.83 | dt: 644.42ms | tok/sec: 813,586 | mfu: 50.85 | epoch: 2 | total time: 104.63m | eta: 74.9m +step 09740/16704 (58.31%) | loss: 2.619927 | lrm: 0.83 | dt: 644.57ms | tok/sec: 813,388 | mfu: 50.84 | epoch: 2 | total time: 104.64m | eta: 74.9m +step 09741/16704 (58.32%) | loss: 2.632637 | lrm: 0.83 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 2 | total time: 104.65m | eta: 74.9m +step 09742/16704 (58.32%) | loss: 2.634066 | lrm: 0.83 | dt: 643.29ms | tok/sec: 815,012 | mfu: 50.94 | epoch: 2 | total time: 104.66m | eta: 74.9m +step 09743/16704 (58.33%) | loss: 2.639695 | lrm: 0.83 | dt: 645.51ms | tok/sec: 812,205 | mfu: 50.76 | epoch: 2 | total time: 104.67m | eta: 74.9m +step 09744/16704 (58.33%) | loss: 2.644605 | lrm: 0.83 | dt: 643.69ms | tok/sec: 814,501 | mfu: 50.91 | epoch: 2 | total time: 104.68m | eta: 74.9m +step 09745/16704 (58.34%) | loss: 2.636731 | lrm: 0.83 | dt: 643.45ms | tok/sec: 814,804 | mfu: 50.93 | epoch: 2 | total time: 104.70m | eta: 74.8m +step 09746/16704 (58.35%) | loss: 2.641888 | lrm: 0.83 | dt: 644.20ms | tok/sec: 813,862 | mfu: 50.87 | epoch: 2 | total time: 104.71m | eta: 74.8m +step 09747/16704 (58.35%) | loss: 2.639232 | lrm: 0.83 | dt: 645.36ms | tok/sec: 812,389 | mfu: 50.78 | epoch: 2 | total time: 104.72m | eta: 74.8m +step 09748/16704 (58.36%) | loss: 2.647363 | lrm: 0.83 | dt: 644.70ms | tok/sec: 813,223 | mfu: 50.83 | epoch: 2 | total time: 104.73m | eta: 74.8m +step 09749/16704 (58.36%) | loss: 2.640698 | lrm: 0.83 | dt: 641.94ms | tok/sec: 816,725 | mfu: 51.05 | epoch: 2 | total time: 104.74m | eta: 74.8m +Step 09750 | Validation bpb: 0.807252 +step 09750/16704 (58.37%) | loss: 2.635741 | lrm: 0.83 | dt: 647.95ms | tok/sec: 809,151 | mfu: 50.57 | epoch: 2 | total time: 104.75m | eta: 74.8m +step 09751/16704 (58.38%) | loss: 2.641293 | lrm: 0.83 | dt: 646.51ms | tok/sec: 810,954 | mfu: 50.69 | epoch: 2 | total time: 104.76m | eta: 74.8m +step 09752/16704 (58.38%) | loss: 2.642072 | lrm: 0.83 | dt: 648.65ms | tok/sec: 808,275 | mfu: 50.52 | epoch: 2 | total time: 104.77m | eta: 74.8m +step 09753/16704 (58.39%) | loss: 2.637993 | lrm: 0.83 | dt: 641.31ms | tok/sec: 817,523 | mfu: 51.10 | epoch: 2 | total time: 104.78m | eta: 74.8m +step 09754/16704 (58.39%) | loss: 2.656406 | lrm: 0.83 | dt: 645.47ms | tok/sec: 812,253 | mfu: 50.77 | epoch: 2 | total time: 104.79m | eta: 74.7m +step 09755/16704 (58.40%) | loss: 2.652191 | lrm: 0.83 | dt: 642.85ms | tok/sec: 815,569 | mfu: 50.97 | epoch: 2 | total time: 104.80m | eta: 74.7m +step 09756/16704 (58.41%) | loss: 2.639716 | lrm: 0.83 | dt: 643.62ms | tok/sec: 814,596 | mfu: 50.91 | epoch: 2 | total time: 104.81m | eta: 74.7m +step 09757/16704 (58.41%) | loss: 2.643249 | lrm: 0.83 | dt: 644.42ms | tok/sec: 813,583 | mfu: 50.85 | epoch: 2 | total time: 104.82m | eta: 74.7m +step 09758/16704 (58.42%) | loss: 2.643620 | lrm: 0.83 | dt: 644.33ms | tok/sec: 813,700 | mfu: 50.86 | epoch: 2 | total time: 104.83m | eta: 74.7m +step 09759/16704 (58.42%) | loss: 2.646345 | lrm: 0.83 | dt: 643.87ms | tok/sec: 814,269 | mfu: 50.89 | epoch: 2 | total time: 104.85m | eta: 74.7m +step 09760/16704 (58.43%) | loss: 2.649066 | lrm: 0.83 | dt: 644.21ms | tok/sec: 813,847 | mfu: 50.87 | epoch: 2 | total time: 104.86m | eta: 74.7m +step 09761/16704 (58.44%) | loss: 2.641361 | lrm: 0.83 | dt: 642.91ms | tok/sec: 815,486 | mfu: 50.97 | epoch: 2 | total time: 104.87m | eta: 74.7m +step 09762/16704 (58.44%) | loss: 2.641815 | lrm: 0.83 | dt: 646.01ms | tok/sec: 811,577 | mfu: 50.72 | epoch: 2 | total time: 104.88m | eta: 74.7m +step 09763/16704 (58.45%) | loss: 2.646350 | lrm: 0.83 | dt: 642.17ms | tok/sec: 816,429 | mfu: 51.03 | epoch: 2 | total time: 104.89m | eta: 74.6m +step 09764/16704 (58.45%) | loss: 2.651372 | lrm: 0.83 | dt: 642.27ms | tok/sec: 816,303 | mfu: 51.02 | epoch: 2 | total time: 104.90m | eta: 74.6m +step 09765/16704 (58.46%) | loss: 2.638996 | lrm: 0.83 | dt: 645.74ms | tok/sec: 811,923 | mfu: 50.75 | epoch: 2 | total time: 104.91m | eta: 74.6m +step 09766/16704 (58.47%) | loss: 2.633841 | lrm: 0.83 | dt: 645.79ms | tok/sec: 811,860 | mfu: 50.74 | epoch: 2 | total time: 104.92m | eta: 74.6m +step 09767/16704 (58.47%) | loss: 2.642508 | lrm: 0.83 | dt: 642.57ms | tok/sec: 815,922 | mfu: 51.00 | epoch: 2 | total time: 104.93m | eta: 74.6m +step 09768/16704 (58.48%) | loss: 2.641003 | lrm: 0.83 | dt: 644.19ms | tok/sec: 813,875 | mfu: 50.87 | epoch: 2 | total time: 104.94m | eta: 74.6m +step 09769/16704 (58.48%) | loss: 2.644255 | lrm: 0.83 | dt: 644.52ms | tok/sec: 813,457 | mfu: 50.84 | epoch: 2 | total time: 104.95m | eta: 74.6m +step 09770/16704 (58.49%) | loss: 2.655980 | lrm: 0.83 | dt: 643.51ms | tok/sec: 814,735 | mfu: 50.92 | epoch: 2 | total time: 104.96m | eta: 74.6m +step 09771/16704 (58.49%) | loss: 2.657779 | lrm: 0.83 | dt: 645.29ms | tok/sec: 812,480 | mfu: 50.78 | epoch: 2 | total time: 104.97m | eta: 74.6m +step 09772/16704 (58.50%) | loss: 2.653216 | lrm: 0.83 | dt: 644.60ms | tok/sec: 813,353 | mfu: 50.84 | epoch: 2 | total time: 104.99m | eta: 74.5m +step 09773/16704 (58.51%) | loss: 2.655288 | lrm: 0.83 | dt: 643.17ms | tok/sec: 815,159 | mfu: 50.95 | epoch: 2 | total time: 105.00m | eta: 74.5m +step 09774/16704 (58.51%) | loss: 2.645295 | lrm: 0.83 | dt: 642.76ms | tok/sec: 815,686 | mfu: 50.98 | epoch: 2 | total time: 105.01m | eta: 74.5m +step 09775/16704 (58.52%) | loss: 2.638924 | lrm: 0.83 | dt: 642.13ms | tok/sec: 816,487 | mfu: 51.03 | epoch: 2 | total time: 105.02m | eta: 74.5m +step 09776/16704 (58.52%) | loss: 2.637925 | lrm: 0.83 | dt: 645.51ms | tok/sec: 812,209 | mfu: 50.76 | epoch: 2 | total time: 105.03m | eta: 74.5m +step 09777/16704 (58.53%) | loss: 2.633435 | lrm: 0.83 | dt: 643.95ms | tok/sec: 814,171 | mfu: 50.89 | epoch: 2 | total time: 105.04m | eta: 74.5m +step 09778/16704 (58.54%) | loss: 2.643436 | lrm: 0.83 | dt: 643.30ms | tok/sec: 814,992 | mfu: 50.94 | epoch: 2 | total time: 105.05m | eta: 74.5m +step 09779/16704 (58.54%) | loss: 2.649179 | lrm: 0.83 | dt: 642.59ms | tok/sec: 815,901 | mfu: 50.99 | epoch: 2 | total time: 105.06m | eta: 74.5m +step 09780/16704 (58.55%) | loss: 2.647102 | lrm: 0.83 | dt: 642.80ms | tok/sec: 815,632 | mfu: 50.98 | epoch: 2 | total time: 105.07m | eta: 74.5m +step 09781/16704 (58.55%) | loss: 2.640485 | lrm: 0.83 | dt: 644.43ms | tok/sec: 813,565 | mfu: 50.85 | epoch: 2 | total time: 105.08m | eta: 74.5m +step 09782/16704 (58.56%) | loss: 2.614490 | lrm: 0.83 | dt: 644.20ms | tok/sec: 813,862 | mfu: 50.87 | epoch: 2 | total time: 105.09m | eta: 74.4m +step 09783/16704 (58.57%) | loss: 2.615415 | lrm: 0.83 | dt: 642.50ms | tok/sec: 816,016 | mfu: 51.00 | epoch: 2 | total time: 105.10m | eta: 74.4m +step 09784/16704 (58.57%) | loss: 2.627525 | lrm: 0.83 | dt: 643.34ms | tok/sec: 814,951 | mfu: 50.94 | epoch: 2 | total time: 105.11m | eta: 74.4m +step 09785/16704 (58.58%) | loss: 2.624036 | lrm: 0.83 | dt: 645.17ms | tok/sec: 812,634 | mfu: 50.79 | epoch: 2 | total time: 105.12m | eta: 74.4m +step 09786/16704 (58.58%) | loss: 2.624746 | lrm: 0.83 | dt: 644.78ms | tok/sec: 813,127 | mfu: 50.82 | epoch: 2 | total time: 105.14m | eta: 74.4m +step 09787/16704 (58.59%) | loss: 2.624052 | lrm: 0.83 | dt: 645.77ms | tok/sec: 811,878 | mfu: 50.74 | epoch: 2 | total time: 105.15m | eta: 74.4m +step 09788/16704 (58.60%) | loss: 2.618392 | lrm: 0.83 | dt: 643.96ms | tok/sec: 814,156 | mfu: 50.89 | epoch: 2 | total time: 105.16m | eta: 74.4m +step 09789/16704 (58.60%) | loss: 2.612141 | lrm: 0.83 | dt: 642.91ms | tok/sec: 815,493 | mfu: 50.97 | epoch: 2 | total time: 105.17m | eta: 74.4m +step 09790/16704 (58.61%) | loss: 2.617224 | lrm: 0.83 | dt: 643.20ms | tok/sec: 815,126 | mfu: 50.95 | epoch: 2 | total time: 105.18m | eta: 74.4m +step 09791/16704 (58.61%) | loss: 2.618860 | lrm: 0.83 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 2 | total time: 105.19m | eta: 74.3m +step 09792/16704 (58.62%) | loss: 2.626795 | lrm: 0.83 | dt: 643.34ms | tok/sec: 814,951 | mfu: 50.94 | epoch: 2 | total time: 105.20m | eta: 74.3m +step 09793/16704 (58.63%) | loss: 2.629420 | lrm: 0.83 | dt: 643.06ms | tok/sec: 815,302 | mfu: 50.96 | epoch: 2 | total time: 105.21m | eta: 74.3m +step 09794/16704 (58.63%) | loss: 2.619818 | lrm: 0.83 | dt: 642.94ms | tok/sec: 815,447 | mfu: 50.97 | epoch: 2 | total time: 105.22m | eta: 74.3m +step 09795/16704 (58.64%) | loss: 2.620825 | lrm: 0.83 | dt: 643.17ms | tok/sec: 815,157 | mfu: 50.95 | epoch: 2 | total time: 105.23m | eta: 74.3m +step 09796/16704 (58.64%) | loss: 2.617152 | lrm: 0.83 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 2 | total time: 105.24m | eta: 74.3m +step 09797/16704 (58.65%) | loss: 2.618555 | lrm: 0.83 | dt: 643.44ms | tok/sec: 814,820 | mfu: 50.93 | epoch: 2 | total time: 105.25m | eta: 74.3m +step 09798/16704 (58.66%) | loss: 2.615511 | lrm: 0.83 | dt: 645.40ms | tok/sec: 812,347 | mfu: 50.77 | epoch: 2 | total time: 105.26m | eta: 74.3m +step 09799/16704 (58.66%) | loss: 2.631068 | lrm: 0.83 | dt: 644.89ms | tok/sec: 812,983 | mfu: 50.81 | epoch: 2 | total time: 105.27m | eta: 74.3m +step 09800/16704 (58.67%) | loss: 2.637581 | lrm: 0.83 | dt: 642.31ms | tok/sec: 816,255 | mfu: 51.02 | epoch: 2 | total time: 105.29m | eta: 74.2m +step 09801/16704 (58.67%) | loss: 2.635614 | lrm: 0.83 | dt: 643.68ms | tok/sec: 814,520 | mfu: 50.91 | epoch: 2 | total time: 105.30m | eta: 74.2m +step 09802/16704 (58.68%) | loss: 2.642184 | lrm: 0.83 | dt: 644.02ms | tok/sec: 814,085 | mfu: 50.88 | epoch: 2 | total time: 105.31m | eta: 74.2m +step 09803/16704 (58.69%) | loss: 2.652284 | lrm: 0.83 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 2 | total time: 105.32m | eta: 74.2m +step 09804/16704 (58.69%) | loss: 2.662396 | lrm: 0.83 | dt: 641.50ms | tok/sec: 817,290 | mfu: 51.08 | epoch: 2 | total time: 105.33m | eta: 74.2m +step 09805/16704 (58.70%) | loss: 2.666090 | lrm: 0.83 | dt: 644.50ms | tok/sec: 813,485 | mfu: 50.84 | epoch: 2 | total time: 105.34m | eta: 74.2m +step 09806/16704 (58.70%) | loss: 2.658799 | lrm: 0.83 | dt: 643.96ms | tok/sec: 814,156 | mfu: 50.89 | epoch: 2 | total time: 105.35m | eta: 74.2m +step 09807/16704 (58.71%) | loss: 2.660465 | lrm: 0.83 | dt: 642.82ms | tok/sec: 815,601 | mfu: 50.98 | epoch: 2 | total time: 105.36m | eta: 74.2m +step 09808/16704 (58.72%) | loss: 2.645172 | lrm: 0.83 | dt: 642.91ms | tok/sec: 815,495 | mfu: 50.97 | epoch: 2 | total time: 105.37m | eta: 74.2m +step 09809/16704 (58.72%) | loss: 2.635454 | lrm: 0.83 | dt: 642.73ms | tok/sec: 815,724 | mfu: 50.98 | epoch: 2 | total time: 105.38m | eta: 74.2m +step 09810/16704 (58.73%) | loss: 2.632382 | lrm: 0.83 | dt: 643.00ms | tok/sec: 815,376 | mfu: 50.96 | epoch: 2 | total time: 105.39m | eta: 74.1m +step 09811/16704 (58.73%) | loss: 2.633611 | lrm: 0.83 | dt: 642.56ms | tok/sec: 815,937 | mfu: 51.00 | epoch: 2 | total time: 105.40m | eta: 74.1m +step 09812/16704 (58.74%) | loss: 2.647158 | lrm: 0.83 | dt: 643.94ms | tok/sec: 814,183 | mfu: 50.89 | epoch: 2 | total time: 105.41m | eta: 74.1m +step 09813/16704 (58.75%) | loss: 2.634841 | lrm: 0.83 | dt: 644.34ms | tok/sec: 813,680 | mfu: 50.86 | epoch: 2 | total time: 105.42m | eta: 74.1m +step 09814/16704 (58.75%) | loss: 2.630617 | lrm: 0.82 | dt: 644.09ms | tok/sec: 813,995 | mfu: 50.88 | epoch: 2 | total time: 105.44m | eta: 74.1m +step 09815/16704 (58.76%) | loss: 2.643449 | lrm: 0.82 | dt: 641.52ms | tok/sec: 817,256 | mfu: 51.08 | epoch: 2 | total time: 105.45m | eta: 74.1m +step 09816/16704 (58.76%) | loss: 2.649612 | lrm: 0.82 | dt: 641.31ms | tok/sec: 817,528 | mfu: 51.10 | epoch: 2 | total time: 105.46m | eta: 74.1m +step 09817/16704 (58.77%) | loss: 2.656086 | lrm: 0.82 | dt: 643.59ms | tok/sec: 814,632 | mfu: 50.92 | epoch: 2 | total time: 105.47m | eta: 74.1m +step 09818/16704 (58.78%) | loss: 2.661679 | lrm: 0.82 | dt: 641.73ms | tok/sec: 816,991 | mfu: 51.06 | epoch: 2 | total time: 105.48m | eta: 74.1m +step 09819/16704 (58.78%) | loss: 2.662495 | lrm: 0.82 | dt: 644.72ms | tok/sec: 813,203 | mfu: 50.83 | epoch: 2 | total time: 105.49m | eta: 74.0m +step 09820/16704 (58.79%) | loss: 2.657306 | lrm: 0.82 | dt: 644.10ms | tok/sec: 813,983 | mfu: 50.88 | epoch: 2 | total time: 105.50m | eta: 74.0m +step 09821/16704 (58.79%) | loss: 2.662497 | lrm: 0.82 | dt: 643.65ms | tok/sec: 814,549 | mfu: 50.91 | epoch: 2 | total time: 105.51m | eta: 74.0m +step 09822/16704 (58.80%) | loss: 2.652713 | lrm: 0.82 | dt: 644.47ms | tok/sec: 813,519 | mfu: 50.85 | epoch: 2 | total time: 105.52m | eta: 74.0m +step 09823/16704 (58.81%) | loss: 2.657476 | lrm: 0.82 | dt: 642.38ms | tok/sec: 816,163 | mfu: 51.01 | epoch: 2 | total time: 105.53m | eta: 74.0m +step 09824/16704 (58.81%) | loss: 2.658966 | lrm: 0.82 | dt: 644.55ms | tok/sec: 813,419 | mfu: 50.84 | epoch: 2 | total time: 105.54m | eta: 74.0m +step 09825/16704 (58.82%) | loss: 2.660660 | lrm: 0.82 | dt: 642.52ms | tok/sec: 815,991 | mfu: 51.00 | epoch: 2 | total time: 105.55m | eta: 74.0m +step 09826/16704 (58.82%) | loss: 2.650786 | lrm: 0.82 | dt: 643.75ms | tok/sec: 814,430 | mfu: 50.90 | epoch: 2 | total time: 105.56m | eta: 74.0m +step 09827/16704 (58.83%) | loss: 2.657101 | lrm: 0.82 | dt: 645.29ms | tok/sec: 812,488 | mfu: 50.78 | epoch: 2 | total time: 105.57m | eta: 74.0m +step 09828/16704 (58.84%) | loss: 2.666589 | lrm: 0.82 | dt: 642.74ms | tok/sec: 815,713 | mfu: 50.98 | epoch: 2 | total time: 105.59m | eta: 73.9m +step 09829/16704 (58.84%) | loss: 2.668921 | lrm: 0.82 | dt: 643.34ms | tok/sec: 814,948 | mfu: 50.94 | epoch: 2 | total time: 105.60m | eta: 73.9m +step 09830/16704 (58.85%) | loss: 2.657927 | lrm: 0.82 | dt: 645.14ms | tok/sec: 812,667 | mfu: 50.79 | epoch: 2 | total time: 105.61m | eta: 73.9m +step 09831/16704 (58.85%) | loss: 2.655677 | lrm: 0.82 | dt: 644.00ms | tok/sec: 814,118 | mfu: 50.88 | epoch: 2 | total time: 105.62m | eta: 73.9m +step 09832/16704 (58.86%) | loss: 2.652201 | lrm: 0.82 | dt: 644.56ms | tok/sec: 813,408 | mfu: 50.84 | epoch: 2 | total time: 105.63m | eta: 73.9m +step 09833/16704 (58.87%) | loss: 2.659825 | lrm: 0.82 | dt: 644.85ms | tok/sec: 813,038 | mfu: 50.82 | epoch: 2 | total time: 105.64m | eta: 73.9m +step 09834/16704 (58.87%) | loss: 2.648434 | lrm: 0.82 | dt: 643.91ms | tok/sec: 814,222 | mfu: 50.89 | epoch: 2 | total time: 105.65m | eta: 73.9m +step 09835/16704 (58.88%) | loss: 2.658775 | lrm: 0.82 | dt: 644.76ms | tok/sec: 813,152 | mfu: 50.82 | epoch: 2 | total time: 105.66m | eta: 73.9m +step 09836/16704 (58.88%) | loss: 2.657522 | lrm: 0.82 | dt: 644.18ms | tok/sec: 813,880 | mfu: 50.87 | epoch: 2 | total time: 105.67m | eta: 73.9m +step 09837/16704 (58.89%) | loss: 2.655932 | lrm: 0.82 | dt: 643.37ms | tok/sec: 814,910 | mfu: 50.93 | epoch: 2 | total time: 105.68m | eta: 73.8m +step 09838/16704 (58.90%) | loss: 2.664319 | lrm: 0.82 | dt: 643.81ms | tok/sec: 814,352 | mfu: 50.90 | epoch: 2 | total time: 105.69m | eta: 73.8m +step 09839/16704 (58.90%) | loss: 2.651768 | lrm: 0.82 | dt: 644.13ms | tok/sec: 813,945 | mfu: 50.87 | epoch: 2 | total time: 105.70m | eta: 73.8m +step 09840/16704 (58.91%) | loss: 2.642193 | lrm: 0.82 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 2 | total time: 105.71m | eta: 73.8m +step 09841/16704 (58.91%) | loss: 2.651855 | lrm: 0.82 | dt: 644.24ms | tok/sec: 813,810 | mfu: 50.86 | epoch: 2 | total time: 105.73m | eta: 73.8m +step 09842/16704 (58.92%) | loss: 2.649721 | lrm: 0.82 | dt: 644.17ms | tok/sec: 813,898 | mfu: 50.87 | epoch: 2 | total time: 105.74m | eta: 73.8m +step 09843/16704 (58.93%) | loss: 2.655832 | lrm: 0.82 | dt: 643.29ms | tok/sec: 815,014 | mfu: 50.94 | epoch: 2 | total time: 105.75m | eta: 73.8m +step 09844/16704 (58.93%) | loss: 2.658182 | lrm: 0.82 | dt: 646.24ms | tok/sec: 811,293 | mfu: 50.71 | epoch: 2 | total time: 105.76m | eta: 73.8m +step 09845/16704 (58.94%) | loss: 2.652418 | lrm: 0.82 | dt: 644.72ms | tok/sec: 813,204 | mfu: 50.83 | epoch: 2 | total time: 105.77m | eta: 73.8m +step 09846/16704 (58.94%) | loss: 2.636767 | lrm: 0.82 | dt: 643.75ms | tok/sec: 814,432 | mfu: 50.90 | epoch: 2 | total time: 105.78m | eta: 73.8m +step 09847/16704 (58.95%) | loss: 2.653993 | lrm: 0.82 | dt: 645.25ms | tok/sec: 812,533 | mfu: 50.78 | epoch: 2 | total time: 105.79m | eta: 73.7m +step 09848/16704 (58.96%) | loss: 2.660635 | lrm: 0.82 | dt: 643.89ms | tok/sec: 814,253 | mfu: 50.89 | epoch: 2 | total time: 105.80m | eta: 73.7m +step 09849/16704 (58.96%) | loss: 2.658308 | lrm: 0.82 | dt: 647.16ms | tok/sec: 810,136 | mfu: 50.63 | epoch: 2 | total time: 105.81m | eta: 73.7m +step 09850/16704 (58.97%) | loss: 2.665763 | lrm: 0.82 | dt: 644.66ms | tok/sec: 813,279 | mfu: 50.83 | epoch: 2 | total time: 105.82m | eta: 73.7m +step 09851/16704 (58.97%) | loss: 2.663992 | lrm: 0.82 | dt: 644.73ms | tok/sec: 813,189 | mfu: 50.83 | epoch: 2 | total time: 105.83m | eta: 73.7m +step 09852/16704 (58.98%) | loss: 2.657936 | lrm: 0.82 | dt: 645.32ms | tok/sec: 812,449 | mfu: 50.78 | epoch: 2 | total time: 105.84m | eta: 73.7m +step 09853/16704 (58.99%) | loss: 2.654109 | lrm: 0.82 | dt: 644.65ms | tok/sec: 813,284 | mfu: 50.83 | epoch: 2 | total time: 105.85m | eta: 73.7m +step 09854/16704 (58.99%) | loss: 2.666010 | lrm: 0.82 | dt: 642.13ms | tok/sec: 816,482 | mfu: 51.03 | epoch: 2 | total time: 105.86m | eta: 73.7m +step 09855/16704 (59.00%) | loss: 2.659773 | lrm: 0.82 | dt: 645.21ms | tok/sec: 812,583 | mfu: 50.79 | epoch: 2 | total time: 105.88m | eta: 73.7m +step 09856/16704 (59.00%) | loss: 2.650309 | lrm: 0.82 | dt: 644.20ms | tok/sec: 813,855 | mfu: 50.87 | epoch: 2 | total time: 105.89m | eta: 73.6m +step 09857/16704 (59.01%) | loss: 2.646795 | lrm: 0.82 | dt: 643.11ms | tok/sec: 815,233 | mfu: 50.95 | epoch: 2 | total time: 105.90m | eta: 73.6m +step 09858/16704 (59.02%) | loss: 2.655745 | lrm: 0.82 | dt: 642.51ms | tok/sec: 816,000 | mfu: 51.00 | epoch: 2 | total time: 105.91m | eta: 73.6m +step 09859/16704 (59.02%) | loss: 2.664824 | lrm: 0.82 | dt: 643.10ms | tok/sec: 815,246 | mfu: 50.95 | epoch: 2 | total time: 105.92m | eta: 73.6m +step 09860/16704 (59.03%) | loss: 2.667717 | lrm: 0.82 | dt: 643.80ms | tok/sec: 814,361 | mfu: 50.90 | epoch: 2 | total time: 105.93m | eta: 73.6m +step 09861/16704 (59.03%) | loss: 2.664937 | lrm: 0.82 | dt: 643.71ms | tok/sec: 814,482 | mfu: 50.91 | epoch: 2 | total time: 105.94m | eta: 73.6m +step 09862/16704 (59.04%) | loss: 2.664531 | lrm: 0.82 | dt: 648.08ms | tok/sec: 808,987 | mfu: 50.56 | epoch: 2 | total time: 105.95m | eta: 73.6m +step 09863/16704 (59.05%) | loss: 2.680400 | lrm: 0.82 | dt: 645.07ms | tok/sec: 812,759 | mfu: 50.80 | epoch: 2 | total time: 105.96m | eta: 73.6m +step 09864/16704 (59.05%) | loss: 2.688826 | lrm: 0.82 | dt: 642.90ms | tok/sec: 815,499 | mfu: 50.97 | epoch: 2 | total time: 105.97m | eta: 73.6m +step 09865/16704 (59.06%) | loss: 2.691368 | lrm: 0.82 | dt: 645.99ms | tok/sec: 811,607 | mfu: 50.73 | epoch: 2 | total time: 105.98m | eta: 73.5m +step 09866/16704 (59.06%) | loss: 2.684968 | lrm: 0.82 | dt: 645.01ms | tok/sec: 812,840 | mfu: 50.80 | epoch: 2 | total time: 105.99m | eta: 73.5m +step 09867/16704 (59.07%) | loss: 2.683858 | lrm: 0.82 | dt: 645.26ms | tok/sec: 812,525 | mfu: 50.78 | epoch: 2 | total time: 106.00m | eta: 73.5m +step 09868/16704 (59.08%) | loss: 2.681030 | lrm: 0.82 | dt: 643.31ms | tok/sec: 814,980 | mfu: 50.94 | epoch: 2 | total time: 106.02m | eta: 73.5m +step 09869/16704 (59.08%) | loss: 2.672059 | lrm: 0.82 | dt: 644.10ms | tok/sec: 813,985 | mfu: 50.88 | epoch: 2 | total time: 106.03m | eta: 73.5m +step 09870/16704 (59.09%) | loss: 2.683273 | lrm: 0.82 | dt: 644.08ms | tok/sec: 814,008 | mfu: 50.88 | epoch: 2 | total time: 106.04m | eta: 73.5m +step 09871/16704 (59.09%) | loss: 2.675673 | lrm: 0.82 | dt: 645.14ms | tok/sec: 812,678 | mfu: 50.79 | epoch: 2 | total time: 106.05m | eta: 73.5m +step 09872/16704 (59.10%) | loss: 2.681129 | lrm: 0.82 | dt: 642.59ms | tok/sec: 815,892 | mfu: 50.99 | epoch: 2 | total time: 106.06m | eta: 73.5m +step 09873/16704 (59.11%) | loss: 2.686781 | lrm: 0.82 | dt: 645.75ms | tok/sec: 811,911 | mfu: 50.75 | epoch: 2 | total time: 106.07m | eta: 73.5m +step 09874/16704 (59.11%) | loss: 2.684171 | lrm: 0.82 | dt: 645.92ms | tok/sec: 811,692 | mfu: 50.73 | epoch: 2 | total time: 106.08m | eta: 73.5m +step 09875/16704 (59.12%) | loss: 2.686683 | lrm: 0.82 | dt: 643.52ms | tok/sec: 814,719 | mfu: 50.92 | epoch: 2 | total time: 106.09m | eta: 73.4m +step 09876/16704 (59.12%) | loss: 2.675769 | lrm: 0.82 | dt: 645.71ms | tok/sec: 811,955 | mfu: 50.75 | epoch: 2 | total time: 106.10m | eta: 73.4m +step 09877/16704 (59.13%) | loss: 2.671342 | lrm: 0.82 | dt: 644.89ms | tok/sec: 812,989 | mfu: 50.81 | epoch: 2 | total time: 106.11m | eta: 73.4m +step 09878/16704 (59.14%) | loss: 2.670927 | lrm: 0.82 | dt: 644.13ms | tok/sec: 813,945 | mfu: 50.87 | epoch: 2 | total time: 106.12m | eta: 73.4m +step 09879/16704 (59.14%) | loss: 2.646759 | lrm: 0.82 | dt: 646.36ms | tok/sec: 811,137 | mfu: 50.70 | epoch: 2 | total time: 106.13m | eta: 73.4m +step 09880/16704 (59.15%) | loss: 2.659588 | lrm: 0.82 | dt: 641.50ms | tok/sec: 817,278 | mfu: 51.08 | epoch: 2 | total time: 106.14m | eta: 73.4m +step 09881/16704 (59.15%) | loss: 2.650162 | lrm: 0.82 | dt: 644.38ms | tok/sec: 813,630 | mfu: 50.85 | epoch: 2 | total time: 106.15m | eta: 73.4m +step 09882/16704 (59.16%) | loss: 2.647643 | lrm: 0.82 | dt: 643.63ms | tok/sec: 814,584 | mfu: 50.91 | epoch: 2 | total time: 106.17m | eta: 73.4m +step 09883/16704 (59.17%) | loss: 2.653161 | lrm: 0.82 | dt: 645.21ms | tok/sec: 812,579 | mfu: 50.79 | epoch: 2 | total time: 106.18m | eta: 73.4m +step 09884/16704 (59.17%) | loss: 2.657864 | lrm: 0.82 | dt: 643.35ms | tok/sec: 814,938 | mfu: 50.93 | epoch: 2 | total time: 106.19m | eta: 73.3m +step 09885/16704 (59.18%) | loss: 2.657504 | lrm: 0.82 | dt: 644.04ms | tok/sec: 814,057 | mfu: 50.88 | epoch: 2 | total time: 106.20m | eta: 73.3m +step 09886/16704 (59.18%) | loss: 2.650765 | lrm: 0.82 | dt: 644.15ms | tok/sec: 813,927 | mfu: 50.87 | epoch: 2 | total time: 106.21m | eta: 73.3m +step 09887/16704 (59.19%) | loss: 2.646774 | lrm: 0.82 | dt: 643.67ms | tok/sec: 814,528 | mfu: 50.91 | epoch: 2 | total time: 106.22m | eta: 73.3m +step 09888/16704 (59.20%) | loss: 2.656780 | lrm: 0.82 | dt: 644.91ms | tok/sec: 812,964 | mfu: 50.81 | epoch: 2 | total time: 106.23m | eta: 73.3m +step 09889/16704 (59.20%) | loss: 2.639858 | lrm: 0.82 | dt: 643.77ms | tok/sec: 814,400 | mfu: 50.90 | epoch: 2 | total time: 106.24m | eta: 73.3m +step 09890/16704 (59.21%) | loss: 2.634085 | lrm: 0.82 | dt: 644.64ms | tok/sec: 813,298 | mfu: 50.83 | epoch: 2 | total time: 106.25m | eta: 73.3m +step 09891/16704 (59.21%) | loss: 2.638224 | lrm: 0.82 | dt: 644.47ms | tok/sec: 813,521 | mfu: 50.85 | epoch: 2 | total time: 106.26m | eta: 73.3m +step 09892/16704 (59.22%) | loss: 2.637321 | lrm: 0.82 | dt: 644.84ms | tok/sec: 813,048 | mfu: 50.82 | epoch: 2 | total time: 106.27m | eta: 73.3m +step 09893/16704 (59.23%) | loss: 2.633934 | lrm: 0.82 | dt: 644.35ms | tok/sec: 813,668 | mfu: 50.86 | epoch: 2 | total time: 106.28m | eta: 73.2m +step 09894/16704 (59.23%) | loss: 2.633554 | lrm: 0.82 | dt: 643.16ms | tok/sec: 815,180 | mfu: 50.95 | epoch: 2 | total time: 106.29m | eta: 73.2m +step 09895/16704 (59.24%) | loss: 2.629658 | lrm: 0.82 | dt: 642.47ms | tok/sec: 816,047 | mfu: 51.00 | epoch: 2 | total time: 106.31m | eta: 73.2m +step 09896/16704 (59.24%) | loss: 2.632651 | lrm: 0.82 | dt: 645.20ms | tok/sec: 812,598 | mfu: 50.79 | epoch: 2 | total time: 106.32m | eta: 73.2m +step 09897/16704 (59.25%) | loss: 2.636366 | lrm: 0.82 | dt: 643.10ms | tok/sec: 815,247 | mfu: 50.95 | epoch: 2 | total time: 106.33m | eta: 73.2m +step 09898/16704 (59.26%) | loss: 2.638345 | lrm: 0.81 | dt: 645.54ms | tok/sec: 812,168 | mfu: 50.76 | epoch: 2 | total time: 106.34m | eta: 73.2m +step 09899/16704 (59.26%) | loss: 2.634361 | lrm: 0.81 | dt: 642.65ms | tok/sec: 815,822 | mfu: 50.99 | epoch: 2 | total time: 106.35m | eta: 73.2m +step 09900/16704 (59.27%) | loss: 2.625466 | lrm: 0.81 | dt: 642.64ms | tok/sec: 815,829 | mfu: 50.99 | epoch: 2 | total time: 106.36m | eta: 73.2m +step 09901/16704 (59.27%) | loss: 2.623477 | lrm: 0.81 | dt: 644.63ms | tok/sec: 813,310 | mfu: 50.83 | epoch: 2 | total time: 106.37m | eta: 73.2m +step 09902/16704 (59.28%) | loss: 2.629270 | lrm: 0.81 | dt: 644.58ms | tok/sec: 813,382 | mfu: 50.84 | epoch: 2 | total time: 106.38m | eta: 73.1m +step 09903/16704 (59.29%) | loss: 2.640227 | lrm: 0.81 | dt: 645.34ms | tok/sec: 812,424 | mfu: 50.78 | epoch: 2 | total time: 106.39m | eta: 73.1m +step 09904/16704 (59.29%) | loss: 2.639635 | lrm: 0.81 | dt: 644.03ms | tok/sec: 814,069 | mfu: 50.88 | epoch: 2 | total time: 106.40m | eta: 73.1m +step 09905/16704 (59.30%) | loss: 2.643357 | lrm: 0.81 | dt: 642.62ms | tok/sec: 815,854 | mfu: 50.99 | epoch: 2 | total time: 106.41m | eta: 73.1m +step 09906/16704 (59.30%) | loss: 2.642570 | lrm: 0.81 | dt: 644.27ms | tok/sec: 813,767 | mfu: 50.86 | epoch: 2 | total time: 106.42m | eta: 73.1m +step 09907/16704 (59.31%) | loss: 2.646096 | lrm: 0.81 | dt: 643.70ms | tok/sec: 814,490 | mfu: 50.91 | epoch: 2 | total time: 106.43m | eta: 73.1m +step 09908/16704 (59.32%) | loss: 2.653480 | lrm: 0.81 | dt: 646.92ms | tok/sec: 810,439 | mfu: 50.65 | epoch: 2 | total time: 106.44m | eta: 73.1m +step 09909/16704 (59.32%) | loss: 2.654818 | lrm: 0.81 | dt: 641.14ms | tok/sec: 817,745 | mfu: 51.11 | epoch: 2 | total time: 106.46m | eta: 73.1m +step 09910/16704 (59.33%) | loss: 2.650015 | lrm: 0.81 | dt: 643.24ms | tok/sec: 815,075 | mfu: 50.94 | epoch: 2 | total time: 106.47m | eta: 73.1m +step 09911/16704 (59.33%) | loss: 2.657507 | lrm: 0.81 | dt: 642.57ms | tok/sec: 815,928 | mfu: 51.00 | epoch: 2 | total time: 106.48m | eta: 73.1m +step 09912/16704 (59.34%) | loss: 2.658692 | lrm: 0.81 | dt: 642.41ms | tok/sec: 816,120 | mfu: 51.01 | epoch: 2 | total time: 106.49m | eta: 73.0m +step 09913/16704 (59.35%) | loss: 2.650437 | lrm: 0.81 | dt: 645.80ms | tok/sec: 811,842 | mfu: 50.74 | epoch: 2 | total time: 106.50m | eta: 73.0m +step 09914/16704 (59.35%) | loss: 2.659522 | lrm: 0.81 | dt: 643.69ms | tok/sec: 814,508 | mfu: 50.91 | epoch: 2 | total time: 106.51m | eta: 73.0m +step 09915/16704 (59.36%) | loss: 2.654980 | lrm: 0.81 | dt: 644.32ms | tok/sec: 813,704 | mfu: 50.86 | epoch: 2 | total time: 106.52m | eta: 73.0m +step 09916/16704 (59.36%) | loss: 2.650230 | lrm: 0.81 | dt: 644.83ms | tok/sec: 813,068 | mfu: 50.82 | epoch: 2 | total time: 106.53m | eta: 73.0m +step 09917/16704 (59.37%) | loss: 2.649878 | lrm: 0.81 | dt: 645.93ms | tok/sec: 811,677 | mfu: 50.73 | epoch: 2 | total time: 106.54m | eta: 73.0m +step 09918/16704 (59.38%) | loss: 2.647479 | lrm: 0.81 | dt: 642.44ms | tok/sec: 816,092 | mfu: 51.01 | epoch: 2 | total time: 106.55m | eta: 73.0m +step 09919/16704 (59.38%) | loss: 2.644031 | lrm: 0.81 | dt: 642.42ms | tok/sec: 816,108 | mfu: 51.01 | epoch: 2 | total time: 106.56m | eta: 73.0m +step 09920/16704 (59.39%) | loss: 2.636101 | lrm: 0.81 | dt: 644.16ms | tok/sec: 813,913 | mfu: 50.87 | epoch: 2 | total time: 106.57m | eta: 73.0m +step 09921/16704 (59.39%) | loss: 2.634860 | lrm: 0.81 | dt: 642.99ms | tok/sec: 815,387 | mfu: 50.96 | epoch: 2 | total time: 106.58m | eta: 72.9m +step 09922/16704 (59.40%) | loss: 2.620621 | lrm: 0.81 | dt: 644.61ms | tok/sec: 813,346 | mfu: 50.84 | epoch: 2 | total time: 106.59m | eta: 72.9m +step 09923/16704 (59.40%) | loss: 2.611715 | lrm: 0.81 | dt: 644.06ms | tok/sec: 814,040 | mfu: 50.88 | epoch: 2 | total time: 106.61m | eta: 72.9m +step 09924/16704 (59.41%) | loss: 2.606573 | lrm: 0.81 | dt: 643.23ms | tok/sec: 815,089 | mfu: 50.94 | epoch: 2 | total time: 106.62m | eta: 72.9m +step 09925/16704 (59.42%) | loss: 2.602369 | lrm: 0.81 | dt: 646.38ms | tok/sec: 811,108 | mfu: 50.70 | epoch: 2 | total time: 106.63m | eta: 72.9m +step 09926/16704 (59.42%) | loss: 2.607032 | lrm: 0.81 | dt: 645.66ms | tok/sec: 812,018 | mfu: 50.75 | epoch: 2 | total time: 106.64m | eta: 72.9m +step 09927/16704 (59.43%) | loss: 2.610077 | lrm: 0.81 | dt: 644.12ms | tok/sec: 813,962 | mfu: 50.87 | epoch: 2 | total time: 106.65m | eta: 72.9m +step 09928/16704 (59.43%) | loss: 2.632916 | lrm: 0.81 | dt: 645.36ms | tok/sec: 812,393 | mfu: 50.78 | epoch: 2 | total time: 106.66m | eta: 72.9m +step 09929/16704 (59.44%) | loss: 2.625529 | lrm: 0.81 | dt: 643.22ms | tok/sec: 815,103 | mfu: 50.95 | epoch: 2 | total time: 106.67m | eta: 72.9m +step 09930/16704 (59.45%) | loss: 2.638797 | lrm: 0.81 | dt: 642.59ms | tok/sec: 815,902 | mfu: 51.00 | epoch: 2 | total time: 106.68m | eta: 72.8m +step 09931/16704 (59.45%) | loss: 2.641347 | lrm: 0.81 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 2 | total time: 106.69m | eta: 72.8m +step 09932/16704 (59.46%) | loss: 2.645683 | lrm: 0.81 | dt: 643.05ms | tok/sec: 815,320 | mfu: 50.96 | epoch: 2 | total time: 106.70m | eta: 72.8m +step 09933/16704 (59.46%) | loss: 2.648804 | lrm: 0.81 | dt: 644.45ms | tok/sec: 813,540 | mfu: 50.85 | epoch: 2 | total time: 106.71m | eta: 72.8m +step 09934/16704 (59.47%) | loss: 2.655336 | lrm: 0.81 | dt: 642.36ms | tok/sec: 816,196 | mfu: 51.01 | epoch: 2 | total time: 106.72m | eta: 72.8m +step 09935/16704 (59.48%) | loss: 2.645250 | lrm: 0.81 | dt: 645.20ms | tok/sec: 812,592 | mfu: 50.79 | epoch: 2 | total time: 106.73m | eta: 72.8m +step 09936/16704 (59.48%) | loss: 2.652231 | lrm: 0.81 | dt: 645.45ms | tok/sec: 812,285 | mfu: 50.77 | epoch: 2 | total time: 106.75m | eta: 72.8m +step 09937/16704 (59.49%) | loss: 2.658497 | lrm: 0.81 | dt: 644.41ms | tok/sec: 813,597 | mfu: 50.85 | epoch: 2 | total time: 106.76m | eta: 72.8m +step 09938/16704 (59.49%) | loss: 2.670289 | lrm: 0.81 | dt: 642.48ms | tok/sec: 816,041 | mfu: 51.00 | epoch: 2 | total time: 106.77m | eta: 72.8m +step 09939/16704 (59.50%) | loss: 2.660275 | lrm: 0.81 | dt: 644.63ms | tok/sec: 813,317 | mfu: 50.83 | epoch: 2 | total time: 106.78m | eta: 72.8m +step 09940/16704 (59.51%) | loss: 2.658244 | lrm: 0.81 | dt: 643.54ms | tok/sec: 814,688 | mfu: 50.92 | epoch: 2 | total time: 106.79m | eta: 72.7m +step 09941/16704 (59.51%) | loss: 2.648658 | lrm: 0.81 | dt: 644.31ms | tok/sec: 813,716 | mfu: 50.86 | epoch: 2 | total time: 106.80m | eta: 72.7m +step 09942/16704 (59.52%) | loss: 2.649454 | lrm: 0.81 | dt: 646.88ms | tok/sec: 810,487 | mfu: 50.66 | epoch: 2 | total time: 106.81m | eta: 72.7m +step 09943/16704 (59.52%) | loss: 2.637895 | lrm: 0.81 | dt: 643.74ms | tok/sec: 814,435 | mfu: 50.90 | epoch: 2 | total time: 106.82m | eta: 72.7m +step 09944/16704 (59.53%) | loss: 2.622658 | lrm: 0.81 | dt: 643.06ms | tok/sec: 815,295 | mfu: 50.96 | epoch: 2 | total time: 106.83m | eta: 72.7m +step 09945/16704 (59.54%) | loss: 2.630200 | lrm: 0.81 | dt: 644.76ms | tok/sec: 813,148 | mfu: 50.82 | epoch: 2 | total time: 106.84m | eta: 72.7m +step 09946/16704 (59.54%) | loss: 2.634244 | lrm: 0.81 | dt: 643.35ms | tok/sec: 814,932 | mfu: 50.93 | epoch: 2 | total time: 106.85m | eta: 72.7m +step 09947/16704 (59.55%) | loss: 2.631016 | lrm: 0.81 | dt: 643.62ms | tok/sec: 814,594 | mfu: 50.91 | epoch: 2 | total time: 106.86m | eta: 72.7m +step 09948/16704 (59.55%) | loss: 2.647914 | lrm: 0.81 | dt: 645.06ms | tok/sec: 812,779 | mfu: 50.80 | epoch: 2 | total time: 106.87m | eta: 72.7m +step 09949/16704 (59.56%) | loss: 2.655545 | lrm: 0.81 | dt: 641.80ms | tok/sec: 816,908 | mfu: 51.06 | epoch: 2 | total time: 106.88m | eta: 72.6m +step 09950/16704 (59.57%) | loss: 2.655381 | lrm: 0.81 | dt: 644.47ms | tok/sec: 813,523 | mfu: 50.85 | epoch: 2 | total time: 106.90m | eta: 72.6m +step 09951/16704 (59.57%) | loss: 2.652034 | lrm: 0.81 | dt: 644.44ms | tok/sec: 813,557 | mfu: 50.85 | epoch: 2 | total time: 106.91m | eta: 72.6m +step 09952/16704 (59.58%) | loss: 2.655425 | lrm: 0.81 | dt: 642.59ms | tok/sec: 815,904 | mfu: 51.00 | epoch: 2 | total time: 106.92m | eta: 72.6m +step 09953/16704 (59.58%) | loss: 2.651560 | lrm: 0.81 | dt: 644.49ms | tok/sec: 813,487 | mfu: 50.84 | epoch: 2 | total time: 106.93m | eta: 72.6m +step 09954/16704 (59.59%) | loss: 2.625144 | lrm: 0.81 | dt: 643.59ms | tok/sec: 814,627 | mfu: 50.92 | epoch: 2 | total time: 106.94m | eta: 72.6m +step 09955/16704 (59.60%) | loss: 2.618252 | lrm: 0.81 | dt: 642.44ms | tok/sec: 816,088 | mfu: 51.01 | epoch: 2 | total time: 106.95m | eta: 72.6m +step 09956/16704 (59.60%) | loss: 2.606561 | lrm: 0.81 | dt: 643.90ms | tok/sec: 814,238 | mfu: 50.89 | epoch: 2 | total time: 106.96m | eta: 72.6m +step 09957/16704 (59.61%) | loss: 2.602150 | lrm: 0.81 | dt: 643.15ms | tok/sec: 815,186 | mfu: 50.95 | epoch: 2 | total time: 106.97m | eta: 72.6m +step 09958/16704 (59.61%) | loss: 2.602762 | lrm: 0.81 | dt: 644.54ms | tok/sec: 813,424 | mfu: 50.84 | epoch: 2 | total time: 106.98m | eta: 72.5m +step 09959/16704 (59.62%) | loss: 2.597128 | lrm: 0.81 | dt: 643.91ms | tok/sec: 814,219 | mfu: 50.89 | epoch: 2 | total time: 106.99m | eta: 72.5m +step 09960/16704 (59.63%) | loss: 2.588700 | lrm: 0.81 | dt: 642.80ms | tok/sec: 815,633 | mfu: 50.98 | epoch: 2 | total time: 107.00m | eta: 72.5m +step 09961/16704 (59.63%) | loss: 2.597383 | lrm: 0.81 | dt: 643.98ms | tok/sec: 814,136 | mfu: 50.88 | epoch: 2 | total time: 107.01m | eta: 72.5m +step 09962/16704 (59.64%) | loss: 2.596298 | lrm: 0.81 | dt: 644.53ms | tok/sec: 813,436 | mfu: 50.84 | epoch: 2 | total time: 107.02m | eta: 72.5m +step 09963/16704 (59.64%) | loss: 2.604676 | lrm: 0.81 | dt: 644.64ms | tok/sec: 813,305 | mfu: 50.83 | epoch: 2 | total time: 107.04m | eta: 72.5m +step 09964/16704 (59.65%) | loss: 2.590953 | lrm: 0.81 | dt: 643.33ms | tok/sec: 814,964 | mfu: 50.94 | epoch: 2 | total time: 107.05m | eta: 72.5m +step 09965/16704 (59.66%) | loss: 2.596330 | lrm: 0.81 | dt: 643.31ms | tok/sec: 814,983 | mfu: 50.94 | epoch: 2 | total time: 107.06m | eta: 72.5m +step 09966/16704 (59.66%) | loss: 2.592520 | lrm: 0.81 | dt: 644.03ms | tok/sec: 814,078 | mfu: 50.88 | epoch: 2 | total time: 107.07m | eta: 72.5m +step 09967/16704 (59.67%) | loss: 2.591272 | lrm: 0.81 | dt: 643.77ms | tok/sec: 814,403 | mfu: 50.90 | epoch: 2 | total time: 107.08m | eta: 72.4m +step 09968/16704 (59.67%) | loss: 2.597077 | lrm: 0.81 | dt: 646.07ms | tok/sec: 811,509 | mfu: 50.72 | epoch: 2 | total time: 107.09m | eta: 72.4m +step 09969/16704 (59.68%) | loss: 2.590350 | lrm: 0.81 | dt: 644.05ms | tok/sec: 814,044 | mfu: 50.88 | epoch: 2 | total time: 107.10m | eta: 72.4m +step 09970/16704 (59.69%) | loss: 2.585211 | lrm: 0.81 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 2 | total time: 107.11m | eta: 72.4m +step 09971/16704 (59.69%) | loss: 2.607357 | lrm: 0.81 | dt: 642.85ms | tok/sec: 815,573 | mfu: 50.97 | epoch: 2 | total time: 107.12m | eta: 72.4m +step 09972/16704 (59.70%) | loss: 2.619326 | lrm: 0.81 | dt: 642.82ms | tok/sec: 815,600 | mfu: 50.98 | epoch: 2 | total time: 107.13m | eta: 72.4m +step 09973/16704 (59.70%) | loss: 2.611824 | lrm: 0.81 | dt: 644.36ms | tok/sec: 813,660 | mfu: 50.85 | epoch: 2 | total time: 107.14m | eta: 72.4m +step 09974/16704 (59.71%) | loss: 2.615547 | lrm: 0.81 | dt: 643.38ms | tok/sec: 814,897 | mfu: 50.93 | epoch: 2 | total time: 107.15m | eta: 72.4m +step 09975/16704 (59.72%) | loss: 2.615538 | lrm: 0.81 | dt: 644.27ms | tok/sec: 813,775 | mfu: 50.86 | epoch: 2 | total time: 107.16m | eta: 72.4m +step 09976/16704 (59.72%) | loss: 2.623606 | lrm: 0.81 | dt: 645.45ms | tok/sec: 812,286 | mfu: 50.77 | epoch: 2 | total time: 107.17m | eta: 72.4m +step 09977/16704 (59.73%) | loss: 2.616589 | lrm: 0.81 | dt: 644.24ms | tok/sec: 813,803 | mfu: 50.86 | epoch: 2 | total time: 107.19m | eta: 72.3m +step 09978/16704 (59.73%) | loss: 2.625233 | lrm: 0.81 | dt: 645.51ms | tok/sec: 812,201 | mfu: 50.76 | epoch: 2 | total time: 107.20m | eta: 72.3m +step 09979/16704 (59.74%) | loss: 2.630579 | lrm: 0.81 | dt: 641.91ms | tok/sec: 816,761 | mfu: 51.05 | epoch: 2 | total time: 107.21m | eta: 72.3m +step 09980/16704 (59.75%) | loss: 2.623295 | lrm: 0.81 | dt: 645.15ms | tok/sec: 812,664 | mfu: 50.79 | epoch: 2 | total time: 107.22m | eta: 72.3m +step 09981/16704 (59.75%) | loss: 2.623614 | lrm: 0.80 | dt: 644.02ms | tok/sec: 814,081 | mfu: 50.88 | epoch: 2 | total time: 107.23m | eta: 72.3m +step 09982/16704 (59.76%) | loss: 2.619275 | lrm: 0.80 | dt: 643.01ms | tok/sec: 815,366 | mfu: 50.96 | epoch: 2 | total time: 107.24m | eta: 72.3m +step 09983/16704 (59.76%) | loss: 2.641128 | lrm: 0.80 | dt: 645.38ms | tok/sec: 812,372 | mfu: 50.77 | epoch: 2 | total time: 107.25m | eta: 72.3m +step 09984/16704 (59.77%) | loss: 2.632856 | lrm: 0.80 | dt: 642.22ms | tok/sec: 816,371 | mfu: 51.02 | epoch: 2 | total time: 107.26m | eta: 72.3m +step 09985/16704 (59.78%) | loss: 2.630258 | lrm: 0.80 | dt: 643.29ms | tok/sec: 815,014 | mfu: 50.94 | epoch: 2 | total time: 107.27m | eta: 72.3m +step 09986/16704 (59.78%) | loss: 2.613162 | lrm: 0.80 | dt: 643.88ms | tok/sec: 814,257 | mfu: 50.89 | epoch: 2 | total time: 107.28m | eta: 72.2m +step 09987/16704 (59.79%) | loss: 2.612786 | lrm: 0.80 | dt: 641.67ms | tok/sec: 817,072 | mfu: 51.07 | epoch: 2 | total time: 107.29m | eta: 72.2m +step 09988/16704 (59.79%) | loss: 2.605449 | lrm: 0.80 | dt: 641.44ms | tok/sec: 817,361 | mfu: 51.09 | epoch: 2 | total time: 107.30m | eta: 72.2m +step 09989/16704 (59.80%) | loss: 2.607921 | lrm: 0.80 | dt: 640.61ms | tok/sec: 818,413 | mfu: 51.15 | epoch: 2 | total time: 107.31m | eta: 72.2m +step 09990/16704 (59.81%) | loss: 2.617583 | lrm: 0.80 | dt: 644.00ms | tok/sec: 814,117 | mfu: 50.88 | epoch: 2 | total time: 107.32m | eta: 72.2m +step 09991/16704 (59.81%) | loss: 2.625510 | lrm: 0.80 | dt: 642.28ms | tok/sec: 816,290 | mfu: 51.02 | epoch: 2 | total time: 107.34m | eta: 72.2m +step 09992/16704 (59.82%) | loss: 2.623374 | lrm: 0.80 | dt: 644.39ms | tok/sec: 813,624 | mfu: 50.85 | epoch: 2 | total time: 107.35m | eta: 72.2m +step 09993/16704 (59.82%) | loss: 2.620111 | lrm: 0.80 | dt: 642.96ms | tok/sec: 815,433 | mfu: 50.97 | epoch: 2 | total time: 107.36m | eta: 72.2m +step 09994/16704 (59.83%) | loss: 2.618385 | lrm: 0.80 | dt: 642.32ms | tok/sec: 816,242 | mfu: 51.02 | epoch: 2 | total time: 107.37m | eta: 72.2m +step 09995/16704 (59.84%) | loss: 2.612383 | lrm: 0.80 | dt: 643.08ms | tok/sec: 815,271 | mfu: 50.96 | epoch: 2 | total time: 107.38m | eta: 72.1m +step 09996/16704 (59.84%) | loss: 2.621847 | lrm: 0.80 | dt: 644.32ms | tok/sec: 813,703 | mfu: 50.86 | epoch: 2 | total time: 107.39m | eta: 72.1m +step 09997/16704 (59.85%) | loss: 2.625118 | lrm: 0.80 | dt: 644.26ms | tok/sec: 813,780 | mfu: 50.86 | epoch: 2 | total time: 107.40m | eta: 72.1m +step 09998/16704 (59.85%) | loss: 2.620908 | lrm: 0.80 | dt: 645.04ms | tok/sec: 812,803 | mfu: 50.80 | epoch: 2 | total time: 107.41m | eta: 72.1m +step 09999/16704 (59.86%) | loss: 2.617971 | lrm: 0.80 | dt: 643.55ms | tok/sec: 814,685 | mfu: 50.92 | epoch: 2 | total time: 107.42m | eta: 72.1m +[GC rank3] gen2: 232.3ms collected 91048 objects +[GC rank6] gen2: 234.1ms collected 90984 objects +[GC rank0] gen2: 235.4ms collected 91112 objects +[GC rank2] gen2: 235.6ms collected 91056 objects +[GC rank5] gen2: 237.6ms collected 91008 objects +[GC rank1] gen2: 238.1ms collected 91088 objects +[GC rank7] gen2: 288.0ms collected 90976 objects +[GC rank4] gen2: 298.7ms collected 91024 objects +Step 10000 | Validation bpb: 0.804764 +step 10000/16704 (59.87%) | loss: 2.630324 | lrm: 0.80 | dt: 642.36ms | tok/sec: 816,190 | mfu: 51.01 | epoch: 2 | total time: 107.43m | eta: 72.1m +step 10001/16704 (59.87%) | loss: 2.636710 | lrm: 0.80 | dt: 646.10ms | tok/sec: 811,463 | mfu: 50.72 | epoch: 2 | total time: 107.44m | eta: 72.1m +step 10002/16704 (59.88%) | loss: 2.640309 | lrm: 0.80 | dt: 645.65ms | tok/sec: 812,033 | mfu: 50.75 | epoch: 2 | total time: 107.45m | eta: 72.1m +step 10003/16704 (59.88%) | loss: 2.649532 | lrm: 0.80 | dt: 640.15ms | tok/sec: 819,003 | mfu: 51.19 | epoch: 2 | total time: 107.46m | eta: 72.1m +step 10004/16704 (59.89%) | loss: 2.637412 | lrm: 0.80 | dt: 649.49ms | tok/sec: 807,230 | mfu: 50.45 | epoch: 2 | total time: 107.47m | eta: 72.1m +step 10005/16704 (59.90%) | loss: 2.626000 | lrm: 0.80 | dt: 640.62ms | tok/sec: 818,407 | mfu: 51.15 | epoch: 2 | total time: 107.49m | eta: 72.0m +step 10006/16704 (59.90%) | loss: 2.635646 | lrm: 0.80 | dt: 643.94ms | tok/sec: 814,185 | mfu: 50.89 | epoch: 2 | total time: 107.50m | eta: 72.0m +step 10007/16704 (59.91%) | loss: 2.627343 | lrm: 0.80 | dt: 644.06ms | tok/sec: 814,033 | mfu: 50.88 | epoch: 2 | total time: 107.51m | eta: 72.0m +step 10008/16704 (59.91%) | loss: 2.643060 | lrm: 0.80 | dt: 641.77ms | tok/sec: 816,943 | mfu: 51.06 | epoch: 2 | total time: 107.52m | eta: 72.0m +step 10009/16704 (59.92%) | loss: 2.670092 | lrm: 0.80 | dt: 646.98ms | tok/sec: 810,367 | mfu: 50.65 | epoch: 2 | total time: 107.53m | eta: 72.0m +step 10010/16704 (59.93%) | loss: 2.670993 | lrm: 0.80 | dt: 643.71ms | tok/sec: 814,482 | mfu: 50.91 | epoch: 2 | total time: 107.54m | eta: 72.0m +step 10011/16704 (59.93%) | loss: 2.658672 | lrm: 0.80 | dt: 642.20ms | tok/sec: 816,393 | mfu: 51.03 | epoch: 2 | total time: 107.55m | eta: 72.0m +step 10012/16704 (59.94%) | loss: 2.680513 | lrm: 0.80 | dt: 645.09ms | tok/sec: 812,731 | mfu: 50.80 | epoch: 2 | total time: 107.56m | eta: 72.0m +step 10013/16704 (59.94%) | loss: 2.682803 | lrm: 0.80 | dt: 643.27ms | tok/sec: 815,035 | mfu: 50.94 | epoch: 2 | total time: 107.57m | eta: 72.0m +step 10014/16704 (59.95%) | loss: 2.687120 | lrm: 0.80 | dt: 645.90ms | tok/sec: 811,717 | mfu: 50.73 | epoch: 2 | total time: 107.58m | eta: 71.9m +step 10015/16704 (59.96%) | loss: 2.688242 | lrm: 0.80 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 2 | total time: 107.59m | eta: 71.9m +step 10016/16704 (59.96%) | loss: 2.688840 | lrm: 0.80 | dt: 642.83ms | tok/sec: 815,597 | mfu: 50.98 | epoch: 2 | total time: 107.60m | eta: 71.9m +step 10017/16704 (59.97%) | loss: 2.690218 | lrm: 0.80 | dt: 644.55ms | tok/sec: 813,415 | mfu: 50.84 | epoch: 2 | total time: 107.61m | eta: 71.9m +step 10018/16704 (59.97%) | loss: 2.681088 | lrm: 0.80 | dt: 643.93ms | tok/sec: 814,203 | mfu: 50.89 | epoch: 2 | total time: 107.63m | eta: 71.9m +step 10019/16704 (59.98%) | loss: 2.686292 | lrm: 0.80 | dt: 645.31ms | tok/sec: 812,458 | mfu: 50.78 | epoch: 2 | total time: 107.64m | eta: 71.9m +step 10020/16704 (59.99%) | loss: 2.679350 | lrm: 0.80 | dt: 643.26ms | tok/sec: 815,050 | mfu: 50.94 | epoch: 2 | total time: 107.65m | eta: 71.9m +step 10021/16704 (59.99%) | loss: 2.688515 | lrm: 0.80 | dt: 644.28ms | tok/sec: 813,757 | mfu: 50.86 | epoch: 2 | total time: 107.66m | eta: 71.9m +step 10022/16704 (60.00%) | loss: 2.680147 | lrm: 0.80 | dt: 643.79ms | tok/sec: 814,381 | mfu: 50.90 | epoch: 2 | total time: 107.67m | eta: 71.9m +step 10023/16704 (60.00%) | loss: 2.663504 | lrm: 0.80 | dt: 644.05ms | tok/sec: 814,045 | mfu: 50.88 | epoch: 2 | total time: 107.68m | eta: 71.8m +step 10024/16704 (60.01%) | loss: 2.646449 | lrm: 0.80 | dt: 643.72ms | tok/sec: 814,470 | mfu: 50.91 | epoch: 2 | total time: 107.69m | eta: 71.8m +step 10025/16704 (60.02%) | loss: 2.635211 | lrm: 0.80 | dt: 645.19ms | tok/sec: 812,612 | mfu: 50.79 | epoch: 2 | total time: 107.70m | eta: 71.8m +step 10026/16704 (60.02%) | loss: 2.637786 | lrm: 0.80 | dt: 646.11ms | tok/sec: 811,458 | mfu: 50.72 | epoch: 2 | total time: 107.71m | eta: 71.8m +step 10027/16704 (60.03%) | loss: 2.626741 | lrm: 0.80 | dt: 642.88ms | tok/sec: 815,528 | mfu: 50.97 | epoch: 2 | total time: 107.72m | eta: 71.8m +step 10028/16704 (60.03%) | loss: 2.612505 | lrm: 0.80 | dt: 643.68ms | tok/sec: 814,517 | mfu: 50.91 | epoch: 2 | total time: 107.73m | eta: 71.8m +step 10029/16704 (60.04%) | loss: 2.620809 | lrm: 0.80 | dt: 646.18ms | tok/sec: 811,359 | mfu: 50.71 | epoch: 2 | total time: 107.74m | eta: 71.8m +step 10030/16704 (60.05%) | loss: 2.620570 | lrm: 0.80 | dt: 646.35ms | tok/sec: 811,147 | mfu: 50.70 | epoch: 2 | total time: 107.75m | eta: 71.8m +step 10031/16704 (60.05%) | loss: 2.604600 | lrm: 0.80 | dt: 644.77ms | tok/sec: 813,139 | mfu: 50.82 | epoch: 2 | total time: 107.76m | eta: 71.8m +step 10032/16704 (60.06%) | loss: 2.602903 | lrm: 0.80 | dt: 644.70ms | tok/sec: 813,233 | mfu: 50.83 | epoch: 2 | total time: 107.78m | eta: 71.7m +step 10033/16704 (60.06%) | loss: 2.597088 | lrm: 0.80 | dt: 644.83ms | tok/sec: 813,066 | mfu: 50.82 | epoch: 2 | total time: 107.79m | eta: 71.7m +step 10034/16704 (60.07%) | loss: 2.602065 | lrm: 0.80 | dt: 644.34ms | tok/sec: 813,677 | mfu: 50.86 | epoch: 2 | total time: 107.80m | eta: 71.7m +step 10035/16704 (60.08%) | loss: 2.593837 | lrm: 0.80 | dt: 644.76ms | tok/sec: 813,146 | mfu: 50.82 | epoch: 2 | total time: 107.81m | eta: 71.7m +step 10036/16704 (60.08%) | loss: 2.598417 | lrm: 0.80 | dt: 643.61ms | tok/sec: 814,600 | mfu: 50.91 | epoch: 2 | total time: 107.82m | eta: 71.7m +step 10037/16704 (60.09%) | loss: 2.588513 | lrm: 0.80 | dt: 646.45ms | tok/sec: 811,030 | mfu: 50.69 | epoch: 2 | total time: 107.83m | eta: 71.7m +step 10038/16704 (60.09%) | loss: 2.591961 | lrm: 0.80 | dt: 646.20ms | tok/sec: 811,343 | mfu: 50.71 | epoch: 2 | total time: 107.84m | eta: 71.7m +step 10039/16704 (60.10%) | loss: 2.581192 | lrm: 0.80 | dt: 643.53ms | tok/sec: 814,703 | mfu: 50.92 | epoch: 2 | total time: 107.85m | eta: 71.7m +step 10040/16704 (60.11%) | loss: 2.578703 | lrm: 0.80 | dt: 644.30ms | tok/sec: 813,737 | mfu: 50.86 | epoch: 2 | total time: 107.86m | eta: 71.7m +step 10041/16704 (60.11%) | loss: 2.583002 | lrm: 0.80 | dt: 643.75ms | tok/sec: 814,424 | mfu: 50.90 | epoch: 2 | total time: 107.87m | eta: 71.7m +step 10042/16704 (60.12%) | loss: 2.597677 | lrm: 0.80 | dt: 646.39ms | tok/sec: 811,107 | mfu: 50.70 | epoch: 2 | total time: 107.88m | eta: 71.6m +step 10043/16704 (60.12%) | loss: 2.596831 | lrm: 0.80 | dt: 644.11ms | tok/sec: 813,968 | mfu: 50.87 | epoch: 2 | total time: 107.89m | eta: 71.6m +step 10044/16704 (60.13%) | loss: 2.601651 | lrm: 0.80 | dt: 644.83ms | tok/sec: 813,063 | mfu: 50.82 | epoch: 2 | total time: 107.90m | eta: 71.6m +step 10045/16704 (60.14%) | loss: 2.614411 | lrm: 0.80 | dt: 645.49ms | tok/sec: 812,232 | mfu: 50.77 | epoch: 2 | total time: 107.92m | eta: 71.6m +step 10046/16704 (60.14%) | loss: 2.606203 | lrm: 0.80 | dt: 643.05ms | tok/sec: 815,310 | mfu: 50.96 | epoch: 2 | total time: 107.93m | eta: 71.6m +step 10047/16704 (60.15%) | loss: 2.595888 | lrm: 0.80 | dt: 642.73ms | tok/sec: 815,722 | mfu: 50.98 | epoch: 2 | total time: 107.94m | eta: 71.6m +step 10048/16704 (60.15%) | loss: 2.603894 | lrm: 0.80 | dt: 644.84ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 2 | total time: 107.95m | eta: 71.6m +step 10049/16704 (60.16%) | loss: 2.606895 | lrm: 0.80 | dt: 644.28ms | tok/sec: 813,760 | mfu: 50.86 | epoch: 2 | total time: 107.96m | eta: 71.6m +step 10050/16704 (60.17%) | loss: 2.598282 | lrm: 0.80 | dt: 644.02ms | tok/sec: 814,084 | mfu: 50.88 | epoch: 2 | total time: 107.97m | eta: 71.6m +step 10051/16704 (60.17%) | loss: 2.601268 | lrm: 0.80 | dt: 644.67ms | tok/sec: 813,260 | mfu: 50.83 | epoch: 2 | total time: 107.98m | eta: 71.5m +step 10052/16704 (60.18%) | loss: 2.600592 | lrm: 0.80 | dt: 645.11ms | tok/sec: 812,713 | mfu: 50.80 | epoch: 2 | total time: 107.99m | eta: 71.5m +step 10053/16704 (60.18%) | loss: 2.609866 | lrm: 0.80 | dt: 644.33ms | tok/sec: 813,694 | mfu: 50.86 | epoch: 2 | total time: 108.00m | eta: 71.5m +step 10054/16704 (60.19%) | loss: 2.611640 | lrm: 0.80 | dt: 644.73ms | tok/sec: 813,186 | mfu: 50.83 | epoch: 2 | total time: 108.01m | eta: 71.5m +step 10055/16704 (60.20%) | loss: 2.631707 | lrm: 0.80 | dt: 645.33ms | tok/sec: 812,434 | mfu: 50.78 | epoch: 2 | total time: 108.02m | eta: 71.5m +step 10056/16704 (60.20%) | loss: 2.639198 | lrm: 0.80 | dt: 643.71ms | tok/sec: 814,478 | mfu: 50.91 | epoch: 2 | total time: 108.03m | eta: 71.5m +step 10057/16704 (60.21%) | loss: 2.637805 | lrm: 0.80 | dt: 644.44ms | tok/sec: 813,555 | mfu: 50.85 | epoch: 2 | total time: 108.04m | eta: 71.5m +step 10058/16704 (60.21%) | loss: 2.627191 | lrm: 0.80 | dt: 643.11ms | tok/sec: 815,233 | mfu: 50.95 | epoch: 2 | total time: 108.05m | eta: 71.5m +step 10059/16704 (60.22%) | loss: 2.626718 | lrm: 0.80 | dt: 644.29ms | tok/sec: 813,740 | mfu: 50.86 | epoch: 2 | total time: 108.07m | eta: 71.5m +step 10060/16704 (60.23%) | loss: 2.633369 | lrm: 0.80 | dt: 644.07ms | tok/sec: 814,028 | mfu: 50.88 | epoch: 2 | total time: 108.08m | eta: 71.4m +step 10061/16704 (60.23%) | loss: 2.632483 | lrm: 0.80 | dt: 644.35ms | tok/sec: 813,673 | mfu: 50.86 | epoch: 2 | total time: 108.09m | eta: 71.4m +step 10062/16704 (60.24%) | loss: 2.630638 | lrm: 0.80 | dt: 643.33ms | tok/sec: 814,960 | mfu: 50.94 | epoch: 2 | total time: 108.10m | eta: 71.4m +step 10063/16704 (60.24%) | loss: 2.624951 | lrm: 0.80 | dt: 644.63ms | tok/sec: 813,320 | mfu: 50.83 | epoch: 2 | total time: 108.11m | eta: 71.4m +step 10064/16704 (60.25%) | loss: 2.624375 | lrm: 0.80 | dt: 644.56ms | tok/sec: 813,407 | mfu: 50.84 | epoch: 2 | total time: 108.12m | eta: 71.4m +step 10065/16704 (60.26%) | loss: 2.627191 | lrm: 0.79 | dt: 645.40ms | tok/sec: 812,345 | mfu: 50.77 | epoch: 2 | total time: 108.13m | eta: 71.4m +step 10066/16704 (60.26%) | loss: 2.631338 | lrm: 0.79 | dt: 645.13ms | tok/sec: 812,682 | mfu: 50.79 | epoch: 2 | total time: 108.14m | eta: 71.4m +step 10067/16704 (60.27%) | loss: 2.622852 | lrm: 0.79 | dt: 642.50ms | tok/sec: 816,017 | mfu: 51.00 | epoch: 2 | total time: 108.15m | eta: 71.4m +step 10068/16704 (60.27%) | loss: 2.623477 | lrm: 0.79 | dt: 644.67ms | tok/sec: 813,265 | mfu: 50.83 | epoch: 2 | total time: 108.16m | eta: 71.4m +step 10069/16704 (60.28%) | loss: 2.613815 | lrm: 0.79 | dt: 643.42ms | tok/sec: 814,846 | mfu: 50.93 | epoch: 2 | total time: 108.17m | eta: 71.4m +step 10070/16704 (60.28%) | loss: 2.600776 | lrm: 0.79 | dt: 642.88ms | tok/sec: 815,535 | mfu: 50.97 | epoch: 2 | total time: 108.18m | eta: 71.3m +step 10071/16704 (60.29%) | loss: 2.596424 | lrm: 0.79 | dt: 644.73ms | tok/sec: 813,192 | mfu: 50.83 | epoch: 2 | total time: 108.19m | eta: 71.3m +step 10072/16704 (60.30%) | loss: 2.610893 | lrm: 0.79 | dt: 643.33ms | tok/sec: 814,953 | mfu: 50.94 | epoch: 2 | total time: 108.21m | eta: 71.3m +step 10073/16704 (60.30%) | loss: 2.614490 | lrm: 0.79 | dt: 644.79ms | tok/sec: 813,116 | mfu: 50.82 | epoch: 2 | total time: 108.22m | eta: 71.3m +step 10074/16704 (60.31%) | loss: 2.623490 | lrm: 0.79 | dt: 643.90ms | tok/sec: 814,235 | mfu: 50.89 | epoch: 2 | total time: 108.23m | eta: 71.3m +step 10075/16704 (60.31%) | loss: 2.627732 | lrm: 0.79 | dt: 642.76ms | tok/sec: 815,684 | mfu: 50.98 | epoch: 2 | total time: 108.24m | eta: 71.3m +step 10076/16704 (60.32%) | loss: 2.622964 | lrm: 0.79 | dt: 642.27ms | tok/sec: 816,310 | mfu: 51.02 | epoch: 2 | total time: 108.25m | eta: 71.3m +step 10077/16704 (60.33%) | loss: 2.614313 | lrm: 0.79 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 2 | total time: 108.26m | eta: 71.3m +step 10078/16704 (60.33%) | loss: 2.622263 | lrm: 0.79 | dt: 643.45ms | tok/sec: 814,813 | mfu: 50.93 | epoch: 2 | total time: 108.27m | eta: 71.3m +step 10079/16704 (60.34%) | loss: 2.620409 | lrm: 0.79 | dt: 645.38ms | tok/sec: 812,373 | mfu: 50.77 | epoch: 2 | total time: 108.28m | eta: 71.2m +step 10080/16704 (60.34%) | loss: 2.613247 | lrm: 0.79 | dt: 643.32ms | tok/sec: 814,972 | mfu: 50.94 | epoch: 2 | total time: 108.29m | eta: 71.2m +step 10081/16704 (60.35%) | loss: 2.633185 | lrm: 0.79 | dt: 642.64ms | tok/sec: 815,836 | mfu: 50.99 | epoch: 2 | total time: 108.30m | eta: 71.2m +step 10082/16704 (60.36%) | loss: 2.630648 | lrm: 0.79 | dt: 643.37ms | tok/sec: 814,913 | mfu: 50.93 | epoch: 2 | total time: 108.31m | eta: 71.2m +step 10083/16704 (60.36%) | loss: 2.625413 | lrm: 0.79 | dt: 644.05ms | tok/sec: 814,049 | mfu: 50.88 | epoch: 2 | total time: 108.32m | eta: 71.2m +step 10084/16704 (60.37%) | loss: 2.622032 | lrm: 0.79 | dt: 643.68ms | tok/sec: 814,520 | mfu: 50.91 | epoch: 2 | total time: 108.33m | eta: 71.2m +step 10085/16704 (60.37%) | loss: 2.619932 | lrm: 0.79 | dt: 643.90ms | tok/sec: 814,232 | mfu: 50.89 | epoch: 2 | total time: 108.34m | eta: 71.2m +step 10086/16704 (60.38%) | loss: 2.599288 | lrm: 0.79 | dt: 643.60ms | tok/sec: 814,622 | mfu: 50.92 | epoch: 2 | total time: 108.36m | eta: 71.2m +step 10087/16704 (60.39%) | loss: 2.618923 | lrm: 0.79 | dt: 643.69ms | tok/sec: 814,502 | mfu: 50.91 | epoch: 2 | total time: 108.37m | eta: 71.2m +step 10088/16704 (60.39%) | loss: 2.622468 | lrm: 0.79 | dt: 643.03ms | tok/sec: 815,334 | mfu: 50.96 | epoch: 2 | total time: 108.38m | eta: 71.1m +step 10089/16704 (60.40%) | loss: 2.634875 | lrm: 0.79 | dt: 644.47ms | tok/sec: 813,519 | mfu: 50.85 | epoch: 2 | total time: 108.39m | eta: 71.1m +step 10090/16704 (60.40%) | loss: 2.634580 | lrm: 0.79 | dt: 645.95ms | tok/sec: 811,650 | mfu: 50.73 | epoch: 2 | total time: 108.40m | eta: 71.1m +step 10091/16704 (60.41%) | loss: 2.633756 | lrm: 0.79 | dt: 645.91ms | tok/sec: 811,704 | mfu: 50.73 | epoch: 2 | total time: 108.41m | eta: 71.1m +step 10092/16704 (60.42%) | loss: 2.626863 | lrm: 0.79 | dt: 644.34ms | tok/sec: 813,683 | mfu: 50.86 | epoch: 2 | total time: 108.42m | eta: 71.1m +step 10093/16704 (60.42%) | loss: 2.632824 | lrm: 0.79 | dt: 646.05ms | tok/sec: 811,533 | mfu: 50.72 | epoch: 2 | total time: 108.43m | eta: 71.1m +step 10094/16704 (60.43%) | loss: 2.645176 | lrm: 0.79 | dt: 644.48ms | tok/sec: 813,507 | mfu: 50.85 | epoch: 2 | total time: 108.44m | eta: 71.1m +step 10095/16704 (60.43%) | loss: 2.643674 | lrm: 0.79 | dt: 642.09ms | tok/sec: 816,530 | mfu: 51.03 | epoch: 2 | total time: 108.45m | eta: 71.1m +step 10096/16704 (60.44%) | loss: 2.648985 | lrm: 0.79 | dt: 643.78ms | tok/sec: 814,392 | mfu: 50.90 | epoch: 2 | total time: 108.46m | eta: 71.1m +step 10097/16704 (60.45%) | loss: 2.647043 | lrm: 0.79 | dt: 643.96ms | tok/sec: 814,167 | mfu: 50.89 | epoch: 2 | total time: 108.47m | eta: 71.1m +step 10098/16704 (60.45%) | loss: 2.643867 | lrm: 0.79 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 2 | total time: 108.48m | eta: 71.0m +step 10099/16704 (60.46%) | loss: 2.645667 | lrm: 0.79 | dt: 646.21ms | tok/sec: 811,325 | mfu: 50.71 | epoch: 2 | total time: 108.49m | eta: 71.0m +step 10100/16704 (60.46%) | loss: 2.627282 | lrm: 0.79 | dt: 643.03ms | tok/sec: 815,338 | mfu: 50.96 | epoch: 2 | total time: 108.51m | eta: 71.0m +step 10101/16704 (60.47%) | loss: 2.629052 | lrm: 0.79 | dt: 644.85ms | tok/sec: 813,039 | mfu: 50.82 | epoch: 2 | total time: 108.52m | eta: 71.0m +step 10102/16704 (60.48%) | loss: 2.634706 | lrm: 0.79 | dt: 647.00ms | tok/sec: 810,342 | mfu: 50.65 | epoch: 2 | total time: 108.53m | eta: 71.0m +step 10103/16704 (60.48%) | loss: 2.632218 | lrm: 0.79 | dt: 640.07ms | tok/sec: 819,116 | mfu: 51.20 | epoch: 2 | total time: 108.54m | eta: 71.0m +step 10104/16704 (60.49%) | loss: 2.644024 | lrm: 0.79 | dt: 645.90ms | tok/sec: 811,722 | mfu: 50.73 | epoch: 2 | total time: 108.55m | eta: 71.0m +step 10105/16704 (60.49%) | loss: 2.627216 | lrm: 0.79 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 2 | total time: 108.56m | eta: 71.0m +step 10106/16704 (60.50%) | loss: 2.625031 | lrm: 0.79 | dt: 643.18ms | tok/sec: 815,146 | mfu: 50.95 | epoch: 2 | total time: 108.57m | eta: 71.0m +step 10107/16704 (60.51%) | loss: 2.618372 | lrm: 0.79 | dt: 645.38ms | tok/sec: 812,375 | mfu: 50.77 | epoch: 2 | total time: 108.58m | eta: 70.9m +step 10108/16704 (60.51%) | loss: 2.620433 | lrm: 0.79 | dt: 643.55ms | tok/sec: 814,678 | mfu: 50.92 | epoch: 2 | total time: 108.59m | eta: 70.9m +step 10109/16704 (60.52%) | loss: 2.608176 | lrm: 0.79 | dt: 642.39ms | tok/sec: 816,148 | mfu: 51.01 | epoch: 2 | total time: 108.60m | eta: 70.9m +step 10110/16704 (60.52%) | loss: 2.606674 | lrm: 0.79 | dt: 644.73ms | tok/sec: 813,192 | mfu: 50.83 | epoch: 2 | total time: 108.61m | eta: 70.9m +step 10111/16704 (60.53%) | loss: 2.609653 | lrm: 0.79 | dt: 643.79ms | tok/sec: 814,379 | mfu: 50.90 | epoch: 2 | total time: 108.62m | eta: 70.9m +step 10112/16704 (60.54%) | loss: 2.616495 | lrm: 0.79 | dt: 643.04ms | tok/sec: 815,329 | mfu: 50.96 | epoch: 2 | total time: 108.63m | eta: 70.9m +step 10113/16704 (60.54%) | loss: 2.629561 | lrm: 0.79 | dt: 645.52ms | tok/sec: 812,196 | mfu: 50.76 | epoch: 2 | total time: 108.65m | eta: 70.9m +step 10114/16704 (60.55%) | loss: 2.633777 | lrm: 0.79 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 2 | total time: 108.66m | eta: 70.9m +step 10115/16704 (60.55%) | loss: 2.637355 | lrm: 0.79 | dt: 644.24ms | tok/sec: 813,806 | mfu: 50.86 | epoch: 2 | total time: 108.67m | eta: 70.9m +step 10116/16704 (60.56%) | loss: 2.638102 | lrm: 0.79 | dt: 646.64ms | tok/sec: 810,784 | mfu: 50.68 | epoch: 2 | total time: 108.68m | eta: 70.8m +step 10117/16704 (60.57%) | loss: 2.630097 | lrm: 0.79 | dt: 641.47ms | tok/sec: 817,320 | mfu: 51.08 | epoch: 2 | total time: 108.69m | eta: 70.8m +step 10118/16704 (60.57%) | loss: 2.633353 | lrm: 0.79 | dt: 646.17ms | tok/sec: 811,382 | mfu: 50.71 | epoch: 2 | total time: 108.70m | eta: 70.8m +step 10119/16704 (60.58%) | loss: 2.634413 | lrm: 0.79 | dt: 642.13ms | tok/sec: 816,487 | mfu: 51.03 | epoch: 2 | total time: 108.71m | eta: 70.8m +step 10120/16704 (60.58%) | loss: 2.637497 | lrm: 0.79 | dt: 644.81ms | tok/sec: 813,087 | mfu: 50.82 | epoch: 2 | total time: 108.72m | eta: 70.8m +step 10121/16704 (60.59%) | loss: 2.633210 | lrm: 0.79 | dt: 643.39ms | tok/sec: 814,888 | mfu: 50.93 | epoch: 2 | total time: 108.73m | eta: 70.8m +step 10122/16704 (60.60%) | loss: 2.641221 | lrm: 0.79 | dt: 643.25ms | tok/sec: 815,066 | mfu: 50.94 | epoch: 2 | total time: 108.74m | eta: 70.8m +step 10123/16704 (60.60%) | loss: 2.635703 | lrm: 0.79 | dt: 643.49ms | tok/sec: 814,755 | mfu: 50.92 | epoch: 2 | total time: 108.75m | eta: 70.8m +step 10124/16704 (60.61%) | loss: 2.616246 | lrm: 0.79 | dt: 645.05ms | tok/sec: 812,789 | mfu: 50.80 | epoch: 2 | total time: 108.76m | eta: 70.8m +step 10125/16704 (60.61%) | loss: 2.639409 | lrm: 0.79 | dt: 644.73ms | tok/sec: 813,194 | mfu: 50.83 | epoch: 2 | total time: 108.77m | eta: 70.7m +step 10126/16704 (60.62%) | loss: 2.623099 | lrm: 0.79 | dt: 643.53ms | tok/sec: 814,710 | mfu: 50.92 | epoch: 2 | total time: 108.78m | eta: 70.7m +step 10127/16704 (60.63%) | loss: 2.632051 | lrm: 0.79 | dt: 643.58ms | tok/sec: 814,648 | mfu: 50.92 | epoch: 2 | total time: 108.80m | eta: 70.7m +step 10128/16704 (60.63%) | loss: 2.629473 | lrm: 0.79 | dt: 642.55ms | tok/sec: 815,951 | mfu: 51.00 | epoch: 2 | total time: 108.81m | eta: 70.7m +step 10129/16704 (60.64%) | loss: 2.627546 | lrm: 0.79 | dt: 645.36ms | tok/sec: 812,397 | mfu: 50.78 | epoch: 2 | total time: 108.82m | eta: 70.7m +step 10130/16704 (60.64%) | loss: 2.626809 | lrm: 0.79 | dt: 643.77ms | tok/sec: 814,396 | mfu: 50.90 | epoch: 2 | total time: 108.83m | eta: 70.7m +step 10131/16704 (60.65%) | loss: 2.617420 | lrm: 0.79 | dt: 645.06ms | tok/sec: 812,774 | mfu: 50.80 | epoch: 2 | total time: 108.84m | eta: 70.7m +step 10132/16704 (60.66%) | loss: 2.617598 | lrm: 0.79 | dt: 646.08ms | tok/sec: 811,496 | mfu: 50.72 | epoch: 2 | total time: 108.85m | eta: 70.7m +step 10133/16704 (60.66%) | loss: 2.610668 | lrm: 0.79 | dt: 644.15ms | tok/sec: 813,919 | mfu: 50.87 | epoch: 2 | total time: 108.86m | eta: 70.7m +step 10134/16704 (60.67%) | loss: 2.618486 | lrm: 0.79 | dt: 641.83ms | tok/sec: 816,864 | mfu: 51.06 | epoch: 2 | total time: 108.87m | eta: 70.7m +step 10135/16704 (60.67%) | loss: 2.622628 | lrm: 0.79 | dt: 644.50ms | tok/sec: 813,481 | mfu: 50.84 | epoch: 2 | total time: 108.88m | eta: 70.6m +step 10136/16704 (60.68%) | loss: 2.620493 | lrm: 0.79 | dt: 645.88ms | tok/sec: 811,736 | mfu: 50.73 | epoch: 2 | total time: 108.89m | eta: 70.6m +step 10137/16704 (60.69%) | loss: 2.620376 | lrm: 0.79 | dt: 642.42ms | tok/sec: 816,112 | mfu: 51.01 | epoch: 2 | total time: 108.90m | eta: 70.6m +step 10138/16704 (60.69%) | loss: 2.615312 | lrm: 0.79 | dt: 645.54ms | tok/sec: 812,169 | mfu: 50.76 | epoch: 2 | total time: 108.91m | eta: 70.6m +step 10139/16704 (60.70%) | loss: 2.610464 | lrm: 0.79 | dt: 643.22ms | tok/sec: 815,099 | mfu: 50.94 | epoch: 2 | total time: 108.92m | eta: 70.6m +step 10140/16704 (60.70%) | loss: 2.596831 | lrm: 0.79 | dt: 645.31ms | tok/sec: 812,463 | mfu: 50.78 | epoch: 2 | total time: 108.94m | eta: 70.6m +step 10141/16704 (60.71%) | loss: 2.585857 | lrm: 0.79 | dt: 645.09ms | tok/sec: 812,741 | mfu: 50.80 | epoch: 2 | total time: 108.95m | eta: 70.6m +step 10142/16704 (60.72%) | loss: 2.585900 | lrm: 0.79 | dt: 642.36ms | tok/sec: 816,188 | mfu: 51.01 | epoch: 2 | total time: 108.96m | eta: 70.6m +step 10143/16704 (60.72%) | loss: 2.606394 | lrm: 0.79 | dt: 644.42ms | tok/sec: 813,575 | mfu: 50.85 | epoch: 2 | total time: 108.97m | eta: 70.6m +step 10144/16704 (60.73%) | loss: 2.609997 | lrm: 0.79 | dt: 646.68ms | tok/sec: 810,738 | mfu: 50.67 | epoch: 2 | total time: 108.98m | eta: 70.5m +step 10145/16704 (60.73%) | loss: 2.620670 | lrm: 0.79 | dt: 642.30ms | tok/sec: 816,270 | mfu: 51.02 | epoch: 2 | total time: 108.99m | eta: 70.5m +step 10146/16704 (60.74%) | loss: 2.630168 | lrm: 0.79 | dt: 646.16ms | tok/sec: 811,388 | mfu: 50.71 | epoch: 2 | total time: 109.00m | eta: 70.5m +step 10147/16704 (60.75%) | loss: 2.633902 | lrm: 0.79 | dt: 645.44ms | tok/sec: 812,300 | mfu: 50.77 | epoch: 2 | total time: 109.01m | eta: 70.5m +step 10148/16704 (60.75%) | loss: 2.635344 | lrm: 0.78 | dt: 643.00ms | tok/sec: 815,382 | mfu: 50.96 | epoch: 2 | total time: 109.02m | eta: 70.5m +step 10149/16704 (60.76%) | loss: 2.627761 | lrm: 0.78 | dt: 645.33ms | tok/sec: 812,433 | mfu: 50.78 | epoch: 2 | total time: 109.03m | eta: 70.5m +step 10150/16704 (60.76%) | loss: 2.626243 | lrm: 0.78 | dt: 642.11ms | tok/sec: 816,506 | mfu: 51.03 | epoch: 2 | total time: 109.04m | eta: 70.5m +step 10151/16704 (60.77%) | loss: 2.620955 | lrm: 0.78 | dt: 644.44ms | tok/sec: 813,551 | mfu: 50.85 | epoch: 2 | total time: 109.05m | eta: 70.5m +step 10152/16704 (60.78%) | loss: 2.620042 | lrm: 0.78 | dt: 644.68ms | tok/sec: 813,252 | mfu: 50.83 | epoch: 2 | total time: 109.06m | eta: 70.5m +step 10153/16704 (60.78%) | loss: 2.620874 | lrm: 0.78 | dt: 643.83ms | tok/sec: 814,328 | mfu: 50.90 | epoch: 2 | total time: 109.07m | eta: 70.4m +step 10154/16704 (60.79%) | loss: 2.633424 | lrm: 0.78 | dt: 645.12ms | tok/sec: 812,700 | mfu: 50.79 | epoch: 2 | total time: 109.09m | eta: 70.4m +step 10155/16704 (60.79%) | loss: 2.640303 | lrm: 0.78 | dt: 646.36ms | tok/sec: 811,143 | mfu: 50.70 | epoch: 2 | total time: 109.10m | eta: 70.4m +step 10156/16704 (60.80%) | loss: 2.649445 | lrm: 0.78 | dt: 642.00ms | tok/sec: 816,647 | mfu: 51.04 | epoch: 2 | total time: 109.11m | eta: 70.4m +step 10157/16704 (60.81%) | loss: 2.649967 | lrm: 0.78 | dt: 644.55ms | tok/sec: 813,419 | mfu: 50.84 | epoch: 2 | total time: 109.12m | eta: 70.4m +step 10158/16704 (60.81%) | loss: 2.654503 | lrm: 0.78 | dt: 645.00ms | tok/sec: 812,844 | mfu: 50.80 | epoch: 2 | total time: 109.13m | eta: 70.4m +step 10159/16704 (60.82%) | loss: 2.656479 | lrm: 0.78 | dt: 645.45ms | tok/sec: 812,287 | mfu: 50.77 | epoch: 2 | total time: 109.14m | eta: 70.4m +step 10160/16704 (60.82%) | loss: 2.650156 | lrm: 0.78 | dt: 646.15ms | tok/sec: 811,399 | mfu: 50.71 | epoch: 2 | total time: 109.15m | eta: 70.4m +step 10161/16704 (60.83%) | loss: 2.652949 | lrm: 0.78 | dt: 642.74ms | tok/sec: 815,707 | mfu: 50.98 | epoch: 2 | total time: 109.16m | eta: 70.4m +step 10162/16704 (60.84%) | loss: 2.646599 | lrm: 0.78 | dt: 644.43ms | tok/sec: 813,564 | mfu: 50.85 | epoch: 2 | total time: 109.17m | eta: 70.4m +step 10163/16704 (60.84%) | loss: 2.644606 | lrm: 0.78 | dt: 644.23ms | tok/sec: 813,824 | mfu: 50.87 | epoch: 2 | total time: 109.18m | eta: 70.3m +step 10164/16704 (60.85%) | loss: 2.640818 | lrm: 0.78 | dt: 642.45ms | tok/sec: 816,071 | mfu: 51.01 | epoch: 2 | total time: 109.19m | eta: 70.3m +step 10165/16704 (60.85%) | loss: 2.649982 | lrm: 0.78 | dt: 645.95ms | tok/sec: 811,652 | mfu: 50.73 | epoch: 2 | total time: 109.20m | eta: 70.3m +step 10166/16704 (60.86%) | loss: 2.645061 | lrm: 0.78 | dt: 643.66ms | tok/sec: 814,543 | mfu: 50.91 | epoch: 2 | total time: 109.21m | eta: 70.3m +step 10167/16704 (60.87%) | loss: 2.629805 | lrm: 0.78 | dt: 646.18ms | tok/sec: 811,369 | mfu: 50.71 | epoch: 2 | total time: 109.23m | eta: 70.3m +step 10168/16704 (60.87%) | loss: 2.614245 | lrm: 0.78 | dt: 642.51ms | tok/sec: 815,999 | mfu: 51.00 | epoch: 2 | total time: 109.24m | eta: 70.3m +step 10169/16704 (60.88%) | loss: 2.622129 | lrm: 0.78 | dt: 642.66ms | tok/sec: 815,812 | mfu: 50.99 | epoch: 2 | total time: 109.25m | eta: 70.3m +step 10170/16704 (60.88%) | loss: 2.624246 | lrm: 0.78 | dt: 643.90ms | tok/sec: 814,233 | mfu: 50.89 | epoch: 2 | total time: 109.26m | eta: 70.3m +step 10171/16704 (60.89%) | loss: 2.619671 | lrm: 0.78 | dt: 643.41ms | tok/sec: 814,853 | mfu: 50.93 | epoch: 2 | total time: 109.27m | eta: 70.3m +step 10172/16704 (60.90%) | loss: 2.627470 | lrm: 0.78 | dt: 643.78ms | tok/sec: 814,384 | mfu: 50.90 | epoch: 2 | total time: 109.28m | eta: 70.2m +step 10173/16704 (60.90%) | loss: 2.613451 | lrm: 0.78 | dt: 644.79ms | tok/sec: 813,118 | mfu: 50.82 | epoch: 2 | total time: 109.29m | eta: 70.2m +step 10174/16704 (60.91%) | loss: 2.614338 | lrm: 0.78 | dt: 643.05ms | tok/sec: 815,320 | mfu: 50.96 | epoch: 2 | total time: 109.30m | eta: 70.2m +step 10175/16704 (60.91%) | loss: 2.610733 | lrm: 0.78 | dt: 645.20ms | tok/sec: 812,600 | mfu: 50.79 | epoch: 2 | total time: 109.31m | eta: 70.2m +step 10176/16704 (60.92%) | loss: 2.601545 | lrm: 0.78 | dt: 643.82ms | tok/sec: 814,333 | mfu: 50.90 | epoch: 2 | total time: 109.32m | eta: 70.2m +step 10177/16704 (60.93%) | loss: 2.616323 | lrm: 0.78 | dt: 643.52ms | tok/sec: 814,713 | mfu: 50.92 | epoch: 2 | total time: 109.33m | eta: 70.2m +step 10178/16704 (60.93%) | loss: 2.602753 | lrm: 0.78 | dt: 643.16ms | tok/sec: 815,177 | mfu: 50.95 | epoch: 2 | total time: 109.34m | eta: 70.2m +step 10179/16704 (60.94%) | loss: 2.596572 | lrm: 0.78 | dt: 643.46ms | tok/sec: 814,791 | mfu: 50.93 | epoch: 2 | total time: 109.35m | eta: 70.2m +step 10180/16704 (60.94%) | loss: 2.602563 | lrm: 0.78 | dt: 644.90ms | tok/sec: 812,976 | mfu: 50.81 | epoch: 2 | total time: 109.36m | eta: 70.2m +step 10181/16704 (60.95%) | loss: 2.612227 | lrm: 0.78 | dt: 643.83ms | tok/sec: 814,324 | mfu: 50.90 | epoch: 2 | total time: 109.38m | eta: 70.1m +step 10182/16704 (60.96%) | loss: 2.615107 | lrm: 0.78 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 2 | total time: 109.39m | eta: 70.1m +step 10183/16704 (60.96%) | loss: 2.616162 | lrm: 0.78 | dt: 643.05ms | tok/sec: 815,313 | mfu: 50.96 | epoch: 2 | total time: 109.40m | eta: 70.1m +step 10184/16704 (60.97%) | loss: 2.618598 | lrm: 0.78 | dt: 643.55ms | tok/sec: 814,679 | mfu: 50.92 | epoch: 2 | total time: 109.41m | eta: 70.1m +step 10185/16704 (60.97%) | loss: 2.624476 | lrm: 0.78 | dt: 644.48ms | tok/sec: 813,500 | mfu: 50.84 | epoch: 2 | total time: 109.42m | eta: 70.1m +step 10186/16704 (60.98%) | loss: 2.636133 | lrm: 0.78 | dt: 644.88ms | tok/sec: 812,999 | mfu: 50.81 | epoch: 2 | total time: 109.43m | eta: 70.1m +step 10187/16704 (60.99%) | loss: 2.633023 | lrm: 0.78 | dt: 644.35ms | tok/sec: 813,673 | mfu: 50.86 | epoch: 2 | total time: 109.44m | eta: 70.1m +step 10188/16704 (60.99%) | loss: 2.643767 | lrm: 0.78 | dt: 643.08ms | tok/sec: 815,271 | mfu: 50.96 | epoch: 2 | total time: 109.45m | eta: 70.1m +step 10189/16704 (61.00%) | loss: 2.637349 | lrm: 0.78 | dt: 644.77ms | tok/sec: 813,142 | mfu: 50.82 | epoch: 2 | total time: 109.46m | eta: 70.1m +step 10190/16704 (61.00%) | loss: 2.633456 | lrm: 0.78 | dt: 642.63ms | tok/sec: 815,842 | mfu: 50.99 | epoch: 2 | total time: 109.47m | eta: 70.0m +step 10191/16704 (61.01%) | loss: 2.633076 | lrm: 0.78 | dt: 646.09ms | tok/sec: 811,475 | mfu: 50.72 | epoch: 2 | total time: 109.48m | eta: 70.0m +step 10192/16704 (61.02%) | loss: 2.619388 | lrm: 0.78 | dt: 641.62ms | tok/sec: 817,135 | mfu: 51.07 | epoch: 2 | total time: 109.49m | eta: 70.0m +step 10193/16704 (61.02%) | loss: 2.622380 | lrm: 0.78 | dt: 646.68ms | tok/sec: 810,738 | mfu: 50.67 | epoch: 2 | total time: 109.50m | eta: 70.0m +step 10194/16704 (61.03%) | loss: 2.629501 | lrm: 0.78 | dt: 643.16ms | tok/sec: 815,171 | mfu: 50.95 | epoch: 2 | total time: 109.51m | eta: 70.0m +step 10195/16704 (61.03%) | loss: 2.629171 | lrm: 0.78 | dt: 646.11ms | tok/sec: 811,447 | mfu: 50.72 | epoch: 2 | total time: 109.53m | eta: 70.0m +step 10196/16704 (61.04%) | loss: 2.632295 | lrm: 0.78 | dt: 645.30ms | tok/sec: 812,470 | mfu: 50.78 | epoch: 2 | total time: 109.54m | eta: 70.0m +step 10197/16704 (61.05%) | loss: 2.623449 | lrm: 0.78 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 2 | total time: 109.55m | eta: 70.0m +step 10198/16704 (61.05%) | loss: 2.619776 | lrm: 0.78 | dt: 643.77ms | tok/sec: 814,402 | mfu: 50.90 | epoch: 2 | total time: 109.56m | eta: 70.0m +step 10199/16704 (61.06%) | loss: 2.627731 | lrm: 0.78 | dt: 642.91ms | tok/sec: 815,495 | mfu: 50.97 | epoch: 2 | total time: 109.57m | eta: 70.0m +step 10200/16704 (61.06%) | loss: 2.633079 | lrm: 0.78 | dt: 644.28ms | tok/sec: 813,760 | mfu: 50.86 | epoch: 2 | total time: 109.58m | eta: 69.9m +step 10201/16704 (61.07%) | loss: 2.624823 | lrm: 0.78 | dt: 642.15ms | tok/sec: 816,453 | mfu: 51.03 | epoch: 2 | total time: 109.59m | eta: 69.9m +step 10202/16704 (61.08%) | loss: 2.637362 | lrm: 0.78 | dt: 646.62ms | tok/sec: 810,810 | mfu: 50.68 | epoch: 2 | total time: 109.60m | eta: 69.9m +step 10203/16704 (61.08%) | loss: 2.635856 | lrm: 0.78 | dt: 643.55ms | tok/sec: 814,680 | mfu: 50.92 | epoch: 2 | total time: 109.61m | eta: 69.9m +step 10204/16704 (61.09%) | loss: 2.634420 | lrm: 0.78 | dt: 644.64ms | tok/sec: 813,304 | mfu: 50.83 | epoch: 2 | total time: 109.62m | eta: 69.9m +step 10205/16704 (61.09%) | loss: 2.637540 | lrm: 0.78 | dt: 644.71ms | tok/sec: 813,213 | mfu: 50.83 | epoch: 2 | total time: 109.63m | eta: 69.9m +step 10206/16704 (61.10%) | loss: 2.628419 | lrm: 0.78 | dt: 641.59ms | tok/sec: 817,170 | mfu: 51.07 | epoch: 2 | total time: 109.64m | eta: 69.9m +step 10207/16704 (61.11%) | loss: 2.630715 | lrm: 0.78 | dt: 645.18ms | tok/sec: 812,623 | mfu: 50.79 | epoch: 2 | total time: 109.65m | eta: 69.9m +step 10208/16704 (61.11%) | loss: 2.641252 | lrm: 0.78 | dt: 643.12ms | tok/sec: 815,231 | mfu: 50.95 | epoch: 2 | total time: 109.67m | eta: 69.9m +step 10209/16704 (61.12%) | loss: 2.642214 | lrm: 0.78 | dt: 643.94ms | tok/sec: 814,189 | mfu: 50.89 | epoch: 2 | total time: 109.68m | eta: 69.8m +step 10210/16704 (61.12%) | loss: 2.638775 | lrm: 0.78 | dt: 645.41ms | tok/sec: 812,334 | mfu: 50.77 | epoch: 2 | total time: 109.69m | eta: 69.8m +step 10211/16704 (61.13%) | loss: 2.638869 | lrm: 0.78 | dt: 643.97ms | tok/sec: 814,149 | mfu: 50.89 | epoch: 2 | total time: 109.70m | eta: 69.8m +step 10212/16704 (61.14%) | loss: 2.634316 | lrm: 0.78 | dt: 642.74ms | tok/sec: 815,703 | mfu: 50.98 | epoch: 2 | total time: 109.71m | eta: 69.8m +step 10213/16704 (61.14%) | loss: 2.631133 | lrm: 0.78 | dt: 645.18ms | tok/sec: 812,619 | mfu: 50.79 | epoch: 2 | total time: 109.72m | eta: 69.8m +step 10214/16704 (61.15%) | loss: 2.637367 | lrm: 0.78 | dt: 645.46ms | tok/sec: 812,265 | mfu: 50.77 | epoch: 2 | total time: 109.73m | eta: 69.8m +step 10215/16704 (61.15%) | loss: 2.627862 | lrm: 0.78 | dt: 645.44ms | tok/sec: 812,301 | mfu: 50.77 | epoch: 2 | total time: 109.74m | eta: 69.8m +step 10216/16704 (61.16%) | loss: 2.613590 | lrm: 0.78 | dt: 643.38ms | tok/sec: 814,891 | mfu: 50.93 | epoch: 2 | total time: 109.75m | eta: 69.8m +step 10217/16704 (61.16%) | loss: 2.625002 | lrm: 0.78 | dt: 643.80ms | tok/sec: 814,361 | mfu: 50.90 | epoch: 2 | total time: 109.76m | eta: 69.8m +step 10218/16704 (61.17%) | loss: 2.622814 | lrm: 0.78 | dt: 644.90ms | tok/sec: 812,975 | mfu: 50.81 | epoch: 2 | total time: 109.77m | eta: 69.7m +step 10219/16704 (61.18%) | loss: 2.611182 | lrm: 0.78 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 2 | total time: 109.78m | eta: 69.7m +step 10220/16704 (61.18%) | loss: 2.613140 | lrm: 0.78 | dt: 644.07ms | tok/sec: 814,024 | mfu: 50.88 | epoch: 2 | total time: 109.79m | eta: 69.7m +step 10221/16704 (61.19%) | loss: 2.606894 | lrm: 0.78 | dt: 643.62ms | tok/sec: 814,594 | mfu: 50.91 | epoch: 2 | total time: 109.80m | eta: 69.7m +step 10222/16704 (61.19%) | loss: 2.617638 | lrm: 0.78 | dt: 643.74ms | tok/sec: 814,440 | mfu: 50.90 | epoch: 2 | total time: 109.82m | eta: 69.7m +step 10223/16704 (61.20%) | loss: 2.613857 | lrm: 0.78 | dt: 646.37ms | tok/sec: 811,131 | mfu: 50.70 | epoch: 2 | total time: 109.83m | eta: 69.7m +step 10224/16704 (61.21%) | loss: 2.603355 | lrm: 0.78 | dt: 644.69ms | tok/sec: 813,244 | mfu: 50.83 | epoch: 2 | total time: 109.84m | eta: 69.7m +step 10225/16704 (61.21%) | loss: 2.600850 | lrm: 0.78 | dt: 642.61ms | tok/sec: 815,874 | mfu: 50.99 | epoch: 2 | total time: 109.85m | eta: 69.7m +step 10226/16704 (61.22%) | loss: 2.607646 | lrm: 0.78 | dt: 643.33ms | tok/sec: 814,964 | mfu: 50.94 | epoch: 2 | total time: 109.86m | eta: 69.7m +step 10227/16704 (61.22%) | loss: 2.603231 | lrm: 0.78 | dt: 644.04ms | tok/sec: 814,061 | mfu: 50.88 | epoch: 2 | total time: 109.87m | eta: 69.7m +step 10228/16704 (61.23%) | loss: 2.606005 | lrm: 0.78 | dt: 643.81ms | tok/sec: 814,346 | mfu: 50.90 | epoch: 2 | total time: 109.88m | eta: 69.6m +step 10229/16704 (61.24%) | loss: 2.611564 | lrm: 0.78 | dt: 645.37ms | tok/sec: 812,385 | mfu: 50.78 | epoch: 2 | total time: 109.89m | eta: 69.6m +step 10230/16704 (61.24%) | loss: 2.618306 | lrm: 0.78 | dt: 644.87ms | tok/sec: 813,013 | mfu: 50.81 | epoch: 2 | total time: 109.90m | eta: 69.6m +step 10231/16704 (61.25%) | loss: 2.610054 | lrm: 0.78 | dt: 643.82ms | tok/sec: 814,337 | mfu: 50.90 | epoch: 2 | total time: 109.91m | eta: 69.6m +step 10232/16704 (61.25%) | loss: 2.606043 | lrm: 0.77 | dt: 644.81ms | tok/sec: 813,084 | mfu: 50.82 | epoch: 2 | total time: 109.92m | eta: 69.6m +step 10233/16704 (61.26%) | loss: 2.606846 | lrm: 0.77 | dt: 643.06ms | tok/sec: 815,306 | mfu: 50.96 | epoch: 2 | total time: 109.93m | eta: 69.6m +step 10234/16704 (61.27%) | loss: 2.609860 | lrm: 0.77 | dt: 644.34ms | tok/sec: 813,685 | mfu: 50.86 | epoch: 2 | total time: 109.94m | eta: 69.6m +step 10235/16704 (61.27%) | loss: 2.616856 | lrm: 0.77 | dt: 643.18ms | tok/sec: 815,145 | mfu: 50.95 | epoch: 2 | total time: 109.96m | eta: 69.6m +step 10236/16704 (61.28%) | loss: 2.616267 | lrm: 0.77 | dt: 644.34ms | tok/sec: 813,684 | mfu: 50.86 | epoch: 2 | total time: 109.97m | eta: 69.6m +step 10237/16704 (61.28%) | loss: 2.610868 | lrm: 0.77 | dt: 645.11ms | tok/sec: 812,708 | mfu: 50.80 | epoch: 2 | total time: 109.98m | eta: 69.5m +step 10238/16704 (61.29%) | loss: 2.605518 | lrm: 0.77 | dt: 644.59ms | tok/sec: 813,370 | mfu: 50.84 | epoch: 2 | total time: 109.99m | eta: 69.5m +step 10239/16704 (61.30%) | loss: 2.608599 | lrm: 0.77 | dt: 643.92ms | tok/sec: 814,212 | mfu: 50.89 | epoch: 2 | total time: 110.00m | eta: 69.5m +step 10240/16704 (61.30%) | loss: 2.607358 | lrm: 0.77 | dt: 643.35ms | tok/sec: 814,931 | mfu: 50.93 | epoch: 2 | total time: 110.01m | eta: 69.5m +step 10241/16704 (61.31%) | loss: 2.607314 | lrm: 0.77 | dt: 645.55ms | tok/sec: 812,151 | mfu: 50.76 | epoch: 2 | total time: 110.02m | eta: 69.5m +step 10242/16704 (61.31%) | loss: 2.600682 | lrm: 0.77 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 2 | total time: 110.03m | eta: 69.5m +step 10243/16704 (61.32%) | loss: 2.616417 | lrm: 0.77 | dt: 645.87ms | tok/sec: 811,754 | mfu: 50.74 | epoch: 2 | total time: 110.04m | eta: 69.5m +step 10244/16704 (61.33%) | loss: 2.634720 | lrm: 0.77 | dt: 643.69ms | tok/sec: 814,506 | mfu: 50.91 | epoch: 2 | total time: 110.05m | eta: 69.5m +step 10245/16704 (61.33%) | loss: 2.628642 | lrm: 0.77 | dt: 642.94ms | tok/sec: 815,450 | mfu: 50.97 | epoch: 2 | total time: 110.06m | eta: 69.5m +step 10246/16704 (61.34%) | loss: 2.622510 | lrm: 0.77 | dt: 644.88ms | tok/sec: 813,006 | mfu: 50.81 | epoch: 2 | total time: 110.07m | eta: 69.4m +step 10247/16704 (61.34%) | loss: 2.619303 | lrm: 0.77 | dt: 644.85ms | tok/sec: 813,042 | mfu: 50.82 | epoch: 2 | total time: 110.08m | eta: 69.4m +step 10248/16704 (61.35%) | loss: 2.613764 | lrm: 0.77 | dt: 643.65ms | tok/sec: 814,560 | mfu: 50.91 | epoch: 2 | total time: 110.09m | eta: 69.4m +step 10249/16704 (61.36%) | loss: 2.608642 | lrm: 0.77 | dt: 643.27ms | tok/sec: 815,038 | mfu: 50.94 | epoch: 2 | total time: 110.11m | eta: 69.4m +Step 10250 | Validation bpb: 0.802586 +step 10250/16704 (61.36%) | loss: 2.604704 | lrm: 0.77 | dt: 644.52ms | tok/sec: 813,457 | mfu: 50.84 | epoch: 2 | total time: 110.12m | eta: 69.4m +step 10251/16704 (61.37%) | loss: 2.615244 | lrm: 0.77 | dt: 647.02ms | tok/sec: 810,313 | mfu: 50.65 | epoch: 2 | total time: 110.13m | eta: 69.4m +step 10252/16704 (61.37%) | loss: 2.607589 | lrm: 0.77 | dt: 642.92ms | tok/sec: 815,474 | mfu: 50.97 | epoch: 2 | total time: 110.14m | eta: 69.4m +step 10253/16704 (61.38%) | loss: 2.616007 | lrm: 0.77 | dt: 641.18ms | tok/sec: 817,694 | mfu: 51.11 | epoch: 2 | total time: 110.15m | eta: 69.4m +step 10254/16704 (61.39%) | loss: 2.621054 | lrm: 0.77 | dt: 645.43ms | tok/sec: 812,305 | mfu: 50.77 | epoch: 2 | total time: 110.16m | eta: 69.4m +step 10255/16704 (61.39%) | loss: 2.627044 | lrm: 0.77 | dt: 642.73ms | tok/sec: 815,720 | mfu: 50.98 | epoch: 2 | total time: 110.17m | eta: 69.3m +step 10256/16704 (61.40%) | loss: 2.632542 | lrm: 0.77 | dt: 644.83ms | tok/sec: 813,066 | mfu: 50.82 | epoch: 2 | total time: 110.18m | eta: 69.3m +step 10257/16704 (61.40%) | loss: 2.640245 | lrm: 0.77 | dt: 645.10ms | tok/sec: 812,725 | mfu: 50.80 | epoch: 2 | total time: 110.19m | eta: 69.3m +step 10258/16704 (61.41%) | loss: 2.634036 | lrm: 0.77 | dt: 642.22ms | tok/sec: 816,370 | mfu: 51.02 | epoch: 2 | total time: 110.20m | eta: 69.3m +step 10259/16704 (61.42%) | loss: 2.637193 | lrm: 0.77 | dt: 642.47ms | tok/sec: 816,052 | mfu: 51.00 | epoch: 2 | total time: 110.21m | eta: 69.3m +step 10260/16704 (61.42%) | loss: 2.642166 | lrm: 0.77 | dt: 646.52ms | tok/sec: 810,933 | mfu: 50.68 | epoch: 2 | total time: 110.22m | eta: 69.3m +step 10261/16704 (61.43%) | loss: 2.652420 | lrm: 0.77 | dt: 644.61ms | tok/sec: 813,347 | mfu: 50.84 | epoch: 2 | total time: 110.23m | eta: 69.3m +step 10262/16704 (61.43%) | loss: 2.651973 | lrm: 0.77 | dt: 646.49ms | tok/sec: 810,974 | mfu: 50.69 | epoch: 2 | total time: 110.24m | eta: 69.3m +step 10263/16704 (61.44%) | loss: 2.656981 | lrm: 0.77 | dt: 642.43ms | tok/sec: 816,106 | mfu: 51.01 | epoch: 2 | total time: 110.26m | eta: 69.3m +step 10264/16704 (61.45%) | loss: 2.652735 | lrm: 0.77 | dt: 644.22ms | tok/sec: 813,832 | mfu: 50.87 | epoch: 2 | total time: 110.27m | eta: 69.3m +step 10265/16704 (61.45%) | loss: 2.660329 | lrm: 0.77 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 2 | total time: 110.28m | eta: 69.2m +step 10266/16704 (61.46%) | loss: 2.659636 | lrm: 0.77 | dt: 644.81ms | tok/sec: 813,092 | mfu: 50.82 | epoch: 2 | total time: 110.29m | eta: 69.2m +step 10267/16704 (61.46%) | loss: 2.667050 | lrm: 0.77 | dt: 647.71ms | tok/sec: 809,450 | mfu: 50.59 | epoch: 2 | total time: 110.30m | eta: 69.2m +step 10268/16704 (61.47%) | loss: 2.668530 | lrm: 0.77 | dt: 641.98ms | tok/sec: 816,669 | mfu: 51.04 | epoch: 2 | total time: 110.31m | eta: 69.2m +step 10269/16704 (61.48%) | loss: 2.673656 | lrm: 0.77 | dt: 643.80ms | tok/sec: 814,369 | mfu: 50.90 | epoch: 2 | total time: 110.32m | eta: 69.2m +step 10270/16704 (61.48%) | loss: 2.668018 | lrm: 0.77 | dt: 644.99ms | tok/sec: 812,865 | mfu: 50.81 | epoch: 2 | total time: 110.33m | eta: 69.2m +step 10271/16704 (61.49%) | loss: 2.655854 | lrm: 0.77 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 2 | total time: 110.34m | eta: 69.2m +step 10272/16704 (61.49%) | loss: 2.656134 | lrm: 0.77 | dt: 643.40ms | tok/sec: 814,868 | mfu: 50.93 | epoch: 2 | total time: 110.35m | eta: 69.2m +step 10273/16704 (61.50%) | loss: 2.669322 | lrm: 0.77 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 2 | total time: 110.36m | eta: 69.2m +step 10274/16704 (61.51%) | loss: 2.664631 | lrm: 0.77 | dt: 644.50ms | tok/sec: 813,483 | mfu: 50.84 | epoch: 2 | total time: 110.37m | eta: 69.1m +step 10275/16704 (61.51%) | loss: 2.665717 | lrm: 0.77 | dt: 646.46ms | tok/sec: 811,015 | mfu: 50.69 | epoch: 2 | total time: 110.38m | eta: 69.1m +step 10276/16704 (61.52%) | loss: 2.668696 | lrm: 0.77 | dt: 645.20ms | tok/sec: 812,596 | mfu: 50.79 | epoch: 2 | total time: 110.40m | eta: 69.1m +step 10277/16704 (61.52%) | loss: 2.657407 | lrm: 0.77 | dt: 642.57ms | tok/sec: 815,924 | mfu: 51.00 | epoch: 2 | total time: 110.41m | eta: 69.1m +step 10278/16704 (61.53%) | loss: 2.654663 | lrm: 0.77 | dt: 643.34ms | tok/sec: 814,952 | mfu: 50.94 | epoch: 2 | total time: 110.42m | eta: 69.1m +step 10279/16704 (61.54%) | loss: 2.648096 | lrm: 0.77 | dt: 644.82ms | tok/sec: 813,073 | mfu: 50.82 | epoch: 2 | total time: 110.43m | eta: 69.1m +step 10280/16704 (61.54%) | loss: 2.652852 | lrm: 0.77 | dt: 643.99ms | tok/sec: 814,129 | mfu: 50.88 | epoch: 2 | total time: 110.44m | eta: 69.1m +step 10281/16704 (61.55%) | loss: 2.635978 | lrm: 0.77 | dt: 642.39ms | tok/sec: 816,151 | mfu: 51.01 | epoch: 2 | total time: 110.45m | eta: 69.1m +step 10282/16704 (61.55%) | loss: 2.637961 | lrm: 0.77 | dt: 645.27ms | tok/sec: 812,505 | mfu: 50.78 | epoch: 2 | total time: 110.46m | eta: 69.1m +step 10283/16704 (61.56%) | loss: 2.639820 | lrm: 0.77 | dt: 643.76ms | tok/sec: 814,417 | mfu: 50.90 | epoch: 2 | total time: 110.47m | eta: 69.0m +step 10284/16704 (61.57%) | loss: 2.636296 | lrm: 0.77 | dt: 644.13ms | tok/sec: 813,942 | mfu: 50.87 | epoch: 2 | total time: 110.48m | eta: 69.0m +step 10285/16704 (61.57%) | loss: 2.639229 | lrm: 0.77 | dt: 644.86ms | tok/sec: 813,023 | mfu: 50.82 | epoch: 2 | total time: 110.49m | eta: 69.0m +step 10286/16704 (61.58%) | loss: 2.641284 | lrm: 0.77 | dt: 645.86ms | tok/sec: 811,765 | mfu: 50.74 | epoch: 2 | total time: 110.50m | eta: 69.0m +step 10287/16704 (61.58%) | loss: 2.646881 | lrm: 0.77 | dt: 644.52ms | tok/sec: 813,457 | mfu: 50.84 | epoch: 2 | total time: 110.51m | eta: 69.0m +step 10288/16704 (61.59%) | loss: 2.640218 | lrm: 0.77 | dt: 645.36ms | tok/sec: 812,394 | mfu: 50.78 | epoch: 2 | total time: 110.52m | eta: 69.0m +step 10289/16704 (61.60%) | loss: 2.634567 | lrm: 0.77 | dt: 645.39ms | tok/sec: 812,360 | mfu: 50.77 | epoch: 2 | total time: 110.53m | eta: 69.0m +step 10290/16704 (61.60%) | loss: 2.641960 | lrm: 0.77 | dt: 644.31ms | tok/sec: 813,720 | mfu: 50.86 | epoch: 2 | total time: 110.55m | eta: 69.0m +step 10291/16704 (61.61%) | loss: 2.638848 | lrm: 0.77 | dt: 644.19ms | tok/sec: 813,874 | mfu: 50.87 | epoch: 2 | total time: 110.56m | eta: 69.0m +step 10292/16704 (61.61%) | loss: 2.627981 | lrm: 0.77 | dt: 642.03ms | tok/sec: 816,614 | mfu: 51.04 | epoch: 2 | total time: 110.57m | eta: 69.0m +step 10293/16704 (61.62%) | loss: 2.624198 | lrm: 0.77 | dt: 644.90ms | tok/sec: 812,971 | mfu: 50.81 | epoch: 2 | total time: 110.58m | eta: 68.9m +step 10294/16704 (61.63%) | loss: 2.621933 | lrm: 0.77 | dt: 645.44ms | tok/sec: 812,297 | mfu: 50.77 | epoch: 2 | total time: 110.59m | eta: 68.9m +step 10295/16704 (61.63%) | loss: 2.612729 | lrm: 0.77 | dt: 642.75ms | tok/sec: 815,697 | mfu: 50.98 | epoch: 2 | total time: 110.60m | eta: 68.9m +step 10296/16704 (61.64%) | loss: 2.616332 | lrm: 0.77 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 2 | total time: 110.61m | eta: 68.9m +step 10297/16704 (61.64%) | loss: 2.622917 | lrm: 0.77 | dt: 645.52ms | tok/sec: 812,191 | mfu: 50.76 | epoch: 2 | total time: 110.62m | eta: 68.9m +step 10298/16704 (61.65%) | loss: 2.611875 | lrm: 0.77 | dt: 643.39ms | tok/sec: 814,885 | mfu: 50.93 | epoch: 2 | total time: 110.63m | eta: 68.9m +step 10299/16704 (61.66%) | loss: 2.608951 | lrm: 0.77 | dt: 645.29ms | tok/sec: 812,479 | mfu: 50.78 | epoch: 2 | total time: 110.64m | eta: 68.9m +step 10300/16704 (61.66%) | loss: 2.613340 | lrm: 0.77 | dt: 643.65ms | tok/sec: 814,555 | mfu: 50.91 | epoch: 2 | total time: 110.65m | eta: 68.9m +step 10301/16704 (61.67%) | loss: 2.614893 | lrm: 0.77 | dt: 644.89ms | tok/sec: 812,985 | mfu: 50.81 | epoch: 2 | total time: 110.66m | eta: 68.9m +step 10302/16704 (61.67%) | loss: 2.613062 | lrm: 0.77 | dt: 643.36ms | tok/sec: 814,917 | mfu: 50.93 | epoch: 2 | total time: 110.67m | eta: 68.8m +step 10303/16704 (61.68%) | loss: 2.608043 | lrm: 0.77 | dt: 643.09ms | tok/sec: 815,265 | mfu: 50.96 | epoch: 2 | total time: 110.69m | eta: 68.8m +step 10304/16704 (61.69%) | loss: 2.617893 | lrm: 0.77 | dt: 642.19ms | tok/sec: 816,405 | mfu: 51.03 | epoch: 2 | total time: 110.70m | eta: 68.8m +step 10305/16704 (61.69%) | loss: 2.618596 | lrm: 0.77 | dt: 645.21ms | tok/sec: 812,580 | mfu: 50.79 | epoch: 2 | total time: 110.71m | eta: 68.8m +step 10306/16704 (61.70%) | loss: 2.612876 | lrm: 0.77 | dt: 645.08ms | tok/sec: 812,752 | mfu: 50.80 | epoch: 2 | total time: 110.72m | eta: 68.8m +step 10307/16704 (61.70%) | loss: 2.615350 | lrm: 0.77 | dt: 644.65ms | tok/sec: 813,296 | mfu: 50.83 | epoch: 2 | total time: 110.73m | eta: 68.8m +step 10308/16704 (61.71%) | loss: 2.606809 | lrm: 0.77 | dt: 645.47ms | tok/sec: 812,261 | mfu: 50.77 | epoch: 2 | total time: 110.74m | eta: 68.8m +step 10309/16704 (61.72%) | loss: 2.612706 | lrm: 0.77 | dt: 643.52ms | tok/sec: 814,723 | mfu: 50.92 | epoch: 2 | total time: 110.75m | eta: 68.8m +step 10310/16704 (61.72%) | loss: 2.609441 | lrm: 0.77 | dt: 644.52ms | tok/sec: 813,456 | mfu: 50.84 | epoch: 2 | total time: 110.76m | eta: 68.8m +step 10311/16704 (61.73%) | loss: 2.610215 | lrm: 0.77 | dt: 644.17ms | tok/sec: 813,891 | mfu: 50.87 | epoch: 2 | total time: 110.77m | eta: 68.7m +step 10312/16704 (61.73%) | loss: 2.610114 | lrm: 0.77 | dt: 646.51ms | tok/sec: 810,944 | mfu: 50.69 | epoch: 2 | total time: 110.78m | eta: 68.7m +step 10313/16704 (61.74%) | loss: 2.595074 | lrm: 0.77 | dt: 640.86ms | tok/sec: 818,094 | mfu: 51.13 | epoch: 2 | total time: 110.79m | eta: 68.7m +step 10314/16704 (61.75%) | loss: 2.595897 | lrm: 0.77 | dt: 644.75ms | tok/sec: 813,170 | mfu: 50.82 | epoch: 2 | total time: 110.80m | eta: 68.7m +step 10315/16704 (61.75%) | loss: 2.598434 | lrm: 0.76 | dt: 645.02ms | tok/sec: 812,825 | mfu: 50.80 | epoch: 2 | total time: 110.81m | eta: 68.7m +step 10316/16704 (61.76%) | loss: 2.589527 | lrm: 0.76 | dt: 645.97ms | tok/sec: 811,630 | mfu: 50.73 | epoch: 2 | total time: 110.82m | eta: 68.7m +step 10317/16704 (61.76%) | loss: 2.593138 | lrm: 0.76 | dt: 646.07ms | tok/sec: 811,502 | mfu: 50.72 | epoch: 2 | total time: 110.84m | eta: 68.7m +step 10318/16704 (61.77%) | loss: 2.598315 | lrm: 0.76 | dt: 643.49ms | tok/sec: 814,758 | mfu: 50.92 | epoch: 2 | total time: 110.85m | eta: 68.7m +step 10319/16704 (61.78%) | loss: 2.599319 | lrm: 0.76 | dt: 644.51ms | tok/sec: 813,462 | mfu: 50.84 | epoch: 2 | total time: 110.86m | eta: 68.7m +step 10320/16704 (61.78%) | loss: 2.604035 | lrm: 0.76 | dt: 645.87ms | tok/sec: 811,759 | mfu: 50.74 | epoch: 2 | total time: 110.87m | eta: 68.6m +step 10321/16704 (61.79%) | loss: 2.613945 | lrm: 0.76 | dt: 642.57ms | tok/sec: 815,924 | mfu: 51.00 | epoch: 2 | total time: 110.88m | eta: 68.6m +step 10322/16704 (61.79%) | loss: 2.618256 | lrm: 0.76 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 2 | total time: 110.89m | eta: 68.6m +step 10323/16704 (61.80%) | loss: 2.627747 | lrm: 0.76 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 2 | total time: 110.90m | eta: 68.6m +step 10324/16704 (61.81%) | loss: 2.609909 | lrm: 0.76 | dt: 644.54ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 110.91m | eta: 68.6m +step 10325/16704 (61.81%) | loss: 2.610089 | lrm: 0.76 | dt: 644.32ms | tok/sec: 813,711 | mfu: 50.86 | epoch: 2 | total time: 110.92m | eta: 68.6m +step 10326/16704 (61.82%) | loss: 2.598778 | lrm: 0.76 | dt: 644.33ms | tok/sec: 813,692 | mfu: 50.86 | epoch: 2 | total time: 110.93m | eta: 68.6m +step 10327/16704 (61.82%) | loss: 2.593158 | lrm: 0.76 | dt: 643.24ms | tok/sec: 815,069 | mfu: 50.94 | epoch: 2 | total time: 110.94m | eta: 68.6m +step 10328/16704 (61.83%) | loss: 2.591131 | lrm: 0.76 | dt: 646.27ms | tok/sec: 811,256 | mfu: 50.70 | epoch: 2 | total time: 110.95m | eta: 68.6m +step 10329/16704 (61.84%) | loss: 2.591196 | lrm: 0.76 | dt: 644.50ms | tok/sec: 813,482 | mfu: 50.84 | epoch: 2 | total time: 110.96m | eta: 68.6m +step 10330/16704 (61.84%) | loss: 2.590459 | lrm: 0.76 | dt: 647.66ms | tok/sec: 809,512 | mfu: 50.60 | epoch: 2 | total time: 110.98m | eta: 68.5m +step 10331/16704 (61.85%) | loss: 2.596359 | lrm: 0.76 | dt: 643.58ms | tok/sec: 814,641 | mfu: 50.92 | epoch: 2 | total time: 110.99m | eta: 68.5m +step 10332/16704 (61.85%) | loss: 2.610564 | lrm: 0.76 | dt: 643.90ms | tok/sec: 814,240 | mfu: 50.89 | epoch: 2 | total time: 111.00m | eta: 68.5m +step 10333/16704 (61.86%) | loss: 2.615479 | lrm: 0.76 | dt: 645.92ms | tok/sec: 811,685 | mfu: 50.73 | epoch: 2 | total time: 111.01m | eta: 68.5m +step 10334/16704 (61.87%) | loss: 2.617983 | lrm: 0.76 | dt: 641.96ms | tok/sec: 816,698 | mfu: 51.04 | epoch: 2 | total time: 111.02m | eta: 68.5m +step 10335/16704 (61.87%) | loss: 2.609857 | lrm: 0.76 | dt: 646.92ms | tok/sec: 810,435 | mfu: 50.65 | epoch: 2 | total time: 111.03m | eta: 68.5m +step 10336/16704 (61.88%) | loss: 2.616370 | lrm: 0.76 | dt: 644.12ms | tok/sec: 813,955 | mfu: 50.87 | epoch: 2 | total time: 111.04m | eta: 68.5m +step 10337/16704 (61.88%) | loss: 2.622112 | lrm: 0.76 | dt: 644.54ms | tok/sec: 813,433 | mfu: 50.84 | epoch: 2 | total time: 111.05m | eta: 68.5m +step 10338/16704 (61.89%) | loss: 2.625433 | lrm: 0.76 | dt: 645.00ms | tok/sec: 812,855 | mfu: 50.80 | epoch: 2 | total time: 111.06m | eta: 68.5m +step 10339/16704 (61.90%) | loss: 2.621383 | lrm: 0.76 | dt: 643.89ms | tok/sec: 814,254 | mfu: 50.89 | epoch: 2 | total time: 111.07m | eta: 68.4m +step 10340/16704 (61.90%) | loss: 2.630851 | lrm: 0.76 | dt: 644.53ms | tok/sec: 813,448 | mfu: 50.84 | epoch: 2 | total time: 111.08m | eta: 68.4m +step 10341/16704 (61.91%) | loss: 2.642866 | lrm: 0.76 | dt: 643.74ms | tok/sec: 814,436 | mfu: 50.90 | epoch: 2 | total time: 111.09m | eta: 68.4m +step 10342/16704 (61.91%) | loss: 2.635961 | lrm: 0.76 | dt: 646.69ms | tok/sec: 810,724 | mfu: 50.67 | epoch: 2 | total time: 111.10m | eta: 68.4m +step 10343/16704 (61.92%) | loss: 2.639643 | lrm: 0.76 | dt: 643.59ms | tok/sec: 814,627 | mfu: 50.92 | epoch: 2 | total time: 111.11m | eta: 68.4m +step 10344/16704 (61.93%) | loss: 2.630442 | lrm: 0.76 | dt: 644.48ms | tok/sec: 813,510 | mfu: 50.85 | epoch: 2 | total time: 111.13m | eta: 68.4m +step 10345/16704 (61.93%) | loss: 2.611687 | lrm: 0.76 | dt: 645.60ms | tok/sec: 812,094 | mfu: 50.76 | epoch: 2 | total time: 111.14m | eta: 68.4m +step 10346/16704 (61.94%) | loss: 2.616666 | lrm: 0.76 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 2 | total time: 111.15m | eta: 68.4m +step 10347/16704 (61.94%) | loss: 2.621730 | lrm: 0.76 | dt: 645.79ms | tok/sec: 811,856 | mfu: 50.74 | epoch: 2 | total time: 111.16m | eta: 68.4m +step 10348/16704 (61.95%) | loss: 2.613687 | lrm: 0.76 | dt: 651.52ms | tok/sec: 804,713 | mfu: 50.30 | epoch: 2 | total time: 111.17m | eta: 68.3m +step 10349/16704 (61.96%) | loss: 2.609309 | lrm: 0.76 | dt: 640.66ms | tok/sec: 818,355 | mfu: 51.15 | epoch: 2 | total time: 111.18m | eta: 68.3m +step 10350/16704 (61.96%) | loss: 2.611459 | lrm: 0.76 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 2 | total time: 111.19m | eta: 68.3m +step 10351/16704 (61.97%) | loss: 2.609351 | lrm: 0.76 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 2 | total time: 111.20m | eta: 68.3m +step 10352/16704 (61.97%) | loss: 2.619704 | lrm: 0.76 | dt: 641.97ms | tok/sec: 816,683 | mfu: 51.04 | epoch: 2 | total time: 111.21m | eta: 68.3m +step 10353/16704 (61.98%) | loss: 2.609924 | lrm: 0.76 | dt: 646.02ms | tok/sec: 811,562 | mfu: 50.72 | epoch: 2 | total time: 111.22m | eta: 68.3m +step 10354/16704 (61.99%) | loss: 2.595358 | lrm: 0.76 | dt: 641.59ms | tok/sec: 817,163 | mfu: 51.07 | epoch: 2 | total time: 111.23m | eta: 68.3m +step 10355/16704 (61.99%) | loss: 2.612341 | lrm: 0.76 | dt: 643.88ms | tok/sec: 814,259 | mfu: 50.89 | epoch: 2 | total time: 111.24m | eta: 68.3m +step 10356/16704 (62.00%) | loss: 2.603132 | lrm: 0.76 | dt: 645.78ms | tok/sec: 811,873 | mfu: 50.74 | epoch: 2 | total time: 111.25m | eta: 68.3m +step 10357/16704 (62.00%) | loss: 2.606334 | lrm: 0.76 | dt: 644.30ms | tok/sec: 813,727 | mfu: 50.86 | epoch: 2 | total time: 111.27m | eta: 68.3m +step 10358/16704 (62.01%) | loss: 2.614791 | lrm: 0.76 | dt: 642.95ms | tok/sec: 815,437 | mfu: 50.97 | epoch: 2 | total time: 111.28m | eta: 68.2m +step 10359/16704 (62.02%) | loss: 2.615008 | lrm: 0.76 | dt: 643.44ms | tok/sec: 814,814 | mfu: 50.93 | epoch: 2 | total time: 111.29m | eta: 68.2m +step 10360/16704 (62.02%) | loss: 2.616701 | lrm: 0.76 | dt: 643.95ms | tok/sec: 814,175 | mfu: 50.89 | epoch: 2 | total time: 111.30m | eta: 68.2m +step 10361/16704 (62.03%) | loss: 2.605223 | lrm: 0.76 | dt: 645.80ms | tok/sec: 811,840 | mfu: 50.74 | epoch: 2 | total time: 111.31m | eta: 68.2m +step 10362/16704 (62.03%) | loss: 2.615220 | lrm: 0.76 | dt: 644.69ms | tok/sec: 813,238 | mfu: 50.83 | epoch: 2 | total time: 111.32m | eta: 68.2m +step 10363/16704 (62.04%) | loss: 2.607017 | lrm: 0.76 | dt: 643.68ms | tok/sec: 814,513 | mfu: 50.91 | epoch: 2 | total time: 111.33m | eta: 68.2m +step 10364/16704 (62.05%) | loss: 2.615512 | lrm: 0.76 | dt: 644.43ms | tok/sec: 813,564 | mfu: 50.85 | epoch: 2 | total time: 111.34m | eta: 68.2m +step 10365/16704 (62.05%) | loss: 2.612465 | lrm: 0.76 | dt: 644.09ms | tok/sec: 813,994 | mfu: 50.88 | epoch: 2 | total time: 111.35m | eta: 68.2m +step 10366/16704 (62.06%) | loss: 2.608541 | lrm: 0.76 | dt: 643.26ms | tok/sec: 815,046 | mfu: 50.94 | epoch: 2 | total time: 111.36m | eta: 68.2m +step 10367/16704 (62.06%) | loss: 2.603216 | lrm: 0.76 | dt: 644.76ms | tok/sec: 813,157 | mfu: 50.82 | epoch: 2 | total time: 111.37m | eta: 68.1m +step 10368/16704 (62.07%) | loss: 2.595773 | lrm: 0.76 | dt: 643.85ms | tok/sec: 814,305 | mfu: 50.90 | epoch: 2 | total time: 111.38m | eta: 68.1m +step 10369/16704 (62.07%) | loss: 2.600069 | lrm: 0.76 | dt: 644.11ms | tok/sec: 813,971 | mfu: 50.87 | epoch: 2 | total time: 111.39m | eta: 68.1m +step 10370/16704 (62.08%) | loss: 2.612302 | lrm: 0.76 | dt: 644.66ms | tok/sec: 813,284 | mfu: 50.83 | epoch: 2 | total time: 111.40m | eta: 68.1m +step 10371/16704 (62.09%) | loss: 2.612853 | lrm: 0.76 | dt: 644.19ms | tok/sec: 813,874 | mfu: 50.87 | epoch: 2 | total time: 111.42m | eta: 68.1m +step 10372/16704 (62.09%) | loss: 2.614833 | lrm: 0.76 | dt: 643.52ms | tok/sec: 814,716 | mfu: 50.92 | epoch: 2 | total time: 111.43m | eta: 68.1m +step 10373/16704 (62.10%) | loss: 2.614093 | lrm: 0.76 | dt: 644.63ms | tok/sec: 813,314 | mfu: 50.83 | epoch: 2 | total time: 111.44m | eta: 68.1m +step 10374/16704 (62.10%) | loss: 2.620521 | lrm: 0.76 | dt: 644.79ms | tok/sec: 813,117 | mfu: 50.82 | epoch: 2 | total time: 111.45m | eta: 68.1m +step 10375/16704 (62.11%) | loss: 2.630766 | lrm: 0.76 | dt: 646.53ms | tok/sec: 810,921 | mfu: 50.68 | epoch: 2 | total time: 111.46m | eta: 68.1m +step 10376/16704 (62.12%) | loss: 2.615458 | lrm: 0.76 | dt: 645.46ms | tok/sec: 812,271 | mfu: 50.77 | epoch: 2 | total time: 111.47m | eta: 68.0m +step 10377/16704 (62.12%) | loss: 2.623798 | lrm: 0.76 | dt: 644.75ms | tok/sec: 813,165 | mfu: 50.82 | epoch: 2 | total time: 111.48m | eta: 68.0m +step 10378/16704 (62.13%) | loss: 2.621026 | lrm: 0.76 | dt: 644.18ms | tok/sec: 813,883 | mfu: 50.87 | epoch: 2 | total time: 111.49m | eta: 68.0m +step 10379/16704 (62.13%) | loss: 2.626387 | lrm: 0.76 | dt: 645.71ms | tok/sec: 811,953 | mfu: 50.75 | epoch: 2 | total time: 111.50m | eta: 68.0m +step 10380/16704 (62.14%) | loss: 2.613381 | lrm: 0.76 | dt: 642.94ms | tok/sec: 815,459 | mfu: 50.97 | epoch: 2 | total time: 111.51m | eta: 68.0m +step 10381/16704 (62.15%) | loss: 2.635582 | lrm: 0.76 | dt: 644.93ms | tok/sec: 812,938 | mfu: 50.81 | epoch: 2 | total time: 111.52m | eta: 68.0m +step 10382/16704 (62.15%) | loss: 2.639069 | lrm: 0.76 | dt: 644.06ms | tok/sec: 814,041 | mfu: 50.88 | epoch: 2 | total time: 111.53m | eta: 68.0m +step 10383/16704 (62.16%) | loss: 2.632135 | lrm: 0.76 | dt: 644.72ms | tok/sec: 813,201 | mfu: 50.83 | epoch: 2 | total time: 111.54m | eta: 68.0m +step 10384/16704 (62.16%) | loss: 2.630947 | lrm: 0.76 | dt: 644.39ms | tok/sec: 813,619 | mfu: 50.85 | epoch: 2 | total time: 111.56m | eta: 68.0m +step 10385/16704 (62.17%) | loss: 2.633077 | lrm: 0.76 | dt: 646.62ms | tok/sec: 810,811 | mfu: 50.68 | epoch: 2 | total time: 111.57m | eta: 68.0m +step 10386/16704 (62.18%) | loss: 2.630984 | lrm: 0.76 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 2 | total time: 111.58m | eta: 67.9m +step 10387/16704 (62.18%) | loss: 2.630033 | lrm: 0.76 | dt: 645.39ms | tok/sec: 812,357 | mfu: 50.77 | epoch: 2 | total time: 111.59m | eta: 67.9m +step 10388/16704 (62.19%) | loss: 2.633716 | lrm: 0.76 | dt: 643.77ms | tok/sec: 814,397 | mfu: 50.90 | epoch: 2 | total time: 111.60m | eta: 67.9m +step 10389/16704 (62.19%) | loss: 2.626619 | lrm: 0.76 | dt: 642.74ms | tok/sec: 815,701 | mfu: 50.98 | epoch: 2 | total time: 111.61m | eta: 67.9m +step 10390/16704 (62.20%) | loss: 2.625101 | lrm: 0.76 | dt: 644.78ms | tok/sec: 813,130 | mfu: 50.82 | epoch: 2 | total time: 111.62m | eta: 67.9m +step 10391/16704 (62.21%) | loss: 2.629870 | lrm: 0.76 | dt: 645.76ms | tok/sec: 811,888 | mfu: 50.74 | epoch: 2 | total time: 111.63m | eta: 67.9m +step 10392/16704 (62.21%) | loss: 2.613125 | lrm: 0.76 | dt: 646.23ms | tok/sec: 811,303 | mfu: 50.71 | epoch: 2 | total time: 111.64m | eta: 67.9m +step 10393/16704 (62.22%) | loss: 2.619407 | lrm: 0.76 | dt: 644.64ms | tok/sec: 813,302 | mfu: 50.83 | epoch: 2 | total time: 111.65m | eta: 67.9m +step 10394/16704 (62.22%) | loss: 2.628219 | lrm: 0.76 | dt: 644.50ms | tok/sec: 813,478 | mfu: 50.84 | epoch: 2 | total time: 111.66m | eta: 67.9m +step 10395/16704 (62.23%) | loss: 2.627894 | lrm: 0.76 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 2 | total time: 111.67m | eta: 67.8m +step 10396/16704 (62.24%) | loss: 2.630879 | lrm: 0.76 | dt: 646.52ms | tok/sec: 810,934 | mfu: 50.68 | epoch: 2 | total time: 111.68m | eta: 67.8m +step 10397/16704 (62.24%) | loss: 2.632811 | lrm: 0.76 | dt: 646.95ms | tok/sec: 810,396 | mfu: 50.65 | epoch: 2 | total time: 111.70m | eta: 67.8m +step 10398/16704 (62.25%) | loss: 2.626408 | lrm: 0.76 | dt: 641.61ms | tok/sec: 817,142 | mfu: 51.07 | epoch: 2 | total time: 111.71m | eta: 67.8m +step 10399/16704 (62.25%) | loss: 2.626280 | lrm: 0.75 | dt: 646.43ms | tok/sec: 811,052 | mfu: 50.69 | epoch: 2 | total time: 111.72m | eta: 67.8m +step 10400/16704 (62.26%) | loss: 2.609865 | lrm: 0.75 | dt: 644.29ms | tok/sec: 813,745 | mfu: 50.86 | epoch: 2 | total time: 111.73m | eta: 67.8m +step 10401/16704 (62.27%) | loss: 2.613731 | lrm: 0.75 | dt: 644.54ms | tok/sec: 813,432 | mfu: 50.84 | epoch: 2 | total time: 111.74m | eta: 67.8m +step 10402/16704 (62.27%) | loss: 2.619749 | lrm: 0.75 | dt: 643.34ms | tok/sec: 814,941 | mfu: 50.93 | epoch: 2 | total time: 111.75m | eta: 67.8m +step 10403/16704 (62.28%) | loss: 2.619652 | lrm: 0.75 | dt: 645.07ms | tok/sec: 812,760 | mfu: 50.80 | epoch: 2 | total time: 111.76m | eta: 67.8m +step 10404/16704 (62.28%) | loss: 2.613967 | lrm: 0.75 | dt: 646.19ms | tok/sec: 811,346 | mfu: 50.71 | epoch: 2 | total time: 111.77m | eta: 67.7m +step 10405/16704 (62.29%) | loss: 2.618756 | lrm: 0.75 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 2 | total time: 111.78m | eta: 67.7m +step 10406/16704 (62.30%) | loss: 2.612124 | lrm: 0.75 | dt: 645.50ms | tok/sec: 812,217 | mfu: 50.76 | epoch: 2 | total time: 111.79m | eta: 67.7m +step 10407/16704 (62.30%) | loss: 2.623545 | lrm: 0.75 | dt: 645.67ms | tok/sec: 812,005 | mfu: 50.75 | epoch: 2 | total time: 111.80m | eta: 67.7m +step 10408/16704 (62.31%) | loss: 2.611047 | lrm: 0.75 | dt: 645.42ms | tok/sec: 812,320 | mfu: 50.77 | epoch: 2 | total time: 111.81m | eta: 67.7m +step 10409/16704 (62.31%) | loss: 2.615613 | lrm: 0.75 | dt: 647.08ms | tok/sec: 810,232 | mfu: 50.64 | epoch: 2 | total time: 111.82m | eta: 67.7m +step 10410/16704 (62.32%) | loss: 2.633851 | lrm: 0.75 | dt: 642.68ms | tok/sec: 815,781 | mfu: 50.99 | epoch: 2 | total time: 111.83m | eta: 67.7m +step 10411/16704 (62.33%) | loss: 2.638761 | lrm: 0.75 | dt: 645.23ms | tok/sec: 812,560 | mfu: 50.79 | epoch: 2 | total time: 111.85m | eta: 67.7m +step 10412/16704 (62.33%) | loss: 2.643811 | lrm: 0.75 | dt: 646.73ms | tok/sec: 810,670 | mfu: 50.67 | epoch: 2 | total time: 111.86m | eta: 67.7m +step 10413/16704 (62.34%) | loss: 2.639323 | lrm: 0.75 | dt: 642.65ms | tok/sec: 815,820 | mfu: 50.99 | epoch: 2 | total time: 111.87m | eta: 67.6m +step 10414/16704 (62.34%) | loss: 2.660395 | lrm: 0.75 | dt: 644.40ms | tok/sec: 813,607 | mfu: 50.85 | epoch: 2 | total time: 111.88m | eta: 67.6m +step 10415/16704 (62.35%) | loss: 2.657754 | lrm: 0.75 | dt: 644.27ms | tok/sec: 813,773 | mfu: 50.86 | epoch: 2 | total time: 111.89m | eta: 67.6m +step 10416/16704 (62.36%) | loss: 2.664455 | lrm: 0.75 | dt: 646.60ms | tok/sec: 810,840 | mfu: 50.68 | epoch: 2 | total time: 111.90m | eta: 67.6m +step 10417/16704 (62.36%) | loss: 2.650877 | lrm: 0.75 | dt: 645.51ms | tok/sec: 812,210 | mfu: 50.76 | epoch: 2 | total time: 111.91m | eta: 67.6m +step 10418/16704 (62.37%) | loss: 2.646807 | lrm: 0.75 | dt: 642.81ms | tok/sec: 815,624 | mfu: 50.98 | epoch: 2 | total time: 111.92m | eta: 67.6m +step 10419/16704 (62.37%) | loss: 2.636625 | lrm: 0.75 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 2 | total time: 111.93m | eta: 67.6m +step 10420/16704 (62.38%) | loss: 2.635856 | lrm: 0.75 | dt: 647.74ms | tok/sec: 809,412 | mfu: 50.59 | epoch: 2 | total time: 111.94m | eta: 67.6m +step 10421/16704 (62.39%) | loss: 2.635564 | lrm: 0.75 | dt: 644.58ms | tok/sec: 813,377 | mfu: 50.84 | epoch: 2 | total time: 111.95m | eta: 67.6m +step 10422/16704 (62.39%) | loss: 2.647255 | lrm: 0.75 | dt: 643.25ms | tok/sec: 815,066 | mfu: 50.94 | epoch: 2 | total time: 111.96m | eta: 67.6m +step 10423/16704 (62.40%) | loss: 2.636757 | lrm: 0.75 | dt: 644.81ms | tok/sec: 813,089 | mfu: 50.82 | epoch: 2 | total time: 111.97m | eta: 67.5m +step 10424/16704 (62.40%) | loss: 2.627651 | lrm: 0.75 | dt: 646.29ms | tok/sec: 811,227 | mfu: 50.70 | epoch: 2 | total time: 111.99m | eta: 67.5m +step 10425/16704 (62.41%) | loss: 2.638393 | lrm: 0.75 | dt: 642.19ms | tok/sec: 816,405 | mfu: 51.03 | epoch: 2 | total time: 112.00m | eta: 67.5m +step 10426/16704 (62.42%) | loss: 2.628097 | lrm: 0.75 | dt: 645.69ms | tok/sec: 811,985 | mfu: 50.75 | epoch: 2 | total time: 112.01m | eta: 67.5m +step 10427/16704 (62.42%) | loss: 2.638506 | lrm: 0.75 | dt: 643.79ms | tok/sec: 814,379 | mfu: 50.90 | epoch: 2 | total time: 112.02m | eta: 67.5m +step 10428/16704 (62.43%) | loss: 2.647566 | lrm: 0.75 | dt: 644.42ms | tok/sec: 813,577 | mfu: 50.85 | epoch: 2 | total time: 112.03m | eta: 67.5m +step 10429/16704 (62.43%) | loss: 2.645358 | lrm: 0.75 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 2 | total time: 112.04m | eta: 67.5m +step 10430/16704 (62.44%) | loss: 2.639733 | lrm: 0.75 | dt: 642.94ms | tok/sec: 815,448 | mfu: 50.97 | epoch: 2 | total time: 112.05m | eta: 67.5m +step 10431/16704 (62.45%) | loss: 2.633402 | lrm: 0.75 | dt: 645.19ms | tok/sec: 812,610 | mfu: 50.79 | epoch: 2 | total time: 112.06m | eta: 67.5m +step 10432/16704 (62.45%) | loss: 2.622988 | lrm: 0.75 | dt: 646.15ms | tok/sec: 811,406 | mfu: 50.71 | epoch: 2 | total time: 112.07m | eta: 67.4m +step 10433/16704 (62.46%) | loss: 2.617784 | lrm: 0.75 | dt: 643.69ms | tok/sec: 814,502 | mfu: 50.91 | epoch: 2 | total time: 112.08m | eta: 67.4m +step 10434/16704 (62.46%) | loss: 2.610313 | lrm: 0.75 | dt: 647.04ms | tok/sec: 810,283 | mfu: 50.64 | epoch: 2 | total time: 112.09m | eta: 67.4m +step 10435/16704 (62.47%) | loss: 2.603958 | lrm: 0.75 | dt: 644.87ms | tok/sec: 813,008 | mfu: 50.81 | epoch: 2 | total time: 112.10m | eta: 67.4m +step 10436/16704 (62.48%) | loss: 2.608564 | lrm: 0.75 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 2 | total time: 112.11m | eta: 67.4m +step 10437/16704 (62.48%) | loss: 2.603966 | lrm: 0.75 | dt: 646.60ms | tok/sec: 810,839 | mfu: 50.68 | epoch: 2 | total time: 112.13m | eta: 67.4m +step 10438/16704 (62.49%) | loss: 2.597385 | lrm: 0.75 | dt: 645.37ms | tok/sec: 812,389 | mfu: 50.78 | epoch: 2 | total time: 112.14m | eta: 67.4m +step 10439/16704 (62.49%) | loss: 2.604666 | lrm: 0.75 | dt: 644.79ms | tok/sec: 813,115 | mfu: 50.82 | epoch: 2 | total time: 112.15m | eta: 67.4m +step 10440/16704 (62.50%) | loss: 2.611540 | lrm: 0.75 | dt: 647.83ms | tok/sec: 809,303 | mfu: 50.58 | epoch: 2 | total time: 112.16m | eta: 67.4m +step 10441/16704 (62.51%) | loss: 2.608600 | lrm: 0.75 | dt: 647.89ms | tok/sec: 809,219 | mfu: 50.58 | epoch: 2 | total time: 112.17m | eta: 67.3m +step 10442/16704 (62.51%) | loss: 2.596729 | lrm: 0.75 | dt: 643.94ms | tok/sec: 814,188 | mfu: 50.89 | epoch: 2 | total time: 112.18m | eta: 67.3m +step 10443/16704 (62.52%) | loss: 2.607770 | lrm: 0.75 | dt: 649.08ms | tok/sec: 807,741 | mfu: 50.48 | epoch: 2 | total time: 112.19m | eta: 67.3m +step 10444/16704 (62.52%) | loss: 2.600655 | lrm: 0.75 | dt: 643.11ms | tok/sec: 815,234 | mfu: 50.95 | epoch: 2 | total time: 112.20m | eta: 67.3m +step 10445/16704 (62.53%) | loss: 2.593571 | lrm: 0.75 | dt: 646.71ms | tok/sec: 810,703 | mfu: 50.67 | epoch: 2 | total time: 112.21m | eta: 67.3m +step 10446/16704 (62.54%) | loss: 2.594738 | lrm: 0.75 | dt: 645.03ms | tok/sec: 812,809 | mfu: 50.80 | epoch: 2 | total time: 112.22m | eta: 67.3m +step 10447/16704 (62.54%) | loss: 2.594833 | lrm: 0.75 | dt: 644.11ms | tok/sec: 813,977 | mfu: 50.87 | epoch: 2 | total time: 112.23m | eta: 67.3m +step 10448/16704 (62.55%) | loss: 2.587579 | lrm: 0.75 | dt: 646.29ms | tok/sec: 811,223 | mfu: 50.70 | epoch: 2 | total time: 112.24m | eta: 67.3m +step 10449/16704 (62.55%) | loss: 2.598069 | lrm: 0.75 | dt: 646.06ms | tok/sec: 811,517 | mfu: 50.72 | epoch: 2 | total time: 112.25m | eta: 67.3m +step 10450/16704 (62.56%) | loss: 2.610401 | lrm: 0.75 | dt: 648.14ms | tok/sec: 808,908 | mfu: 50.56 | epoch: 2 | total time: 112.26m | eta: 67.3m +step 10451/16704 (62.57%) | loss: 2.627032 | lrm: 0.75 | dt: 644.49ms | tok/sec: 813,491 | mfu: 50.84 | epoch: 2 | total time: 112.28m | eta: 67.2m +step 10452/16704 (62.57%) | loss: 2.616965 | lrm: 0.75 | dt: 645.42ms | tok/sec: 812,314 | mfu: 50.77 | epoch: 2 | total time: 112.29m | eta: 67.2m +step 10453/16704 (62.58%) | loss: 2.618139 | lrm: 0.75 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 2 | total time: 112.30m | eta: 67.2m +step 10454/16704 (62.58%) | loss: 2.611448 | lrm: 0.75 | dt: 644.96ms | tok/sec: 812,898 | mfu: 50.81 | epoch: 2 | total time: 112.31m | eta: 67.2m +step 10455/16704 (62.59%) | loss: 2.596708 | lrm: 0.75 | dt: 644.70ms | tok/sec: 813,222 | mfu: 50.83 | epoch: 2 | total time: 112.32m | eta: 67.2m +step 10456/16704 (62.60%) | loss: 2.614340 | lrm: 0.75 | dt: 645.65ms | tok/sec: 812,035 | mfu: 50.75 | epoch: 2 | total time: 112.33m | eta: 67.2m +step 10457/16704 (62.60%) | loss: 2.614063 | lrm: 0.75 | dt: 646.42ms | tok/sec: 811,065 | mfu: 50.69 | epoch: 2 | total time: 112.34m | eta: 67.2m +step 10458/16704 (62.61%) | loss: 2.617125 | lrm: 0.75 | dt: 645.96ms | tok/sec: 811,642 | mfu: 50.73 | epoch: 2 | total time: 112.35m | eta: 67.2m +step 10459/16704 (62.61%) | loss: 2.610519 | lrm: 0.75 | dt: 645.24ms | tok/sec: 812,548 | mfu: 50.79 | epoch: 2 | total time: 112.36m | eta: 67.2m +step 10460/16704 (62.62%) | loss: 2.610639 | lrm: 0.75 | dt: 646.63ms | tok/sec: 810,806 | mfu: 50.68 | epoch: 2 | total time: 112.37m | eta: 67.1m +step 10461/16704 (62.63%) | loss: 2.607489 | lrm: 0.75 | dt: 645.93ms | tok/sec: 811,680 | mfu: 50.73 | epoch: 2 | total time: 112.38m | eta: 67.1m +step 10462/16704 (62.63%) | loss: 2.603969 | lrm: 0.75 | dt: 643.06ms | tok/sec: 815,298 | mfu: 50.96 | epoch: 2 | total time: 112.39m | eta: 67.1m +step 10463/16704 (62.64%) | loss: 2.609131 | lrm: 0.75 | dt: 648.09ms | tok/sec: 808,976 | mfu: 50.56 | epoch: 2 | total time: 112.40m | eta: 67.1m +step 10464/16704 (62.64%) | loss: 2.616048 | lrm: 0.75 | dt: 644.90ms | tok/sec: 812,970 | mfu: 50.81 | epoch: 2 | total time: 112.42m | eta: 67.1m +step 10465/16704 (62.65%) | loss: 2.609744 | lrm: 0.75 | dt: 645.74ms | tok/sec: 811,924 | mfu: 50.75 | epoch: 2 | total time: 112.43m | eta: 67.1m +step 10466/16704 (62.66%) | loss: 2.610739 | lrm: 0.75 | dt: 644.65ms | tok/sec: 813,284 | mfu: 50.83 | epoch: 2 | total time: 112.44m | eta: 67.1m +step 10467/16704 (62.66%) | loss: 2.609608 | lrm: 0.75 | dt: 647.22ms | tok/sec: 810,057 | mfu: 50.63 | epoch: 2 | total time: 112.45m | eta: 67.1m +step 10468/16704 (62.67%) | loss: 2.623540 | lrm: 0.75 | dt: 643.16ms | tok/sec: 815,175 | mfu: 50.95 | epoch: 2 | total time: 112.46m | eta: 67.1m +step 10469/16704 (62.67%) | loss: 2.624747 | lrm: 0.75 | dt: 642.92ms | tok/sec: 815,485 | mfu: 50.97 | epoch: 2 | total time: 112.47m | eta: 67.0m +step 10470/16704 (62.68%) | loss: 2.618885 | lrm: 0.75 | dt: 645.66ms | tok/sec: 812,018 | mfu: 50.75 | epoch: 2 | total time: 112.48m | eta: 67.0m +step 10471/16704 (62.69%) | loss: 2.611337 | lrm: 0.75 | dt: 645.73ms | tok/sec: 811,926 | mfu: 50.75 | epoch: 2 | total time: 112.49m | eta: 67.0m +step 10472/16704 (62.69%) | loss: 2.597398 | lrm: 0.75 | dt: 644.00ms | tok/sec: 814,114 | mfu: 50.88 | epoch: 2 | total time: 112.50m | eta: 67.0m +step 10473/16704 (62.70%) | loss: 2.579896 | lrm: 0.75 | dt: 643.12ms | tok/sec: 815,225 | mfu: 50.95 | epoch: 2 | total time: 112.51m | eta: 67.0m +step 10474/16704 (62.70%) | loss: 2.579204 | lrm: 0.75 | dt: 645.34ms | tok/sec: 812,420 | mfu: 50.78 | epoch: 2 | total time: 112.52m | eta: 67.0m +step 10475/16704 (62.71%) | loss: 2.588999 | lrm: 0.75 | dt: 644.78ms | tok/sec: 813,132 | mfu: 50.82 | epoch: 2 | total time: 112.53m | eta: 67.0m +step 10476/16704 (62.72%) | loss: 2.585411 | lrm: 0.75 | dt: 644.21ms | tok/sec: 813,847 | mfu: 50.87 | epoch: 2 | total time: 112.54m | eta: 67.0m +step 10477/16704 (62.72%) | loss: 2.573297 | lrm: 0.75 | dt: 644.32ms | tok/sec: 813,711 | mfu: 50.86 | epoch: 2 | total time: 112.56m | eta: 67.0m +step 10478/16704 (62.73%) | loss: 2.567291 | lrm: 0.75 | dt: 644.48ms | tok/sec: 813,501 | mfu: 50.85 | epoch: 2 | total time: 112.57m | eta: 67.0m +step 10479/16704 (62.73%) | loss: 2.561352 | lrm: 0.75 | dt: 645.65ms | tok/sec: 812,031 | mfu: 50.75 | epoch: 2 | total time: 112.58m | eta: 66.9m +step 10480/16704 (62.74%) | loss: 2.572797 | lrm: 0.75 | dt: 646.66ms | tok/sec: 810,767 | mfu: 50.67 | epoch: 2 | total time: 112.59m | eta: 66.9m +step 10481/16704 (62.75%) | loss: 2.569633 | lrm: 0.75 | dt: 643.81ms | tok/sec: 814,357 | mfu: 50.90 | epoch: 2 | total time: 112.60m | eta: 66.9m +step 10482/16704 (62.75%) | loss: 2.572529 | lrm: 0.74 | dt: 645.20ms | tok/sec: 812,602 | mfu: 50.79 | epoch: 2 | total time: 112.61m | eta: 66.9m +step 10483/16704 (62.76%) | loss: 2.575433 | lrm: 0.74 | dt: 643.39ms | tok/sec: 814,882 | mfu: 50.93 | epoch: 2 | total time: 112.62m | eta: 66.9m +step 10484/16704 (62.76%) | loss: 2.596717 | lrm: 0.74 | dt: 645.85ms | tok/sec: 811,784 | mfu: 50.74 | epoch: 2 | total time: 112.63m | eta: 66.9m +step 10485/16704 (62.77%) | loss: 2.600410 | lrm: 0.74 | dt: 643.49ms | tok/sec: 814,760 | mfu: 50.92 | epoch: 2 | total time: 112.64m | eta: 66.9m +step 10486/16704 (62.78%) | loss: 2.602314 | lrm: 0.74 | dt: 644.83ms | tok/sec: 813,063 | mfu: 50.82 | epoch: 2 | total time: 112.65m | eta: 66.9m +step 10487/16704 (62.78%) | loss: 2.616875 | lrm: 0.74 | dt: 644.42ms | tok/sec: 813,581 | mfu: 50.85 | epoch: 2 | total time: 112.66m | eta: 66.9m +step 10488/16704 (62.79%) | loss: 2.633364 | lrm: 0.74 | dt: 646.80ms | tok/sec: 810,585 | mfu: 50.66 | epoch: 2 | total time: 112.67m | eta: 66.8m +step 10489/16704 (62.79%) | loss: 2.634797 | lrm: 0.74 | dt: 644.98ms | tok/sec: 812,879 | mfu: 50.81 | epoch: 2 | total time: 112.68m | eta: 66.8m +step 10490/16704 (62.80%) | loss: 2.649598 | lrm: 0.74 | dt: 644.99ms | tok/sec: 812,863 | mfu: 50.81 | epoch: 2 | total time: 112.70m | eta: 66.8m +step 10491/16704 (62.81%) | loss: 2.653759 | lrm: 0.74 | dt: 643.79ms | tok/sec: 814,371 | mfu: 50.90 | epoch: 2 | total time: 112.71m | eta: 66.8m +step 10492/16704 (62.81%) | loss: 2.649549 | lrm: 0.74 | dt: 646.85ms | tok/sec: 810,521 | mfu: 50.66 | epoch: 2 | total time: 112.72m | eta: 66.8m +step 10493/16704 (62.82%) | loss: 2.650103 | lrm: 0.74 | dt: 643.27ms | tok/sec: 815,037 | mfu: 50.94 | epoch: 2 | total time: 112.73m | eta: 66.8m +step 10494/16704 (62.82%) | loss: 2.644767 | lrm: 0.74 | dt: 644.52ms | tok/sec: 813,460 | mfu: 50.84 | epoch: 2 | total time: 112.74m | eta: 66.8m +step 10495/16704 (62.83%) | loss: 2.650126 | lrm: 0.74 | dt: 646.75ms | tok/sec: 810,649 | mfu: 50.67 | epoch: 2 | total time: 112.75m | eta: 66.8m +step 10496/16704 (62.84%) | loss: 2.649512 | lrm: 0.74 | dt: 645.13ms | tok/sec: 812,688 | mfu: 50.79 | epoch: 2 | total time: 112.76m | eta: 66.8m +step 10497/16704 (62.84%) | loss: 2.648582 | lrm: 0.74 | dt: 643.59ms | tok/sec: 814,632 | mfu: 50.92 | epoch: 2 | total time: 112.77m | eta: 66.7m +step 10498/16704 (62.85%) | loss: 2.663943 | lrm: 0.74 | dt: 645.26ms | tok/sec: 812,525 | mfu: 50.78 | epoch: 2 | total time: 112.78m | eta: 66.7m +step 10499/16704 (62.85%) | loss: 2.660179 | lrm: 0.74 | dt: 644.27ms | tok/sec: 813,764 | mfu: 50.86 | epoch: 2 | total time: 112.79m | eta: 66.7m +Step 10500 | Validation bpb: 0.800301 +step 10500/16704 (62.86%) | loss: 2.639804 | lrm: 0.74 | dt: 646.83ms | tok/sec: 810,553 | mfu: 50.66 | epoch: 2 | total time: 112.80m | eta: 66.7m +step 10501/16704 (62.87%) | loss: 2.647878 | lrm: 0.74 | dt: 649.76ms | tok/sec: 806,889 | mfu: 50.43 | epoch: 2 | total time: 112.81m | eta: 66.7m +step 10502/16704 (62.87%) | loss: 2.642141 | lrm: 0.74 | dt: 644.99ms | tok/sec: 812,856 | mfu: 50.80 | epoch: 2 | total time: 112.82m | eta: 66.7m +step 10503/16704 (62.88%) | loss: 2.620013 | lrm: 0.74 | dt: 640.38ms | tok/sec: 818,713 | mfu: 51.17 | epoch: 2 | total time: 112.83m | eta: 66.7m +step 10504/16704 (62.88%) | loss: 2.610295 | lrm: 0.74 | dt: 650.32ms | tok/sec: 806,197 | mfu: 50.39 | epoch: 2 | total time: 112.85m | eta: 66.7m +step 10505/16704 (62.89%) | loss: 2.599403 | lrm: 0.74 | dt: 641.54ms | tok/sec: 817,233 | mfu: 51.08 | epoch: 2 | total time: 112.86m | eta: 66.7m +step 10506/16704 (62.90%) | loss: 2.592344 | lrm: 0.74 | dt: 647.84ms | tok/sec: 809,284 | mfu: 50.58 | epoch: 2 | total time: 112.87m | eta: 66.6m +step 10507/16704 (62.90%) | loss: 2.598209 | lrm: 0.74 | dt: 646.04ms | tok/sec: 811,542 | mfu: 50.72 | epoch: 2 | total time: 112.88m | eta: 66.6m +step 10508/16704 (62.91%) | loss: 2.605909 | lrm: 0.74 | dt: 644.77ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 2 | total time: 112.89m | eta: 66.6m +step 10509/16704 (62.91%) | loss: 2.612903 | lrm: 0.74 | dt: 646.45ms | tok/sec: 811,032 | mfu: 50.69 | epoch: 2 | total time: 112.90m | eta: 66.6m +step 10510/16704 (62.92%) | loss: 2.615061 | lrm: 0.74 | dt: 646.91ms | tok/sec: 810,446 | mfu: 50.65 | epoch: 2 | total time: 112.91m | eta: 66.6m +step 10511/16704 (62.93%) | loss: 2.627808 | lrm: 0.74 | dt: 643.94ms | tok/sec: 814,189 | mfu: 50.89 | epoch: 2 | total time: 112.92m | eta: 66.6m +step 10512/16704 (62.93%) | loss: 2.616870 | lrm: 0.74 | dt: 645.52ms | tok/sec: 812,191 | mfu: 50.76 | epoch: 2 | total time: 112.93m | eta: 66.6m +step 10513/16704 (62.94%) | loss: 2.618077 | lrm: 0.74 | dt: 646.77ms | tok/sec: 810,627 | mfu: 50.67 | epoch: 2 | total time: 112.94m | eta: 66.6m +step 10514/16704 (62.94%) | loss: 2.621860 | lrm: 0.74 | dt: 645.55ms | tok/sec: 812,160 | mfu: 50.76 | epoch: 2 | total time: 112.95m | eta: 66.6m +step 10515/16704 (62.95%) | loss: 2.621048 | lrm: 0.74 | dt: 642.63ms | tok/sec: 815,841 | mfu: 50.99 | epoch: 2 | total time: 112.96m | eta: 66.6m +step 10516/16704 (62.95%) | loss: 2.611793 | lrm: 0.74 | dt: 645.73ms | tok/sec: 811,927 | mfu: 50.75 | epoch: 2 | total time: 112.97m | eta: 66.5m +step 10517/16704 (62.96%) | loss: 2.607192 | lrm: 0.74 | dt: 644.00ms | tok/sec: 814,113 | mfu: 50.88 | epoch: 2 | total time: 112.99m | eta: 66.5m +step 10518/16704 (62.97%) | loss: 2.612760 | lrm: 0.74 | dt: 646.36ms | tok/sec: 811,143 | mfu: 50.70 | epoch: 2 | total time: 113.00m | eta: 66.5m +step 10519/16704 (62.97%) | loss: 2.623491 | lrm: 0.74 | dt: 646.60ms | tok/sec: 810,842 | mfu: 50.68 | epoch: 2 | total time: 113.01m | eta: 66.5m +step 10520/16704 (62.98%) | loss: 2.631194 | lrm: 0.74 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 2 | total time: 113.02m | eta: 66.5m +step 10521/16704 (62.98%) | loss: 2.629317 | lrm: 0.74 | dt: 645.08ms | tok/sec: 812,752 | mfu: 50.80 | epoch: 2 | total time: 113.03m | eta: 66.5m +step 10522/16704 (62.99%) | loss: 2.630662 | lrm: 0.74 | dt: 646.50ms | tok/sec: 810,961 | mfu: 50.69 | epoch: 2 | total time: 113.04m | eta: 66.5m +step 10523/16704 (63.00%) | loss: 2.631763 | lrm: 0.74 | dt: 643.22ms | tok/sec: 815,100 | mfu: 50.94 | epoch: 2 | total time: 113.05m | eta: 66.5m +step 10524/16704 (63.00%) | loss: 2.634320 | lrm: 0.74 | dt: 645.13ms | tok/sec: 812,684 | mfu: 50.79 | epoch: 2 | total time: 113.06m | eta: 66.5m +step 10525/16704 (63.01%) | loss: 2.630186 | lrm: 0.74 | dt: 645.01ms | tok/sec: 812,837 | mfu: 50.80 | epoch: 2 | total time: 113.07m | eta: 66.4m +step 10526/16704 (63.01%) | loss: 2.635487 | lrm: 0.74 | dt: 646.36ms | tok/sec: 811,133 | mfu: 50.70 | epoch: 2 | total time: 113.08m | eta: 66.4m +step 10527/16704 (63.02%) | loss: 2.634344 | lrm: 0.74 | dt: 644.92ms | tok/sec: 812,950 | mfu: 50.81 | epoch: 2 | total time: 113.09m | eta: 66.4m +step 10528/16704 (63.03%) | loss: 2.635285 | lrm: 0.74 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 2 | total time: 113.10m | eta: 66.4m +step 10529/16704 (63.03%) | loss: 2.634844 | lrm: 0.74 | dt: 644.51ms | tok/sec: 813,461 | mfu: 50.84 | epoch: 2 | total time: 113.11m | eta: 66.4m +step 10530/16704 (63.04%) | loss: 2.637776 | lrm: 0.74 | dt: 645.38ms | tok/sec: 812,372 | mfu: 50.77 | epoch: 2 | total time: 113.13m | eta: 66.4m +step 10531/16704 (63.04%) | loss: 2.633048 | lrm: 0.74 | dt: 644.40ms | tok/sec: 813,609 | mfu: 50.85 | epoch: 2 | total time: 113.14m | eta: 66.4m +step 10532/16704 (63.05%) | loss: 2.627123 | lrm: 0.74 | dt: 644.33ms | tok/sec: 813,694 | mfu: 50.86 | epoch: 2 | total time: 113.15m | eta: 66.4m +step 10533/16704 (63.06%) | loss: 2.635587 | lrm: 0.74 | dt: 651.65ms | tok/sec: 804,554 | mfu: 50.29 | epoch: 2 | total time: 113.16m | eta: 66.4m +step 10534/16704 (63.06%) | loss: 2.631782 | lrm: 0.74 | dt: 641.70ms | tok/sec: 817,026 | mfu: 51.07 | epoch: 2 | total time: 113.17m | eta: 66.3m +step 10535/16704 (63.07%) | loss: 2.621215 | lrm: 0.74 | dt: 646.28ms | tok/sec: 811,244 | mfu: 50.70 | epoch: 2 | total time: 113.18m | eta: 66.3m +step 10536/16704 (63.07%) | loss: 2.622000 | lrm: 0.74 | dt: 644.67ms | tok/sec: 813,269 | mfu: 50.83 | epoch: 2 | total time: 113.19m | eta: 66.3m +step 10537/16704 (63.08%) | loss: 2.618917 | lrm: 0.74 | dt: 644.59ms | tok/sec: 813,370 | mfu: 50.84 | epoch: 2 | total time: 113.20m | eta: 66.3m +step 10538/16704 (63.09%) | loss: 2.621815 | lrm: 0.74 | dt: 646.75ms | tok/sec: 810,653 | mfu: 50.67 | epoch: 2 | total time: 113.21m | eta: 66.3m +step 10539/16704 (63.09%) | loss: 2.618807 | lrm: 0.74 | dt: 646.35ms | tok/sec: 811,157 | mfu: 50.70 | epoch: 2 | total time: 113.22m | eta: 66.3m +step 10540/16704 (63.10%) | loss: 2.623499 | lrm: 0.74 | dt: 646.31ms | tok/sec: 811,198 | mfu: 50.70 | epoch: 2 | total time: 113.23m | eta: 66.3m +step 10541/16704 (63.10%) | loss: 2.622539 | lrm: 0.74 | dt: 645.68ms | tok/sec: 811,993 | mfu: 50.75 | epoch: 2 | total time: 113.24m | eta: 66.3m +step 10542/16704 (63.11%) | loss: 2.625643 | lrm: 0.74 | dt: 643.33ms | tok/sec: 814,956 | mfu: 50.94 | epoch: 2 | total time: 113.25m | eta: 66.3m +step 10543/16704 (63.12%) | loss: 2.629033 | lrm: 0.74 | dt: 643.74ms | tok/sec: 814,446 | mfu: 50.90 | epoch: 2 | total time: 113.27m | eta: 66.3m +step 10544/16704 (63.12%) | loss: 2.627066 | lrm: 0.74 | dt: 645.52ms | tok/sec: 812,190 | mfu: 50.76 | epoch: 2 | total time: 113.28m | eta: 66.2m +step 10545/16704 (63.13%) | loss: 2.622585 | lrm: 0.74 | dt: 645.75ms | tok/sec: 811,904 | mfu: 50.75 | epoch: 2 | total time: 113.29m | eta: 66.2m +step 10546/16704 (63.13%) | loss: 2.620198 | lrm: 0.74 | dt: 644.96ms | tok/sec: 812,897 | mfu: 50.81 | epoch: 2 | total time: 113.30m | eta: 66.2m +step 10547/16704 (63.14%) | loss: 2.617432 | lrm: 0.74 | dt: 646.32ms | tok/sec: 811,191 | mfu: 50.70 | epoch: 2 | total time: 113.31m | eta: 66.2m +step 10548/16704 (63.15%) | loss: 2.612497 | lrm: 0.74 | dt: 644.67ms | tok/sec: 813,271 | mfu: 50.83 | epoch: 2 | total time: 113.32m | eta: 66.2m +step 10549/16704 (63.15%) | loss: 2.589781 | lrm: 0.74 | dt: 644.72ms | tok/sec: 813,207 | mfu: 50.83 | epoch: 2 | total time: 113.33m | eta: 66.2m +step 10550/16704 (63.16%) | loss: 2.589094 | lrm: 0.74 | dt: 643.57ms | tok/sec: 814,659 | mfu: 50.92 | epoch: 2 | total time: 113.34m | eta: 66.2m +step 10551/16704 (63.16%) | loss: 2.600482 | lrm: 0.74 | dt: 646.91ms | tok/sec: 810,445 | mfu: 50.65 | epoch: 2 | total time: 113.35m | eta: 66.2m +step 10552/16704 (63.17%) | loss: 2.606412 | lrm: 0.74 | dt: 643.38ms | tok/sec: 814,892 | mfu: 50.93 | epoch: 2 | total time: 113.36m | eta: 66.2m +step 10553/16704 (63.18%) | loss: 2.603582 | lrm: 0.74 | dt: 645.72ms | tok/sec: 811,938 | mfu: 50.75 | epoch: 2 | total time: 113.37m | eta: 66.1m +step 10554/16704 (63.18%) | loss: 2.619329 | lrm: 0.74 | dt: 645.16ms | tok/sec: 812,648 | mfu: 50.79 | epoch: 2 | total time: 113.38m | eta: 66.1m +step 10555/16704 (63.19%) | loss: 2.634077 | lrm: 0.74 | dt: 644.40ms | tok/sec: 813,611 | mfu: 50.85 | epoch: 2 | total time: 113.39m | eta: 66.1m +step 10556/16704 (63.19%) | loss: 2.628659 | lrm: 0.74 | dt: 647.69ms | tok/sec: 809,472 | mfu: 50.59 | epoch: 2 | total time: 113.40m | eta: 66.1m +step 10557/16704 (63.20%) | loss: 2.625686 | lrm: 0.74 | dt: 641.22ms | tok/sec: 817,643 | mfu: 51.10 | epoch: 2 | total time: 113.42m | eta: 66.1m +step 10558/16704 (63.21%) | loss: 2.621436 | lrm: 0.74 | dt: 645.70ms | tok/sec: 811,963 | mfu: 50.75 | epoch: 2 | total time: 113.43m | eta: 66.1m +step 10559/16704 (63.21%) | loss: 2.614439 | lrm: 0.74 | dt: 644.44ms | tok/sec: 813,555 | mfu: 50.85 | epoch: 2 | total time: 113.44m | eta: 66.1m +step 10560/16704 (63.22%) | loss: 2.607072 | lrm: 0.74 | dt: 644.25ms | tok/sec: 813,799 | mfu: 50.86 | epoch: 2 | total time: 113.45m | eta: 66.1m +step 10561/16704 (63.22%) | loss: 2.617264 | lrm: 0.74 | dt: 645.33ms | tok/sec: 812,428 | mfu: 50.78 | epoch: 2 | total time: 113.46m | eta: 66.1m +step 10562/16704 (63.23%) | loss: 2.623720 | lrm: 0.74 | dt: 643.28ms | tok/sec: 815,023 | mfu: 50.94 | epoch: 2 | total time: 113.47m | eta: 66.0m +step 10563/16704 (63.24%) | loss: 2.624687 | lrm: 0.74 | dt: 646.87ms | tok/sec: 810,505 | mfu: 50.66 | epoch: 2 | total time: 113.48m | eta: 66.0m +step 10564/16704 (63.24%) | loss: 2.640838 | lrm: 0.74 | dt: 645.99ms | tok/sec: 811,606 | mfu: 50.73 | epoch: 2 | total time: 113.49m | eta: 66.0m +step 10565/16704 (63.25%) | loss: 2.639148 | lrm: 0.74 | dt: 644.21ms | tok/sec: 813,846 | mfu: 50.87 | epoch: 2 | total time: 113.50m | eta: 66.0m +step 10566/16704 (63.25%) | loss: 2.636745 | lrm: 0.73 | dt: 646.16ms | tok/sec: 811,386 | mfu: 50.71 | epoch: 2 | total time: 113.51m | eta: 66.0m +step 10567/16704 (63.26%) | loss: 2.645171 | lrm: 0.73 | dt: 643.77ms | tok/sec: 814,402 | mfu: 50.90 | epoch: 2 | total time: 113.52m | eta: 66.0m +step 10568/16704 (63.27%) | loss: 2.644445 | lrm: 0.73 | dt: 642.58ms | tok/sec: 815,913 | mfu: 51.00 | epoch: 2 | total time: 113.53m | eta: 66.0m +step 10569/16704 (63.27%) | loss: 2.647653 | lrm: 0.73 | dt: 646.04ms | tok/sec: 811,541 | mfu: 50.72 | epoch: 2 | total time: 113.54m | eta: 66.0m +step 10570/16704 (63.28%) | loss: 2.650327 | lrm: 0.73 | dt: 645.45ms | tok/sec: 812,278 | mfu: 50.77 | epoch: 2 | total time: 113.56m | eta: 66.0m +step 10571/16704 (63.28%) | loss: 2.640387 | lrm: 0.73 | dt: 645.87ms | tok/sec: 811,757 | mfu: 50.74 | epoch: 2 | total time: 113.57m | eta: 66.0m +step 10572/16704 (63.29%) | loss: 2.640659 | lrm: 0.73 | dt: 643.52ms | tok/sec: 814,713 | mfu: 50.92 | epoch: 2 | total time: 113.58m | eta: 65.9m +step 10573/16704 (63.30%) | loss: 2.631076 | lrm: 0.73 | dt: 643.96ms | tok/sec: 814,156 | mfu: 50.89 | epoch: 2 | total time: 113.59m | eta: 65.9m +step 10574/16704 (63.30%) | loss: 2.635377 | lrm: 0.73 | dt: 647.07ms | tok/sec: 810,252 | mfu: 50.64 | epoch: 2 | total time: 113.60m | eta: 65.9m +step 10575/16704 (63.31%) | loss: 2.605935 | lrm: 0.73 | dt: 646.65ms | tok/sec: 810,772 | mfu: 50.67 | epoch: 2 | total time: 113.61m | eta: 65.9m +step 10576/16704 (63.31%) | loss: 2.619140 | lrm: 0.73 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 2 | total time: 113.62m | eta: 65.9m +step 10577/16704 (63.32%) | loss: 2.609316 | lrm: 0.73 | dt: 644.99ms | tok/sec: 812,865 | mfu: 50.81 | epoch: 2 | total time: 113.63m | eta: 65.9m +step 10578/16704 (63.33%) | loss: 2.607382 | lrm: 0.73 | dt: 645.17ms | tok/sec: 812,635 | mfu: 50.79 | epoch: 2 | total time: 113.64m | eta: 65.9m +step 10579/16704 (63.33%) | loss: 2.598828 | lrm: 0.73 | dt: 644.83ms | tok/sec: 813,067 | mfu: 50.82 | epoch: 2 | total time: 113.65m | eta: 65.9m +step 10580/16704 (63.34%) | loss: 2.600439 | lrm: 0.73 | dt: 644.22ms | tok/sec: 813,830 | mfu: 50.87 | epoch: 2 | total time: 113.66m | eta: 65.9m +step 10581/16704 (63.34%) | loss: 2.604908 | lrm: 0.73 | dt: 645.70ms | tok/sec: 811,964 | mfu: 50.75 | epoch: 2 | total time: 113.67m | eta: 65.8m +step 10582/16704 (63.35%) | loss: 2.596540 | lrm: 0.73 | dt: 644.33ms | tok/sec: 813,693 | mfu: 50.86 | epoch: 2 | total time: 113.68m | eta: 65.8m +step 10583/16704 (63.36%) | loss: 2.604833 | lrm: 0.73 | dt: 643.24ms | tok/sec: 815,078 | mfu: 50.94 | epoch: 2 | total time: 113.70m | eta: 65.8m +step 10584/16704 (63.36%) | loss: 2.595677 | lrm: 0.73 | dt: 646.36ms | tok/sec: 811,145 | mfu: 50.70 | epoch: 2 | total time: 113.71m | eta: 65.8m +step 10585/16704 (63.37%) | loss: 2.600410 | lrm: 0.73 | dt: 643.67ms | tok/sec: 814,523 | mfu: 50.91 | epoch: 2 | total time: 113.72m | eta: 65.8m +step 10586/16704 (63.37%) | loss: 2.602795 | lrm: 0.73 | dt: 643.76ms | tok/sec: 814,418 | mfu: 50.90 | epoch: 2 | total time: 113.73m | eta: 65.8m +step 10587/16704 (63.38%) | loss: 2.599564 | lrm: 0.73 | dt: 646.27ms | tok/sec: 811,247 | mfu: 50.70 | epoch: 2 | total time: 113.74m | eta: 65.8m +step 10588/16704 (63.39%) | loss: 2.601939 | lrm: 0.73 | dt: 643.36ms | tok/sec: 814,920 | mfu: 50.93 | epoch: 2 | total time: 113.75m | eta: 65.8m +step 10589/16704 (63.39%) | loss: 2.607999 | lrm: 0.73 | dt: 646.64ms | tok/sec: 810,788 | mfu: 50.68 | epoch: 2 | total time: 113.76m | eta: 65.8m +step 10590/16704 (63.40%) | loss: 2.604332 | lrm: 0.73 | dt: 646.41ms | tok/sec: 811,078 | mfu: 50.69 | epoch: 2 | total time: 113.77m | eta: 65.7m +step 10591/16704 (63.40%) | loss: 2.607397 | lrm: 0.73 | dt: 642.61ms | tok/sec: 815,867 | mfu: 50.99 | epoch: 2 | total time: 113.78m | eta: 65.7m +step 10592/16704 (63.41%) | loss: 2.603569 | lrm: 0.73 | dt: 646.24ms | tok/sec: 811,285 | mfu: 50.71 | epoch: 2 | total time: 113.79m | eta: 65.7m +step 10593/16704 (63.42%) | loss: 2.603969 | lrm: 0.73 | dt: 646.04ms | tok/sec: 811,540 | mfu: 50.72 | epoch: 2 | total time: 113.80m | eta: 65.7m +step 10594/16704 (63.42%) | loss: 2.599174 | lrm: 0.73 | dt: 642.49ms | tok/sec: 816,024 | mfu: 51.00 | epoch: 2 | total time: 113.81m | eta: 65.7m +step 10595/16704 (63.43%) | loss: 2.596515 | lrm: 0.73 | dt: 647.30ms | tok/sec: 809,956 | mfu: 50.62 | epoch: 2 | total time: 113.82m | eta: 65.7m +step 10596/16704 (63.43%) | loss: 2.613011 | lrm: 0.73 | dt: 644.58ms | tok/sec: 813,378 | mfu: 50.84 | epoch: 2 | total time: 113.83m | eta: 65.7m +step 10597/16704 (63.44%) | loss: 2.618081 | lrm: 0.73 | dt: 644.06ms | tok/sec: 814,034 | mfu: 50.88 | epoch: 2 | total time: 113.85m | eta: 65.7m +step 10598/16704 (63.45%) | loss: 2.618427 | lrm: 0.73 | dt: 645.04ms | tok/sec: 812,804 | mfu: 50.80 | epoch: 2 | total time: 113.86m | eta: 65.7m +step 10599/16704 (63.45%) | loss: 2.620800 | lrm: 0.73 | dt: 647.23ms | tok/sec: 810,054 | mfu: 50.63 | epoch: 2 | total time: 113.87m | eta: 65.6m +step 10600/16704 (63.46%) | loss: 2.633643 | lrm: 0.73 | dt: 644.00ms | tok/sec: 814,106 | mfu: 50.88 | epoch: 2 | total time: 113.88m | eta: 65.6m +step 10601/16704 (63.46%) | loss: 2.633487 | lrm: 0.73 | dt: 644.79ms | tok/sec: 813,114 | mfu: 50.82 | epoch: 2 | total time: 113.89m | eta: 65.6m +step 10602/16704 (63.47%) | loss: 2.627249 | lrm: 0.73 | dt: 645.27ms | tok/sec: 812,510 | mfu: 50.78 | epoch: 2 | total time: 113.90m | eta: 65.6m +step 10603/16704 (63.48%) | loss: 2.643925 | lrm: 0.73 | dt: 644.21ms | tok/sec: 813,844 | mfu: 50.87 | epoch: 2 | total time: 113.91m | eta: 65.6m +step 10604/16704 (63.48%) | loss: 2.638445 | lrm: 0.73 | dt: 646.47ms | tok/sec: 811,005 | mfu: 50.69 | epoch: 2 | total time: 113.92m | eta: 65.6m +step 10605/16704 (63.49%) | loss: 2.634524 | lrm: 0.73 | dt: 645.07ms | tok/sec: 812,762 | mfu: 50.80 | epoch: 2 | total time: 113.93m | eta: 65.6m +step 10606/16704 (63.49%) | loss: 2.634191 | lrm: 0.73 | dt: 644.94ms | tok/sec: 812,920 | mfu: 50.81 | epoch: 2 | total time: 113.94m | eta: 65.6m +step 10607/16704 (63.50%) | loss: 2.629182 | lrm: 0.73 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 2 | total time: 113.95m | eta: 65.6m +step 10608/16704 (63.51%) | loss: 2.626981 | lrm: 0.73 | dt: 644.07ms | tok/sec: 814,023 | mfu: 50.88 | epoch: 2 | total time: 113.96m | eta: 65.6m +step 10609/16704 (63.51%) | loss: 2.624491 | lrm: 0.73 | dt: 644.55ms | tok/sec: 813,414 | mfu: 50.84 | epoch: 2 | total time: 113.97m | eta: 65.5m +step 10610/16704 (63.52%) | loss: 2.638141 | lrm: 0.73 | dt: 644.99ms | tok/sec: 812,866 | mfu: 50.81 | epoch: 2 | total time: 113.99m | eta: 65.5m +step 10611/16704 (63.52%) | loss: 2.646234 | lrm: 0.73 | dt: 644.15ms | tok/sec: 813,925 | mfu: 50.87 | epoch: 2 | total time: 114.00m | eta: 65.5m +step 10612/16704 (63.53%) | loss: 2.649801 | lrm: 0.73 | dt: 645.55ms | tok/sec: 812,162 | mfu: 50.76 | epoch: 2 | total time: 114.01m | eta: 65.5m +step 10613/16704 (63.54%) | loss: 2.649410 | lrm: 0.73 | dt: 643.47ms | tok/sec: 814,778 | mfu: 50.92 | epoch: 2 | total time: 114.02m | eta: 65.5m +step 10614/16704 (63.54%) | loss: 2.655293 | lrm: 0.73 | dt: 646.04ms | tok/sec: 811,538 | mfu: 50.72 | epoch: 2 | total time: 114.03m | eta: 65.5m +step 10615/16704 (63.55%) | loss: 2.656837 | lrm: 0.73 | dt: 645.20ms | tok/sec: 812,601 | mfu: 50.79 | epoch: 2 | total time: 114.04m | eta: 65.5m +step 10616/16704 (63.55%) | loss: 2.655126 | lrm: 0.73 | dt: 642.59ms | tok/sec: 815,902 | mfu: 51.00 | epoch: 2 | total time: 114.05m | eta: 65.5m +step 10617/16704 (63.56%) | loss: 2.664488 | lrm: 0.73 | dt: 645.02ms | tok/sec: 812,820 | mfu: 50.80 | epoch: 2 | total time: 114.06m | eta: 65.5m +step 10618/16704 (63.57%) | loss: 2.651145 | lrm: 0.73 | dt: 645.17ms | tok/sec: 812,638 | mfu: 50.79 | epoch: 2 | total time: 114.07m | eta: 65.4m +step 10619/16704 (63.57%) | loss: 2.643428 | lrm: 0.73 | dt: 642.57ms | tok/sec: 815,926 | mfu: 51.00 | epoch: 2 | total time: 114.08m | eta: 65.4m +step 10620/16704 (63.58%) | loss: 2.635799 | lrm: 0.73 | dt: 646.88ms | tok/sec: 810,485 | mfu: 50.66 | epoch: 2 | total time: 114.09m | eta: 65.4m +step 10621/16704 (63.58%) | loss: 2.629521 | lrm: 0.73 | dt: 644.35ms | tok/sec: 813,671 | mfu: 50.86 | epoch: 2 | total time: 114.10m | eta: 65.4m +step 10622/16704 (63.59%) | loss: 2.638917 | lrm: 0.73 | dt: 646.01ms | tok/sec: 811,582 | mfu: 50.73 | epoch: 2 | total time: 114.11m | eta: 65.4m +step 10623/16704 (63.60%) | loss: 2.638450 | lrm: 0.73 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 2 | total time: 114.12m | eta: 65.4m +step 10624/16704 (63.60%) | loss: 2.628059 | lrm: 0.73 | dt: 643.44ms | tok/sec: 814,822 | mfu: 50.93 | epoch: 2 | total time: 114.14m | eta: 65.4m +step 10625/16704 (63.61%) | loss: 2.624731 | lrm: 0.73 | dt: 646.30ms | tok/sec: 811,215 | mfu: 50.70 | epoch: 2 | total time: 114.15m | eta: 65.4m +step 10626/16704 (63.61%) | loss: 2.628930 | lrm: 0.73 | dt: 645.80ms | tok/sec: 811,845 | mfu: 50.74 | epoch: 2 | total time: 114.16m | eta: 65.4m +step 10627/16704 (63.62%) | loss: 2.633685 | lrm: 0.73 | dt: 642.94ms | tok/sec: 815,455 | mfu: 50.97 | epoch: 2 | total time: 114.17m | eta: 65.3m +step 10628/16704 (63.63%) | loss: 2.642132 | lrm: 0.73 | dt: 645.19ms | tok/sec: 812,612 | mfu: 50.79 | epoch: 2 | total time: 114.18m | eta: 65.3m +step 10629/16704 (63.63%) | loss: 2.641474 | lrm: 0.73 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 2 | total time: 114.19m | eta: 65.3m +step 10630/16704 (63.64%) | loss: 2.640547 | lrm: 0.73 | dt: 644.39ms | tok/sec: 813,615 | mfu: 50.85 | epoch: 2 | total time: 114.20m | eta: 65.3m +step 10631/16704 (63.64%) | loss: 2.646444 | lrm: 0.73 | dt: 646.97ms | tok/sec: 810,379 | mfu: 50.65 | epoch: 2 | total time: 114.21m | eta: 65.3m +step 10632/16704 (63.65%) | loss: 2.646177 | lrm: 0.73 | dt: 645.94ms | tok/sec: 811,662 | mfu: 50.73 | epoch: 2 | total time: 114.22m | eta: 65.3m +step 10633/16704 (63.66%) | loss: 2.645365 | lrm: 0.73 | dt: 644.09ms | tok/sec: 814,003 | mfu: 50.88 | epoch: 2 | total time: 114.23m | eta: 65.3m +step 10634/16704 (63.66%) | loss: 2.646991 | lrm: 0.73 | dt: 645.15ms | tok/sec: 812,661 | mfu: 50.79 | epoch: 2 | total time: 114.24m | eta: 65.3m +step 10635/16704 (63.67%) | loss: 2.642280 | lrm: 0.73 | dt: 643.26ms | tok/sec: 815,048 | mfu: 50.94 | epoch: 2 | total time: 114.25m | eta: 65.3m +step 10636/16704 (63.67%) | loss: 2.639106 | lrm: 0.73 | dt: 645.49ms | tok/sec: 812,232 | mfu: 50.77 | epoch: 2 | total time: 114.26m | eta: 65.3m +step 10637/16704 (63.68%) | loss: 2.649475 | lrm: 0.73 | dt: 645.48ms | tok/sec: 812,239 | mfu: 50.77 | epoch: 2 | total time: 114.28m | eta: 65.2m +step 10638/16704 (63.69%) | loss: 2.645400 | lrm: 0.73 | dt: 645.67ms | tok/sec: 812,007 | mfu: 50.75 | epoch: 2 | total time: 114.29m | eta: 65.2m +step 10639/16704 (63.69%) | loss: 2.631414 | lrm: 0.73 | dt: 644.33ms | tok/sec: 813,689 | mfu: 50.86 | epoch: 2 | total time: 114.30m | eta: 65.2m +step 10640/16704 (63.70%) | loss: 2.626911 | lrm: 0.73 | dt: 646.61ms | tok/sec: 810,820 | mfu: 50.68 | epoch: 2 | total time: 114.31m | eta: 65.2m +step 10641/16704 (63.70%) | loss: 2.634909 | lrm: 0.73 | dt: 643.08ms | tok/sec: 815,279 | mfu: 50.96 | epoch: 2 | total time: 114.32m | eta: 65.2m +step 10642/16704 (63.71%) | loss: 2.627185 | lrm: 0.73 | dt: 646.56ms | tok/sec: 810,885 | mfu: 50.68 | epoch: 2 | total time: 114.33m | eta: 65.2m +step 10643/16704 (63.72%) | loss: 2.629969 | lrm: 0.73 | dt: 645.14ms | tok/sec: 812,679 | mfu: 50.79 | epoch: 2 | total time: 114.34m | eta: 65.2m +step 10644/16704 (63.72%) | loss: 2.624832 | lrm: 0.73 | dt: 644.54ms | tok/sec: 813,424 | mfu: 50.84 | epoch: 2 | total time: 114.35m | eta: 65.2m +step 10645/16704 (63.73%) | loss: 2.623098 | lrm: 0.73 | dt: 644.60ms | tok/sec: 813,354 | mfu: 50.84 | epoch: 2 | total time: 114.36m | eta: 65.2m +step 10646/16704 (63.73%) | loss: 2.614741 | lrm: 0.73 | dt: 644.41ms | tok/sec: 813,591 | mfu: 50.85 | epoch: 2 | total time: 114.37m | eta: 65.1m +step 10647/16704 (63.74%) | loss: 2.604594 | lrm: 0.73 | dt: 645.43ms | tok/sec: 812,306 | mfu: 50.77 | epoch: 2 | total time: 114.38m | eta: 65.1m +step 10648/16704 (63.75%) | loss: 2.601912 | lrm: 0.73 | dt: 644.06ms | tok/sec: 814,035 | mfu: 50.88 | epoch: 2 | total time: 114.39m | eta: 65.1m +step 10649/16704 (63.75%) | loss: 2.598760 | lrm: 0.72 | dt: 645.50ms | tok/sec: 812,224 | mfu: 50.77 | epoch: 2 | total time: 114.40m | eta: 65.1m +step 10650/16704 (63.76%) | loss: 2.601387 | lrm: 0.72 | dt: 646.05ms | tok/sec: 811,522 | mfu: 50.72 | epoch: 2 | total time: 114.42m | eta: 65.1m +step 10651/16704 (63.76%) | loss: 2.598853 | lrm: 0.72 | dt: 645.41ms | tok/sec: 812,327 | mfu: 50.77 | epoch: 2 | total time: 114.43m | eta: 65.1m +step 10652/16704 (63.77%) | loss: 2.606665 | lrm: 0.72 | dt: 644.43ms | tok/sec: 813,567 | mfu: 50.85 | epoch: 2 | total time: 114.44m | eta: 65.1m +step 10653/16704 (63.78%) | loss: 2.611832 | lrm: 0.72 | dt: 646.25ms | tok/sec: 811,275 | mfu: 50.71 | epoch: 2 | total time: 114.45m | eta: 65.1m +step 10654/16704 (63.78%) | loss: 2.615175 | lrm: 0.72 | dt: 646.54ms | tok/sec: 810,916 | mfu: 50.68 | epoch: 2 | total time: 114.46m | eta: 65.1m +step 10655/16704 (63.79%) | loss: 2.610136 | lrm: 0.72 | dt: 646.26ms | tok/sec: 811,265 | mfu: 50.71 | epoch: 2 | total time: 114.47m | eta: 65.0m +step 10656/16704 (63.79%) | loss: 2.610374 | lrm: 0.72 | dt: 646.59ms | tok/sec: 810,846 | mfu: 50.68 | epoch: 2 | total time: 114.48m | eta: 65.0m +step 10657/16704 (63.80%) | loss: 2.606659 | lrm: 0.72 | dt: 647.79ms | tok/sec: 809,350 | mfu: 50.59 | epoch: 2 | total time: 114.49m | eta: 65.0m +step 10658/16704 (63.81%) | loss: 2.610771 | lrm: 0.72 | dt: 643.26ms | tok/sec: 815,042 | mfu: 50.94 | epoch: 2 | total time: 114.50m | eta: 65.0m +step 10659/16704 (63.81%) | loss: 2.602668 | lrm: 0.72 | dt: 647.20ms | tok/sec: 810,088 | mfu: 50.63 | epoch: 2 | total time: 114.51m | eta: 65.0m +step 10660/16704 (63.82%) | loss: 2.581067 | lrm: 0.72 | dt: 643.30ms | tok/sec: 815,003 | mfu: 50.94 | epoch: 2 | total time: 114.52m | eta: 65.0m +step 10661/16704 (63.82%) | loss: 2.584483 | lrm: 0.72 | dt: 647.43ms | tok/sec: 809,795 | mfu: 50.61 | epoch: 2 | total time: 114.53m | eta: 65.0m +step 10662/16704 (63.83%) | loss: 2.591737 | lrm: 0.72 | dt: 643.38ms | tok/sec: 814,901 | mfu: 50.93 | epoch: 2 | total time: 114.54m | eta: 65.0m +step 10663/16704 (63.84%) | loss: 2.593078 | lrm: 0.72 | dt: 645.66ms | tok/sec: 812,017 | mfu: 50.75 | epoch: 2 | total time: 114.56m | eta: 65.0m +step 10664/16704 (63.84%) | loss: 2.591364 | lrm: 0.72 | dt: 644.93ms | tok/sec: 812,941 | mfu: 50.81 | epoch: 2 | total time: 114.57m | eta: 65.0m +step 10665/16704 (63.85%) | loss: 2.582339 | lrm: 0.72 | dt: 644.56ms | tok/sec: 813,401 | mfu: 50.84 | epoch: 2 | total time: 114.58m | eta: 64.9m +step 10666/16704 (63.85%) | loss: 2.588333 | lrm: 0.72 | dt: 645.97ms | tok/sec: 811,632 | mfu: 50.73 | epoch: 2 | total time: 114.59m | eta: 64.9m +step 10667/16704 (63.86%) | loss: 2.598077 | lrm: 0.72 | dt: 643.27ms | tok/sec: 815,039 | mfu: 50.94 | epoch: 2 | total time: 114.60m | eta: 64.9m +step 10668/16704 (63.86%) | loss: 2.594208 | lrm: 0.72 | dt: 646.84ms | tok/sec: 810,531 | mfu: 50.66 | epoch: 2 | total time: 114.61m | eta: 64.9m +step 10669/16704 (63.87%) | loss: 2.585599 | lrm: 0.72 | dt: 644.92ms | tok/sec: 812,954 | mfu: 50.81 | epoch: 2 | total time: 114.62m | eta: 64.9m +step 10670/16704 (63.88%) | loss: 2.584342 | lrm: 0.72 | dt: 644.33ms | tok/sec: 813,689 | mfu: 50.86 | epoch: 2 | total time: 114.63m | eta: 64.9m +step 10671/16704 (63.88%) | loss: 2.589269 | lrm: 0.72 | dt: 646.13ms | tok/sec: 811,426 | mfu: 50.72 | epoch: 2 | total time: 114.64m | eta: 64.9m +step 10672/16704 (63.89%) | loss: 2.579810 | lrm: 0.72 | dt: 645.97ms | tok/sec: 811,633 | mfu: 50.73 | epoch: 2 | total time: 114.65m | eta: 64.9m +step 10673/16704 (63.89%) | loss: 2.581619 | lrm: 0.72 | dt: 646.23ms | tok/sec: 811,297 | mfu: 50.71 | epoch: 2 | total time: 114.66m | eta: 64.9m +step 10674/16704 (63.90%) | loss: 2.563918 | lrm: 0.72 | dt: 646.55ms | tok/sec: 810,898 | mfu: 50.68 | epoch: 2 | total time: 114.67m | eta: 64.8m +step 10675/16704 (63.91%) | loss: 2.563093 | lrm: 0.72 | dt: 643.88ms | tok/sec: 814,264 | mfu: 50.89 | epoch: 2 | total time: 114.68m | eta: 64.8m +step 10676/16704 (63.91%) | loss: 2.573828 | lrm: 0.72 | dt: 646.79ms | tok/sec: 810,601 | mfu: 50.66 | epoch: 2 | total time: 114.69m | eta: 64.8m +step 10677/16704 (63.92%) | loss: 2.581747 | lrm: 0.72 | dt: 645.51ms | tok/sec: 812,206 | mfu: 50.76 | epoch: 2 | total time: 114.71m | eta: 64.8m +step 10678/16704 (63.92%) | loss: 2.580905 | lrm: 0.72 | dt: 648.02ms | tok/sec: 809,058 | mfu: 50.57 | epoch: 2 | total time: 114.72m | eta: 64.8m +step 10679/16704 (63.93%) | loss: 2.586489 | lrm: 0.72 | dt: 645.81ms | tok/sec: 811,828 | mfu: 50.74 | epoch: 2 | total time: 114.73m | eta: 64.8m +step 10680/16704 (63.94%) | loss: 2.594268 | lrm: 0.72 | dt: 645.63ms | tok/sec: 812,052 | mfu: 50.75 | epoch: 2 | total time: 114.74m | eta: 64.8m +step 10681/16704 (63.94%) | loss: 2.586590 | lrm: 0.72 | dt: 649.37ms | tok/sec: 807,376 | mfu: 50.46 | epoch: 2 | total time: 114.75m | eta: 64.8m +step 10682/16704 (63.95%) | loss: 2.592730 | lrm: 0.72 | dt: 645.96ms | tok/sec: 811,646 | mfu: 50.73 | epoch: 2 | total time: 114.76m | eta: 64.8m +step 10683/16704 (63.95%) | loss: 2.602771 | lrm: 0.72 | dt: 646.39ms | tok/sec: 811,102 | mfu: 50.70 | epoch: 2 | total time: 114.77m | eta: 64.7m +step 10684/16704 (63.96%) | loss: 2.595167 | lrm: 0.72 | dt: 648.32ms | tok/sec: 808,691 | mfu: 50.54 | epoch: 2 | total time: 114.78m | eta: 64.7m +step 10685/16704 (63.97%) | loss: 2.610170 | lrm: 0.72 | dt: 646.65ms | tok/sec: 810,774 | mfu: 50.67 | epoch: 2 | total time: 114.79m | eta: 64.7m +step 10686/16704 (63.97%) | loss: 2.604493 | lrm: 0.72 | dt: 644.97ms | tok/sec: 812,882 | mfu: 50.81 | epoch: 2 | total time: 114.80m | eta: 64.7m +step 10687/16704 (63.98%) | loss: 2.611511 | lrm: 0.72 | dt: 644.92ms | tok/sec: 812,955 | mfu: 50.81 | epoch: 2 | total time: 114.81m | eta: 64.7m +step 10688/16704 (63.98%) | loss: 2.621977 | lrm: 0.72 | dt: 644.00ms | tok/sec: 814,109 | mfu: 50.88 | epoch: 2 | total time: 114.82m | eta: 64.7m +step 10689/16704 (63.99%) | loss: 2.616461 | lrm: 0.72 | dt: 644.29ms | tok/sec: 813,740 | mfu: 50.86 | epoch: 2 | total time: 114.83m | eta: 64.7m +step 10690/16704 (64.00%) | loss: 2.612287 | lrm: 0.72 | dt: 645.55ms | tok/sec: 812,158 | mfu: 50.76 | epoch: 2 | total time: 114.85m | eta: 64.7m +step 10691/16704 (64.00%) | loss: 2.621482 | lrm: 0.72 | dt: 646.98ms | tok/sec: 810,367 | mfu: 50.65 | epoch: 2 | total time: 114.86m | eta: 64.7m +step 10692/16704 (64.01%) | loss: 2.618097 | lrm: 0.72 | dt: 647.70ms | tok/sec: 809,458 | mfu: 50.59 | epoch: 2 | total time: 114.87m | eta: 64.6m +step 10693/16704 (64.01%) | loss: 2.617053 | lrm: 0.72 | dt: 644.96ms | tok/sec: 812,901 | mfu: 50.81 | epoch: 2 | total time: 114.88m | eta: 64.6m +step 10694/16704 (64.02%) | loss: 2.604826 | lrm: 0.72 | dt: 646.33ms | tok/sec: 811,174 | mfu: 50.70 | epoch: 2 | total time: 114.89m | eta: 64.6m +step 10695/16704 (64.03%) | loss: 2.623129 | lrm: 0.72 | dt: 646.28ms | tok/sec: 811,239 | mfu: 50.70 | epoch: 2 | total time: 114.90m | eta: 64.6m +step 10696/16704 (64.03%) | loss: 2.622541 | lrm: 0.72 | dt: 646.72ms | tok/sec: 810,684 | mfu: 50.67 | epoch: 2 | total time: 114.91m | eta: 64.6m +step 10697/16704 (64.04%) | loss: 2.617890 | lrm: 0.72 | dt: 646.98ms | tok/sec: 810,368 | mfu: 50.65 | epoch: 2 | total time: 114.92m | eta: 64.6m +step 10698/16704 (64.04%) | loss: 2.609127 | lrm: 0.72 | dt: 645.48ms | tok/sec: 812,240 | mfu: 50.77 | epoch: 2 | total time: 114.93m | eta: 64.6m +step 10699/16704 (64.05%) | loss: 2.604065 | lrm: 0.72 | dt: 644.83ms | tok/sec: 813,069 | mfu: 50.82 | epoch: 2 | total time: 114.94m | eta: 64.6m +step 10700/16704 (64.06%) | loss: 2.607219 | lrm: 0.72 | dt: 645.98ms | tok/sec: 811,611 | mfu: 50.73 | epoch: 2 | total time: 114.95m | eta: 64.6m +step 10701/16704 (64.06%) | loss: 2.605859 | lrm: 0.72 | dt: 646.43ms | tok/sec: 811,049 | mfu: 50.69 | epoch: 2 | total time: 114.96m | eta: 64.6m +step 10702/16704 (64.07%) | loss: 2.620171 | lrm: 0.72 | dt: 645.50ms | tok/sec: 812,223 | mfu: 50.77 | epoch: 2 | total time: 114.97m | eta: 64.5m +step 10703/16704 (64.07%) | loss: 2.618782 | lrm: 0.72 | dt: 644.88ms | tok/sec: 813,000 | mfu: 50.81 | epoch: 2 | total time: 114.99m | eta: 64.5m +step 10704/16704 (64.08%) | loss: 2.618111 | lrm: 0.72 | dt: 646.26ms | tok/sec: 811,261 | mfu: 50.70 | epoch: 2 | total time: 115.00m | eta: 64.5m +step 10705/16704 (64.09%) | loss: 2.622455 | lrm: 0.72 | dt: 645.41ms | tok/sec: 812,337 | mfu: 50.77 | epoch: 2 | total time: 115.01m | eta: 64.5m +step 10706/16704 (64.09%) | loss: 2.623727 | lrm: 0.72 | dt: 645.74ms | tok/sec: 811,924 | mfu: 50.75 | epoch: 2 | total time: 115.02m | eta: 64.5m +step 10707/16704 (64.10%) | loss: 2.619616 | lrm: 0.72 | dt: 647.15ms | tok/sec: 810,148 | mfu: 50.64 | epoch: 2 | total time: 115.03m | eta: 64.5m +step 10708/16704 (64.10%) | loss: 2.623102 | lrm: 0.72 | dt: 647.49ms | tok/sec: 809,726 | mfu: 50.61 | epoch: 2 | total time: 115.04m | eta: 64.5m +step 10709/16704 (64.11%) | loss: 2.606694 | lrm: 0.72 | dt: 646.80ms | tok/sec: 810,588 | mfu: 50.66 | epoch: 2 | total time: 115.05m | eta: 64.5m +step 10710/16704 (64.12%) | loss: 2.618409 | lrm: 0.72 | dt: 646.78ms | tok/sec: 810,606 | mfu: 50.66 | epoch: 2 | total time: 115.06m | eta: 64.5m +step 10711/16704 (64.12%) | loss: 2.616765 | lrm: 0.72 | dt: 644.75ms | tok/sec: 813,166 | mfu: 50.82 | epoch: 2 | total time: 115.07m | eta: 64.4m +step 10712/16704 (64.13%) | loss: 2.616752 | lrm: 0.72 | dt: 644.52ms | tok/sec: 813,453 | mfu: 50.84 | epoch: 2 | total time: 115.08m | eta: 64.4m +step 10713/16704 (64.13%) | loss: 2.609720 | lrm: 0.72 | dt: 647.10ms | tok/sec: 810,206 | mfu: 50.64 | epoch: 2 | total time: 115.09m | eta: 64.4m +step 10714/16704 (64.14%) | loss: 2.596079 | lrm: 0.72 | dt: 643.79ms | tok/sec: 814,379 | mfu: 50.90 | epoch: 2 | total time: 115.10m | eta: 64.4m +step 10715/16704 (64.15%) | loss: 2.611806 | lrm: 0.72 | dt: 643.70ms | tok/sec: 814,494 | mfu: 50.91 | epoch: 2 | total time: 115.11m | eta: 64.4m +step 10716/16704 (64.15%) | loss: 2.616273 | lrm: 0.72 | dt: 646.30ms | tok/sec: 811,214 | mfu: 50.70 | epoch: 2 | total time: 115.13m | eta: 64.4m +step 10717/16704 (64.16%) | loss: 2.623236 | lrm: 0.72 | dt: 644.83ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 2 | total time: 115.14m | eta: 64.4m +step 10718/16704 (64.16%) | loss: 2.633634 | lrm: 0.72 | dt: 648.56ms | tok/sec: 808,383 | mfu: 50.53 | epoch: 2 | total time: 115.15m | eta: 64.4m +step 10719/16704 (64.17%) | loss: 2.637380 | lrm: 0.72 | dt: 646.18ms | tok/sec: 811,365 | mfu: 50.71 | epoch: 2 | total time: 115.16m | eta: 64.4m +step 10720/16704 (64.18%) | loss: 2.648178 | lrm: 0.72 | dt: 645.53ms | tok/sec: 812,176 | mfu: 50.76 | epoch: 2 | total time: 115.17m | eta: 64.3m +step 10721/16704 (64.18%) | loss: 2.637857 | lrm: 0.72 | dt: 645.30ms | tok/sec: 812,467 | mfu: 50.78 | epoch: 2 | total time: 115.18m | eta: 64.3m +step 10722/16704 (64.19%) | loss: 2.638011 | lrm: 0.72 | dt: 645.80ms | tok/sec: 811,843 | mfu: 50.74 | epoch: 2 | total time: 115.19m | eta: 64.3m +step 10723/16704 (64.19%) | loss: 2.623089 | lrm: 0.72 | dt: 644.09ms | tok/sec: 814,002 | mfu: 50.88 | epoch: 2 | total time: 115.20m | eta: 64.3m +step 10724/16704 (64.20%) | loss: 2.622811 | lrm: 0.72 | dt: 646.00ms | tok/sec: 811,586 | mfu: 50.73 | epoch: 2 | total time: 115.21m | eta: 64.3m +step 10725/16704 (64.21%) | loss: 2.620972 | lrm: 0.72 | dt: 645.96ms | tok/sec: 811,646 | mfu: 50.73 | epoch: 2 | total time: 115.22m | eta: 64.3m +step 10726/16704 (64.21%) | loss: 2.610205 | lrm: 0.72 | dt: 645.93ms | tok/sec: 811,681 | mfu: 50.73 | epoch: 2 | total time: 115.23m | eta: 64.3m +step 10727/16704 (64.22%) | loss: 2.613268 | lrm: 0.72 | dt: 646.78ms | tok/sec: 810,612 | mfu: 50.66 | epoch: 2 | total time: 115.24m | eta: 64.3m +step 10728/16704 (64.22%) | loss: 2.625522 | lrm: 0.72 | dt: 646.32ms | tok/sec: 811,188 | mfu: 50.70 | epoch: 2 | total time: 115.25m | eta: 64.3m +step 10729/16704 (64.23%) | loss: 2.610916 | lrm: 0.72 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 2 | total time: 115.27m | eta: 64.3m +step 10730/16704 (64.24%) | loss: 2.613061 | lrm: 0.72 | dt: 646.32ms | tok/sec: 811,192 | mfu: 50.70 | epoch: 2 | total time: 115.28m | eta: 64.2m +step 10731/16704 (64.24%) | loss: 2.615854 | lrm: 0.72 | dt: 647.08ms | tok/sec: 810,235 | mfu: 50.64 | epoch: 2 | total time: 115.29m | eta: 64.2m +step 10732/16704 (64.25%) | loss: 2.622188 | lrm: 0.72 | dt: 645.10ms | tok/sec: 812,726 | mfu: 50.80 | epoch: 2 | total time: 115.30m | eta: 64.2m +step 10733/16704 (64.25%) | loss: 2.623871 | lrm: 0.71 | dt: 647.88ms | tok/sec: 809,231 | mfu: 50.58 | epoch: 2 | total time: 115.31m | eta: 64.2m +step 10734/16704 (64.26%) | loss: 2.620582 | lrm: 0.71 | dt: 644.73ms | tok/sec: 813,188 | mfu: 50.83 | epoch: 2 | total time: 115.32m | eta: 64.2m +step 10735/16704 (64.27%) | loss: 2.617241 | lrm: 0.71 | dt: 645.21ms | tok/sec: 812,580 | mfu: 50.79 | epoch: 2 | total time: 115.33m | eta: 64.2m +step 10736/16704 (64.27%) | loss: 2.617580 | lrm: 0.71 | dt: 645.91ms | tok/sec: 811,700 | mfu: 50.73 | epoch: 2 | total time: 115.34m | eta: 64.2m +step 10737/16704 (64.28%) | loss: 2.618577 | lrm: 0.71 | dt: 646.20ms | tok/sec: 811,338 | mfu: 50.71 | epoch: 2 | total time: 115.35m | eta: 64.2m +step 10738/16704 (64.28%) | loss: 2.619047 | lrm: 0.71 | dt: 646.00ms | tok/sec: 811,589 | mfu: 50.73 | epoch: 2 | total time: 115.36m | eta: 64.2m +step 10739/16704 (64.29%) | loss: 2.616334 | lrm: 0.71 | dt: 646.60ms | tok/sec: 810,833 | mfu: 50.68 | epoch: 2 | total time: 115.37m | eta: 64.1m +step 10740/16704 (64.30%) | loss: 2.617781 | lrm: 0.71 | dt: 646.85ms | tok/sec: 810,529 | mfu: 50.66 | epoch: 2 | total time: 115.38m | eta: 64.1m +step 10741/16704 (64.30%) | loss: 2.620489 | lrm: 0.71 | dt: 646.09ms | tok/sec: 811,481 | mfu: 50.72 | epoch: 2 | total time: 115.39m | eta: 64.1m +step 10742/16704 (64.31%) | loss: 2.609813 | lrm: 0.71 | dt: 644.21ms | tok/sec: 813,842 | mfu: 50.87 | epoch: 2 | total time: 115.41m | eta: 64.1m +step 10743/16704 (64.31%) | loss: 2.602815 | lrm: 0.71 | dt: 644.98ms | tok/sec: 812,876 | mfu: 50.81 | epoch: 2 | total time: 115.42m | eta: 64.1m +step 10744/16704 (64.32%) | loss: 2.603130 | lrm: 0.71 | dt: 645.70ms | tok/sec: 811,967 | mfu: 50.75 | epoch: 2 | total time: 115.43m | eta: 64.1m +step 10745/16704 (64.33%) | loss: 2.589186 | lrm: 0.71 | dt: 645.48ms | tok/sec: 812,242 | mfu: 50.77 | epoch: 2 | total time: 115.44m | eta: 64.1m +step 10746/16704 (64.33%) | loss: 2.576661 | lrm: 0.71 | dt: 645.30ms | tok/sec: 812,471 | mfu: 50.78 | epoch: 2 | total time: 115.45m | eta: 64.1m +step 10747/16704 (64.34%) | loss: 2.576281 | lrm: 0.71 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 2 | total time: 115.46m | eta: 64.1m +step 10748/16704 (64.34%) | loss: 2.576658 | lrm: 0.71 | dt: 643.63ms | tok/sec: 814,585 | mfu: 50.91 | epoch: 2 | total time: 115.47m | eta: 64.0m +step 10749/16704 (64.35%) | loss: 2.568254 | lrm: 0.71 | dt: 643.79ms | tok/sec: 814,379 | mfu: 50.90 | epoch: 2 | total time: 115.48m | eta: 64.0m +Step 10750 | Validation bpb: 0.797846 +step 10750/16704 (64.36%) | loss: 2.582632 | lrm: 0.71 | dt: 650.34ms | tok/sec: 806,178 | mfu: 50.39 | epoch: 2 | total time: 115.49m | eta: 64.0m +step 10751/16704 (64.36%) | loss: 2.577208 | lrm: 0.71 | dt: 643.27ms | tok/sec: 815,039 | mfu: 50.94 | epoch: 2 | total time: 115.50m | eta: 64.0m +step 10752/16704 (64.37%) | loss: 2.589616 | lrm: 0.71 | dt: 646.64ms | tok/sec: 810,789 | mfu: 50.68 | epoch: 2 | total time: 115.51m | eta: 64.0m +step 10753/16704 (64.37%) | loss: 2.591892 | lrm: 0.71 | dt: 643.14ms | tok/sec: 815,203 | mfu: 50.95 | epoch: 2 | total time: 115.52m | eta: 64.0m +step 10754/16704 (64.38%) | loss: 2.600950 | lrm: 0.71 | dt: 645.15ms | tok/sec: 812,655 | mfu: 50.79 | epoch: 2 | total time: 115.53m | eta: 64.0m +step 10755/16704 (64.39%) | loss: 2.595233 | lrm: 0.71 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 2 | total time: 115.55m | eta: 64.0m +step 10756/16704 (64.39%) | loss: 2.593838 | lrm: 0.71 | dt: 643.93ms | tok/sec: 814,199 | mfu: 50.89 | epoch: 2 | total time: 115.56m | eta: 64.0m +step 10757/16704 (64.40%) | loss: 2.585764 | lrm: 0.71 | dt: 647.66ms | tok/sec: 809,514 | mfu: 50.60 | epoch: 2 | total time: 115.57m | eta: 64.0m +step 10758/16704 (64.40%) | loss: 2.582889 | lrm: 0.71 | dt: 645.32ms | tok/sec: 812,448 | mfu: 50.78 | epoch: 2 | total time: 115.58m | eta: 63.9m +step 10759/16704 (64.41%) | loss: 2.595323 | lrm: 0.71 | dt: 644.92ms | tok/sec: 812,949 | mfu: 50.81 | epoch: 2 | total time: 115.59m | eta: 63.9m +step 10760/16704 (64.42%) | loss: 2.591835 | lrm: 0.71 | dt: 644.62ms | tok/sec: 813,332 | mfu: 50.83 | epoch: 2 | total time: 115.60m | eta: 63.9m +step 10761/16704 (64.42%) | loss: 2.584248 | lrm: 0.71 | dt: 647.59ms | tok/sec: 809,594 | mfu: 50.60 | epoch: 2 | total time: 115.61m | eta: 63.9m +step 10762/16704 (64.43%) | loss: 2.589850 | lrm: 0.71 | dt: 644.91ms | tok/sec: 812,964 | mfu: 50.81 | epoch: 2 | total time: 115.62m | eta: 63.9m +step 10763/16704 (64.43%) | loss: 2.589849 | lrm: 0.71 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 2 | total time: 115.63m | eta: 63.9m +step 10764/16704 (64.44%) | loss: 2.595116 | lrm: 0.71 | dt: 644.29ms | tok/sec: 813,744 | mfu: 50.86 | epoch: 2 | total time: 115.64m | eta: 63.9m +step 10765/16704 (64.45%) | loss: 2.607213 | lrm: 0.71 | dt: 643.99ms | tok/sec: 814,120 | mfu: 50.88 | epoch: 2 | total time: 115.65m | eta: 63.9m +step 10766/16704 (64.45%) | loss: 2.607348 | lrm: 0.71 | dt: 645.15ms | tok/sec: 812,662 | mfu: 50.79 | epoch: 2 | total time: 115.66m | eta: 63.9m +step 10767/16704 (64.46%) | loss: 2.623600 | lrm: 0.71 | dt: 641.00ms | tok/sec: 817,923 | mfu: 51.12 | epoch: 2 | total time: 115.67m | eta: 63.8m +step 10768/16704 (64.46%) | loss: 2.624670 | lrm: 0.71 | dt: 645.89ms | tok/sec: 811,726 | mfu: 50.73 | epoch: 2 | total time: 115.69m | eta: 63.8m +step 10769/16704 (64.47%) | loss: 2.620362 | lrm: 0.71 | dt: 647.31ms | tok/sec: 809,948 | mfu: 50.62 | epoch: 2 | total time: 115.70m | eta: 63.8m +step 10770/16704 (64.48%) | loss: 2.616975 | lrm: 0.71 | dt: 643.94ms | tok/sec: 814,192 | mfu: 50.89 | epoch: 2 | total time: 115.71m | eta: 63.8m +step 10771/16704 (64.48%) | loss: 2.610569 | lrm: 0.71 | dt: 646.46ms | tok/sec: 811,014 | mfu: 50.69 | epoch: 2 | total time: 115.72m | eta: 63.8m +step 10772/16704 (64.49%) | loss: 2.616118 | lrm: 0.71 | dt: 646.49ms | tok/sec: 810,971 | mfu: 50.69 | epoch: 2 | total time: 115.73m | eta: 63.8m +step 10773/16704 (64.49%) | loss: 2.609929 | lrm: 0.71 | dt: 648.11ms | tok/sec: 808,949 | mfu: 50.56 | epoch: 2 | total time: 115.74m | eta: 63.8m +step 10774/16704 (64.50%) | loss: 2.607416 | lrm: 0.71 | dt: 644.97ms | tok/sec: 812,892 | mfu: 50.81 | epoch: 2 | total time: 115.75m | eta: 63.8m +step 10775/16704 (64.51%) | loss: 2.603654 | lrm: 0.71 | dt: 644.81ms | tok/sec: 813,083 | mfu: 50.82 | epoch: 2 | total time: 115.76m | eta: 63.8m +step 10776/16704 (64.51%) | loss: 2.602939 | lrm: 0.71 | dt: 648.00ms | tok/sec: 809,083 | mfu: 50.57 | epoch: 2 | total time: 115.77m | eta: 63.7m +step 10777/16704 (64.52%) | loss: 2.595452 | lrm: 0.71 | dt: 644.30ms | tok/sec: 813,737 | mfu: 50.86 | epoch: 2 | total time: 115.78m | eta: 63.7m +step 10778/16704 (64.52%) | loss: 2.612554 | lrm: 0.71 | dt: 643.04ms | tok/sec: 815,325 | mfu: 50.96 | epoch: 2 | total time: 115.79m | eta: 63.7m +step 10779/16704 (64.53%) | loss: 2.614668 | lrm: 0.71 | dt: 645.17ms | tok/sec: 812,638 | mfu: 50.79 | epoch: 2 | total time: 115.80m | eta: 63.7m +step 10780/16704 (64.54%) | loss: 2.629786 | lrm: 0.71 | dt: 643.62ms | tok/sec: 814,594 | mfu: 50.91 | epoch: 2 | total time: 115.81m | eta: 63.7m +step 10781/16704 (64.54%) | loss: 2.635180 | lrm: 0.71 | dt: 644.58ms | tok/sec: 813,374 | mfu: 50.84 | epoch: 2 | total time: 115.82m | eta: 63.7m +step 10782/16704 (64.55%) | loss: 2.635204 | lrm: 0.71 | dt: 645.01ms | tok/sec: 812,839 | mfu: 50.80 | epoch: 2 | total time: 115.84m | eta: 63.7m +step 10783/16704 (64.55%) | loss: 2.643369 | lrm: 0.71 | dt: 646.23ms | tok/sec: 811,306 | mfu: 50.71 | epoch: 2 | total time: 115.85m | eta: 63.7m +step 10784/16704 (64.56%) | loss: 2.629566 | lrm: 0.71 | dt: 645.46ms | tok/sec: 812,265 | mfu: 50.77 | epoch: 2 | total time: 115.86m | eta: 63.7m +step 10785/16704 (64.57%) | loss: 2.638545 | lrm: 0.71 | dt: 647.09ms | tok/sec: 810,218 | mfu: 50.64 | epoch: 2 | total time: 115.87m | eta: 63.6m +step 10786/16704 (64.57%) | loss: 2.647436 | lrm: 0.71 | dt: 644.43ms | tok/sec: 813,565 | mfu: 50.85 | epoch: 2 | total time: 115.88m | eta: 63.6m +step 10787/16704 (64.58%) | loss: 2.647367 | lrm: 0.71 | dt: 647.86ms | tok/sec: 809,258 | mfu: 50.58 | epoch: 2 | total time: 115.89m | eta: 63.6m +step 10788/16704 (64.58%) | loss: 2.650336 | lrm: 0.71 | dt: 645.05ms | tok/sec: 812,786 | mfu: 50.80 | epoch: 2 | total time: 115.90m | eta: 63.6m +step 10789/16704 (64.59%) | loss: 2.644487 | lrm: 0.71 | dt: 646.03ms | tok/sec: 811,555 | mfu: 50.72 | epoch: 2 | total time: 115.91m | eta: 63.6m +step 10790/16704 (64.60%) | loss: 2.652653 | lrm: 0.71 | dt: 646.15ms | tok/sec: 811,397 | mfu: 50.71 | epoch: 2 | total time: 115.92m | eta: 63.6m +step 10791/16704 (64.60%) | loss: 2.654803 | lrm: 0.71 | dt: 645.97ms | tok/sec: 811,627 | mfu: 50.73 | epoch: 2 | total time: 115.93m | eta: 63.6m +step 10792/16704 (64.61%) | loss: 2.656380 | lrm: 0.71 | dt: 643.55ms | tok/sec: 814,680 | mfu: 50.92 | epoch: 2 | total time: 115.94m | eta: 63.6m +step 10793/16704 (64.61%) | loss: 2.651760 | lrm: 0.71 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 2 | total time: 115.95m | eta: 63.6m +step 10794/16704 (64.62%) | loss: 2.650831 | lrm: 0.71 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 2 | total time: 115.96m | eta: 63.6m +step 10795/16704 (64.63%) | loss: 2.643096 | lrm: 0.71 | dt: 645.59ms | tok/sec: 812,104 | mfu: 50.76 | epoch: 2 | total time: 115.98m | eta: 63.5m +step 10796/16704 (64.63%) | loss: 2.636173 | lrm: 0.71 | dt: 644.48ms | tok/sec: 813,499 | mfu: 50.84 | epoch: 2 | total time: 115.99m | eta: 63.5m +step 10797/16704 (64.64%) | loss: 2.634954 | lrm: 0.71 | dt: 644.70ms | tok/sec: 813,222 | mfu: 50.83 | epoch: 2 | total time: 116.00m | eta: 63.5m +step 10798/16704 (64.64%) | loss: 2.633894 | lrm: 0.71 | dt: 645.90ms | tok/sec: 811,720 | mfu: 50.73 | epoch: 2 | total time: 116.01m | eta: 63.5m +step 10799/16704 (64.65%) | loss: 2.631162 | lrm: 0.71 | dt: 646.76ms | tok/sec: 810,637 | mfu: 50.67 | epoch: 2 | total time: 116.02m | eta: 63.5m +step 10800/16704 (64.66%) | loss: 2.647006 | lrm: 0.71 | dt: 644.92ms | tok/sec: 812,946 | mfu: 50.81 | epoch: 2 | total time: 116.03m | eta: 63.5m +step 10801/16704 (64.66%) | loss: 2.636618 | lrm: 0.71 | dt: 644.30ms | tok/sec: 813,733 | mfu: 50.86 | epoch: 2 | total time: 116.04m | eta: 63.5m +step 10802/16704 (64.67%) | loss: 2.632929 | lrm: 0.71 | dt: 648.23ms | tok/sec: 808,793 | mfu: 50.55 | epoch: 2 | total time: 116.05m | eta: 63.5m +step 10803/16704 (64.67%) | loss: 2.631707 | lrm: 0.71 | dt: 643.94ms | tok/sec: 814,192 | mfu: 50.89 | epoch: 2 | total time: 116.06m | eta: 63.5m +step 10804/16704 (64.68%) | loss: 2.619605 | lrm: 0.71 | dt: 646.47ms | tok/sec: 811,001 | mfu: 50.69 | epoch: 2 | total time: 116.07m | eta: 63.4m +step 10805/16704 (64.69%) | loss: 2.612843 | lrm: 0.71 | dt: 646.20ms | tok/sec: 811,345 | mfu: 50.71 | epoch: 2 | total time: 116.08m | eta: 63.4m +step 10806/16704 (64.69%) | loss: 2.608381 | lrm: 0.71 | dt: 644.45ms | tok/sec: 813,543 | mfu: 50.85 | epoch: 2 | total time: 116.09m | eta: 63.4m +step 10807/16704 (64.70%) | loss: 2.601719 | lrm: 0.71 | dt: 644.05ms | tok/sec: 814,054 | mfu: 50.88 | epoch: 2 | total time: 116.10m | eta: 63.4m +step 10808/16704 (64.70%) | loss: 2.601646 | lrm: 0.71 | dt: 644.65ms | tok/sec: 813,291 | mfu: 50.83 | epoch: 2 | total time: 116.12m | eta: 63.4m +step 10809/16704 (64.71%) | loss: 2.604805 | lrm: 0.71 | dt: 644.83ms | tok/sec: 813,062 | mfu: 50.82 | epoch: 2 | total time: 116.13m | eta: 63.4m +step 10810/16704 (64.72%) | loss: 2.613734 | lrm: 0.71 | dt: 646.57ms | tok/sec: 810,875 | mfu: 50.68 | epoch: 2 | total time: 116.14m | eta: 63.4m +step 10811/16704 (64.72%) | loss: 2.618291 | lrm: 0.71 | dt: 644.93ms | tok/sec: 812,943 | mfu: 50.81 | epoch: 2 | total time: 116.15m | eta: 63.4m +step 10812/16704 (64.73%) | loss: 2.618257 | lrm: 0.71 | dt: 644.25ms | tok/sec: 813,790 | mfu: 50.86 | epoch: 2 | total time: 116.16m | eta: 63.4m +step 10813/16704 (64.73%) | loss: 2.624653 | lrm: 0.71 | dt: 645.85ms | tok/sec: 811,781 | mfu: 50.74 | epoch: 2 | total time: 116.17m | eta: 63.3m +step 10814/16704 (64.74%) | loss: 2.632524 | lrm: 0.71 | dt: 644.20ms | tok/sec: 813,861 | mfu: 50.87 | epoch: 2 | total time: 116.18m | eta: 63.3m +step 10815/16704 (64.74%) | loss: 2.627875 | lrm: 0.71 | dt: 644.61ms | tok/sec: 813,339 | mfu: 50.83 | epoch: 2 | total time: 116.19m | eta: 63.3m +step 10816/16704 (64.75%) | loss: 2.623109 | lrm: 0.70 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 2 | total time: 116.20m | eta: 63.3m +step 10817/16704 (64.76%) | loss: 2.636713 | lrm: 0.70 | dt: 645.34ms | tok/sec: 812,425 | mfu: 50.78 | epoch: 2 | total time: 116.21m | eta: 63.3m +step 10818/16704 (64.76%) | loss: 2.625683 | lrm: 0.70 | dt: 647.17ms | tok/sec: 810,129 | mfu: 50.63 | epoch: 2 | total time: 116.22m | eta: 63.3m +step 10819/16704 (64.77%) | loss: 2.612848 | lrm: 0.70 | dt: 646.35ms | tok/sec: 811,152 | mfu: 50.70 | epoch: 2 | total time: 116.23m | eta: 63.3m +step 10820/16704 (64.77%) | loss: 2.614566 | lrm: 0.70 | dt: 644.34ms | tok/sec: 813,680 | mfu: 50.86 | epoch: 2 | total time: 116.24m | eta: 63.3m +step 10821/16704 (64.78%) | loss: 2.615277 | lrm: 0.70 | dt: 645.51ms | tok/sec: 812,212 | mfu: 50.76 | epoch: 2 | total time: 116.26m | eta: 63.3m +step 10822/16704 (64.79%) | loss: 2.619803 | lrm: 0.70 | dt: 646.83ms | tok/sec: 810,551 | mfu: 50.66 | epoch: 2 | total time: 116.27m | eta: 63.3m +step 10823/16704 (64.79%) | loss: 2.609355 | lrm: 0.70 | dt: 644.39ms | tok/sec: 813,622 | mfu: 50.85 | epoch: 2 | total time: 116.28m | eta: 63.2m +step 10824/16704 (64.80%) | loss: 2.613779 | lrm: 0.70 | dt: 646.14ms | tok/sec: 811,409 | mfu: 50.71 | epoch: 2 | total time: 116.29m | eta: 63.2m +step 10825/16704 (64.80%) | loss: 2.619835 | lrm: 0.70 | dt: 645.48ms | tok/sec: 812,244 | mfu: 50.77 | epoch: 2 | total time: 116.30m | eta: 63.2m +step 10826/16704 (64.81%) | loss: 2.615928 | lrm: 0.70 | dt: 647.37ms | tok/sec: 809,868 | mfu: 50.62 | epoch: 2 | total time: 116.31m | eta: 63.2m +step 10827/16704 (64.82%) | loss: 2.611904 | lrm: 0.70 | dt: 643.73ms | tok/sec: 814,456 | mfu: 50.90 | epoch: 2 | total time: 116.32m | eta: 63.2m +step 10828/16704 (64.82%) | loss: 2.612461 | lrm: 0.70 | dt: 647.26ms | tok/sec: 810,017 | mfu: 50.63 | epoch: 2 | total time: 116.33m | eta: 63.2m +step 10829/16704 (64.83%) | loss: 2.618091 | lrm: 0.70 | dt: 645.26ms | tok/sec: 812,518 | mfu: 50.78 | epoch: 2 | total time: 116.34m | eta: 63.2m +step 10830/16704 (64.83%) | loss: 2.620341 | lrm: 0.70 | dt: 648.37ms | tok/sec: 808,621 | mfu: 50.54 | epoch: 2 | total time: 116.35m | eta: 63.2m +step 10831/16704 (64.84%) | loss: 2.626138 | lrm: 0.70 | dt: 644.59ms | tok/sec: 813,371 | mfu: 50.84 | epoch: 2 | total time: 116.36m | eta: 63.2m +step 10832/16704 (64.85%) | loss: 2.623025 | lrm: 0.70 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 2 | total time: 116.37m | eta: 63.1m +step 10833/16704 (64.85%) | loss: 2.615212 | lrm: 0.70 | dt: 644.46ms | tok/sec: 813,531 | mfu: 50.85 | epoch: 2 | total time: 116.38m | eta: 63.1m +step 10834/16704 (64.86%) | loss: 2.609692 | lrm: 0.70 | dt: 643.18ms | tok/sec: 815,153 | mfu: 50.95 | epoch: 2 | total time: 116.40m | eta: 63.1m +step 10835/16704 (64.86%) | loss: 2.607227 | lrm: 0.70 | dt: 643.89ms | tok/sec: 814,253 | mfu: 50.89 | epoch: 2 | total time: 116.41m | eta: 63.1m +step 10836/16704 (64.87%) | loss: 2.604330 | lrm: 0.70 | dt: 644.74ms | tok/sec: 813,176 | mfu: 50.82 | epoch: 2 | total time: 116.42m | eta: 63.1m +step 10837/16704 (64.88%) | loss: 2.602665 | lrm: 0.70 | dt: 644.18ms | tok/sec: 813,887 | mfu: 50.87 | epoch: 2 | total time: 116.43m | eta: 63.1m +step 10838/16704 (64.88%) | loss: 2.585526 | lrm: 0.70 | dt: 646.53ms | tok/sec: 810,919 | mfu: 50.68 | epoch: 2 | total time: 116.44m | eta: 63.1m +step 10839/16704 (64.89%) | loss: 2.585062 | lrm: 0.70 | dt: 643.82ms | tok/sec: 814,334 | mfu: 50.90 | epoch: 2 | total time: 116.45m | eta: 63.1m +step 10840/16704 (64.89%) | loss: 2.587426 | lrm: 0.70 | dt: 644.10ms | tok/sec: 813,980 | mfu: 50.87 | epoch: 2 | total time: 116.46m | eta: 63.1m +step 10841/16704 (64.90%) | loss: 2.588175 | lrm: 0.70 | dt: 645.67ms | tok/sec: 812,011 | mfu: 50.75 | epoch: 2 | total time: 116.47m | eta: 63.0m +step 10842/16704 (64.91%) | loss: 2.583917 | lrm: 0.70 | dt: 646.13ms | tok/sec: 811,425 | mfu: 50.72 | epoch: 2 | total time: 116.48m | eta: 63.0m +step 10843/16704 (64.91%) | loss: 2.588701 | lrm: 0.70 | dt: 644.28ms | tok/sec: 813,759 | mfu: 50.86 | epoch: 2 | total time: 116.49m | eta: 63.0m +step 10844/16704 (64.92%) | loss: 2.582895 | lrm: 0.70 | dt: 644.70ms | tok/sec: 813,229 | mfu: 50.83 | epoch: 2 | total time: 116.50m | eta: 63.0m +step 10845/16704 (64.92%) | loss: 2.580716 | lrm: 0.70 | dt: 647.06ms | tok/sec: 810,263 | mfu: 50.64 | epoch: 2 | total time: 116.51m | eta: 63.0m +step 10846/16704 (64.93%) | loss: 2.600979 | lrm: 0.70 | dt: 646.82ms | tok/sec: 810,565 | mfu: 50.66 | epoch: 2 | total time: 116.52m | eta: 63.0m +step 10847/16704 (64.94%) | loss: 2.596446 | lrm: 0.70 | dt: 645.90ms | tok/sec: 811,717 | mfu: 50.73 | epoch: 2 | total time: 116.53m | eta: 63.0m +step 10848/16704 (64.94%) | loss: 2.591482 | lrm: 0.70 | dt: 645.13ms | tok/sec: 812,680 | mfu: 50.79 | epoch: 2 | total time: 116.55m | eta: 63.0m +step 10849/16704 (64.95%) | loss: 2.593458 | lrm: 0.70 | dt: 647.56ms | tok/sec: 809,636 | mfu: 50.60 | epoch: 2 | total time: 116.56m | eta: 63.0m +step 10850/16704 (64.95%) | loss: 2.587716 | lrm: 0.70 | dt: 643.69ms | tok/sec: 814,509 | mfu: 50.91 | epoch: 2 | total time: 116.57m | eta: 63.0m +step 10851/16704 (64.96%) | loss: 2.579977 | lrm: 0.70 | dt: 647.23ms | tok/sec: 810,044 | mfu: 50.63 | epoch: 2 | total time: 116.58m | eta: 62.9m +step 10852/16704 (64.97%) | loss: 2.586586 | lrm: 0.70 | dt: 645.36ms | tok/sec: 812,400 | mfu: 50.78 | epoch: 2 | total time: 116.59m | eta: 62.9m +step 10853/16704 (64.97%) | loss: 2.599904 | lrm: 0.70 | dt: 645.16ms | tok/sec: 812,645 | mfu: 50.79 | epoch: 2 | total time: 116.60m | eta: 62.9m +step 10854/16704 (64.98%) | loss: 2.596830 | lrm: 0.70 | dt: 647.04ms | tok/sec: 810,288 | mfu: 50.64 | epoch: 2 | total time: 116.61m | eta: 62.9m +step 10855/16704 (64.98%) | loss: 2.600193 | lrm: 0.70 | dt: 646.85ms | tok/sec: 810,527 | mfu: 50.66 | epoch: 2 | total time: 116.62m | eta: 62.9m +step 10856/16704 (64.99%) | loss: 2.596504 | lrm: 0.70 | dt: 644.32ms | tok/sec: 813,707 | mfu: 50.86 | epoch: 2 | total time: 116.63m | eta: 62.9m +step 10857/16704 (65.00%) | loss: 2.600849 | lrm: 0.70 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 2 | total time: 116.64m | eta: 62.9m +step 10858/16704 (65.00%) | loss: 2.598431 | lrm: 0.70 | dt: 644.36ms | tok/sec: 813,654 | mfu: 50.85 | epoch: 2 | total time: 116.65m | eta: 62.9m +step 10859/16704 (65.01%) | loss: 2.605592 | lrm: 0.70 | dt: 645.23ms | tok/sec: 812,558 | mfu: 50.79 | epoch: 2 | total time: 116.66m | eta: 62.9m +step 10860/16704 (65.01%) | loss: 2.608471 | lrm: 0.70 | dt: 645.80ms | tok/sec: 811,844 | mfu: 50.74 | epoch: 2 | total time: 116.67m | eta: 62.8m +step 10861/16704 (65.02%) | loss: 2.602970 | lrm: 0.70 | dt: 642.62ms | tok/sec: 815,854 | mfu: 50.99 | epoch: 2 | total time: 116.69m | eta: 62.8m +step 10862/16704 (65.03%) | loss: 2.589811 | lrm: 0.70 | dt: 645.59ms | tok/sec: 812,112 | mfu: 50.76 | epoch: 2 | total time: 116.70m | eta: 62.8m +step 10863/16704 (65.03%) | loss: 2.590746 | lrm: 0.70 | dt: 646.16ms | tok/sec: 811,395 | mfu: 50.71 | epoch: 2 | total time: 116.71m | eta: 62.8m +step 10864/16704 (65.04%) | loss: 2.589468 | lrm: 0.70 | dt: 647.76ms | tok/sec: 809,383 | mfu: 50.59 | epoch: 2 | total time: 116.72m | eta: 62.8m +step 10865/16704 (65.04%) | loss: 2.595198 | lrm: 0.70 | dt: 645.23ms | tok/sec: 812,562 | mfu: 50.79 | epoch: 2 | total time: 116.73m | eta: 62.8m +step 10866/16704 (65.05%) | loss: 2.608320 | lrm: 0.70 | dt: 645.44ms | tok/sec: 812,299 | mfu: 50.77 | epoch: 2 | total time: 116.74m | eta: 62.8m +step 10867/16704 (65.06%) | loss: 2.608023 | lrm: 0.70 | dt: 648.43ms | tok/sec: 808,553 | mfu: 50.54 | epoch: 2 | total time: 116.75m | eta: 62.8m +step 10868/16704 (65.06%) | loss: 2.617363 | lrm: 0.70 | dt: 645.11ms | tok/sec: 812,715 | mfu: 50.80 | epoch: 2 | total time: 116.76m | eta: 62.8m +step 10869/16704 (65.07%) | loss: 2.597472 | lrm: 0.70 | dt: 643.42ms | tok/sec: 814,846 | mfu: 50.93 | epoch: 2 | total time: 116.77m | eta: 62.7m +step 10870/16704 (65.07%) | loss: 2.600838 | lrm: 0.70 | dt: 646.91ms | tok/sec: 810,449 | mfu: 50.65 | epoch: 2 | total time: 116.78m | eta: 62.7m +step 10871/16704 (65.08%) | loss: 2.601024 | lrm: 0.70 | dt: 643.76ms | tok/sec: 814,417 | mfu: 50.90 | epoch: 2 | total time: 116.79m | eta: 62.7m +step 10872/16704 (65.09%) | loss: 2.585912 | lrm: 0.70 | dt: 646.29ms | tok/sec: 811,228 | mfu: 50.70 | epoch: 2 | total time: 116.80m | eta: 62.7m +step 10873/16704 (65.09%) | loss: 2.593215 | lrm: 0.70 | dt: 644.27ms | tok/sec: 813,767 | mfu: 50.86 | epoch: 2 | total time: 116.81m | eta: 62.7m +step 10874/16704 (65.10%) | loss: 2.590591 | lrm: 0.70 | dt: 644.39ms | tok/sec: 813,613 | mfu: 50.85 | epoch: 2 | total time: 116.83m | eta: 62.7m +step 10875/16704 (65.10%) | loss: 2.582702 | lrm: 0.70 | dt: 646.44ms | tok/sec: 811,039 | mfu: 50.69 | epoch: 2 | total time: 116.84m | eta: 62.7m +step 10876/16704 (65.11%) | loss: 2.604561 | lrm: 0.70 | dt: 644.69ms | tok/sec: 813,237 | mfu: 50.83 | epoch: 2 | total time: 116.85m | eta: 62.7m +step 10877/16704 (65.12%) | loss: 2.591783 | lrm: 0.70 | dt: 646.83ms | tok/sec: 810,550 | mfu: 50.66 | epoch: 2 | total time: 116.86m | eta: 62.7m +step 10878/16704 (65.12%) | loss: 2.594616 | lrm: 0.70 | dt: 646.93ms | tok/sec: 810,429 | mfu: 50.65 | epoch: 2 | total time: 116.87m | eta: 62.6m +step 10879/16704 (65.13%) | loss: 2.600544 | lrm: 0.70 | dt: 644.69ms | tok/sec: 813,240 | mfu: 50.83 | epoch: 2 | total time: 116.88m | eta: 62.6m +step 10880/16704 (65.13%) | loss: 2.589722 | lrm: 0.70 | dt: 645.11ms | tok/sec: 812,715 | mfu: 50.80 | epoch: 2 | total time: 116.89m | eta: 62.6m +step 10881/16704 (65.14%) | loss: 2.591118 | lrm: 0.70 | dt: 645.91ms | tok/sec: 811,699 | mfu: 50.73 | epoch: 2 | total time: 116.90m | eta: 62.6m +step 10882/16704 (65.15%) | loss: 2.573563 | lrm: 0.70 | dt: 645.93ms | tok/sec: 811,679 | mfu: 50.73 | epoch: 2 | total time: 116.91m | eta: 62.6m +step 10883/16704 (65.15%) | loss: 2.583218 | lrm: 0.70 | dt: 645.19ms | tok/sec: 812,613 | mfu: 50.79 | epoch: 2 | total time: 116.92m | eta: 62.6m +step 10884/16704 (65.16%) | loss: 2.575560 | lrm: 0.70 | dt: 647.58ms | tok/sec: 809,611 | mfu: 50.60 | epoch: 2 | total time: 116.93m | eta: 62.6m +step 10885/16704 (65.16%) | loss: 2.592064 | lrm: 0.70 | dt: 645.41ms | tok/sec: 812,328 | mfu: 50.77 | epoch: 2 | total time: 116.94m | eta: 62.6m +step 10886/16704 (65.17%) | loss: 2.589816 | lrm: 0.70 | dt: 644.99ms | tok/sec: 812,864 | mfu: 50.81 | epoch: 2 | total time: 116.95m | eta: 62.6m +step 10887/16704 (65.18%) | loss: 2.585531 | lrm: 0.70 | dt: 646.41ms | tok/sec: 811,072 | mfu: 50.69 | epoch: 2 | total time: 116.97m | eta: 62.6m +step 10888/16704 (65.18%) | loss: 2.607244 | lrm: 0.70 | dt: 647.13ms | tok/sec: 810,175 | mfu: 50.64 | epoch: 2 | total time: 116.98m | eta: 62.5m +step 10889/16704 (65.19%) | loss: 2.606878 | lrm: 0.70 | dt: 644.36ms | tok/sec: 813,654 | mfu: 50.85 | epoch: 2 | total time: 116.99m | eta: 62.5m +step 10890/16704 (65.19%) | loss: 2.611637 | lrm: 0.70 | dt: 647.05ms | tok/sec: 810,279 | mfu: 50.64 | epoch: 2 | total time: 117.00m | eta: 62.5m +step 10891/16704 (65.20%) | loss: 2.606286 | lrm: 0.70 | dt: 646.86ms | tok/sec: 810,518 | mfu: 50.66 | epoch: 2 | total time: 117.01m | eta: 62.5m +step 10892/16704 (65.21%) | loss: 2.614935 | lrm: 0.70 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 2 | total time: 117.02m | eta: 62.5m +step 10893/16704 (65.21%) | loss: 2.608568 | lrm: 0.70 | dt: 644.70ms | tok/sec: 813,234 | mfu: 50.83 | epoch: 2 | total time: 117.03m | eta: 62.5m +step 10894/16704 (65.22%) | loss: 2.596120 | lrm: 0.70 | dt: 645.62ms | tok/sec: 812,067 | mfu: 50.76 | epoch: 2 | total time: 117.04m | eta: 62.5m +step 10895/16704 (65.22%) | loss: 2.592224 | lrm: 0.70 | dt: 646.74ms | tok/sec: 810,663 | mfu: 50.67 | epoch: 2 | total time: 117.05m | eta: 62.5m +step 10896/16704 (65.23%) | loss: 2.597036 | lrm: 0.70 | dt: 644.48ms | tok/sec: 813,500 | mfu: 50.84 | epoch: 2 | total time: 117.06m | eta: 62.5m +step 10897/16704 (65.24%) | loss: 2.590271 | lrm: 0.70 | dt: 645.72ms | tok/sec: 811,948 | mfu: 50.75 | epoch: 2 | total time: 117.07m | eta: 62.4m +step 10898/16704 (65.24%) | loss: 2.595919 | lrm: 0.70 | dt: 649.31ms | tok/sec: 807,456 | mfu: 50.47 | epoch: 2 | total time: 117.08m | eta: 62.4m +step 10899/16704 (65.25%) | loss: 2.596280 | lrm: 0.70 | dt: 647.09ms | tok/sec: 810,223 | mfu: 50.64 | epoch: 2 | total time: 117.09m | eta: 62.4m +step 10900/16704 (65.25%) | loss: 2.593924 | lrm: 0.69 | dt: 644.62ms | tok/sec: 813,330 | mfu: 50.83 | epoch: 2 | total time: 117.11m | eta: 62.4m +step 10901/16704 (65.26%) | loss: 2.592638 | lrm: 0.69 | dt: 649.51ms | tok/sec: 807,206 | mfu: 50.45 | epoch: 2 | total time: 117.12m | eta: 62.4m +step 10902/16704 (65.27%) | loss: 2.583468 | lrm: 0.69 | dt: 645.36ms | tok/sec: 812,402 | mfu: 50.78 | epoch: 2 | total time: 117.13m | eta: 62.4m +step 10903/16704 (65.27%) | loss: 2.584334 | lrm: 0.69 | dt: 645.07ms | tok/sec: 812,760 | mfu: 50.80 | epoch: 2 | total time: 117.14m | eta: 62.4m +step 10904/16704 (65.28%) | loss: 2.581271 | lrm: 0.69 | dt: 645.15ms | tok/sec: 812,662 | mfu: 50.79 | epoch: 2 | total time: 117.15m | eta: 62.4m +step 10905/16704 (65.28%) | loss: 2.576916 | lrm: 0.69 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 2 | total time: 117.16m | eta: 62.4m +step 10906/16704 (65.29%) | loss: 2.577631 | lrm: 0.69 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 2 | total time: 117.17m | eta: 62.3m +step 10907/16704 (65.30%) | loss: 2.571974 | lrm: 0.69 | dt: 645.63ms | tok/sec: 812,052 | mfu: 50.75 | epoch: 2 | total time: 117.18m | eta: 62.3m +step 10908/16704 (65.30%) | loss: 2.565439 | lrm: 0.69 | dt: 645.22ms | tok/sec: 812,576 | mfu: 50.79 | epoch: 2 | total time: 117.19m | eta: 62.3m +step 10909/16704 (65.31%) | loss: 2.560895 | lrm: 0.69 | dt: 647.10ms | tok/sec: 810,212 | mfu: 50.64 | epoch: 2 | total time: 117.20m | eta: 62.3m +step 10910/16704 (65.31%) | loss: 2.566471 | lrm: 0.69 | dt: 646.90ms | tok/sec: 810,465 | mfu: 50.66 | epoch: 2 | total time: 117.21m | eta: 62.3m +step 10911/16704 (65.32%) | loss: 2.560525 | lrm: 0.69 | dt: 646.93ms | tok/sec: 810,422 | mfu: 50.65 | epoch: 2 | total time: 117.22m | eta: 62.3m +step 10912/16704 (65.33%) | loss: 2.571907 | lrm: 0.69 | dt: 646.46ms | tok/sec: 811,008 | mfu: 50.69 | epoch: 2 | total time: 117.23m | eta: 62.3m +step 10913/16704 (65.33%) | loss: 2.571191 | lrm: 0.69 | dt: 646.48ms | tok/sec: 810,987 | mfu: 50.69 | epoch: 2 | total time: 117.25m | eta: 62.3m +step 10914/16704 (65.34%) | loss: 2.571241 | lrm: 0.69 | dt: 644.87ms | tok/sec: 813,019 | mfu: 50.81 | epoch: 2 | total time: 117.26m | eta: 62.3m +step 10915/16704 (65.34%) | loss: 2.570274 | lrm: 0.69 | dt: 645.54ms | tok/sec: 812,167 | mfu: 50.76 | epoch: 2 | total time: 117.27m | eta: 62.3m +step 10916/16704 (65.35%) | loss: 2.575033 | lrm: 0.69 | dt: 644.73ms | tok/sec: 813,189 | mfu: 50.83 | epoch: 2 | total time: 117.28m | eta: 62.2m +step 10917/16704 (65.36%) | loss: 2.578182 | lrm: 0.69 | dt: 644.64ms | tok/sec: 813,307 | mfu: 50.83 | epoch: 2 | total time: 117.29m | eta: 62.2m +step 10918/16704 (65.36%) | loss: 2.583435 | lrm: 0.69 | dt: 644.94ms | tok/sec: 812,930 | mfu: 50.81 | epoch: 2 | total time: 117.30m | eta: 62.2m +step 10919/16704 (65.37%) | loss: 2.584549 | lrm: 0.69 | dt: 645.65ms | tok/sec: 812,033 | mfu: 50.75 | epoch: 2 | total time: 117.31m | eta: 62.2m +step 10920/16704 (65.37%) | loss: 2.585414 | lrm: 0.69 | dt: 646.13ms | tok/sec: 811,428 | mfu: 50.72 | epoch: 2 | total time: 117.32m | eta: 62.2m +step 10921/16704 (65.38%) | loss: 2.592749 | lrm: 0.69 | dt: 646.17ms | tok/sec: 811,382 | mfu: 50.71 | epoch: 2 | total time: 117.33m | eta: 62.2m +step 10922/16704 (65.39%) | loss: 2.603082 | lrm: 0.69 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 2 | total time: 117.34m | eta: 62.2m +step 10923/16704 (65.39%) | loss: 2.601575 | lrm: 0.69 | dt: 645.47ms | tok/sec: 812,256 | mfu: 50.77 | epoch: 2 | total time: 117.35m | eta: 62.2m +step 10924/16704 (65.40%) | loss: 2.602962 | lrm: 0.69 | dt: 646.43ms | tok/sec: 811,057 | mfu: 50.69 | epoch: 2 | total time: 117.36m | eta: 62.2m +step 10925/16704 (65.40%) | loss: 2.597248 | lrm: 0.69 | dt: 646.68ms | tok/sec: 810,742 | mfu: 50.67 | epoch: 2 | total time: 117.37m | eta: 62.1m +step 10926/16704 (65.41%) | loss: 2.611648 | lrm: 0.69 | dt: 645.63ms | tok/sec: 812,057 | mfu: 50.75 | epoch: 2 | total time: 117.39m | eta: 62.1m +step 10927/16704 (65.42%) | loss: 2.594201 | lrm: 0.69 | dt: 644.42ms | tok/sec: 813,583 | mfu: 50.85 | epoch: 2 | total time: 117.40m | eta: 62.1m +step 10928/16704 (65.42%) | loss: 2.588482 | lrm: 0.69 | dt: 646.20ms | tok/sec: 811,338 | mfu: 50.71 | epoch: 2 | total time: 117.41m | eta: 62.1m +step 10929/16704 (65.43%) | loss: 2.591039 | lrm: 0.69 | dt: 645.61ms | tok/sec: 812,085 | mfu: 50.76 | epoch: 2 | total time: 117.42m | eta: 62.1m +step 10930/16704 (65.43%) | loss: 2.583835 | lrm: 0.69 | dt: 647.05ms | tok/sec: 810,276 | mfu: 50.64 | epoch: 2 | total time: 117.43m | eta: 62.1m +step 10931/16704 (65.44%) | loss: 2.588664 | lrm: 0.69 | dt: 645.46ms | tok/sec: 812,275 | mfu: 50.77 | epoch: 2 | total time: 117.44m | eta: 62.1m +step 10932/16704 (65.45%) | loss: 2.596887 | lrm: 0.69 | dt: 643.42ms | tok/sec: 814,849 | mfu: 50.93 | epoch: 2 | total time: 117.45m | eta: 62.1m +step 10933/16704 (65.45%) | loss: 2.590323 | lrm: 0.69 | dt: 649.20ms | tok/sec: 807,585 | mfu: 50.48 | epoch: 2 | total time: 117.46m | eta: 62.1m +step 10934/16704 (65.46%) | loss: 2.593896 | lrm: 0.69 | dt: 643.41ms | tok/sec: 814,858 | mfu: 50.93 | epoch: 2 | total time: 117.47m | eta: 62.0m +step 10935/16704 (65.46%) | loss: 2.589648 | lrm: 0.69 | dt: 646.69ms | tok/sec: 810,726 | mfu: 50.67 | epoch: 2 | total time: 117.48m | eta: 62.0m +step 10936/16704 (65.47%) | loss: 2.596362 | lrm: 0.69 | dt: 645.24ms | tok/sec: 812,543 | mfu: 50.79 | epoch: 2 | total time: 117.49m | eta: 62.0m +step 10937/16704 (65.48%) | loss: 2.600729 | lrm: 0.69 | dt: 646.22ms | tok/sec: 811,309 | mfu: 50.71 | epoch: 2 | total time: 117.50m | eta: 62.0m +step 10938/16704 (65.48%) | loss: 2.596490 | lrm: 0.69 | dt: 650.29ms | tok/sec: 806,242 | mfu: 50.39 | epoch: 2 | total time: 117.51m | eta: 62.0m +step 10939/16704 (65.49%) | loss: 2.580755 | lrm: 0.69 | dt: 643.35ms | tok/sec: 814,938 | mfu: 50.93 | epoch: 2 | total time: 117.53m | eta: 62.0m +step 10940/16704 (65.49%) | loss: 2.581406 | lrm: 0.69 | dt: 647.31ms | tok/sec: 809,947 | mfu: 50.62 | epoch: 2 | total time: 117.54m | eta: 62.0m +step 10941/16704 (65.50%) | loss: 2.572360 | lrm: 0.69 | dt: 645.01ms | tok/sec: 812,839 | mfu: 50.80 | epoch: 2 | total time: 117.55m | eta: 62.0m +step 10942/16704 (65.51%) | loss: 2.580228 | lrm: 0.69 | dt: 646.22ms | tok/sec: 811,309 | mfu: 50.71 | epoch: 2 | total time: 117.56m | eta: 62.0m +step 10943/16704 (65.51%) | loss: 2.587657 | lrm: 0.69 | dt: 647.22ms | tok/sec: 810,058 | mfu: 50.63 | epoch: 2 | total time: 117.57m | eta: 62.0m +step 10944/16704 (65.52%) | loss: 2.600667 | lrm: 0.69 | dt: 643.95ms | tok/sec: 814,178 | mfu: 50.89 | epoch: 2 | total time: 117.58m | eta: 61.9m +step 10945/16704 (65.52%) | loss: 2.590756 | lrm: 0.69 | dt: 646.24ms | tok/sec: 811,286 | mfu: 50.71 | epoch: 2 | total time: 117.59m | eta: 61.9m +step 10946/16704 (65.53%) | loss: 2.588093 | lrm: 0.69 | dt: 646.77ms | tok/sec: 810,630 | mfu: 50.67 | epoch: 2 | total time: 117.60m | eta: 61.9m +step 10947/16704 (65.54%) | loss: 2.596638 | lrm: 0.69 | dt: 646.39ms | tok/sec: 811,096 | mfu: 50.69 | epoch: 2 | total time: 117.61m | eta: 61.9m +step 10948/16704 (65.54%) | loss: 2.589394 | lrm: 0.69 | dt: 645.67ms | tok/sec: 812,002 | mfu: 50.75 | epoch: 2 | total time: 117.62m | eta: 61.9m +step 10949/16704 (65.55%) | loss: 2.587495 | lrm: 0.69 | dt: 643.32ms | tok/sec: 814,970 | mfu: 50.94 | epoch: 2 | total time: 117.63m | eta: 61.9m +step 10950/16704 (65.55%) | loss: 2.589118 | lrm: 0.69 | dt: 644.77ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 2 | total time: 117.64m | eta: 61.9m +step 10951/16704 (65.56%) | loss: 2.601357 | lrm: 0.69 | dt: 643.42ms | tok/sec: 814,840 | mfu: 50.93 | epoch: 2 | total time: 117.65m | eta: 61.9m +step 10952/16704 (65.57%) | loss: 2.597659 | lrm: 0.69 | dt: 647.47ms | tok/sec: 809,746 | mfu: 50.61 | epoch: 2 | total time: 117.66m | eta: 61.9m +step 10953/16704 (65.57%) | loss: 2.599696 | lrm: 0.69 | dt: 642.93ms | tok/sec: 815,460 | mfu: 50.97 | epoch: 2 | total time: 117.68m | eta: 61.8m +step 10954/16704 (65.58%) | loss: 2.593739 | lrm: 0.69 | dt: 647.32ms | tok/sec: 809,936 | mfu: 50.62 | epoch: 2 | total time: 117.69m | eta: 61.8m +step 10955/16704 (65.58%) | loss: 2.600655 | lrm: 0.69 | dt: 646.29ms | tok/sec: 811,230 | mfu: 50.70 | epoch: 2 | total time: 117.70m | eta: 61.8m +step 10956/16704 (65.59%) | loss: 2.598866 | lrm: 0.69 | dt: 644.23ms | tok/sec: 813,823 | mfu: 50.87 | epoch: 2 | total time: 117.71m | eta: 61.8m +step 10957/16704 (65.60%) | loss: 2.594998 | lrm: 0.69 | dt: 645.39ms | tok/sec: 812,355 | mfu: 50.77 | epoch: 2 | total time: 117.72m | eta: 61.8m +step 10958/16704 (65.60%) | loss: 2.600364 | lrm: 0.69 | dt: 647.05ms | tok/sec: 810,273 | mfu: 50.64 | epoch: 2 | total time: 117.73m | eta: 61.8m +step 10959/16704 (65.61%) | loss: 2.594070 | lrm: 0.69 | dt: 645.66ms | tok/sec: 812,015 | mfu: 50.75 | epoch: 2 | total time: 117.74m | eta: 61.8m +step 10960/16704 (65.61%) | loss: 2.589617 | lrm: 0.69 | dt: 645.02ms | tok/sec: 812,819 | mfu: 50.80 | epoch: 2 | total time: 117.75m | eta: 61.8m +step 10961/16704 (65.62%) | loss: 2.586159 | lrm: 0.69 | dt: 644.49ms | tok/sec: 813,492 | mfu: 50.84 | epoch: 2 | total time: 117.76m | eta: 61.8m +step 10962/16704 (65.62%) | loss: 2.582637 | lrm: 0.69 | dt: 646.04ms | tok/sec: 811,537 | mfu: 50.72 | epoch: 2 | total time: 117.77m | eta: 61.7m +step 10963/16704 (65.63%) | loss: 2.582149 | lrm: 0.69 | dt: 645.13ms | tok/sec: 812,685 | mfu: 50.79 | epoch: 2 | total time: 117.78m | eta: 61.7m +step 10964/16704 (65.64%) | loss: 2.584150 | lrm: 0.69 | dt: 646.21ms | tok/sec: 811,321 | mfu: 50.71 | epoch: 2 | total time: 117.79m | eta: 61.7m +step 10965/16704 (65.64%) | loss: 2.587602 | lrm: 0.69 | dt: 647.27ms | tok/sec: 809,992 | mfu: 50.63 | epoch: 2 | total time: 117.80m | eta: 61.7m +step 10966/16704 (65.65%) | loss: 2.587134 | lrm: 0.69 | dt: 645.70ms | tok/sec: 811,968 | mfu: 50.75 | epoch: 2 | total time: 117.82m | eta: 61.7m +step 10967/16704 (65.65%) | loss: 2.594265 | lrm: 0.69 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 2 | total time: 117.83m | eta: 61.7m +step 10968/16704 (65.66%) | loss: 2.596754 | lrm: 0.69 | dt: 646.36ms | tok/sec: 811,137 | mfu: 50.70 | epoch: 2 | total time: 117.84m | eta: 61.7m +step 10969/16704 (65.67%) | loss: 2.601605 | lrm: 0.69 | dt: 648.30ms | tok/sec: 808,715 | mfu: 50.55 | epoch: 2 | total time: 117.85m | eta: 61.7m +step 10970/16704 (65.67%) | loss: 2.605915 | lrm: 0.69 | dt: 641.95ms | tok/sec: 816,714 | mfu: 51.05 | epoch: 2 | total time: 117.86m | eta: 61.7m +step 10971/16704 (65.68%) | loss: 2.593845 | lrm: 0.69 | dt: 649.37ms | tok/sec: 807,378 | mfu: 50.46 | epoch: 2 | total time: 117.87m | eta: 61.6m +step 10972/16704 (65.68%) | loss: 2.582010 | lrm: 0.69 | dt: 647.13ms | tok/sec: 810,174 | mfu: 50.64 | epoch: 2 | total time: 117.88m | eta: 61.6m +step 10973/16704 (65.69%) | loss: 2.577793 | lrm: 0.69 | dt: 644.29ms | tok/sec: 813,742 | mfu: 50.86 | epoch: 2 | total time: 117.89m | eta: 61.6m +step 10974/16704 (65.70%) | loss: 2.585011 | lrm: 0.69 | dt: 648.14ms | tok/sec: 808,917 | mfu: 50.56 | epoch: 2 | total time: 117.90m | eta: 61.6m +step 10975/16704 (65.70%) | loss: 2.579472 | lrm: 0.69 | dt: 646.18ms | tok/sec: 811,360 | mfu: 50.71 | epoch: 2 | total time: 117.91m | eta: 61.6m +step 10976/16704 (65.71%) | loss: 2.585732 | lrm: 0.69 | dt: 644.13ms | tok/sec: 813,945 | mfu: 50.87 | epoch: 2 | total time: 117.92m | eta: 61.6m +step 10977/16704 (65.71%) | loss: 2.576862 | lrm: 0.69 | dt: 646.56ms | tok/sec: 810,887 | mfu: 50.68 | epoch: 2 | total time: 117.93m | eta: 61.6m +step 10978/16704 (65.72%) | loss: 2.559858 | lrm: 0.69 | dt: 645.58ms | tok/sec: 812,114 | mfu: 50.76 | epoch: 2 | total time: 117.94m | eta: 61.6m +step 10979/16704 (65.73%) | loss: 2.565616 | lrm: 0.69 | dt: 644.57ms | tok/sec: 813,395 | mfu: 50.84 | epoch: 2 | total time: 117.96m | eta: 61.6m +step 10980/16704 (65.73%) | loss: 2.554058 | lrm: 0.69 | dt: 645.41ms | tok/sec: 812,332 | mfu: 50.77 | epoch: 2 | total time: 117.97m | eta: 61.6m +step 10981/16704 (65.74%) | loss: 2.553083 | lrm: 0.69 | dt: 645.15ms | tok/sec: 812,660 | mfu: 50.79 | epoch: 2 | total time: 117.98m | eta: 61.5m +step 10982/16704 (65.74%) | loss: 2.558450 | lrm: 0.69 | dt: 645.75ms | tok/sec: 811,900 | mfu: 50.74 | epoch: 2 | total time: 117.99m | eta: 61.5m +step 10983/16704 (65.75%) | loss: 2.573990 | lrm: 0.68 | dt: 645.81ms | tok/sec: 811,832 | mfu: 50.74 | epoch: 2 | total time: 118.00m | eta: 61.5m +step 10984/16704 (65.76%) | loss: 2.575275 | lrm: 0.68 | dt: 643.44ms | tok/sec: 814,825 | mfu: 50.93 | epoch: 2 | total time: 118.01m | eta: 61.5m +step 10985/16704 (65.76%) | loss: 2.577084 | lrm: 0.68 | dt: 647.21ms | tok/sec: 810,072 | mfu: 50.63 | epoch: 2 | total time: 118.02m | eta: 61.5m +step 10986/16704 (65.77%) | loss: 2.586437 | lrm: 0.68 | dt: 643.51ms | tok/sec: 814,737 | mfu: 50.92 | epoch: 2 | total time: 118.03m | eta: 61.5m +step 10987/16704 (65.77%) | loss: 2.596478 | lrm: 0.68 | dt: 645.39ms | tok/sec: 812,354 | mfu: 50.77 | epoch: 2 | total time: 118.04m | eta: 61.5m +step 10988/16704 (65.78%) | loss: 2.593180 | lrm: 0.68 | dt: 645.62ms | tok/sec: 812,068 | mfu: 50.76 | epoch: 2 | total time: 118.05m | eta: 61.5m +step 10989/16704 (65.79%) | loss: 2.601840 | lrm: 0.68 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 2 | total time: 118.06m | eta: 61.5m +step 10990/16704 (65.79%) | loss: 2.611065 | lrm: 0.68 | dt: 646.64ms | tok/sec: 810,791 | mfu: 50.68 | epoch: 2 | total time: 118.07m | eta: 61.4m +step 10991/16704 (65.80%) | loss: 2.606954 | lrm: 0.68 | dt: 644.89ms | tok/sec: 812,992 | mfu: 50.81 | epoch: 2 | total time: 118.08m | eta: 61.4m +step 10992/16704 (65.80%) | loss: 2.612248 | lrm: 0.68 | dt: 645.49ms | tok/sec: 812,236 | mfu: 50.77 | epoch: 2 | total time: 118.10m | eta: 61.4m +step 10993/16704 (65.81%) | loss: 2.604989 | lrm: 0.68 | dt: 646.12ms | tok/sec: 811,440 | mfu: 50.72 | epoch: 2 | total time: 118.11m | eta: 61.4m +step 10994/16704 (65.82%) | loss: 2.615002 | lrm: 0.68 | dt: 645.54ms | tok/sec: 812,166 | mfu: 50.76 | epoch: 2 | total time: 118.12m | eta: 61.4m +step 10995/16704 (65.82%) | loss: 2.618441 | lrm: 0.68 | dt: 644.02ms | tok/sec: 814,085 | mfu: 50.88 | epoch: 2 | total time: 118.13m | eta: 61.4m +step 10996/16704 (65.83%) | loss: 2.617521 | lrm: 0.68 | dt: 645.76ms | tok/sec: 811,892 | mfu: 50.74 | epoch: 2 | total time: 118.14m | eta: 61.4m +step 10997/16704 (65.83%) | loss: 2.616105 | lrm: 0.68 | dt: 645.60ms | tok/sec: 812,096 | mfu: 50.76 | epoch: 2 | total time: 118.15m | eta: 61.4m +step 10998/16704 (65.84%) | loss: 2.610412 | lrm: 0.68 | dt: 644.19ms | tok/sec: 813,869 | mfu: 50.87 | epoch: 2 | total time: 118.16m | eta: 61.4m +step 10999/16704 (65.85%) | loss: 2.608060 | lrm: 0.68 | dt: 646.13ms | tok/sec: 811,430 | mfu: 50.72 | epoch: 2 | total time: 118.17m | eta: 61.3m +Step 11000 | Validation bpb: 0.795606 +step 11000/16704 (65.85%) | loss: 2.607695 | lrm: 0.68 | dt: 651.40ms | tok/sec: 804,869 | mfu: 50.31 | epoch: 2 | total time: 118.18m | eta: 61.3m +step 11001/16704 (65.86%) | loss: 2.599706 | lrm: 0.68 | dt: 645.15ms | tok/sec: 812,665 | mfu: 50.79 | epoch: 2 | total time: 118.19m | eta: 61.3m +step 11002/16704 (65.86%) | loss: 2.600250 | lrm: 0.68 | dt: 649.20ms | tok/sec: 807,595 | mfu: 50.48 | epoch: 2 | total time: 118.20m | eta: 61.3m +step 11003/16704 (65.87%) | loss: 2.593867 | lrm: 0.68 | dt: 641.05ms | tok/sec: 817,856 | mfu: 51.12 | epoch: 2 | total time: 118.21m | eta: 61.3m +step 11004/16704 (65.88%) | loss: 2.596102 | lrm: 0.68 | dt: 647.41ms | tok/sec: 809,818 | mfu: 50.61 | epoch: 2 | total time: 118.22m | eta: 61.3m +step 11005/16704 (65.88%) | loss: 2.602172 | lrm: 0.68 | dt: 645.37ms | tok/sec: 812,388 | mfu: 50.78 | epoch: 2 | total time: 118.24m | eta: 61.3m +step 11006/16704 (65.89%) | loss: 2.598459 | lrm: 0.68 | dt: 643.07ms | tok/sec: 815,284 | mfu: 50.96 | epoch: 2 | total time: 118.25m | eta: 61.3m +step 11007/16704 (65.89%) | loss: 2.591391 | lrm: 0.68 | dt: 651.58ms | tok/sec: 804,640 | mfu: 50.29 | epoch: 2 | total time: 118.26m | eta: 61.3m +step 11008/16704 (65.90%) | loss: 2.598549 | lrm: 0.68 | dt: 642.38ms | tok/sec: 816,160 | mfu: 51.01 | epoch: 2 | total time: 118.27m | eta: 61.3m +step 11009/16704 (65.91%) | loss: 2.594140 | lrm: 0.68 | dt: 647.37ms | tok/sec: 809,870 | mfu: 50.62 | epoch: 2 | total time: 118.28m | eta: 61.2m +step 11010/16704 (65.91%) | loss: 2.595775 | lrm: 0.68 | dt: 644.07ms | tok/sec: 814,029 | mfu: 50.88 | epoch: 2 | total time: 118.29m | eta: 61.2m +step 11011/16704 (65.92%) | loss: 2.593625 | lrm: 0.68 | dt: 646.91ms | tok/sec: 810,454 | mfu: 50.65 | epoch: 2 | total time: 118.30m | eta: 61.2m +step 11012/16704 (65.92%) | loss: 2.606109 | lrm: 0.68 | dt: 647.52ms | tok/sec: 809,686 | mfu: 50.61 | epoch: 2 | total time: 118.31m | eta: 61.2m +step 11013/16704 (65.93%) | loss: 2.586263 | lrm: 0.68 | dt: 643.72ms | tok/sec: 814,471 | mfu: 50.91 | epoch: 2 | total time: 118.32m | eta: 61.2m +step 11014/16704 (65.94%) | loss: 2.576895 | lrm: 0.68 | dt: 646.29ms | tok/sec: 811,226 | mfu: 50.70 | epoch: 2 | total time: 118.33m | eta: 61.2m +step 11015/16704 (65.94%) | loss: 2.577023 | lrm: 0.68 | dt: 645.24ms | tok/sec: 812,547 | mfu: 50.79 | epoch: 2 | total time: 118.34m | eta: 61.2m +step 11016/16704 (65.95%) | loss: 2.579432 | lrm: 0.68 | dt: 646.55ms | tok/sec: 810,902 | mfu: 50.68 | epoch: 2 | total time: 118.35m | eta: 61.2m +step 11017/16704 (65.95%) | loss: 2.561858 | lrm: 0.68 | dt: 644.56ms | tok/sec: 813,406 | mfu: 50.84 | epoch: 2 | total time: 118.36m | eta: 61.2m +step 11018/16704 (65.96%) | loss: 2.567697 | lrm: 0.68 | dt: 645.69ms | tok/sec: 811,979 | mfu: 50.75 | epoch: 2 | total time: 118.38m | eta: 61.1m +step 11019/16704 (65.97%) | loss: 2.568471 | lrm: 0.68 | dt: 646.21ms | tok/sec: 811,328 | mfu: 50.71 | epoch: 2 | total time: 118.39m | eta: 61.1m +step 11020/16704 (65.97%) | loss: 2.571870 | lrm: 0.68 | dt: 647.16ms | tok/sec: 810,133 | mfu: 50.63 | epoch: 2 | total time: 118.40m | eta: 61.1m +step 11021/16704 (65.98%) | loss: 2.574592 | lrm: 0.68 | dt: 644.68ms | tok/sec: 813,256 | mfu: 50.83 | epoch: 2 | total time: 118.41m | eta: 61.1m +step 11022/16704 (65.98%) | loss: 2.582382 | lrm: 0.68 | dt: 644.43ms | tok/sec: 813,570 | mfu: 50.85 | epoch: 2 | total time: 118.42m | eta: 61.1m +step 11023/16704 (65.99%) | loss: 2.583200 | lrm: 0.68 | dt: 648.48ms | tok/sec: 808,481 | mfu: 50.53 | epoch: 2 | total time: 118.43m | eta: 61.1m +step 11024/16704 (66.00%) | loss: 2.605733 | lrm: 0.68 | dt: 646.06ms | tok/sec: 811,519 | mfu: 50.72 | epoch: 2 | total time: 118.44m | eta: 61.1m +step 11025/16704 (66.00%) | loss: 2.604567 | lrm: 0.68 | dt: 644.39ms | tok/sec: 813,622 | mfu: 50.85 | epoch: 2 | total time: 118.45m | eta: 61.1m +step 11026/16704 (66.01%) | loss: 2.598635 | lrm: 0.68 | dt: 646.12ms | tok/sec: 811,446 | mfu: 50.72 | epoch: 2 | total time: 118.46m | eta: 61.1m +step 11027/16704 (66.01%) | loss: 2.604751 | lrm: 0.68 | dt: 646.76ms | tok/sec: 810,634 | mfu: 50.67 | epoch: 2 | total time: 118.47m | eta: 61.0m +step 11028/16704 (66.02%) | loss: 2.606549 | lrm: 0.68 | dt: 644.32ms | tok/sec: 813,704 | mfu: 50.86 | epoch: 2 | total time: 118.48m | eta: 61.0m +step 11029/16704 (66.03%) | loss: 2.606500 | lrm: 0.68 | dt: 646.29ms | tok/sec: 811,222 | mfu: 50.70 | epoch: 2 | total time: 118.49m | eta: 61.0m +step 11030/16704 (66.03%) | loss: 2.606664 | lrm: 0.68 | dt: 646.26ms | tok/sec: 811,263 | mfu: 50.71 | epoch: 2 | total time: 118.50m | eta: 61.0m +step 11031/16704 (66.04%) | loss: 2.603256 | lrm: 0.68 | dt: 642.77ms | tok/sec: 815,667 | mfu: 50.98 | epoch: 2 | total time: 118.52m | eta: 61.0m +step 11032/16704 (66.04%) | loss: 2.599348 | lrm: 0.68 | dt: 646.96ms | tok/sec: 810,388 | mfu: 50.65 | epoch: 2 | total time: 118.53m | eta: 61.0m +step 11033/16704 (66.05%) | loss: 2.590251 | lrm: 0.68 | dt: 650.31ms | tok/sec: 806,216 | mfu: 50.39 | epoch: 2 | total time: 118.54m | eta: 61.0m +step 11034/16704 (66.06%) | loss: 2.589355 | lrm: 0.68 | dt: 645.80ms | tok/sec: 811,848 | mfu: 50.74 | epoch: 2 | total time: 118.55m | eta: 61.0m +step 11035/16704 (66.06%) | loss: 2.589466 | lrm: 0.68 | dt: 643.59ms | tok/sec: 814,636 | mfu: 50.92 | epoch: 2 | total time: 118.56m | eta: 61.0m +step 11036/16704 (66.07%) | loss: 2.600249 | lrm: 0.68 | dt: 648.65ms | tok/sec: 808,279 | mfu: 50.52 | epoch: 2 | total time: 118.57m | eta: 61.0m +step 11037/16704 (66.07%) | loss: 2.611783 | lrm: 0.68 | dt: 642.83ms | tok/sec: 815,587 | mfu: 50.98 | epoch: 2 | total time: 118.58m | eta: 60.9m +step 11038/16704 (66.08%) | loss: 2.616761 | lrm: 0.68 | dt: 646.87ms | tok/sec: 810,497 | mfu: 50.66 | epoch: 2 | total time: 118.59m | eta: 60.9m +step 11039/16704 (66.09%) | loss: 2.607610 | lrm: 0.68 | dt: 643.82ms | tok/sec: 814,339 | mfu: 50.90 | epoch: 2 | total time: 118.60m | eta: 60.9m +step 11040/16704 (66.09%) | loss: 2.611903 | lrm: 0.68 | dt: 644.81ms | tok/sec: 813,087 | mfu: 50.82 | epoch: 2 | total time: 118.61m | eta: 60.9m +step 11041/16704 (66.10%) | loss: 2.628289 | lrm: 0.68 | dt: 643.46ms | tok/sec: 814,792 | mfu: 50.93 | epoch: 2 | total time: 118.62m | eta: 60.9m +step 11042/16704 (66.10%) | loss: 2.623549 | lrm: 0.68 | dt: 644.33ms | tok/sec: 813,695 | mfu: 50.86 | epoch: 2 | total time: 118.63m | eta: 60.9m +step 11043/16704 (66.11%) | loss: 2.621495 | lrm: 0.68 | dt: 646.83ms | tok/sec: 810,548 | mfu: 50.66 | epoch: 2 | total time: 118.64m | eta: 60.9m +step 11044/16704 (66.12%) | loss: 2.626525 | lrm: 0.68 | dt: 645.59ms | tok/sec: 812,103 | mfu: 50.76 | epoch: 2 | total time: 118.66m | eta: 60.9m +step 11045/16704 (66.12%) | loss: 2.621967 | lrm: 0.68 | dt: 644.23ms | tok/sec: 813,821 | mfu: 50.87 | epoch: 2 | total time: 118.67m | eta: 60.9m +step 11046/16704 (66.13%) | loss: 2.619763 | lrm: 0.68 | dt: 645.30ms | tok/sec: 812,466 | mfu: 50.78 | epoch: 2 | total time: 118.68m | eta: 60.8m +step 11047/16704 (66.13%) | loss: 2.617869 | lrm: 0.68 | dt: 646.06ms | tok/sec: 811,519 | mfu: 50.72 | epoch: 2 | total time: 118.69m | eta: 60.8m +step 11048/16704 (66.14%) | loss: 2.625126 | lrm: 0.68 | dt: 646.00ms | tok/sec: 811,592 | mfu: 50.73 | epoch: 2 | total time: 118.70m | eta: 60.8m +step 11049/16704 (66.15%) | loss: 2.614827 | lrm: 0.68 | dt: 646.50ms | tok/sec: 810,968 | mfu: 50.69 | epoch: 2 | total time: 118.71m | eta: 60.8m +step 11050/16704 (66.15%) | loss: 2.617330 | lrm: 0.68 | dt: 646.27ms | tok/sec: 811,255 | mfu: 50.70 | epoch: 2 | total time: 118.72m | eta: 60.8m +step 11051/16704 (66.16%) | loss: 2.626809 | lrm: 0.68 | dt: 646.21ms | tok/sec: 811,328 | mfu: 50.71 | epoch: 2 | total time: 118.73m | eta: 60.8m +step 11052/16704 (66.16%) | loss: 2.619589 | lrm: 0.68 | dt: 645.32ms | tok/sec: 812,448 | mfu: 50.78 | epoch: 2 | total time: 118.74m | eta: 60.8m +step 11053/16704 (66.17%) | loss: 2.597053 | lrm: 0.68 | dt: 643.52ms | tok/sec: 814,714 | mfu: 50.92 | epoch: 2 | total time: 118.75m | eta: 60.8m +step 11054/16704 (66.18%) | loss: 2.605181 | lrm: 0.68 | dt: 646.74ms | tok/sec: 810,659 | mfu: 50.67 | epoch: 2 | total time: 118.76m | eta: 60.8m +step 11055/16704 (66.18%) | loss: 2.603363 | lrm: 0.68 | dt: 647.39ms | tok/sec: 809,847 | mfu: 50.62 | epoch: 2 | total time: 118.77m | eta: 60.7m +step 11056/16704 (66.19%) | loss: 2.585754 | lrm: 0.68 | dt: 644.33ms | tok/sec: 813,691 | mfu: 50.86 | epoch: 2 | total time: 118.78m | eta: 60.7m +step 11057/16704 (66.19%) | loss: 2.594956 | lrm: 0.68 | dt: 645.51ms | tok/sec: 812,203 | mfu: 50.76 | epoch: 2 | total time: 118.79m | eta: 60.7m +step 11058/16704 (66.20%) | loss: 2.581663 | lrm: 0.68 | dt: 643.06ms | tok/sec: 815,305 | mfu: 50.96 | epoch: 2 | total time: 118.81m | eta: 60.7m +step 11059/16704 (66.21%) | loss: 2.589802 | lrm: 0.68 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 2 | total time: 118.82m | eta: 60.7m +step 11060/16704 (66.21%) | loss: 2.585129 | lrm: 0.68 | dt: 647.02ms | tok/sec: 810,307 | mfu: 50.65 | epoch: 2 | total time: 118.83m | eta: 60.7m +step 11061/16704 (66.22%) | loss: 2.581694 | lrm: 0.68 | dt: 646.07ms | tok/sec: 811,508 | mfu: 50.72 | epoch: 2 | total time: 118.84m | eta: 60.7m +step 11062/16704 (66.22%) | loss: 2.575856 | lrm: 0.68 | dt: 647.56ms | tok/sec: 809,631 | mfu: 50.60 | epoch: 2 | total time: 118.85m | eta: 60.7m +step 11063/16704 (66.23%) | loss: 2.577266 | lrm: 0.68 | dt: 643.99ms | tok/sec: 814,120 | mfu: 50.88 | epoch: 2 | total time: 118.86m | eta: 60.7m +step 11064/16704 (66.24%) | loss: 2.583847 | lrm: 0.68 | dt: 645.19ms | tok/sec: 812,613 | mfu: 50.79 | epoch: 2 | total time: 118.87m | eta: 60.7m +step 11065/16704 (66.24%) | loss: 2.589257 | lrm: 0.68 | dt: 645.82ms | tok/sec: 811,820 | mfu: 50.74 | epoch: 2 | total time: 118.88m | eta: 60.6m +step 11066/16704 (66.25%) | loss: 2.596469 | lrm: 0.68 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 2 | total time: 118.89m | eta: 60.6m +step 11067/16704 (66.25%) | loss: 2.587734 | lrm: 0.67 | dt: 643.85ms | tok/sec: 814,297 | mfu: 50.89 | epoch: 2 | total time: 118.90m | eta: 60.6m +step 11068/16704 (66.26%) | loss: 2.578359 | lrm: 0.67 | dt: 646.84ms | tok/sec: 810,539 | mfu: 50.66 | epoch: 2 | total time: 118.91m | eta: 60.6m +step 11069/16704 (66.27%) | loss: 2.590716 | lrm: 0.67 | dt: 643.66ms | tok/sec: 814,542 | mfu: 50.91 | epoch: 2 | total time: 118.92m | eta: 60.6m +step 11070/16704 (66.27%) | loss: 2.588044 | lrm: 0.67 | dt: 647.27ms | tok/sec: 810,003 | mfu: 50.63 | epoch: 2 | total time: 118.93m | eta: 60.6m +step 11071/16704 (66.28%) | loss: 2.590545 | lrm: 0.67 | dt: 644.07ms | tok/sec: 814,020 | mfu: 50.88 | epoch: 2 | total time: 118.95m | eta: 60.6m +step 11072/16704 (66.28%) | loss: 2.588994 | lrm: 0.67 | dt: 647.26ms | tok/sec: 810,016 | mfu: 50.63 | epoch: 2 | total time: 118.96m | eta: 60.6m +step 11073/16704 (66.29%) | loss: 2.595828 | lrm: 0.67 | dt: 646.20ms | tok/sec: 811,338 | mfu: 50.71 | epoch: 2 | total time: 118.97m | eta: 60.6m +step 11074/16704 (66.30%) | loss: 2.589949 | lrm: 0.67 | dt: 645.18ms | tok/sec: 812,624 | mfu: 50.79 | epoch: 2 | total time: 118.98m | eta: 60.5m +step 11075/16704 (66.30%) | loss: 2.586211 | lrm: 0.67 | dt: 648.57ms | tok/sec: 808,374 | mfu: 50.52 | epoch: 2 | total time: 118.99m | eta: 60.5m +step 11076/16704 (66.31%) | loss: 2.605908 | lrm: 0.67 | dt: 643.45ms | tok/sec: 814,804 | mfu: 50.93 | epoch: 2 | total time: 119.00m | eta: 60.5m +step 11077/16704 (66.31%) | loss: 2.603623 | lrm: 0.67 | dt: 644.19ms | tok/sec: 813,873 | mfu: 50.87 | epoch: 2 | total time: 119.01m | eta: 60.5m +step 11078/16704 (66.32%) | loss: 2.613797 | lrm: 0.67 | dt: 645.03ms | tok/sec: 812,815 | mfu: 50.80 | epoch: 2 | total time: 119.02m | eta: 60.5m +step 11079/16704 (66.33%) | loss: 2.617752 | lrm: 0.67 | dt: 647.23ms | tok/sec: 810,054 | mfu: 50.63 | epoch: 2 | total time: 119.03m | eta: 60.5m +step 11080/16704 (66.33%) | loss: 2.615627 | lrm: 0.67 | dt: 645.95ms | tok/sec: 811,652 | mfu: 50.73 | epoch: 2 | total time: 119.04m | eta: 60.5m +step 11081/16704 (66.34%) | loss: 2.625087 | lrm: 0.67 | dt: 644.71ms | tok/sec: 813,220 | mfu: 50.83 | epoch: 2 | total time: 119.05m | eta: 60.5m +step 11082/16704 (66.34%) | loss: 2.622273 | lrm: 0.67 | dt: 644.68ms | tok/sec: 813,257 | mfu: 50.83 | epoch: 2 | total time: 119.06m | eta: 60.5m +step 11083/16704 (66.35%) | loss: 2.608954 | lrm: 0.67 | dt: 644.89ms | tok/sec: 812,983 | mfu: 50.81 | epoch: 2 | total time: 119.07m | eta: 60.4m +step 11084/16704 (66.36%) | loss: 2.601873 | lrm: 0.67 | dt: 645.08ms | tok/sec: 812,752 | mfu: 50.80 | epoch: 2 | total time: 119.09m | eta: 60.4m +step 11085/16704 (66.36%) | loss: 2.612714 | lrm: 0.67 | dt: 646.13ms | tok/sec: 811,424 | mfu: 50.72 | epoch: 2 | total time: 119.10m | eta: 60.4m +step 11086/16704 (66.37%) | loss: 2.614108 | lrm: 0.67 | dt: 647.50ms | tok/sec: 809,710 | mfu: 50.61 | epoch: 2 | total time: 119.11m | eta: 60.4m +step 11087/16704 (66.37%) | loss: 2.614775 | lrm: 0.67 | dt: 644.27ms | tok/sec: 813,772 | mfu: 50.86 | epoch: 2 | total time: 119.12m | eta: 60.4m +step 11088/16704 (66.38%) | loss: 2.623340 | lrm: 0.67 | dt: 644.41ms | tok/sec: 813,587 | mfu: 50.85 | epoch: 2 | total time: 119.13m | eta: 60.4m +step 11089/16704 (66.39%) | loss: 2.613930 | lrm: 0.67 | dt: 646.23ms | tok/sec: 811,307 | mfu: 50.71 | epoch: 2 | total time: 119.14m | eta: 60.4m +step 11090/16704 (66.39%) | loss: 2.598325 | lrm: 0.67 | dt: 644.40ms | tok/sec: 813,609 | mfu: 50.85 | epoch: 2 | total time: 119.15m | eta: 60.4m +step 11091/16704 (66.40%) | loss: 2.610529 | lrm: 0.67 | dt: 646.89ms | tok/sec: 810,472 | mfu: 50.66 | epoch: 2 | total time: 119.16m | eta: 60.4m +step 11092/16704 (66.40%) | loss: 2.616076 | lrm: 0.67 | dt: 644.23ms | tok/sec: 813,821 | mfu: 50.87 | epoch: 2 | total time: 119.17m | eta: 60.3m +step 11093/16704 (66.41%) | loss: 2.612352 | lrm: 0.67 | dt: 646.36ms | tok/sec: 811,138 | mfu: 50.70 | epoch: 2 | total time: 119.18m | eta: 60.3m +step 11094/16704 (66.42%) | loss: 2.607800 | lrm: 0.67 | dt: 644.22ms | tok/sec: 813,832 | mfu: 50.87 | epoch: 2 | total time: 119.19m | eta: 60.3m +step 11095/16704 (66.42%) | loss: 2.616330 | lrm: 0.67 | dt: 645.26ms | tok/sec: 812,527 | mfu: 50.78 | epoch: 2 | total time: 119.20m | eta: 60.3m +step 11096/16704 (66.43%) | loss: 2.619166 | lrm: 0.67 | dt: 644.58ms | tok/sec: 813,380 | mfu: 50.84 | epoch: 2 | total time: 119.21m | eta: 60.3m +step 11097/16704 (66.43%) | loss: 2.617878 | lrm: 0.67 | dt: 645.63ms | tok/sec: 812,055 | mfu: 50.75 | epoch: 2 | total time: 119.23m | eta: 60.3m +step 11098/16704 (66.44%) | loss: 2.596867 | lrm: 0.67 | dt: 645.63ms | tok/sec: 812,050 | mfu: 50.75 | epoch: 2 | total time: 119.24m | eta: 60.3m +step 11099/16704 (66.45%) | loss: 2.589465 | lrm: 0.67 | dt: 645.94ms | tok/sec: 811,666 | mfu: 50.73 | epoch: 2 | total time: 119.25m | eta: 60.3m +step 11100/16704 (66.45%) | loss: 2.589817 | lrm: 0.67 | dt: 644.27ms | tok/sec: 813,773 | mfu: 50.86 | epoch: 2 | total time: 119.26m | eta: 60.3m +step 11101/16704 (66.46%) | loss: 2.586414 | lrm: 0.67 | dt: 647.43ms | tok/sec: 809,795 | mfu: 50.61 | epoch: 2 | total time: 119.27m | eta: 60.3m +step 11102/16704 (66.46%) | loss: 2.597342 | lrm: 0.67 | dt: 645.99ms | tok/sec: 811,600 | mfu: 50.73 | epoch: 2 | total time: 119.28m | eta: 60.2m +step 11103/16704 (66.47%) | loss: 2.611637 | lrm: 0.67 | dt: 644.84ms | tok/sec: 813,045 | mfu: 50.82 | epoch: 2 | total time: 119.29m | eta: 60.2m +step 11104/16704 (66.48%) | loss: 2.617388 | lrm: 0.67 | dt: 646.36ms | tok/sec: 811,137 | mfu: 50.70 | epoch: 2 | total time: 119.30m | eta: 60.2m +step 11105/16704 (66.48%) | loss: 2.617996 | lrm: 0.67 | dt: 644.63ms | tok/sec: 813,311 | mfu: 50.83 | epoch: 2 | total time: 119.31m | eta: 60.2m +step 11106/16704 (66.49%) | loss: 2.596819 | lrm: 0.67 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 2 | total time: 119.32m | eta: 60.2m +step 11107/16704 (66.49%) | loss: 2.598442 | lrm: 0.67 | dt: 645.48ms | tok/sec: 812,249 | mfu: 50.77 | epoch: 2 | total time: 119.33m | eta: 60.2m +step 11108/16704 (66.50%) | loss: 2.597541 | lrm: 0.67 | dt: 644.67ms | tok/sec: 813,270 | mfu: 50.83 | epoch: 2 | total time: 119.34m | eta: 60.2m +step 11109/16704 (66.51%) | loss: 2.601353 | lrm: 0.67 | dt: 646.51ms | tok/sec: 810,948 | mfu: 50.69 | epoch: 2 | total time: 119.35m | eta: 60.2m +step 11110/16704 (66.51%) | loss: 2.594760 | lrm: 0.67 | dt: 647.06ms | tok/sec: 810,260 | mfu: 50.64 | epoch: 2 | total time: 119.37m | eta: 60.2m +step 11111/16704 (66.52%) | loss: 2.585736 | lrm: 0.67 | dt: 644.56ms | tok/sec: 813,398 | mfu: 50.84 | epoch: 2 | total time: 119.38m | eta: 60.1m +step 11112/16704 (66.52%) | loss: 2.577431 | lrm: 0.67 | dt: 645.84ms | tok/sec: 811,795 | mfu: 50.74 | epoch: 2 | total time: 119.39m | eta: 60.1m +step 11113/16704 (66.53%) | loss: 2.579456 | lrm: 0.67 | dt: 645.14ms | tok/sec: 812,673 | mfu: 50.79 | epoch: 2 | total time: 119.40m | eta: 60.1m +step 11114/16704 (66.53%) | loss: 2.565933 | lrm: 0.67 | dt: 646.92ms | tok/sec: 810,439 | mfu: 50.65 | epoch: 2 | total time: 119.41m | eta: 60.1m +step 11115/16704 (66.54%) | loss: 2.577917 | lrm: 0.67 | dt: 648.20ms | tok/sec: 808,840 | mfu: 50.55 | epoch: 2 | total time: 119.42m | eta: 60.1m +step 11116/16704 (66.55%) | loss: 2.582124 | lrm: 0.67 | dt: 645.22ms | tok/sec: 812,573 | mfu: 50.79 | epoch: 2 | total time: 119.43m | eta: 60.1m +step 11117/16704 (66.55%) | loss: 2.581188 | lrm: 0.67 | dt: 645.87ms | tok/sec: 811,754 | mfu: 50.74 | epoch: 2 | total time: 119.44m | eta: 60.1m +step 11118/16704 (66.56%) | loss: 2.588377 | lrm: 0.67 | dt: 645.50ms | tok/sec: 812,222 | mfu: 50.77 | epoch: 2 | total time: 119.45m | eta: 60.1m +step 11119/16704 (66.56%) | loss: 2.583772 | lrm: 0.67 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 2 | total time: 119.46m | eta: 60.1m +step 11120/16704 (66.57%) | loss: 2.583309 | lrm: 0.67 | dt: 643.62ms | tok/sec: 814,587 | mfu: 50.91 | epoch: 2 | total time: 119.47m | eta: 60.0m +step 11121/16704 (66.58%) | loss: 2.591199 | lrm: 0.67 | dt: 645.41ms | tok/sec: 812,329 | mfu: 50.77 | epoch: 2 | total time: 119.48m | eta: 60.0m +step 11122/16704 (66.58%) | loss: 2.584770 | lrm: 0.67 | dt: 646.22ms | tok/sec: 811,309 | mfu: 50.71 | epoch: 2 | total time: 119.49m | eta: 60.0m +step 11123/16704 (66.59%) | loss: 2.582660 | lrm: 0.67 | dt: 645.64ms | tok/sec: 812,049 | mfu: 50.75 | epoch: 2 | total time: 119.50m | eta: 60.0m +step 11124/16704 (66.59%) | loss: 2.600043 | lrm: 0.67 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 2 | total time: 119.52m | eta: 60.0m +step 11125/16704 (66.60%) | loss: 2.605876 | lrm: 0.67 | dt: 646.76ms | tok/sec: 810,638 | mfu: 50.67 | epoch: 2 | total time: 119.53m | eta: 60.0m +step 11126/16704 (66.61%) | loss: 2.605778 | lrm: 0.67 | dt: 645.13ms | tok/sec: 812,685 | mfu: 50.79 | epoch: 2 | total time: 119.54m | eta: 60.0m +step 11127/16704 (66.61%) | loss: 2.617550 | lrm: 0.67 | dt: 645.08ms | tok/sec: 812,746 | mfu: 50.80 | epoch: 2 | total time: 119.55m | eta: 60.0m +step 11128/16704 (66.62%) | loss: 2.611817 | lrm: 0.67 | dt: 646.19ms | tok/sec: 811,355 | mfu: 50.71 | epoch: 2 | total time: 119.56m | eta: 60.0m +step 11129/16704 (66.62%) | loss: 2.623422 | lrm: 0.67 | dt: 643.98ms | tok/sec: 814,140 | mfu: 50.88 | epoch: 2 | total time: 119.57m | eta: 60.0m +step 11130/16704 (66.63%) | loss: 2.618584 | lrm: 0.67 | dt: 645.12ms | tok/sec: 812,697 | mfu: 50.79 | epoch: 2 | total time: 119.58m | eta: 59.9m +step 11131/16704 (66.64%) | loss: 2.608872 | lrm: 0.67 | dt: 646.14ms | tok/sec: 811,418 | mfu: 50.71 | epoch: 2 | total time: 119.59m | eta: 59.9m +step 11132/16704 (66.64%) | loss: 2.608081 | lrm: 0.67 | dt: 646.25ms | tok/sec: 811,271 | mfu: 50.71 | epoch: 2 | total time: 119.60m | eta: 59.9m +step 11133/16704 (66.65%) | loss: 2.593465 | lrm: 0.67 | dt: 646.52ms | tok/sec: 810,938 | mfu: 50.68 | epoch: 2 | total time: 119.61m | eta: 59.9m +step 11134/16704 (66.65%) | loss: 2.601251 | lrm: 0.67 | dt: 644.49ms | tok/sec: 813,495 | mfu: 50.84 | epoch: 2 | total time: 119.62m | eta: 59.9m +step 11135/16704 (66.66%) | loss: 2.589602 | lrm: 0.67 | dt: 644.60ms | tok/sec: 813,359 | mfu: 50.84 | epoch: 2 | total time: 119.63m | eta: 59.9m +step 11136/16704 (66.67%) | loss: 2.600786 | lrm: 0.67 | dt: 647.82ms | tok/sec: 809,307 | mfu: 50.58 | epoch: 2 | total time: 119.64m | eta: 59.9m +step 11137/16704 (66.67%) | loss: 2.616231 | lrm: 0.67 | dt: 645.58ms | tok/sec: 812,124 | mfu: 50.76 | epoch: 2 | total time: 119.66m | eta: 59.9m +step 11138/16704 (66.68%) | loss: 2.603031 | lrm: 0.67 | dt: 644.48ms | tok/sec: 813,507 | mfu: 50.85 | epoch: 2 | total time: 119.67m | eta: 59.9m +step 11139/16704 (66.68%) | loss: 2.604931 | lrm: 0.67 | dt: 647.39ms | tok/sec: 809,850 | mfu: 50.62 | epoch: 2 | total time: 119.68m | eta: 59.8m +step 11140/16704 (66.69%) | loss: 2.606583 | lrm: 0.67 | dt: 644.35ms | tok/sec: 813,671 | mfu: 50.86 | epoch: 2 | total time: 119.69m | eta: 59.8m +step 11141/16704 (66.70%) | loss: 2.607395 | lrm: 0.67 | dt: 646.21ms | tok/sec: 811,325 | mfu: 50.71 | epoch: 2 | total time: 119.70m | eta: 59.8m +step 11142/16704 (66.70%) | loss: 2.610393 | lrm: 0.67 | dt: 643.62ms | tok/sec: 814,591 | mfu: 50.91 | epoch: 2 | total time: 119.71m | eta: 59.8m +step 11143/16704 (66.71%) | loss: 2.614607 | lrm: 0.67 | dt: 646.88ms | tok/sec: 810,490 | mfu: 50.66 | epoch: 2 | total time: 119.72m | eta: 59.8m +step 11144/16704 (66.71%) | loss: 2.624527 | lrm: 0.67 | dt: 645.62ms | tok/sec: 812,073 | mfu: 50.76 | epoch: 2 | total time: 119.73m | eta: 59.8m +step 11145/16704 (66.72%) | loss: 2.623921 | lrm: 0.67 | dt: 643.93ms | tok/sec: 814,194 | mfu: 50.89 | epoch: 2 | total time: 119.74m | eta: 59.8m +step 11146/16704 (66.73%) | loss: 2.621916 | lrm: 0.67 | dt: 646.97ms | tok/sec: 810,368 | mfu: 50.65 | epoch: 2 | total time: 119.75m | eta: 59.8m +step 11147/16704 (66.73%) | loss: 2.626132 | lrm: 0.67 | dt: 645.11ms | tok/sec: 812,714 | mfu: 50.80 | epoch: 2 | total time: 119.76m | eta: 59.8m +step 11148/16704 (66.74%) | loss: 2.617780 | lrm: 0.67 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 2 | total time: 119.77m | eta: 59.7m +step 11149/16704 (66.74%) | loss: 2.624313 | lrm: 0.67 | dt: 644.68ms | tok/sec: 813,255 | mfu: 50.83 | epoch: 2 | total time: 119.78m | eta: 59.7m +step 11150/16704 (66.75%) | loss: 2.627946 | lrm: 0.66 | dt: 644.49ms | tok/sec: 813,491 | mfu: 50.84 | epoch: 2 | total time: 119.80m | eta: 59.7m +step 11151/16704 (66.76%) | loss: 2.618897 | lrm: 0.66 | dt: 644.38ms | tok/sec: 813,632 | mfu: 50.85 | epoch: 2 | total time: 119.81m | eta: 59.7m +step 11152/16704 (66.76%) | loss: 2.608197 | lrm: 0.66 | dt: 644.42ms | tok/sec: 813,587 | mfu: 50.85 | epoch: 2 | total time: 119.82m | eta: 59.7m +step 11153/16704 (66.77%) | loss: 2.604293 | lrm: 0.66 | dt: 645.08ms | tok/sec: 812,745 | mfu: 50.80 | epoch: 2 | total time: 119.83m | eta: 59.7m +step 11154/16704 (66.77%) | loss: 2.599420 | lrm: 0.66 | dt: 649.48ms | tok/sec: 807,240 | mfu: 50.45 | epoch: 2 | total time: 119.84m | eta: 59.7m +step 11155/16704 (66.78%) | loss: 2.592707 | lrm: 0.66 | dt: 645.27ms | tok/sec: 812,509 | mfu: 50.78 | epoch: 2 | total time: 119.85m | eta: 59.7m +step 11156/16704 (66.79%) | loss: 2.593463 | lrm: 0.66 | dt: 644.99ms | tok/sec: 812,864 | mfu: 50.81 | epoch: 2 | total time: 119.86m | eta: 59.7m +step 11157/16704 (66.79%) | loss: 2.593980 | lrm: 0.66 | dt: 644.94ms | tok/sec: 812,922 | mfu: 50.81 | epoch: 2 | total time: 119.87m | eta: 59.7m +step 11158/16704 (66.80%) | loss: 2.593981 | lrm: 0.66 | dt: 646.12ms | tok/sec: 811,435 | mfu: 50.72 | epoch: 2 | total time: 119.88m | eta: 59.6m +step 11159/16704 (66.80%) | loss: 2.589765 | lrm: 0.66 | dt: 646.19ms | tok/sec: 811,352 | mfu: 50.71 | epoch: 2 | total time: 119.89m | eta: 59.6m +step 11160/16704 (66.81%) | loss: 2.591228 | lrm: 0.66 | dt: 643.93ms | tok/sec: 814,195 | mfu: 50.89 | epoch: 2 | total time: 119.90m | eta: 59.6m +step 11161/16704 (66.82%) | loss: 2.577801 | lrm: 0.66 | dt: 644.98ms | tok/sec: 812,873 | mfu: 50.81 | epoch: 2 | total time: 119.91m | eta: 59.6m +step 11162/16704 (66.82%) | loss: 2.588588 | lrm: 0.66 | dt: 646.62ms | tok/sec: 810,811 | mfu: 50.68 | epoch: 2 | total time: 119.92m | eta: 59.6m +step 11163/16704 (66.83%) | loss: 2.576256 | lrm: 0.66 | dt: 643.62ms | tok/sec: 814,596 | mfu: 50.91 | epoch: 2 | total time: 119.94m | eta: 59.6m +step 11164/16704 (66.83%) | loss: 2.586586 | lrm: 0.66 | dt: 645.61ms | tok/sec: 812,085 | mfu: 50.76 | epoch: 2 | total time: 119.95m | eta: 59.6m +step 11165/16704 (66.84%) | loss: 2.592291 | lrm: 0.66 | dt: 646.65ms | tok/sec: 810,773 | mfu: 50.67 | epoch: 2 | total time: 119.96m | eta: 59.6m +step 11166/16704 (66.85%) | loss: 2.600349 | lrm: 0.66 | dt: 645.24ms | tok/sec: 812,546 | mfu: 50.79 | epoch: 2 | total time: 119.97m | eta: 59.6m +step 11167/16704 (66.85%) | loss: 2.604472 | lrm: 0.66 | dt: 644.75ms | tok/sec: 813,166 | mfu: 50.82 | epoch: 2 | total time: 119.98m | eta: 59.5m +step 11168/16704 (66.86%) | loss: 2.600133 | lrm: 0.66 | dt: 648.87ms | tok/sec: 808,006 | mfu: 50.50 | epoch: 2 | total time: 119.99m | eta: 59.5m +step 11169/16704 (66.86%) | loss: 2.597249 | lrm: 0.66 | dt: 646.02ms | tok/sec: 811,571 | mfu: 50.72 | epoch: 2 | total time: 120.00m | eta: 59.5m +step 11170/16704 (66.87%) | loss: 2.598338 | lrm: 0.66 | dt: 645.51ms | tok/sec: 812,205 | mfu: 50.76 | epoch: 2 | total time: 120.01m | eta: 59.5m +step 11171/16704 (66.88%) | loss: 2.607736 | lrm: 0.66 | dt: 647.20ms | tok/sec: 810,086 | mfu: 50.63 | epoch: 2 | total time: 120.02m | eta: 59.5m +step 11172/16704 (66.88%) | loss: 2.603673 | lrm: 0.66 | dt: 645.79ms | tok/sec: 811,852 | mfu: 50.74 | epoch: 2 | total time: 120.03m | eta: 59.5m +step 11173/16704 (66.89%) | loss: 2.600324 | lrm: 0.66 | dt: 644.70ms | tok/sec: 813,226 | mfu: 50.83 | epoch: 2 | total time: 120.04m | eta: 59.5m +step 11174/16704 (66.89%) | loss: 2.608623 | lrm: 0.66 | dt: 648.09ms | tok/sec: 808,975 | mfu: 50.56 | epoch: 2 | total time: 120.05m | eta: 59.5m +step 11175/16704 (66.90%) | loss: 2.612480 | lrm: 0.66 | dt: 644.22ms | tok/sec: 813,837 | mfu: 50.87 | epoch: 2 | total time: 120.06m | eta: 59.5m +step 11176/16704 (66.91%) | loss: 2.616485 | lrm: 0.66 | dt: 644.18ms | tok/sec: 813,882 | mfu: 50.87 | epoch: 2 | total time: 120.08m | eta: 59.4m +step 11177/16704 (66.91%) | loss: 2.619755 | lrm: 0.66 | dt: 646.09ms | tok/sec: 811,483 | mfu: 50.72 | epoch: 2 | total time: 120.09m | eta: 59.4m +step 11178/16704 (66.92%) | loss: 2.617977 | lrm: 0.66 | dt: 646.49ms | tok/sec: 810,972 | mfu: 50.69 | epoch: 2 | total time: 120.10m | eta: 59.4m +step 11179/16704 (66.92%) | loss: 2.612294 | lrm: 0.66 | dt: 645.41ms | tok/sec: 812,337 | mfu: 50.77 | epoch: 2 | total time: 120.11m | eta: 59.4m +step 11180/16704 (66.93%) | loss: 2.616559 | lrm: 0.66 | dt: 643.94ms | tok/sec: 814,193 | mfu: 50.89 | epoch: 2 | total time: 120.12m | eta: 59.4m +step 11181/16704 (66.94%) | loss: 2.614313 | lrm: 0.66 | dt: 644.89ms | tok/sec: 812,986 | mfu: 50.81 | epoch: 2 | total time: 120.13m | eta: 59.4m +step 11182/16704 (66.94%) | loss: 2.608999 | lrm: 0.66 | dt: 648.03ms | tok/sec: 809,052 | mfu: 50.57 | epoch: 2 | total time: 120.14m | eta: 59.4m +step 11183/16704 (66.95%) | loss: 2.611723 | lrm: 0.66 | dt: 643.86ms | tok/sec: 814,292 | mfu: 50.89 | epoch: 2 | total time: 120.15m | eta: 59.4m +step 11184/16704 (66.95%) | loss: 2.609880 | lrm: 0.66 | dt: 647.76ms | tok/sec: 809,390 | mfu: 50.59 | epoch: 2 | total time: 120.16m | eta: 59.4m +step 11185/16704 (66.96%) | loss: 2.595097 | lrm: 0.66 | dt: 644.67ms | tok/sec: 813,269 | mfu: 50.83 | epoch: 2 | total time: 120.17m | eta: 59.3m +step 11186/16704 (66.97%) | loss: 2.599867 | lrm: 0.66 | dt: 644.38ms | tok/sec: 813,629 | mfu: 50.85 | epoch: 2 | total time: 120.18m | eta: 59.3m +step 11187/16704 (66.97%) | loss: 2.601164 | lrm: 0.66 | dt: 644.81ms | tok/sec: 813,087 | mfu: 50.82 | epoch: 2 | total time: 120.19m | eta: 59.3m +step 11188/16704 (66.98%) | loss: 2.607412 | lrm: 0.66 | dt: 645.15ms | tok/sec: 812,655 | mfu: 50.79 | epoch: 2 | total time: 120.20m | eta: 59.3m +step 11189/16704 (66.98%) | loss: 2.591670 | lrm: 0.66 | dt: 645.11ms | tok/sec: 812,712 | mfu: 50.80 | epoch: 2 | total time: 120.22m | eta: 59.3m +step 11190/16704 (66.99%) | loss: 2.580039 | lrm: 0.66 | dt: 643.89ms | tok/sec: 814,249 | mfu: 50.89 | epoch: 2 | total time: 120.23m | eta: 59.3m +step 11191/16704 (67.00%) | loss: 2.581756 | lrm: 0.66 | dt: 645.54ms | tok/sec: 812,172 | mfu: 50.76 | epoch: 2 | total time: 120.24m | eta: 59.3m +step 11192/16704 (67.00%) | loss: 2.585624 | lrm: 0.66 | dt: 645.95ms | tok/sec: 811,656 | mfu: 50.73 | epoch: 2 | total time: 120.25m | eta: 59.3m +step 11193/16704 (67.01%) | loss: 2.597974 | lrm: 0.66 | dt: 642.39ms | tok/sec: 816,146 | mfu: 51.01 | epoch: 2 | total time: 120.26m | eta: 59.3m +step 11194/16704 (67.01%) | loss: 2.600635 | lrm: 0.66 | dt: 644.63ms | tok/sec: 813,316 | mfu: 50.83 | epoch: 2 | total time: 120.27m | eta: 59.3m +step 11195/16704 (67.02%) | loss: 2.602529 | lrm: 0.66 | dt: 645.79ms | tok/sec: 811,850 | mfu: 50.74 | epoch: 2 | total time: 120.28m | eta: 59.2m +step 11196/16704 (67.03%) | loss: 2.596050 | lrm: 0.66 | dt: 647.60ms | tok/sec: 809,586 | mfu: 50.60 | epoch: 2 | total time: 120.29m | eta: 59.2m +step 11197/16704 (67.03%) | loss: 2.596118 | lrm: 0.66 | dt: 646.56ms | tok/sec: 810,883 | mfu: 50.68 | epoch: 2 | total time: 120.30m | eta: 59.2m +step 11198/16704 (67.04%) | loss: 2.598265 | lrm: 0.66 | dt: 643.30ms | tok/sec: 814,995 | mfu: 50.94 | epoch: 2 | total time: 120.31m | eta: 59.2m +step 11199/16704 (67.04%) | loss: 2.599173 | lrm: 0.66 | dt: 647.17ms | tok/sec: 810,126 | mfu: 50.63 | epoch: 2 | total time: 120.32m | eta: 59.2m +step 11200/16704 (67.05%) | loss: 2.603957 | lrm: 0.66 | dt: 647.57ms | tok/sec: 809,625 | mfu: 50.60 | epoch: 2 | total time: 120.33m | eta: 59.2m +step 11201/16704 (67.06%) | loss: 2.618733 | lrm: 0.66 | dt: 643.78ms | tok/sec: 814,390 | mfu: 50.90 | epoch: 2 | total time: 120.34m | eta: 59.2m +step 11202/16704 (67.06%) | loss: 2.614694 | lrm: 0.66 | dt: 648.91ms | tok/sec: 807,950 | mfu: 50.50 | epoch: 2 | total time: 120.35m | eta: 59.2m +step 11203/16704 (67.07%) | loss: 2.618846 | lrm: 0.66 | dt: 644.99ms | tok/sec: 812,860 | mfu: 50.80 | epoch: 2 | total time: 120.37m | eta: 59.2m +step 11204/16704 (67.07%) | loss: 2.615674 | lrm: 0.66 | dt: 644.71ms | tok/sec: 813,213 | mfu: 50.83 | epoch: 2 | total time: 120.38m | eta: 59.1m +step 11205/16704 (67.08%) | loss: 2.615933 | lrm: 0.66 | dt: 643.89ms | tok/sec: 814,245 | mfu: 50.89 | epoch: 2 | total time: 120.39m | eta: 59.1m +step 11206/16704 (67.09%) | loss: 2.611302 | lrm: 0.66 | dt: 644.71ms | tok/sec: 813,212 | mfu: 50.83 | epoch: 2 | total time: 120.40m | eta: 59.1m +step 11207/16704 (67.09%) | loss: 2.602160 | lrm: 0.66 | dt: 645.75ms | tok/sec: 811,906 | mfu: 50.75 | epoch: 2 | total time: 120.41m | eta: 59.1m +step 11208/16704 (67.10%) | loss: 2.596990 | lrm: 0.66 | dt: 645.45ms | tok/sec: 812,280 | mfu: 50.77 | epoch: 2 | total time: 120.42m | eta: 59.1m +step 11209/16704 (67.10%) | loss: 2.591206 | lrm: 0.66 | dt: 645.60ms | tok/sec: 812,093 | mfu: 50.76 | epoch: 2 | total time: 120.43m | eta: 59.1m +step 11210/16704 (67.11%) | loss: 2.594564 | lrm: 0.66 | dt: 646.89ms | tok/sec: 810,476 | mfu: 50.66 | epoch: 2 | total time: 120.44m | eta: 59.1m +step 11211/16704 (67.12%) | loss: 2.580398 | lrm: 0.66 | dt: 644.95ms | tok/sec: 812,912 | mfu: 50.81 | epoch: 2 | total time: 120.45m | eta: 59.1m +step 11212/16704 (67.12%) | loss: 2.581750 | lrm: 0.66 | dt: 647.57ms | tok/sec: 809,629 | mfu: 50.60 | epoch: 2 | total time: 120.46m | eta: 59.1m +step 11213/16704 (67.13%) | loss: 2.585017 | lrm: 0.66 | dt: 646.43ms | tok/sec: 811,049 | mfu: 50.69 | epoch: 2 | total time: 120.47m | eta: 59.0m +step 11214/16704 (67.13%) | loss: 2.589099 | lrm: 0.66 | dt: 643.51ms | tok/sec: 814,734 | mfu: 50.92 | epoch: 2 | total time: 120.48m | eta: 59.0m +step 11215/16704 (67.14%) | loss: 2.601643 | lrm: 0.66 | dt: 645.18ms | tok/sec: 812,618 | mfu: 50.79 | epoch: 2 | total time: 120.49m | eta: 59.0m +step 11216/16704 (67.15%) | loss: 2.598443 | lrm: 0.66 | dt: 645.24ms | tok/sec: 812,541 | mfu: 50.79 | epoch: 2 | total time: 120.51m | eta: 59.0m +step 11217/16704 (67.15%) | loss: 2.594197 | lrm: 0.66 | dt: 644.61ms | tok/sec: 813,344 | mfu: 50.84 | epoch: 2 | total time: 120.52m | eta: 59.0m +step 11218/16704 (67.16%) | loss: 2.585550 | lrm: 0.66 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 2 | total time: 120.53m | eta: 59.0m +step 11219/16704 (67.16%) | loss: 2.572034 | lrm: 0.66 | dt: 644.77ms | tok/sec: 813,135 | mfu: 50.82 | epoch: 2 | total time: 120.54m | eta: 59.0m +step 11220/16704 (67.17%) | loss: 2.575534 | lrm: 0.66 | dt: 645.80ms | tok/sec: 811,844 | mfu: 50.74 | epoch: 2 | total time: 120.55m | eta: 59.0m +step 11221/16704 (67.18%) | loss: 2.570279 | lrm: 0.66 | dt: 649.84ms | tok/sec: 806,789 | mfu: 50.43 | epoch: 2 | total time: 120.56m | eta: 59.0m +step 11222/16704 (67.18%) | loss: 2.577836 | lrm: 0.66 | dt: 644.78ms | tok/sec: 813,124 | mfu: 50.82 | epoch: 2 | total time: 120.57m | eta: 59.0m +step 11223/16704 (67.19%) | loss: 2.578248 | lrm: 0.66 | dt: 644.84ms | tok/sec: 813,048 | mfu: 50.82 | epoch: 2 | total time: 120.58m | eta: 58.9m +step 11224/16704 (67.19%) | loss: 2.590285 | lrm: 0.66 | dt: 646.54ms | tok/sec: 810,916 | mfu: 50.68 | epoch: 2 | total time: 120.59m | eta: 58.9m +step 11225/16704 (67.20%) | loss: 2.607128 | lrm: 0.66 | dt: 645.75ms | tok/sec: 811,900 | mfu: 50.74 | epoch: 2 | total time: 120.60m | eta: 58.9m +step 11226/16704 (67.21%) | loss: 2.597740 | lrm: 0.66 | dt: 646.29ms | tok/sec: 811,229 | mfu: 50.70 | epoch: 2 | total time: 120.61m | eta: 58.9m +step 11227/16704 (67.21%) | loss: 2.591145 | lrm: 0.66 | dt: 644.19ms | tok/sec: 813,875 | mfu: 50.87 | epoch: 2 | total time: 120.62m | eta: 58.9m +step 11228/16704 (67.22%) | loss: 2.603581 | lrm: 0.66 | dt: 645.04ms | tok/sec: 812,798 | mfu: 50.80 | epoch: 2 | total time: 120.63m | eta: 58.9m +step 11229/16704 (67.22%) | loss: 2.605635 | lrm: 0.66 | dt: 645.46ms | tok/sec: 812,266 | mfu: 50.77 | epoch: 2 | total time: 120.65m | eta: 58.9m +step 11230/16704 (67.23%) | loss: 2.609661 | lrm: 0.66 | dt: 645.57ms | tok/sec: 812,136 | mfu: 50.76 | epoch: 2 | total time: 120.66m | eta: 58.9m +step 11231/16704 (67.24%) | loss: 2.609746 | lrm: 0.66 | dt: 645.63ms | tok/sec: 812,051 | mfu: 50.75 | epoch: 2 | total time: 120.67m | eta: 58.9m +step 11232/16704 (67.24%) | loss: 2.622457 | lrm: 0.66 | dt: 645.31ms | tok/sec: 812,457 | mfu: 50.78 | epoch: 2 | total time: 120.68m | eta: 58.8m +step 11233/16704 (67.25%) | loss: 2.621127 | lrm: 0.66 | dt: 645.03ms | tok/sec: 812,811 | mfu: 50.80 | epoch: 2 | total time: 120.69m | eta: 58.8m +step 11234/16704 (67.25%) | loss: 2.612404 | lrm: 0.65 | dt: 645.48ms | tok/sec: 812,242 | mfu: 50.77 | epoch: 2 | total time: 120.70m | eta: 58.8m +step 11235/16704 (67.26%) | loss: 2.618084 | lrm: 0.65 | dt: 644.34ms | tok/sec: 813,682 | mfu: 50.86 | epoch: 2 | total time: 120.71m | eta: 58.8m +step 11236/16704 (67.27%) | loss: 2.613203 | lrm: 0.65 | dt: 645.46ms | tok/sec: 812,276 | mfu: 50.77 | epoch: 2 | total time: 120.72m | eta: 58.8m +step 11237/16704 (67.27%) | loss: 2.602655 | lrm: 0.65 | dt: 643.97ms | tok/sec: 814,151 | mfu: 50.89 | epoch: 2 | total time: 120.73m | eta: 58.8m +step 11238/16704 (67.28%) | loss: 2.594900 | lrm: 0.65 | dt: 645.67ms | tok/sec: 812,004 | mfu: 50.75 | epoch: 2 | total time: 120.74m | eta: 58.8m +step 11239/16704 (67.28%) | loss: 2.575259 | lrm: 0.65 | dt: 645.14ms | tok/sec: 812,668 | mfu: 50.79 | epoch: 2 | total time: 120.75m | eta: 58.8m +step 11240/16704 (67.29%) | loss: 2.575029 | lrm: 0.65 | dt: 644.79ms | tok/sec: 813,111 | mfu: 50.82 | epoch: 2 | total time: 120.76m | eta: 58.8m +step 11241/16704 (67.30%) | loss: 2.586982 | lrm: 0.65 | dt: 645.67ms | tok/sec: 812,007 | mfu: 50.75 | epoch: 2 | total time: 120.77m | eta: 58.7m +step 11242/16704 (67.30%) | loss: 2.580072 | lrm: 0.65 | dt: 644.03ms | tok/sec: 814,077 | mfu: 50.88 | epoch: 2 | total time: 120.79m | eta: 58.7m +step 11243/16704 (67.31%) | loss: 2.585052 | lrm: 0.65 | dt: 647.06ms | tok/sec: 810,255 | mfu: 50.64 | epoch: 2 | total time: 120.80m | eta: 58.7m +step 11244/16704 (67.31%) | loss: 2.593424 | lrm: 0.65 | dt: 644.17ms | tok/sec: 813,891 | mfu: 50.87 | epoch: 2 | total time: 120.81m | eta: 58.7m +step 11245/16704 (67.32%) | loss: 2.579529 | lrm: 0.65 | dt: 644.15ms | tok/sec: 813,921 | mfu: 50.87 | epoch: 2 | total time: 120.82m | eta: 58.7m +step 11246/16704 (67.33%) | loss: 2.594730 | lrm: 0.65 | dt: 644.75ms | tok/sec: 813,164 | mfu: 50.82 | epoch: 2 | total time: 120.83m | eta: 58.7m +step 11247/16704 (67.33%) | loss: 2.595277 | lrm: 0.65 | dt: 645.26ms | tok/sec: 812,520 | mfu: 50.78 | epoch: 2 | total time: 120.84m | eta: 58.7m +step 11248/16704 (67.34%) | loss: 2.600401 | lrm: 0.65 | dt: 647.13ms | tok/sec: 810,170 | mfu: 50.64 | epoch: 2 | total time: 120.85m | eta: 58.7m +step 11249/16704 (67.34%) | loss: 2.598243 | lrm: 0.65 | dt: 645.41ms | tok/sec: 812,336 | mfu: 50.77 | epoch: 2 | total time: 120.86m | eta: 58.7m +Step 11250 | Validation bpb: 0.793223 +step 11250/16704 (67.35%) | loss: 2.592318 | lrm: 0.65 | dt: 644.36ms | tok/sec: 813,662 | mfu: 50.86 | epoch: 2 | total time: 120.87m | eta: 58.7m +step 11251/16704 (67.36%) | loss: 2.592113 | lrm: 0.65 | dt: 651.26ms | tok/sec: 805,042 | mfu: 50.32 | epoch: 2 | total time: 120.88m | eta: 58.6m +step 11252/16704 (67.36%) | loss: 2.582279 | lrm: 0.65 | dt: 641.88ms | tok/sec: 816,796 | mfu: 51.05 | epoch: 2 | total time: 120.89m | eta: 58.6m +step 11253/16704 (67.37%) | loss: 2.586443 | lrm: 0.65 | dt: 643.59ms | tok/sec: 814,634 | mfu: 50.92 | epoch: 2 | total time: 120.90m | eta: 58.6m +step 11254/16704 (67.37%) | loss: 2.586776 | lrm: 0.65 | dt: 648.04ms | tok/sec: 809,031 | mfu: 50.57 | epoch: 2 | total time: 120.91m | eta: 58.6m +step 11255/16704 (67.38%) | loss: 2.593233 | lrm: 0.65 | dt: 641.66ms | tok/sec: 817,081 | mfu: 51.07 | epoch: 2 | total time: 120.92m | eta: 58.6m +step 11256/16704 (67.39%) | loss: 2.597467 | lrm: 0.65 | dt: 646.33ms | tok/sec: 811,175 | mfu: 50.70 | epoch: 2 | total time: 120.94m | eta: 58.6m +step 11257/16704 (67.39%) | loss: 2.611028 | lrm: 0.65 | dt: 643.99ms | tok/sec: 814,124 | mfu: 50.88 | epoch: 2 | total time: 120.95m | eta: 58.6m +step 11258/16704 (67.40%) | loss: 2.623045 | lrm: 0.65 | dt: 643.75ms | tok/sec: 814,428 | mfu: 50.90 | epoch: 2 | total time: 120.96m | eta: 58.6m +step 11259/16704 (67.40%) | loss: 2.622917 | lrm: 0.65 | dt: 645.89ms | tok/sec: 811,727 | mfu: 50.73 | epoch: 2 | total time: 120.97m | eta: 58.6m +step 11260/16704 (67.41%) | loss: 2.622629 | lrm: 0.65 | dt: 643.07ms | tok/sec: 815,286 | mfu: 50.96 | epoch: 2 | total time: 120.98m | eta: 58.5m +step 11261/16704 (67.41%) | loss: 2.609175 | lrm: 0.65 | dt: 645.12ms | tok/sec: 812,700 | mfu: 50.79 | epoch: 2 | total time: 120.99m | eta: 58.5m +step 11262/16704 (67.42%) | loss: 2.597389 | lrm: 0.65 | dt: 644.28ms | tok/sec: 813,757 | mfu: 50.86 | epoch: 2 | total time: 121.00m | eta: 58.5m +step 11263/16704 (67.43%) | loss: 2.588459 | lrm: 0.65 | dt: 646.27ms | tok/sec: 811,254 | mfu: 50.70 | epoch: 2 | total time: 121.01m | eta: 58.5m +step 11264/16704 (67.43%) | loss: 2.604011 | lrm: 0.65 | dt: 647.67ms | tok/sec: 809,493 | mfu: 50.59 | epoch: 2 | total time: 121.02m | eta: 58.5m +step 11265/16704 (67.44%) | loss: 2.606822 | lrm: 0.65 | dt: 644.56ms | tok/sec: 813,403 | mfu: 50.84 | epoch: 2 | total time: 121.03m | eta: 58.5m +step 11266/16704 (67.44%) | loss: 2.597958 | lrm: 0.65 | dt: 644.56ms | tok/sec: 813,408 | mfu: 50.84 | epoch: 2 | total time: 121.04m | eta: 58.5m +step 11267/16704 (67.45%) | loss: 2.600185 | lrm: 0.65 | dt: 647.24ms | tok/sec: 810,040 | mfu: 50.63 | epoch: 2 | total time: 121.05m | eta: 58.5m +step 11268/16704 (67.46%) | loss: 2.600550 | lrm: 0.65 | dt: 645.63ms | tok/sec: 812,059 | mfu: 50.75 | epoch: 2 | total time: 121.06m | eta: 58.5m +step 11269/16704 (67.46%) | loss: 2.607297 | lrm: 0.65 | dt: 644.55ms | tok/sec: 813,420 | mfu: 50.84 | epoch: 2 | total time: 121.08m | eta: 58.4m +step 11270/16704 (67.47%) | loss: 2.605820 | lrm: 0.65 | dt: 643.31ms | tok/sec: 814,985 | mfu: 50.94 | epoch: 2 | total time: 121.09m | eta: 58.4m +step 11271/16704 (67.47%) | loss: 2.601206 | lrm: 0.65 | dt: 644.05ms | tok/sec: 814,052 | mfu: 50.88 | epoch: 2 | total time: 121.10m | eta: 58.4m +step 11272/16704 (67.48%) | loss: 2.605262 | lrm: 0.65 | dt: 646.56ms | tok/sec: 810,889 | mfu: 50.68 | epoch: 2 | total time: 121.11m | eta: 58.4m +step 11273/16704 (67.49%) | loss: 2.601399 | lrm: 0.65 | dt: 646.07ms | tok/sec: 811,504 | mfu: 50.72 | epoch: 2 | total time: 121.12m | eta: 58.4m +step 11274/16704 (67.49%) | loss: 2.585358 | lrm: 0.65 | dt: 645.94ms | tok/sec: 811,670 | mfu: 50.73 | epoch: 2 | total time: 121.13m | eta: 58.4m +step 11275/16704 (67.50%) | loss: 2.587052 | lrm: 0.65 | dt: 646.78ms | tok/sec: 810,617 | mfu: 50.66 | epoch: 2 | total time: 121.14m | eta: 58.4m +step 11276/16704 (67.50%) | loss: 2.592779 | lrm: 0.65 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 2 | total time: 121.15m | eta: 58.4m +step 11277/16704 (67.51%) | loss: 2.595048 | lrm: 0.65 | dt: 645.74ms | tok/sec: 811,913 | mfu: 50.75 | epoch: 2 | total time: 121.16m | eta: 58.4m +step 11278/16704 (67.52%) | loss: 2.598630 | lrm: 0.65 | dt: 644.13ms | tok/sec: 813,948 | mfu: 50.87 | epoch: 2 | total time: 121.17m | eta: 58.3m +step 11279/16704 (67.52%) | loss: 2.592560 | lrm: 0.65 | dt: 646.36ms | tok/sec: 811,137 | mfu: 50.70 | epoch: 2 | total time: 121.18m | eta: 58.3m +step 11280/16704 (67.53%) | loss: 2.602910 | lrm: 0.65 | dt: 646.55ms | tok/sec: 810,901 | mfu: 50.68 | epoch: 2 | total time: 121.19m | eta: 58.3m +step 11281/16704 (67.53%) | loss: 2.600167 | lrm: 0.65 | dt: 645.18ms | tok/sec: 812,617 | mfu: 50.79 | epoch: 2 | total time: 121.20m | eta: 58.3m +step 11282/16704 (67.54%) | loss: 2.592655 | lrm: 0.65 | dt: 646.95ms | tok/sec: 810,401 | mfu: 50.65 | epoch: 2 | total time: 121.22m | eta: 58.3m +step 11283/16704 (67.55%) | loss: 2.606574 | lrm: 0.65 | dt: 646.88ms | tok/sec: 810,482 | mfu: 50.66 | epoch: 2 | total time: 121.23m | eta: 58.3m +step 11284/16704 (67.55%) | loss: 2.606682 | lrm: 0.65 | dt: 645.71ms | tok/sec: 811,952 | mfu: 50.75 | epoch: 2 | total time: 121.24m | eta: 58.3m +step 11285/16704 (67.56%) | loss: 2.605629 | lrm: 0.65 | dt: 646.02ms | tok/sec: 811,568 | mfu: 50.72 | epoch: 2 | total time: 121.25m | eta: 58.3m +step 11286/16704 (67.56%) | loss: 2.603819 | lrm: 0.65 | dt: 645.47ms | tok/sec: 812,259 | mfu: 50.77 | epoch: 2 | total time: 121.26m | eta: 58.3m +step 11287/16704 (67.57%) | loss: 2.617908 | lrm: 0.65 | dt: 646.39ms | tok/sec: 811,096 | mfu: 50.69 | epoch: 2 | total time: 121.27m | eta: 58.3m +step 11288/16704 (67.58%) | loss: 2.608956 | lrm: 0.65 | dt: 644.79ms | tok/sec: 813,110 | mfu: 50.82 | epoch: 2 | total time: 121.28m | eta: 58.2m +step 11289/16704 (67.58%) | loss: 2.610547 | lrm: 0.65 | dt: 644.79ms | tok/sec: 813,115 | mfu: 50.82 | epoch: 2 | total time: 121.29m | eta: 58.2m +step 11290/16704 (67.59%) | loss: 2.613788 | lrm: 0.65 | dt: 646.20ms | tok/sec: 811,336 | mfu: 50.71 | epoch: 2 | total time: 121.30m | eta: 58.2m +step 11291/16704 (67.59%) | loss: 2.602240 | lrm: 0.65 | dt: 647.29ms | tok/sec: 809,975 | mfu: 50.62 | epoch: 2 | total time: 121.31m | eta: 58.2m +step 11292/16704 (67.60%) | loss: 2.598269 | lrm: 0.65 | dt: 644.32ms | tok/sec: 813,710 | mfu: 50.86 | epoch: 2 | total time: 121.32m | eta: 58.2m +step 11293/16704 (67.61%) | loss: 2.607472 | lrm: 0.65 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 2 | total time: 121.33m | eta: 58.2m +step 11294/16704 (67.61%) | loss: 2.606210 | lrm: 0.65 | dt: 645.98ms | tok/sec: 811,613 | mfu: 50.73 | epoch: 2 | total time: 121.34m | eta: 58.2m +step 11295/16704 (67.62%) | loss: 2.603073 | lrm: 0.65 | dt: 644.22ms | tok/sec: 813,835 | mfu: 50.87 | epoch: 2 | total time: 121.36m | eta: 58.2m +step 11296/16704 (67.62%) | loss: 2.605866 | lrm: 0.65 | dt: 645.36ms | tok/sec: 812,395 | mfu: 50.78 | epoch: 2 | total time: 121.37m | eta: 58.2m +step 11297/16704 (67.63%) | loss: 2.609037 | lrm: 0.65 | dt: 645.76ms | tok/sec: 811,888 | mfu: 50.74 | epoch: 2 | total time: 121.38m | eta: 58.1m +step 11298/16704 (67.64%) | loss: 2.601390 | lrm: 0.65 | dt: 645.00ms | tok/sec: 812,846 | mfu: 50.80 | epoch: 2 | total time: 121.39m | eta: 58.1m +step 11299/16704 (67.64%) | loss: 2.600896 | lrm: 0.65 | dt: 647.34ms | tok/sec: 809,912 | mfu: 50.62 | epoch: 2 | total time: 121.40m | eta: 58.1m +step 11300/16704 (67.65%) | loss: 2.594741 | lrm: 0.65 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 2 | total time: 121.41m | eta: 58.1m +step 11301/16704 (67.65%) | loss: 2.594927 | lrm: 0.65 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 2 | total time: 121.42m | eta: 58.1m +step 11302/16704 (67.66%) | loss: 2.612686 | lrm: 0.65 | dt: 645.04ms | tok/sec: 812,796 | mfu: 50.80 | epoch: 2 | total time: 121.43m | eta: 58.1m +step 11303/16704 (67.67%) | loss: 2.615988 | lrm: 0.65 | dt: 643.03ms | tok/sec: 815,340 | mfu: 50.96 | epoch: 2 | total time: 121.44m | eta: 58.1m +step 11304/16704 (67.67%) | loss: 2.613846 | lrm: 0.65 | dt: 648.57ms | tok/sec: 808,380 | mfu: 50.52 | epoch: 2 | total time: 121.45m | eta: 58.1m +step 11305/16704 (67.68%) | loss: 2.617519 | lrm: 0.65 | dt: 644.72ms | tok/sec: 813,196 | mfu: 50.83 | epoch: 2 | total time: 121.46m | eta: 58.1m +step 11306/16704 (67.68%) | loss: 2.608338 | lrm: 0.65 | dt: 644.76ms | tok/sec: 813,154 | mfu: 50.82 | epoch: 2 | total time: 121.47m | eta: 58.0m +step 11307/16704 (67.69%) | loss: 2.604175 | lrm: 0.65 | dt: 645.85ms | tok/sec: 811,784 | mfu: 50.74 | epoch: 2 | total time: 121.48m | eta: 58.0m +step 11308/16704 (67.70%) | loss: 2.599484 | lrm: 0.65 | dt: 645.34ms | tok/sec: 812,422 | mfu: 50.78 | epoch: 2 | total time: 121.50m | eta: 58.0m +step 11309/16704 (67.70%) | loss: 2.594680 | lrm: 0.65 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 2 | total time: 121.51m | eta: 58.0m +step 11310/16704 (67.71%) | loss: 2.605839 | lrm: 0.65 | dt: 646.95ms | tok/sec: 810,397 | mfu: 50.65 | epoch: 2 | total time: 121.52m | eta: 58.0m +step 11311/16704 (67.71%) | loss: 2.598613 | lrm: 0.65 | dt: 646.10ms | tok/sec: 811,462 | mfu: 50.72 | epoch: 2 | total time: 121.53m | eta: 58.0m +step 11312/16704 (67.72%) | loss: 2.601604 | lrm: 0.65 | dt: 646.75ms | tok/sec: 810,652 | mfu: 50.67 | epoch: 2 | total time: 121.54m | eta: 58.0m +step 11313/16704 (67.73%) | loss: 2.588212 | lrm: 0.65 | dt: 646.28ms | tok/sec: 811,245 | mfu: 50.70 | epoch: 2 | total time: 121.55m | eta: 58.0m +step 11314/16704 (67.73%) | loss: 2.594144 | lrm: 0.65 | dt: 646.11ms | tok/sec: 811,456 | mfu: 50.72 | epoch: 2 | total time: 121.56m | eta: 58.0m +step 11315/16704 (67.74%) | loss: 2.599676 | lrm: 0.65 | dt: 647.06ms | tok/sec: 810,256 | mfu: 50.64 | epoch: 2 | total time: 121.57m | eta: 58.0m +step 11316/16704 (67.74%) | loss: 2.603400 | lrm: 0.65 | dt: 645.82ms | tok/sec: 811,822 | mfu: 50.74 | epoch: 2 | total time: 121.58m | eta: 57.9m +step 11317/16704 (67.75%) | loss: 2.601481 | lrm: 0.64 | dt: 647.43ms | tok/sec: 809,804 | mfu: 50.61 | epoch: 2 | total time: 121.59m | eta: 57.9m +step 11318/16704 (67.76%) | loss: 2.598639 | lrm: 0.64 | dt: 644.46ms | tok/sec: 813,534 | mfu: 50.85 | epoch: 2 | total time: 121.60m | eta: 57.9m +step 11319/16704 (67.76%) | loss: 2.597113 | lrm: 0.64 | dt: 648.06ms | tok/sec: 809,016 | mfu: 50.56 | epoch: 2 | total time: 121.61m | eta: 57.9m +step 11320/16704 (67.77%) | loss: 2.588023 | lrm: 0.64 | dt: 645.06ms | tok/sec: 812,774 | mfu: 50.80 | epoch: 2 | total time: 121.62m | eta: 57.9m +step 11321/16704 (67.77%) | loss: 2.581310 | lrm: 0.64 | dt: 645.19ms | tok/sec: 812,610 | mfu: 50.79 | epoch: 2 | total time: 121.64m | eta: 57.9m +step 11322/16704 (67.78%) | loss: 2.599523 | lrm: 0.64 | dt: 646.19ms | tok/sec: 811,357 | mfu: 50.71 | epoch: 2 | total time: 121.65m | eta: 57.9m +step 11323/16704 (67.79%) | loss: 2.598518 | lrm: 0.64 | dt: 645.58ms | tok/sec: 812,114 | mfu: 50.76 | epoch: 2 | total time: 121.66m | eta: 57.9m +step 11324/16704 (67.79%) | loss: 2.602673 | lrm: 0.64 | dt: 647.67ms | tok/sec: 809,492 | mfu: 50.59 | epoch: 2 | total time: 121.67m | eta: 57.9m +step 11325/16704 (67.80%) | loss: 2.608897 | lrm: 0.64 | dt: 645.07ms | tok/sec: 812,764 | mfu: 50.80 | epoch: 2 | total time: 121.68m | eta: 57.8m +step 11326/16704 (67.80%) | loss: 2.608477 | lrm: 0.64 | dt: 644.66ms | tok/sec: 813,279 | mfu: 50.83 | epoch: 2 | total time: 121.69m | eta: 57.8m +step 11327/16704 (67.81%) | loss: 2.605031 | lrm: 0.64 | dt: 646.79ms | tok/sec: 810,599 | mfu: 50.66 | epoch: 2 | total time: 121.70m | eta: 57.8m +step 11328/16704 (67.82%) | loss: 2.604030 | lrm: 0.64 | dt: 645.27ms | tok/sec: 812,515 | mfu: 50.78 | epoch: 2 | total time: 121.71m | eta: 57.8m +step 11329/16704 (67.82%) | loss: 2.612019 | lrm: 0.64 | dt: 647.02ms | tok/sec: 810,309 | mfu: 50.65 | epoch: 2 | total time: 121.72m | eta: 57.8m +step 11330/16704 (67.83%) | loss: 2.616094 | lrm: 0.64 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 2 | total time: 121.73m | eta: 57.8m +step 11331/16704 (67.83%) | loss: 2.615224 | lrm: 0.64 | dt: 644.87ms | tok/sec: 813,016 | mfu: 50.81 | epoch: 2 | total time: 121.74m | eta: 57.8m +step 11332/16704 (67.84%) | loss: 2.610410 | lrm: 0.64 | dt: 644.50ms | tok/sec: 813,474 | mfu: 50.84 | epoch: 2 | total time: 121.75m | eta: 57.8m +step 11333/16704 (67.85%) | loss: 2.613402 | lrm: 0.64 | dt: 646.19ms | tok/sec: 811,357 | mfu: 50.71 | epoch: 2 | total time: 121.76m | eta: 57.8m +step 11334/16704 (67.85%) | loss: 2.594479 | lrm: 0.64 | dt: 644.17ms | tok/sec: 813,899 | mfu: 50.87 | epoch: 2 | total time: 121.78m | eta: 57.7m +step 11335/16704 (67.86%) | loss: 2.583425 | lrm: 0.64 | dt: 646.74ms | tok/sec: 810,667 | mfu: 50.67 | epoch: 2 | total time: 121.79m | eta: 57.7m +step 11336/16704 (67.86%) | loss: 2.565541 | lrm: 0.64 | dt: 646.24ms | tok/sec: 811,285 | mfu: 50.71 | epoch: 2 | total time: 121.80m | eta: 57.7m +step 11337/16704 (67.87%) | loss: 2.574665 | lrm: 0.64 | dt: 646.42ms | tok/sec: 811,064 | mfu: 50.69 | epoch: 2 | total time: 121.81m | eta: 57.7m +step 11338/16704 (67.88%) | loss: 2.593989 | lrm: 0.64 | dt: 645.20ms | tok/sec: 812,603 | mfu: 50.79 | epoch: 2 | total time: 121.82m | eta: 57.7m +step 11339/16704 (67.88%) | loss: 2.582907 | lrm: 0.64 | dt: 646.74ms | tok/sec: 810,661 | mfu: 50.67 | epoch: 2 | total time: 121.83m | eta: 57.7m +step 11340/16704 (67.89%) | loss: 2.582384 | lrm: 0.64 | dt: 645.77ms | tok/sec: 811,880 | mfu: 50.74 | epoch: 2 | total time: 121.84m | eta: 57.7m +step 11341/16704 (67.89%) | loss: 2.588872 | lrm: 0.64 | dt: 647.92ms | tok/sec: 809,190 | mfu: 50.58 | epoch: 2 | total time: 121.85m | eta: 57.7m +step 11342/16704 (67.90%) | loss: 2.578205 | lrm: 0.64 | dt: 644.22ms | tok/sec: 813,831 | mfu: 50.87 | epoch: 2 | total time: 121.86m | eta: 57.7m +step 11343/16704 (67.91%) | loss: 2.573769 | lrm: 0.64 | dt: 647.89ms | tok/sec: 809,220 | mfu: 50.58 | epoch: 2 | total time: 121.87m | eta: 57.7m +step 11344/16704 (67.91%) | loss: 2.583179 | lrm: 0.64 | dt: 646.08ms | tok/sec: 811,490 | mfu: 50.72 | epoch: 2 | total time: 121.88m | eta: 57.6m +step 11345/16704 (67.92%) | loss: 2.585589 | lrm: 0.64 | dt: 645.40ms | tok/sec: 812,350 | mfu: 50.77 | epoch: 2 | total time: 121.89m | eta: 57.6m +step 11346/16704 (67.92%) | loss: 2.584855 | lrm: 0.64 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 2 | total time: 121.90m | eta: 57.6m +step 11347/16704 (67.93%) | loss: 2.596630 | lrm: 0.64 | dt: 645.58ms | tok/sec: 812,123 | mfu: 50.76 | epoch: 2 | total time: 121.92m | eta: 57.6m +step 11348/16704 (67.94%) | loss: 2.597144 | lrm: 0.64 | dt: 645.81ms | tok/sec: 811,833 | mfu: 50.74 | epoch: 2 | total time: 121.93m | eta: 57.6m +step 11349/16704 (67.94%) | loss: 2.602815 | lrm: 0.64 | dt: 646.26ms | tok/sec: 811,269 | mfu: 50.71 | epoch: 2 | total time: 121.94m | eta: 57.6m +step 11350/16704 (67.95%) | loss: 2.603133 | lrm: 0.64 | dt: 644.73ms | tok/sec: 813,192 | mfu: 50.83 | epoch: 2 | total time: 121.95m | eta: 57.6m +step 11351/16704 (67.95%) | loss: 2.607793 | lrm: 0.64 | dt: 643.75ms | tok/sec: 814,431 | mfu: 50.90 | epoch: 2 | total time: 121.96m | eta: 57.6m +step 11352/16704 (67.96%) | loss: 2.600623 | lrm: 0.64 | dt: 648.00ms | tok/sec: 809,081 | mfu: 50.57 | epoch: 2 | total time: 121.97m | eta: 57.6m +step 11353/16704 (67.97%) | loss: 2.599629 | lrm: 0.64 | dt: 642.90ms | tok/sec: 815,504 | mfu: 50.97 | epoch: 2 | total time: 121.98m | eta: 57.5m +step 11354/16704 (67.97%) | loss: 2.597571 | lrm: 0.64 | dt: 647.21ms | tok/sec: 810,073 | mfu: 50.63 | epoch: 2 | total time: 121.99m | eta: 57.5m +step 11355/16704 (67.98%) | loss: 2.611576 | lrm: 0.64 | dt: 648.40ms | tok/sec: 808,581 | mfu: 50.54 | epoch: 2 | total time: 122.00m | eta: 57.5m +step 11356/16704 (67.98%) | loss: 2.610432 | lrm: 0.64 | dt: 644.85ms | tok/sec: 813,033 | mfu: 50.82 | epoch: 2 | total time: 122.01m | eta: 57.5m +step 11357/16704 (67.99%) | loss: 2.613467 | lrm: 0.64 | dt: 644.77ms | tok/sec: 813,134 | mfu: 50.82 | epoch: 2 | total time: 122.02m | eta: 57.5m +step 11358/16704 (68.00%) | loss: 2.625671 | lrm: 0.64 | dt: 646.33ms | tok/sec: 811,177 | mfu: 50.70 | epoch: 2 | total time: 122.03m | eta: 57.5m +step 11359/16704 (68.00%) | loss: 2.622371 | lrm: 0.64 | dt: 644.96ms | tok/sec: 812,906 | mfu: 50.81 | epoch: 2 | total time: 122.04m | eta: 57.5m +step 11360/16704 (68.01%) | loss: 2.616931 | lrm: 0.64 | dt: 647.36ms | tok/sec: 809,886 | mfu: 50.62 | epoch: 2 | total time: 122.05m | eta: 57.5m +step 11361/16704 (68.01%) | loss: 2.589248 | lrm: 0.64 | dt: 645.00ms | tok/sec: 812,853 | mfu: 50.80 | epoch: 2 | total time: 122.07m | eta: 57.5m +step 11362/16704 (68.02%) | loss: 2.593616 | lrm: 0.64 | dt: 643.57ms | tok/sec: 814,658 | mfu: 50.92 | epoch: 2 | total time: 122.08m | eta: 57.4m +step 11363/16704 (68.03%) | loss: 2.580590 | lrm: 0.64 | dt: 646.28ms | tok/sec: 811,235 | mfu: 50.70 | epoch: 2 | total time: 122.09m | eta: 57.4m +step 11364/16704 (68.03%) | loss: 2.585939 | lrm: 0.64 | dt: 646.89ms | tok/sec: 810,476 | mfu: 50.66 | epoch: 2 | total time: 122.10m | eta: 57.4m +step 11365/16704 (68.04%) | loss: 2.575001 | lrm: 0.64 | dt: 647.46ms | tok/sec: 809,759 | mfu: 50.61 | epoch: 2 | total time: 122.11m | eta: 57.4m +step 11366/16704 (68.04%) | loss: 2.571514 | lrm: 0.64 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 2 | total time: 122.12m | eta: 57.4m +step 11367/16704 (68.05%) | loss: 2.591338 | lrm: 0.64 | dt: 645.50ms | tok/sec: 812,221 | mfu: 50.76 | epoch: 2 | total time: 122.13m | eta: 57.4m +step 11368/16704 (68.06%) | loss: 2.591768 | lrm: 0.64 | dt: 648.50ms | tok/sec: 808,459 | mfu: 50.53 | epoch: 2 | total time: 122.14m | eta: 57.4m +step 11369/16704 (68.06%) | loss: 2.587220 | lrm: 0.64 | dt: 644.54ms | tok/sec: 813,430 | mfu: 50.84 | epoch: 2 | total time: 122.15m | eta: 57.4m +step 11370/16704 (68.07%) | loss: 2.590290 | lrm: 0.64 | dt: 644.93ms | tok/sec: 812,934 | mfu: 50.81 | epoch: 2 | total time: 122.16m | eta: 57.4m +step 11371/16704 (68.07%) | loss: 2.574495 | lrm: 0.64 | dt: 645.02ms | tok/sec: 812,826 | mfu: 50.80 | epoch: 2 | total time: 122.17m | eta: 57.3m +step 11372/16704 (68.08%) | loss: 2.569295 | lrm: 0.64 | dt: 643.80ms | tok/sec: 814,362 | mfu: 50.90 | epoch: 2 | total time: 122.18m | eta: 57.3m +step 11373/16704 (68.09%) | loss: 2.575958 | lrm: 0.64 | dt: 645.24ms | tok/sec: 812,551 | mfu: 50.79 | epoch: 2 | total time: 122.19m | eta: 57.3m +step 11374/16704 (68.09%) | loss: 2.577896 | lrm: 0.64 | dt: 646.67ms | tok/sec: 810,751 | mfu: 50.67 | epoch: 2 | total time: 122.21m | eta: 57.3m +step 11375/16704 (68.10%) | loss: 2.582049 | lrm: 0.64 | dt: 645.03ms | tok/sec: 812,816 | mfu: 50.80 | epoch: 2 | total time: 122.22m | eta: 57.3m +step 11376/16704 (68.10%) | loss: 2.582868 | lrm: 0.64 | dt: 647.67ms | tok/sec: 809,500 | mfu: 50.59 | epoch: 2 | total time: 122.23m | eta: 57.3m +step 11377/16704 (68.11%) | loss: 2.570281 | lrm: 0.64 | dt: 643.39ms | tok/sec: 814,889 | mfu: 50.93 | epoch: 2 | total time: 122.24m | eta: 57.3m +step 11378/16704 (68.12%) | loss: 2.577294 | lrm: 0.64 | dt: 644.54ms | tok/sec: 813,426 | mfu: 50.84 | epoch: 2 | total time: 122.25m | eta: 57.3m +step 11379/16704 (68.12%) | loss: 2.577655 | lrm: 0.64 | dt: 644.04ms | tok/sec: 814,059 | mfu: 50.88 | epoch: 2 | total time: 122.26m | eta: 57.3m +step 11380/16704 (68.13%) | loss: 2.596805 | lrm: 0.64 | dt: 648.51ms | tok/sec: 808,455 | mfu: 50.53 | epoch: 2 | total time: 122.27m | eta: 57.3m +step 11381/16704 (68.13%) | loss: 2.597621 | lrm: 0.64 | dt: 646.23ms | tok/sec: 811,303 | mfu: 50.71 | epoch: 2 | total time: 122.28m | eta: 57.2m +step 11382/16704 (68.14%) | loss: 2.584123 | lrm: 0.64 | dt: 645.24ms | tok/sec: 812,543 | mfu: 50.79 | epoch: 2 | total time: 122.29m | eta: 57.2m +step 11383/16704 (68.15%) | loss: 2.580185 | lrm: 0.64 | dt: 646.96ms | tok/sec: 810,390 | mfu: 50.65 | epoch: 2 | total time: 122.30m | eta: 57.2m +step 11384/16704 (68.15%) | loss: 2.577513 | lrm: 0.64 | dt: 645.31ms | tok/sec: 812,459 | mfu: 50.78 | epoch: 2 | total time: 122.31m | eta: 57.2m +step 11385/16704 (68.16%) | loss: 2.573440 | lrm: 0.64 | dt: 648.73ms | tok/sec: 808,178 | mfu: 50.51 | epoch: 2 | total time: 122.32m | eta: 57.2m +step 11386/16704 (68.16%) | loss: 2.573706 | lrm: 0.64 | dt: 643.37ms | tok/sec: 814,906 | mfu: 50.93 | epoch: 2 | total time: 122.33m | eta: 57.2m +step 11387/16704 (68.17%) | loss: 2.580140 | lrm: 0.64 | dt: 643.99ms | tok/sec: 814,119 | mfu: 50.88 | epoch: 2 | total time: 122.35m | eta: 57.2m +step 11388/16704 (68.18%) | loss: 2.582510 | lrm: 0.64 | dt: 644.39ms | tok/sec: 813,619 | mfu: 50.85 | epoch: 2 | total time: 122.36m | eta: 57.2m +step 11389/16704 (68.18%) | loss: 2.579145 | lrm: 0.64 | dt: 644.06ms | tok/sec: 814,039 | mfu: 50.88 | epoch: 2 | total time: 122.37m | eta: 57.2m +step 11390/16704 (68.19%) | loss: 2.576073 | lrm: 0.64 | dt: 645.45ms | tok/sec: 812,277 | mfu: 50.77 | epoch: 2 | total time: 122.38m | eta: 57.1m +step 11391/16704 (68.19%) | loss: 2.578025 | lrm: 0.64 | dt: 645.18ms | tok/sec: 812,618 | mfu: 50.79 | epoch: 2 | total time: 122.39m | eta: 57.1m +step 11392/16704 (68.20%) | loss: 2.582317 | lrm: 0.64 | dt: 645.95ms | tok/sec: 811,651 | mfu: 50.73 | epoch: 2 | total time: 122.40m | eta: 57.1m +step 11393/16704 (68.21%) | loss: 2.570672 | lrm: 0.64 | dt: 645.01ms | tok/sec: 812,838 | mfu: 50.80 | epoch: 2 | total time: 122.41m | eta: 57.1m +step 11394/16704 (68.21%) | loss: 2.570961 | lrm: 0.64 | dt: 645.69ms | tok/sec: 811,978 | mfu: 50.75 | epoch: 2 | total time: 122.42m | eta: 57.1m +step 11395/16704 (68.22%) | loss: 2.561269 | lrm: 0.64 | dt: 644.52ms | tok/sec: 813,452 | mfu: 50.84 | epoch: 2 | total time: 122.43m | eta: 57.1m +step 11396/16704 (68.22%) | loss: 2.575287 | lrm: 0.64 | dt: 644.36ms | tok/sec: 813,651 | mfu: 50.85 | epoch: 2 | total time: 122.44m | eta: 57.1m +step 11397/16704 (68.23%) | loss: 2.587859 | lrm: 0.64 | dt: 645.43ms | tok/sec: 812,307 | mfu: 50.77 | epoch: 2 | total time: 122.45m | eta: 57.1m +step 11398/16704 (68.24%) | loss: 2.583414 | lrm: 0.64 | dt: 645.84ms | tok/sec: 811,795 | mfu: 50.74 | epoch: 2 | total time: 122.46m | eta: 57.1m +step 11399/16704 (68.24%) | loss: 2.601024 | lrm: 0.64 | dt: 644.98ms | tok/sec: 812,869 | mfu: 50.81 | epoch: 2 | total time: 122.47m | eta: 57.0m +step 11400/16704 (68.25%) | loss: 2.591573 | lrm: 0.64 | dt: 644.51ms | tok/sec: 813,471 | mfu: 50.84 | epoch: 2 | total time: 122.49m | eta: 57.0m +step 11401/16704 (68.25%) | loss: 2.597793 | lrm: 0.63 | dt: 644.43ms | tok/sec: 813,571 | mfu: 50.85 | epoch: 2 | total time: 122.50m | eta: 57.0m +step 11402/16704 (68.26%) | loss: 2.610461 | lrm: 0.63 | dt: 644.35ms | tok/sec: 813,673 | mfu: 50.86 | epoch: 2 | total time: 122.51m | eta: 57.0m +step 11403/16704 (68.27%) | loss: 2.600225 | lrm: 0.63 | dt: 644.77ms | tok/sec: 813,143 | mfu: 50.82 | epoch: 2 | total time: 122.52m | eta: 57.0m +step 11404/16704 (68.27%) | loss: 2.589294 | lrm: 0.63 | dt: 644.91ms | tok/sec: 812,966 | mfu: 50.81 | epoch: 2 | total time: 122.53m | eta: 57.0m +step 11405/16704 (68.28%) | loss: 2.601523 | lrm: 0.63 | dt: 643.52ms | tok/sec: 814,715 | mfu: 50.92 | epoch: 2 | total time: 122.54m | eta: 57.0m +step 11406/16704 (68.28%) | loss: 2.604149 | lrm: 0.63 | dt: 643.49ms | tok/sec: 814,761 | mfu: 50.92 | epoch: 2 | total time: 122.55m | eta: 57.0m +step 11407/16704 (68.29%) | loss: 2.607747 | lrm: 0.63 | dt: 644.81ms | tok/sec: 813,089 | mfu: 50.82 | epoch: 2 | total time: 122.56m | eta: 57.0m +step 11408/16704 (68.30%) | loss: 2.600880 | lrm: 0.63 | dt: 642.30ms | tok/sec: 816,270 | mfu: 51.02 | epoch: 2 | total time: 122.57m | eta: 57.0m +step 11409/16704 (68.30%) | loss: 2.586649 | lrm: 0.63 | dt: 644.03ms | tok/sec: 814,077 | mfu: 50.88 | epoch: 2 | total time: 122.58m | eta: 56.9m +step 11410/16704 (68.31%) | loss: 2.589669 | lrm: 0.63 | dt: 644.21ms | tok/sec: 813,844 | mfu: 50.87 | epoch: 2 | total time: 122.59m | eta: 56.9m +step 11411/16704 (68.31%) | loss: 2.604750 | lrm: 0.63 | dt: 643.29ms | tok/sec: 815,008 | mfu: 50.94 | epoch: 2 | total time: 122.60m | eta: 56.9m +step 11412/16704 (68.32%) | loss: 2.604989 | lrm: 0.63 | dt: 644.19ms | tok/sec: 813,873 | mfu: 50.87 | epoch: 2 | total time: 122.61m | eta: 56.9m +step 11413/16704 (68.32%) | loss: 2.595799 | lrm: 0.63 | dt: 644.01ms | tok/sec: 814,094 | mfu: 50.88 | epoch: 2 | total time: 122.62m | eta: 56.9m +step 11414/16704 (68.33%) | loss: 2.594793 | lrm: 0.63 | dt: 645.72ms | tok/sec: 811,948 | mfu: 50.75 | epoch: 2 | total time: 122.64m | eta: 56.9m +step 11415/16704 (68.34%) | loss: 2.603963 | lrm: 0.63 | dt: 648.92ms | tok/sec: 807,936 | mfu: 50.50 | epoch: 2 | total time: 122.65m | eta: 56.9m +step 11416/16704 (68.34%) | loss: 2.607491 | lrm: 0.63 | dt: 642.83ms | tok/sec: 815,593 | mfu: 50.98 | epoch: 2 | total time: 122.66m | eta: 56.9m +step 11417/16704 (68.35%) | loss: 2.591291 | lrm: 0.63 | dt: 642.54ms | tok/sec: 815,963 | mfu: 51.00 | epoch: 2 | total time: 122.67m | eta: 56.9m +step 11418/16704 (68.35%) | loss: 2.593085 | lrm: 0.63 | dt: 645.75ms | tok/sec: 811,902 | mfu: 50.75 | epoch: 2 | total time: 122.68m | eta: 56.8m +step 11419/16704 (68.36%) | loss: 2.603085 | lrm: 0.63 | dt: 643.48ms | tok/sec: 814,775 | mfu: 50.92 | epoch: 2 | total time: 122.69m | eta: 56.8m +step 11420/16704 (68.37%) | loss: 2.601763 | lrm: 0.63 | dt: 644.96ms | tok/sec: 812,898 | mfu: 50.81 | epoch: 2 | total time: 122.70m | eta: 56.8m +step 11421/16704 (68.37%) | loss: 2.597337 | lrm: 0.63 | dt: 643.37ms | tok/sec: 814,910 | mfu: 50.93 | epoch: 2 | total time: 122.71m | eta: 56.8m +step 11422/16704 (68.38%) | loss: 2.596910 | lrm: 0.63 | dt: 642.41ms | tok/sec: 816,123 | mfu: 51.01 | epoch: 2 | total time: 122.72m | eta: 56.8m +step 11423/16704 (68.38%) | loss: 2.588849 | lrm: 0.63 | dt: 643.21ms | tok/sec: 815,116 | mfu: 50.95 | epoch: 2 | total time: 122.73m | eta: 56.8m +step 11424/16704 (68.39%) | loss: 2.591045 | lrm: 0.63 | dt: 647.61ms | tok/sec: 809,568 | mfu: 50.60 | epoch: 2 | total time: 122.74m | eta: 56.8m +step 11425/16704 (68.40%) | loss: 2.585283 | lrm: 0.63 | dt: 643.59ms | tok/sec: 814,634 | mfu: 50.92 | epoch: 2 | total time: 122.75m | eta: 56.8m +step 11426/16704 (68.40%) | loss: 2.587225 | lrm: 0.63 | dt: 642.10ms | tok/sec: 816,527 | mfu: 51.03 | epoch: 2 | total time: 122.76m | eta: 56.8m +step 11427/16704 (68.41%) | loss: 2.581108 | lrm: 0.63 | dt: 643.46ms | tok/sec: 814,798 | mfu: 50.93 | epoch: 2 | total time: 122.78m | eta: 56.7m +step 11428/16704 (68.41%) | loss: 2.567605 | lrm: 0.63 | dt: 643.86ms | tok/sec: 814,285 | mfu: 50.89 | epoch: 2 | total time: 122.79m | eta: 56.7m +step 11429/16704 (68.42%) | loss: 2.579825 | lrm: 0.63 | dt: 643.83ms | tok/sec: 814,329 | mfu: 50.90 | epoch: 2 | total time: 122.80m | eta: 56.7m +step 11430/16704 (68.43%) | loss: 2.586584 | lrm: 0.63 | dt: 645.79ms | tok/sec: 811,852 | mfu: 50.74 | epoch: 2 | total time: 122.81m | eta: 56.7m +step 11431/16704 (68.43%) | loss: 2.596888 | lrm: 0.63 | dt: 641.97ms | tok/sec: 816,684 | mfu: 51.04 | epoch: 2 | total time: 122.82m | eta: 56.7m +step 11432/16704 (68.44%) | loss: 2.587052 | lrm: 0.63 | dt: 643.41ms | tok/sec: 814,855 | mfu: 50.93 | epoch: 2 | total time: 122.83m | eta: 56.7m +step 11433/16704 (68.44%) | loss: 2.588707 | lrm: 0.63 | dt: 644.52ms | tok/sec: 813,454 | mfu: 50.84 | epoch: 2 | total time: 122.84m | eta: 56.7m +step 11434/16704 (68.45%) | loss: 2.586112 | lrm: 0.63 | dt: 643.92ms | tok/sec: 814,212 | mfu: 50.89 | epoch: 2 | total time: 122.85m | eta: 56.7m +step 11435/16704 (68.46%) | loss: 2.590873 | lrm: 0.63 | dt: 643.16ms | tok/sec: 815,174 | mfu: 50.95 | epoch: 2 | total time: 122.86m | eta: 56.7m +step 11436/16704 (68.46%) | loss: 2.585000 | lrm: 0.63 | dt: 643.74ms | tok/sec: 814,441 | mfu: 50.90 | epoch: 2 | total time: 122.87m | eta: 56.7m +step 11437/16704 (68.47%) | loss: 2.583200 | lrm: 0.63 | dt: 642.87ms | tok/sec: 815,536 | mfu: 50.97 | epoch: 2 | total time: 122.88m | eta: 56.6m +step 11438/16704 (68.47%) | loss: 2.582846 | lrm: 0.63 | dt: 644.77ms | tok/sec: 813,140 | mfu: 50.82 | epoch: 2 | total time: 122.89m | eta: 56.6m +step 11439/16704 (68.48%) | loss: 2.585593 | lrm: 0.63 | dt: 645.42ms | tok/sec: 812,314 | mfu: 50.77 | epoch: 2 | total time: 122.90m | eta: 56.6m +step 11440/16704 (68.49%) | loss: 2.573696 | lrm: 0.63 | dt: 642.71ms | tok/sec: 815,745 | mfu: 50.99 | epoch: 2 | total time: 122.91m | eta: 56.6m +step 11441/16704 (68.49%) | loss: 2.579377 | lrm: 0.63 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 2 | total time: 122.93m | eta: 56.6m +step 11442/16704 (68.50%) | loss: 2.576351 | lrm: 0.63 | dt: 642.77ms | tok/sec: 815,670 | mfu: 50.98 | epoch: 2 | total time: 122.94m | eta: 56.6m +step 11443/16704 (68.50%) | loss: 2.566831 | lrm: 0.63 | dt: 642.54ms | tok/sec: 815,957 | mfu: 51.00 | epoch: 2 | total time: 122.95m | eta: 56.6m +step 11444/16704 (68.51%) | loss: 2.557342 | lrm: 0.63 | dt: 643.37ms | tok/sec: 814,902 | mfu: 50.93 | epoch: 2 | total time: 122.96m | eta: 56.6m +step 11445/16704 (68.52%) | loss: 2.579959 | lrm: 0.63 | dt: 645.28ms | tok/sec: 812,494 | mfu: 50.78 | epoch: 2 | total time: 122.97m | eta: 56.6m +step 11446/16704 (68.52%) | loss: 2.570664 | lrm: 0.63 | dt: 643.92ms | tok/sec: 814,214 | mfu: 50.89 | epoch: 2 | total time: 122.98m | eta: 56.5m +step 11447/16704 (68.53%) | loss: 2.567626 | lrm: 0.63 | dt: 643.49ms | tok/sec: 814,754 | mfu: 50.92 | epoch: 2 | total time: 122.99m | eta: 56.5m +step 11448/16704 (68.53%) | loss: 2.577724 | lrm: 0.63 | dt: 641.64ms | tok/sec: 817,109 | mfu: 51.07 | epoch: 2 | total time: 123.00m | eta: 56.5m +step 11449/16704 (68.54%) | loss: 2.572202 | lrm: 0.63 | dt: 643.43ms | tok/sec: 814,835 | mfu: 50.93 | epoch: 2 | total time: 123.01m | eta: 56.5m +step 11450/16704 (68.55%) | loss: 2.568976 | lrm: 0.63 | dt: 645.55ms | tok/sec: 812,157 | mfu: 50.76 | epoch: 2 | total time: 123.02m | eta: 56.5m +step 11451/16704 (68.55%) | loss: 2.587151 | lrm: 0.63 | dt: 644.14ms | tok/sec: 813,939 | mfu: 50.87 | epoch: 2 | total time: 123.03m | eta: 56.5m +step 11452/16704 (68.56%) | loss: 2.585746 | lrm: 0.63 | dt: 644.67ms | tok/sec: 813,261 | mfu: 50.83 | epoch: 2 | total time: 123.04m | eta: 56.5m +step 11453/16704 (68.56%) | loss: 2.593207 | lrm: 0.63 | dt: 642.17ms | tok/sec: 816,427 | mfu: 51.03 | epoch: 2 | total time: 123.05m | eta: 56.5m +step 11454/16704 (68.57%) | loss: 2.606632 | lrm: 0.63 | dt: 642.63ms | tok/sec: 815,849 | mfu: 50.99 | epoch: 2 | total time: 123.06m | eta: 56.5m +step 11455/16704 (68.58%) | loss: 2.596014 | lrm: 0.63 | dt: 643.71ms | tok/sec: 814,483 | mfu: 50.91 | epoch: 2 | total time: 123.08m | eta: 56.4m +step 11456/16704 (68.58%) | loss: 2.590505 | lrm: 0.63 | dt: 644.77ms | tok/sec: 813,136 | mfu: 50.82 | epoch: 2 | total time: 123.09m | eta: 56.4m +step 11457/16704 (68.59%) | loss: 2.603881 | lrm: 0.63 | dt: 644.85ms | tok/sec: 813,034 | mfu: 50.82 | epoch: 2 | total time: 123.10m | eta: 56.4m +step 11458/16704 (68.59%) | loss: 2.617156 | lrm: 0.63 | dt: 643.75ms | tok/sec: 814,428 | mfu: 50.90 | epoch: 2 | total time: 123.11m | eta: 56.4m +step 11459/16704 (68.60%) | loss: 2.619437 | lrm: 0.63 | dt: 644.54ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 123.12m | eta: 56.4m +step 11460/16704 (68.61%) | loss: 2.613538 | lrm: 0.63 | dt: 645.27ms | tok/sec: 812,513 | mfu: 50.78 | epoch: 2 | total time: 123.13m | eta: 56.4m +step 11461/16704 (68.61%) | loss: 2.609870 | lrm: 0.63 | dt: 642.72ms | tok/sec: 815,738 | mfu: 50.98 | epoch: 2 | total time: 123.14m | eta: 56.4m +step 11462/16704 (68.62%) | loss: 2.607526 | lrm: 0.63 | dt: 644.78ms | tok/sec: 813,132 | mfu: 50.82 | epoch: 2 | total time: 123.15m | eta: 56.4m +step 11463/16704 (68.62%) | loss: 2.609743 | lrm: 0.63 | dt: 644.34ms | tok/sec: 813,682 | mfu: 50.86 | epoch: 2 | total time: 123.16m | eta: 56.4m +step 11464/16704 (68.63%) | loss: 2.601220 | lrm: 0.63 | dt: 643.02ms | tok/sec: 815,357 | mfu: 50.96 | epoch: 2 | total time: 123.17m | eta: 56.3m +step 11465/16704 (68.64%) | loss: 2.608699 | lrm: 0.63 | dt: 644.44ms | tok/sec: 813,560 | mfu: 50.85 | epoch: 2 | total time: 123.18m | eta: 56.3m +step 11466/16704 (68.64%) | loss: 2.589912 | lrm: 0.63 | dt: 646.19ms | tok/sec: 811,348 | mfu: 50.71 | epoch: 2 | total time: 123.19m | eta: 56.3m +step 11467/16704 (68.65%) | loss: 2.596508 | lrm: 0.63 | dt: 644.49ms | tok/sec: 813,494 | mfu: 50.84 | epoch: 2 | total time: 123.20m | eta: 56.3m +step 11468/16704 (68.65%) | loss: 2.579965 | lrm: 0.63 | dt: 642.21ms | tok/sec: 816,382 | mfu: 51.03 | epoch: 2 | total time: 123.22m | eta: 56.3m +step 11469/16704 (68.66%) | loss: 2.584143 | lrm: 0.63 | dt: 645.81ms | tok/sec: 811,828 | mfu: 50.74 | epoch: 2 | total time: 123.23m | eta: 56.3m +step 11470/16704 (68.67%) | loss: 2.588163 | lrm: 0.63 | dt: 646.58ms | tok/sec: 810,863 | mfu: 50.68 | epoch: 2 | total time: 123.24m | eta: 56.3m +step 11471/16704 (68.67%) | loss: 2.588382 | lrm: 0.63 | dt: 643.67ms | tok/sec: 814,524 | mfu: 50.91 | epoch: 2 | total time: 123.25m | eta: 56.3m +step 11472/16704 (68.68%) | loss: 2.586572 | lrm: 0.63 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 2 | total time: 123.26m | eta: 56.3m +step 11473/16704 (68.68%) | loss: 2.591152 | lrm: 0.63 | dt: 644.33ms | tok/sec: 813,696 | mfu: 50.86 | epoch: 2 | total time: 123.27m | eta: 56.3m +step 11474/16704 (68.69%) | loss: 2.592193 | lrm: 0.63 | dt: 644.67ms | tok/sec: 813,259 | mfu: 50.83 | epoch: 2 | total time: 123.28m | eta: 56.2m +step 11475/16704 (68.70%) | loss: 2.593772 | lrm: 0.63 | dt: 643.97ms | tok/sec: 814,149 | mfu: 50.89 | epoch: 2 | total time: 123.29m | eta: 56.2m +step 11476/16704 (68.70%) | loss: 2.594557 | lrm: 0.63 | dt: 642.32ms | tok/sec: 816,240 | mfu: 51.02 | epoch: 2 | total time: 123.30m | eta: 56.2m +step 11477/16704 (68.71%) | loss: 2.587363 | lrm: 0.63 | dt: 645.73ms | tok/sec: 811,927 | mfu: 50.75 | epoch: 2 | total time: 123.31m | eta: 56.2m +step 11478/16704 (68.71%) | loss: 2.584633 | lrm: 0.63 | dt: 644.45ms | tok/sec: 813,548 | mfu: 50.85 | epoch: 2 | total time: 123.32m | eta: 56.2m +step 11479/16704 (68.72%) | loss: 2.604082 | lrm: 0.63 | dt: 643.06ms | tok/sec: 815,299 | mfu: 50.96 | epoch: 2 | total time: 123.33m | eta: 56.2m +step 11480/16704 (68.73%) | loss: 2.604411 | lrm: 0.63 | dt: 648.48ms | tok/sec: 808,488 | mfu: 50.53 | epoch: 2 | total time: 123.34m | eta: 56.2m +step 11481/16704 (68.73%) | loss: 2.596351 | lrm: 0.63 | dt: 643.89ms | tok/sec: 814,251 | mfu: 50.89 | epoch: 2 | total time: 123.35m | eta: 56.2m +step 11482/16704 (68.74%) | loss: 2.580941 | lrm: 0.63 | dt: 646.91ms | tok/sec: 810,446 | mfu: 50.65 | epoch: 2 | total time: 123.37m | eta: 56.2m +step 11483/16704 (68.74%) | loss: 2.579940 | lrm: 0.63 | dt: 642.95ms | tok/sec: 815,440 | mfu: 50.97 | epoch: 2 | total time: 123.38m | eta: 56.1m +step 11484/16704 (68.75%) | loss: 2.576535 | lrm: 0.62 | dt: 645.95ms | tok/sec: 811,652 | mfu: 50.73 | epoch: 2 | total time: 123.39m | eta: 56.1m +step 11485/16704 (68.76%) | loss: 2.586758 | lrm: 0.62 | dt: 645.22ms | tok/sec: 812,578 | mfu: 50.79 | epoch: 2 | total time: 123.40m | eta: 56.1m +step 11486/16704 (68.76%) | loss: 2.587935 | lrm: 0.62 | dt: 644.71ms | tok/sec: 813,209 | mfu: 50.83 | epoch: 2 | total time: 123.41m | eta: 56.1m +step 11487/16704 (68.77%) | loss: 2.592593 | lrm: 0.62 | dt: 645.78ms | tok/sec: 811,864 | mfu: 50.74 | epoch: 2 | total time: 123.42m | eta: 56.1m +step 11488/16704 (68.77%) | loss: 2.605488 | lrm: 0.62 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 2 | total time: 123.43m | eta: 56.1m +step 11489/16704 (68.78%) | loss: 2.602045 | lrm: 0.62 | dt: 646.94ms | tok/sec: 810,415 | mfu: 50.65 | epoch: 2 | total time: 123.44m | eta: 56.1m +step 11490/16704 (68.79%) | loss: 2.603215 | lrm: 0.62 | dt: 644.34ms | tok/sec: 813,685 | mfu: 50.86 | epoch: 2 | total time: 123.45m | eta: 56.1m +step 11491/16704 (68.79%) | loss: 2.605043 | lrm: 0.62 | dt: 645.90ms | tok/sec: 811,720 | mfu: 50.73 | epoch: 2 | total time: 123.46m | eta: 56.1m +step 11492/16704 (68.80%) | loss: 2.621629 | lrm: 0.62 | dt: 643.20ms | tok/sec: 815,121 | mfu: 50.95 | epoch: 2 | total time: 123.47m | eta: 56.0m +step 11493/16704 (68.80%) | loss: 2.608717 | lrm: 0.62 | dt: 643.44ms | tok/sec: 814,814 | mfu: 50.93 | epoch: 2 | total time: 123.48m | eta: 56.0m +step 11494/16704 (68.81%) | loss: 2.606307 | lrm: 0.62 | dt: 645.22ms | tok/sec: 812,570 | mfu: 50.79 | epoch: 2 | total time: 123.49m | eta: 56.0m +step 11495/16704 (68.82%) | loss: 2.607361 | lrm: 0.62 | dt: 645.60ms | tok/sec: 812,100 | mfu: 50.76 | epoch: 2 | total time: 123.51m | eta: 56.0m +step 11496/16704 (68.82%) | loss: 2.608972 | lrm: 0.62 | dt: 642.70ms | tok/sec: 815,760 | mfu: 50.99 | epoch: 2 | total time: 123.52m | eta: 56.0m +step 11497/16704 (68.83%) | loss: 2.605225 | lrm: 0.62 | dt: 643.17ms | tok/sec: 815,166 | mfu: 50.95 | epoch: 2 | total time: 123.53m | eta: 56.0m +step 11498/16704 (68.83%) | loss: 2.606354 | lrm: 0.62 | dt: 641.40ms | tok/sec: 817,408 | mfu: 51.09 | epoch: 2 | total time: 123.54m | eta: 56.0m +step 11499/16704 (68.84%) | loss: 2.599900 | lrm: 0.62 | dt: 641.88ms | tok/sec: 816,805 | mfu: 51.05 | epoch: 2 | total time: 123.55m | eta: 56.0m +Step 11500 | Validation bpb: 0.791251 +step 11500/16704 (68.85%) | loss: 2.598299 | lrm: 0.62 | dt: 643.67ms | tok/sec: 814,525 | mfu: 50.91 | epoch: 2 | total time: 123.56m | eta: 56.0m +step 11501/16704 (68.85%) | loss: 2.607434 | lrm: 0.62 | dt: 649.35ms | tok/sec: 807,407 | mfu: 50.46 | epoch: 2 | total time: 123.57m | eta: 56.0m +step 11502/16704 (68.86%) | loss: 2.621567 | lrm: 0.62 | dt: 641.31ms | tok/sec: 817,530 | mfu: 51.10 | epoch: 2 | total time: 123.58m | eta: 55.9m +step 11503/16704 (68.86%) | loss: 2.612296 | lrm: 0.62 | dt: 644.15ms | tok/sec: 813,920 | mfu: 50.87 | epoch: 2 | total time: 123.59m | eta: 55.9m +step 11504/16704 (68.87%) | loss: 2.592501 | lrm: 0.62 | dt: 647.59ms | tok/sec: 809,598 | mfu: 50.60 | epoch: 2 | total time: 123.60m | eta: 55.9m +step 11505/16704 (68.88%) | loss: 2.589213 | lrm: 0.62 | dt: 639.61ms | tok/sec: 819,701 | mfu: 51.23 | epoch: 2 | total time: 123.61m | eta: 55.9m +step 11506/16704 (68.88%) | loss: 2.591278 | lrm: 0.62 | dt: 644.91ms | tok/sec: 812,962 | mfu: 50.81 | epoch: 2 | total time: 123.62m | eta: 55.9m +step 11507/16704 (68.89%) | loss: 2.591004 | lrm: 0.62 | dt: 642.83ms | tok/sec: 815,588 | mfu: 50.98 | epoch: 2 | total time: 123.63m | eta: 55.9m +step 11508/16704 (68.89%) | loss: 2.598712 | lrm: 0.62 | dt: 643.72ms | tok/sec: 814,468 | mfu: 50.91 | epoch: 2 | total time: 123.64m | eta: 55.9m +step 11509/16704 (68.90%) | loss: 2.599982 | lrm: 0.62 | dt: 645.60ms | tok/sec: 812,094 | mfu: 50.76 | epoch: 2 | total time: 123.66m | eta: 55.9m +step 11510/16704 (68.91%) | loss: 2.601262 | lrm: 0.62 | dt: 643.28ms | tok/sec: 815,019 | mfu: 50.94 | epoch: 2 | total time: 123.67m | eta: 55.9m +step 11511/16704 (68.91%) | loss: 2.581313 | lrm: 0.62 | dt: 645.19ms | tok/sec: 812,615 | mfu: 50.79 | epoch: 2 | total time: 123.68m | eta: 55.8m +step 11512/16704 (68.92%) | loss: 2.577491 | lrm: 0.62 | dt: 644.36ms | tok/sec: 813,653 | mfu: 50.85 | epoch: 2 | total time: 123.69m | eta: 55.8m +step 11513/16704 (68.92%) | loss: 2.581661 | lrm: 0.62 | dt: 643.08ms | tok/sec: 815,272 | mfu: 50.96 | epoch: 2 | total time: 123.70m | eta: 55.8m +step 11514/16704 (68.93%) | loss: 2.598321 | lrm: 0.62 | dt: 646.30ms | tok/sec: 811,218 | mfu: 50.70 | epoch: 2 | total time: 123.71m | eta: 55.8m +step 11515/16704 (68.94%) | loss: 2.603855 | lrm: 0.62 | dt: 642.77ms | tok/sec: 815,665 | mfu: 50.98 | epoch: 2 | total time: 123.72m | eta: 55.8m +step 11516/16704 (68.94%) | loss: 2.610399 | lrm: 0.62 | dt: 643.30ms | tok/sec: 814,994 | mfu: 50.94 | epoch: 2 | total time: 123.73m | eta: 55.8m +step 11517/16704 (68.95%) | loss: 2.614836 | lrm: 0.62 | dt: 644.34ms | tok/sec: 813,681 | mfu: 50.86 | epoch: 2 | total time: 123.74m | eta: 55.8m +step 11518/16704 (68.95%) | loss: 2.604892 | lrm: 0.62 | dt: 645.80ms | tok/sec: 811,844 | mfu: 50.74 | epoch: 2 | total time: 123.75m | eta: 55.8m +step 11519/16704 (68.96%) | loss: 2.600128 | lrm: 0.62 | dt: 643.32ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 2 | total time: 123.76m | eta: 55.8m +step 11520/16704 (68.97%) | loss: 2.583374 | lrm: 0.62 | dt: 643.82ms | tok/sec: 814,341 | mfu: 50.90 | epoch: 2 | total time: 123.77m | eta: 55.7m +step 11521/16704 (68.97%) | loss: 2.602242 | lrm: 0.62 | dt: 645.67ms | tok/sec: 812,010 | mfu: 50.75 | epoch: 2 | total time: 123.78m | eta: 55.7m +step 11522/16704 (68.98%) | loss: 2.597830 | lrm: 0.62 | dt: 642.04ms | tok/sec: 816,598 | mfu: 51.04 | epoch: 2 | total time: 123.80m | eta: 55.7m +step 11523/16704 (68.98%) | loss: 2.596741 | lrm: 0.62 | dt: 644.14ms | tok/sec: 813,928 | mfu: 50.87 | epoch: 2 | total time: 123.81m | eta: 55.7m +step 11524/16704 (68.99%) | loss: 2.590262 | lrm: 0.62 | dt: 641.69ms | tok/sec: 817,037 | mfu: 51.07 | epoch: 2 | total time: 123.82m | eta: 55.7m +step 11525/16704 (69.00%) | loss: 2.586081 | lrm: 0.62 | dt: 643.18ms | tok/sec: 815,152 | mfu: 50.95 | epoch: 2 | total time: 123.83m | eta: 55.7m +step 11526/16704 (69.00%) | loss: 2.574416 | lrm: 0.62 | dt: 643.56ms | tok/sec: 814,665 | mfu: 50.92 | epoch: 2 | total time: 123.84m | eta: 55.7m +step 11527/16704 (69.01%) | loss: 2.549932 | lrm: 0.62 | dt: 643.12ms | tok/sec: 815,230 | mfu: 50.95 | epoch: 2 | total time: 123.85m | eta: 55.7m +step 11528/16704 (69.01%) | loss: 2.554793 | lrm: 0.62 | dt: 645.76ms | tok/sec: 811,887 | mfu: 50.74 | epoch: 2 | total time: 123.86m | eta: 55.7m +step 11529/16704 (69.02%) | loss: 2.561106 | lrm: 0.62 | dt: 644.01ms | tok/sec: 814,104 | mfu: 50.88 | epoch: 2 | total time: 123.87m | eta: 55.6m +step 11530/16704 (69.03%) | loss: 2.559561 | lrm: 0.62 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 2 | total time: 123.88m | eta: 55.6m +step 11531/16704 (69.03%) | loss: 2.568079 | lrm: 0.62 | dt: 644.17ms | tok/sec: 813,899 | mfu: 50.87 | epoch: 2 | total time: 123.89m | eta: 55.6m +step 11532/16704 (69.04%) | loss: 2.560193 | lrm: 0.62 | dt: 643.12ms | tok/sec: 815,223 | mfu: 50.95 | epoch: 2 | total time: 123.90m | eta: 55.6m +step 11533/16704 (69.04%) | loss: 2.565033 | lrm: 0.62 | dt: 645.09ms | tok/sec: 812,730 | mfu: 50.80 | epoch: 2 | total time: 123.91m | eta: 55.6m +step 11534/16704 (69.05%) | loss: 2.584820 | lrm: 0.62 | dt: 641.98ms | tok/sec: 816,675 | mfu: 51.04 | epoch: 2 | total time: 123.92m | eta: 55.6m +step 11535/16704 (69.06%) | loss: 2.591900 | lrm: 0.62 | dt: 645.10ms | tok/sec: 812,719 | mfu: 50.80 | epoch: 2 | total time: 123.93m | eta: 55.6m +step 11536/16704 (69.06%) | loss: 2.571715 | lrm: 0.62 | dt: 640.83ms | tok/sec: 818,136 | mfu: 51.13 | epoch: 2 | total time: 123.95m | eta: 55.6m +step 11537/16704 (69.07%) | loss: 2.563254 | lrm: 0.62 | dt: 643.61ms | tok/sec: 814,600 | mfu: 50.91 | epoch: 2 | total time: 123.96m | eta: 55.6m +step 11538/16704 (69.07%) | loss: 2.563695 | lrm: 0.62 | dt: 641.25ms | tok/sec: 817,602 | mfu: 51.10 | epoch: 2 | total time: 123.97m | eta: 55.6m +step 11539/16704 (69.08%) | loss: 2.566220 | lrm: 0.62 | dt: 645.28ms | tok/sec: 812,503 | mfu: 50.78 | epoch: 2 | total time: 123.98m | eta: 55.5m +step 11540/16704 (69.09%) | loss: 2.575886 | lrm: 0.62 | dt: 644.34ms | tok/sec: 813,683 | mfu: 50.86 | epoch: 2 | total time: 123.99m | eta: 55.5m +step 11541/16704 (69.09%) | loss: 2.577381 | lrm: 0.62 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 2 | total time: 124.00m | eta: 55.5m +step 11542/16704 (69.10%) | loss: 2.580569 | lrm: 0.62 | dt: 642.27ms | tok/sec: 816,299 | mfu: 51.02 | epoch: 2 | total time: 124.01m | eta: 55.5m +step 11543/16704 (69.10%) | loss: 2.568116 | lrm: 0.62 | dt: 645.08ms | tok/sec: 812,743 | mfu: 50.80 | epoch: 2 | total time: 124.02m | eta: 55.5m +step 11544/16704 (69.11%) | loss: 2.567119 | lrm: 0.62 | dt: 645.36ms | tok/sec: 812,394 | mfu: 50.78 | epoch: 2 | total time: 124.03m | eta: 55.5m +step 11545/16704 (69.12%) | loss: 2.567187 | lrm: 0.62 | dt: 645.72ms | tok/sec: 811,939 | mfu: 50.75 | epoch: 2 | total time: 124.04m | eta: 55.5m +step 11546/16704 (69.12%) | loss: 2.577522 | lrm: 0.62 | dt: 644.99ms | tok/sec: 812,862 | mfu: 50.81 | epoch: 2 | total time: 124.05m | eta: 55.5m +step 11547/16704 (69.13%) | loss: 2.572057 | lrm: 0.62 | dt: 645.20ms | tok/sec: 812,596 | mfu: 50.79 | epoch: 2 | total time: 124.06m | eta: 55.5m +step 11548/16704 (69.13%) | loss: 2.565741 | lrm: 0.62 | dt: 646.80ms | tok/sec: 810,590 | mfu: 50.66 | epoch: 2 | total time: 124.07m | eta: 55.4m +step 11549/16704 (69.14%) | loss: 2.552248 | lrm: 0.62 | dt: 642.22ms | tok/sec: 816,371 | mfu: 51.02 | epoch: 2 | total time: 124.08m | eta: 55.4m +step 11550/16704 (69.15%) | loss: 2.562054 | lrm: 0.62 | dt: 643.79ms | tok/sec: 814,377 | mfu: 50.90 | epoch: 2 | total time: 124.10m | eta: 55.4m +step 11551/16704 (69.15%) | loss: 2.542850 | lrm: 0.62 | dt: 644.05ms | tok/sec: 814,049 | mfu: 50.88 | epoch: 2 | total time: 124.11m | eta: 55.4m +step 11552/16704 (69.16%) | loss: 2.555340 | lrm: 0.62 | dt: 645.17ms | tok/sec: 812,639 | mfu: 50.79 | epoch: 2 | total time: 124.12m | eta: 55.4m +step 11553/16704 (69.16%) | loss: 2.543940 | lrm: 0.62 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 2 | total time: 124.13m | eta: 55.4m +step 11554/16704 (69.17%) | loss: 2.561627 | lrm: 0.62 | dt: 645.60ms | tok/sec: 812,092 | mfu: 50.76 | epoch: 2 | total time: 124.14m | eta: 55.4m +step 11555/16704 (69.18%) | loss: 2.571691 | lrm: 0.62 | dt: 645.59ms | tok/sec: 812,103 | mfu: 50.76 | epoch: 2 | total time: 124.15m | eta: 55.4m +step 11556/16704 (69.18%) | loss: 2.583896 | lrm: 0.62 | dt: 645.91ms | tok/sec: 811,707 | mfu: 50.73 | epoch: 2 | total time: 124.16m | eta: 55.4m +step 11557/16704 (69.19%) | loss: 2.567459 | lrm: 0.62 | dt: 645.53ms | tok/sec: 812,177 | mfu: 50.76 | epoch: 2 | total time: 124.17m | eta: 55.3m +step 11558/16704 (69.19%) | loss: 2.557874 | lrm: 0.62 | dt: 644.07ms | tok/sec: 814,023 | mfu: 50.88 | epoch: 2 | total time: 124.18m | eta: 55.3m +step 11559/16704 (69.20%) | loss: 2.563921 | lrm: 0.62 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 2 | total time: 124.19m | eta: 55.3m +step 11560/16704 (69.20%) | loss: 2.553317 | lrm: 0.62 | dt: 644.18ms | tok/sec: 813,886 | mfu: 50.87 | epoch: 2 | total time: 124.20m | eta: 55.3m +step 11561/16704 (69.21%) | loss: 2.550295 | lrm: 0.62 | dt: 643.85ms | tok/sec: 814,300 | mfu: 50.89 | epoch: 2 | total time: 124.21m | eta: 55.3m +step 11562/16704 (69.22%) | loss: 2.556858 | lrm: 0.62 | dt: 646.61ms | tok/sec: 810,819 | mfu: 50.68 | epoch: 2 | total time: 124.22m | eta: 55.3m +step 11563/16704 (69.22%) | loss: 2.555406 | lrm: 0.62 | dt: 643.79ms | tok/sec: 814,376 | mfu: 50.90 | epoch: 2 | total time: 124.24m | eta: 55.3m +step 11564/16704 (69.23%) | loss: 2.566505 | lrm: 0.62 | dt: 644.39ms | tok/sec: 813,617 | mfu: 50.85 | epoch: 2 | total time: 124.25m | eta: 55.3m +step 11565/16704 (69.23%) | loss: 2.568401 | lrm: 0.62 | dt: 643.26ms | tok/sec: 815,049 | mfu: 50.94 | epoch: 2 | total time: 124.26m | eta: 55.3m +step 11566/16704 (69.24%) | loss: 2.598348 | lrm: 0.62 | dt: 644.30ms | tok/sec: 813,736 | mfu: 50.86 | epoch: 2 | total time: 124.27m | eta: 55.3m +step 11567/16704 (69.25%) | loss: 2.599083 | lrm: 0.62 | dt: 645.12ms | tok/sec: 812,694 | mfu: 50.79 | epoch: 2 | total time: 124.28m | eta: 55.2m +step 11568/16704 (69.25%) | loss: 2.602128 | lrm: 0.61 | dt: 646.72ms | tok/sec: 810,690 | mfu: 50.67 | epoch: 2 | total time: 124.29m | eta: 55.2m +step 11569/16704 (69.26%) | loss: 2.606914 | lrm: 0.61 | dt: 643.19ms | tok/sec: 815,133 | mfu: 50.95 | epoch: 2 | total time: 124.30m | eta: 55.2m +step 11570/16704 (69.26%) | loss: 2.585183 | lrm: 0.61 | dt: 649.13ms | tok/sec: 807,676 | mfu: 50.48 | epoch: 2 | total time: 124.31m | eta: 55.2m +step 11571/16704 (69.27%) | loss: 2.588439 | lrm: 0.61 | dt: 642.69ms | tok/sec: 815,777 | mfu: 50.99 | epoch: 2 | total time: 124.32m | eta: 55.2m +step 11572/16704 (69.28%) | loss: 2.597209 | lrm: 0.61 | dt: 644.53ms | tok/sec: 813,447 | mfu: 50.84 | epoch: 2 | total time: 124.33m | eta: 55.2m +step 11573/16704 (69.28%) | loss: 2.588282 | lrm: 0.61 | dt: 649.35ms | tok/sec: 807,403 | mfu: 50.46 | epoch: 2 | total time: 124.34m | eta: 55.2m +step 11574/16704 (69.29%) | loss: 2.581707 | lrm: 0.61 | dt: 641.21ms | tok/sec: 817,654 | mfu: 51.10 | epoch: 2 | total time: 124.35m | eta: 55.2m +step 11575/16704 (69.29%) | loss: 2.601545 | lrm: 0.61 | dt: 644.57ms | tok/sec: 813,395 | mfu: 50.84 | epoch: 2 | total time: 124.36m | eta: 55.2m +step 11576/16704 (69.30%) | loss: 2.611198 | lrm: 0.61 | dt: 645.28ms | tok/sec: 812,501 | mfu: 50.78 | epoch: 2 | total time: 124.38m | eta: 55.1m +step 11577/16704 (69.31%) | loss: 2.600739 | lrm: 0.61 | dt: 644.53ms | tok/sec: 813,436 | mfu: 50.84 | epoch: 2 | total time: 124.39m | eta: 55.1m +step 11578/16704 (69.31%) | loss: 2.592507 | lrm: 0.61 | dt: 645.84ms | tok/sec: 811,787 | mfu: 50.74 | epoch: 2 | total time: 124.40m | eta: 55.1m +step 11579/16704 (69.32%) | loss: 2.591168 | lrm: 0.61 | dt: 642.95ms | tok/sec: 815,440 | mfu: 50.97 | epoch: 2 | total time: 124.41m | eta: 55.1m +step 11580/16704 (69.32%) | loss: 2.580071 | lrm: 0.61 | dt: 644.90ms | tok/sec: 812,974 | mfu: 50.81 | epoch: 2 | total time: 124.42m | eta: 55.1m +step 11581/16704 (69.33%) | loss: 2.582187 | lrm: 0.61 | dt: 643.56ms | tok/sec: 814,665 | mfu: 50.92 | epoch: 2 | total time: 124.43m | eta: 55.1m +step 11582/16704 (69.34%) | loss: 2.586267 | lrm: 0.61 | dt: 644.03ms | tok/sec: 814,075 | mfu: 50.88 | epoch: 2 | total time: 124.44m | eta: 55.1m +step 11583/16704 (69.34%) | loss: 2.576730 | lrm: 0.61 | dt: 644.86ms | tok/sec: 813,023 | mfu: 50.82 | epoch: 2 | total time: 124.45m | eta: 55.1m +step 11584/16704 (69.35%) | loss: 2.574291 | lrm: 0.61 | dt: 645.03ms | tok/sec: 812,810 | mfu: 50.80 | epoch: 2 | total time: 124.46m | eta: 55.1m +step 11585/16704 (69.35%) | loss: 2.581358 | lrm: 0.61 | dt: 647.40ms | tok/sec: 809,838 | mfu: 50.62 | epoch: 2 | total time: 124.47m | eta: 55.0m +step 11586/16704 (69.36%) | loss: 2.579693 | lrm: 0.61 | dt: 644.86ms | tok/sec: 813,023 | mfu: 50.82 | epoch: 2 | total time: 124.48m | eta: 55.0m +step 11587/16704 (69.37%) | loss: 2.576152 | lrm: 0.61 | dt: 643.65ms | tok/sec: 814,557 | mfu: 50.91 | epoch: 2 | total time: 124.49m | eta: 55.0m +step 11588/16704 (69.37%) | loss: 2.568023 | lrm: 0.61 | dt: 646.34ms | tok/sec: 811,162 | mfu: 50.70 | epoch: 2 | total time: 124.50m | eta: 55.0m +step 11589/16704 (69.38%) | loss: 2.568230 | lrm: 0.61 | dt: 644.63ms | tok/sec: 813,310 | mfu: 50.83 | epoch: 2 | total time: 124.51m | eta: 55.0m +step 11590/16704 (69.38%) | loss: 2.564442 | lrm: 0.61 | dt: 644.53ms | tok/sec: 813,440 | mfu: 50.84 | epoch: 2 | total time: 124.53m | eta: 55.0m +step 11591/16704 (69.39%) | loss: 2.581171 | lrm: 0.61 | dt: 644.63ms | tok/sec: 813,317 | mfu: 50.83 | epoch: 2 | total time: 124.54m | eta: 55.0m +step 11592/16704 (69.40%) | loss: 2.589591 | lrm: 0.61 | dt: 645.31ms | tok/sec: 812,457 | mfu: 50.78 | epoch: 2 | total time: 124.55m | eta: 55.0m +step 11593/16704 (69.40%) | loss: 2.590171 | lrm: 0.61 | dt: 644.46ms | tok/sec: 813,527 | mfu: 50.85 | epoch: 2 | total time: 124.56m | eta: 55.0m +step 11594/16704 (69.41%) | loss: 2.586127 | lrm: 0.61 | dt: 644.09ms | tok/sec: 814,002 | mfu: 50.88 | epoch: 2 | total time: 124.57m | eta: 55.0m +step 11595/16704 (69.41%) | loss: 2.598660 | lrm: 0.61 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 2 | total time: 124.58m | eta: 54.9m +step 11596/16704 (69.42%) | loss: 2.594500 | lrm: 0.61 | dt: 645.68ms | tok/sec: 811,998 | mfu: 50.75 | epoch: 2 | total time: 124.59m | eta: 54.9m +step 11597/16704 (69.43%) | loss: 2.593571 | lrm: 0.61 | dt: 644.64ms | tok/sec: 813,306 | mfu: 50.83 | epoch: 2 | total time: 124.60m | eta: 54.9m +step 11598/16704 (69.43%) | loss: 2.586749 | lrm: 0.61 | dt: 646.35ms | tok/sec: 811,150 | mfu: 50.70 | epoch: 2 | total time: 124.61m | eta: 54.9m +step 11599/16704 (69.44%) | loss: 2.597071 | lrm: 0.61 | dt: 643.91ms | tok/sec: 814,230 | mfu: 50.89 | epoch: 2 | total time: 124.62m | eta: 54.9m +step 11600/16704 (69.44%) | loss: 2.599351 | lrm: 0.61 | dt: 644.39ms | tok/sec: 813,614 | mfu: 50.85 | epoch: 2 | total time: 124.63m | eta: 54.9m +step 11601/16704 (69.45%) | loss: 2.604476 | lrm: 0.61 | dt: 645.88ms | tok/sec: 811,738 | mfu: 50.73 | epoch: 2 | total time: 124.64m | eta: 54.9m +step 11602/16704 (69.46%) | loss: 2.581797 | lrm: 0.61 | dt: 643.25ms | tok/sec: 815,060 | mfu: 50.94 | epoch: 2 | total time: 124.65m | eta: 54.9m +step 11603/16704 (69.46%) | loss: 2.581509 | lrm: 0.61 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 2 | total time: 124.67m | eta: 54.9m +step 11604/16704 (69.47%) | loss: 2.590183 | lrm: 0.61 | dt: 645.61ms | tok/sec: 812,078 | mfu: 50.76 | epoch: 2 | total time: 124.68m | eta: 54.8m +step 11605/16704 (69.47%) | loss: 2.601277 | lrm: 0.61 | dt: 643.91ms | tok/sec: 814,230 | mfu: 50.89 | epoch: 2 | total time: 124.69m | eta: 54.8m +step 11606/16704 (69.48%) | loss: 2.598223 | lrm: 0.61 | dt: 647.28ms | tok/sec: 809,989 | mfu: 50.63 | epoch: 2 | total time: 124.70m | eta: 54.8m +step 11607/16704 (69.49%) | loss: 2.586054 | lrm: 0.61 | dt: 642.50ms | tok/sec: 816,009 | mfu: 51.00 | epoch: 2 | total time: 124.71m | eta: 54.8m +step 11608/16704 (69.49%) | loss: 2.588027 | lrm: 0.61 | dt: 643.91ms | tok/sec: 814,219 | mfu: 50.89 | epoch: 2 | total time: 124.72m | eta: 54.8m +step 11609/16704 (69.50%) | loss: 2.600346 | lrm: 0.61 | dt: 646.34ms | tok/sec: 811,165 | mfu: 50.70 | epoch: 2 | total time: 124.73m | eta: 54.8m +step 11610/16704 (69.50%) | loss: 2.591339 | lrm: 0.61 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 2 | total time: 124.74m | eta: 54.8m +step 11611/16704 (69.51%) | loss: 2.592612 | lrm: 0.61 | dt: 646.92ms | tok/sec: 810,442 | mfu: 50.65 | epoch: 2 | total time: 124.75m | eta: 54.8m +step 11612/16704 (69.52%) | loss: 2.606215 | lrm: 0.61 | dt: 645.03ms | tok/sec: 812,816 | mfu: 50.80 | epoch: 2 | total time: 124.76m | eta: 54.8m +step 11613/16704 (69.52%) | loss: 2.612710 | lrm: 0.61 | dt: 643.37ms | tok/sec: 814,902 | mfu: 50.93 | epoch: 2 | total time: 124.77m | eta: 54.7m +step 11614/16704 (69.53%) | loss: 2.618447 | lrm: 0.61 | dt: 646.45ms | tok/sec: 811,027 | mfu: 50.69 | epoch: 2 | total time: 124.78m | eta: 54.7m +step 11615/16704 (69.53%) | loss: 2.619058 | lrm: 0.61 | dt: 646.52ms | tok/sec: 810,942 | mfu: 50.69 | epoch: 2 | total time: 124.79m | eta: 54.7m +step 11616/16704 (69.54%) | loss: 2.608127 | lrm: 0.61 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 2 | total time: 124.80m | eta: 54.7m +step 11617/16704 (69.55%) | loss: 2.604410 | lrm: 0.61 | dt: 646.07ms | tok/sec: 811,505 | mfu: 50.72 | epoch: 2 | total time: 124.82m | eta: 54.7m +step 11618/16704 (69.55%) | loss: 2.599959 | lrm: 0.61 | dt: 647.02ms | tok/sec: 810,313 | mfu: 50.65 | epoch: 2 | total time: 124.83m | eta: 54.7m +step 11619/16704 (69.56%) | loss: 2.607989 | lrm: 0.61 | dt: 644.68ms | tok/sec: 813,257 | mfu: 50.83 | epoch: 2 | total time: 124.84m | eta: 54.7m +step 11620/16704 (69.56%) | loss: 2.608395 | lrm: 0.61 | dt: 644.17ms | tok/sec: 813,897 | mfu: 50.87 | epoch: 2 | total time: 124.85m | eta: 54.7m +step 11621/16704 (69.57%) | loss: 2.617699 | lrm: 0.61 | dt: 641.92ms | tok/sec: 816,751 | mfu: 51.05 | epoch: 2 | total time: 124.86m | eta: 54.7m +step 11622/16704 (69.58%) | loss: 2.616308 | lrm: 0.61 | dt: 643.11ms | tok/sec: 815,235 | mfu: 50.95 | epoch: 2 | total time: 124.87m | eta: 54.6m +step 11623/16704 (69.58%) | loss: 2.611848 | lrm: 0.61 | dt: 646.30ms | tok/sec: 811,213 | mfu: 50.70 | epoch: 2 | total time: 124.88m | eta: 54.6m +step 11624/16704 (69.59%) | loss: 2.616168 | lrm: 0.61 | dt: 643.46ms | tok/sec: 814,789 | mfu: 50.93 | epoch: 2 | total time: 124.89m | eta: 54.6m +step 11625/16704 (69.59%) | loss: 2.613578 | lrm: 0.61 | dt: 645.55ms | tok/sec: 812,161 | mfu: 50.76 | epoch: 2 | total time: 124.90m | eta: 54.6m +step 11626/16704 (69.60%) | loss: 2.608898 | lrm: 0.61 | dt: 645.07ms | tok/sec: 812,755 | mfu: 50.80 | epoch: 2 | total time: 124.91m | eta: 54.6m +step 11627/16704 (69.61%) | loss: 2.608382 | lrm: 0.61 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 2 | total time: 124.92m | eta: 54.6m +step 11628/16704 (69.61%) | loss: 2.597891 | lrm: 0.61 | dt: 644.51ms | tok/sec: 813,471 | mfu: 50.84 | epoch: 2 | total time: 124.93m | eta: 54.6m +step 11629/16704 (69.62%) | loss: 2.599012 | lrm: 0.61 | dt: 647.27ms | tok/sec: 810,002 | mfu: 50.63 | epoch: 2 | total time: 124.94m | eta: 54.6m +step 11630/16704 (69.62%) | loss: 2.594949 | lrm: 0.61 | dt: 644.51ms | tok/sec: 813,473 | mfu: 50.84 | epoch: 2 | total time: 124.96m | eta: 54.6m +step 11631/16704 (69.63%) | loss: 2.586138 | lrm: 0.61 | dt: 642.30ms | tok/sec: 816,266 | mfu: 51.02 | epoch: 2 | total time: 124.97m | eta: 54.6m +step 11632/16704 (69.64%) | loss: 2.588716 | lrm: 0.61 | dt: 644.25ms | tok/sec: 813,795 | mfu: 50.86 | epoch: 2 | total time: 124.98m | eta: 54.5m +step 11633/16704 (69.64%) | loss: 2.596612 | lrm: 0.61 | dt: 645.49ms | tok/sec: 812,227 | mfu: 50.77 | epoch: 2 | total time: 124.99m | eta: 54.5m +step 11634/16704 (69.65%) | loss: 2.605631 | lrm: 0.61 | dt: 645.46ms | tok/sec: 812,267 | mfu: 50.77 | epoch: 2 | total time: 125.00m | eta: 54.5m +step 11635/16704 (69.65%) | loss: 2.598993 | lrm: 0.61 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 2 | total time: 125.01m | eta: 54.5m +step 11636/16704 (69.66%) | loss: 2.605438 | lrm: 0.61 | dt: 645.23ms | tok/sec: 812,554 | mfu: 50.79 | epoch: 2 | total time: 125.02m | eta: 54.5m +step 11637/16704 (69.67%) | loss: 2.605528 | lrm: 0.61 | dt: 644.79ms | tok/sec: 813,112 | mfu: 50.82 | epoch: 2 | total time: 125.03m | eta: 54.5m +step 11638/16704 (69.67%) | loss: 2.609284 | lrm: 0.61 | dt: 643.11ms | tok/sec: 815,235 | mfu: 50.95 | epoch: 2 | total time: 125.04m | eta: 54.5m +step 11639/16704 (69.68%) | loss: 2.611221 | lrm: 0.61 | dt: 645.95ms | tok/sec: 811,656 | mfu: 50.73 | epoch: 2 | total time: 125.05m | eta: 54.5m +step 11640/16704 (69.68%) | loss: 2.615188 | lrm: 0.61 | dt: 642.42ms | tok/sec: 816,118 | mfu: 51.01 | epoch: 2 | total time: 125.06m | eta: 54.5m +step 11641/16704 (69.69%) | loss: 2.618704 | lrm: 0.61 | dt: 645.18ms | tok/sec: 812,621 | mfu: 50.79 | epoch: 2 | total time: 125.07m | eta: 54.4m +step 11642/16704 (69.70%) | loss: 2.613509 | lrm: 0.61 | dt: 645.44ms | tok/sec: 812,296 | mfu: 50.77 | epoch: 2 | total time: 125.08m | eta: 54.4m +step 11643/16704 (69.70%) | loss: 2.602722 | lrm: 0.61 | dt: 646.08ms | tok/sec: 811,488 | mfu: 50.72 | epoch: 2 | total time: 125.10m | eta: 54.4m +step 11644/16704 (69.71%) | loss: 2.626141 | lrm: 0.61 | dt: 646.33ms | tok/sec: 811,176 | mfu: 50.70 | epoch: 2 | total time: 125.11m | eta: 54.4m +step 11645/16704 (69.71%) | loss: 2.623183 | lrm: 0.61 | dt: 641.99ms | tok/sec: 816,655 | mfu: 51.04 | epoch: 2 | total time: 125.12m | eta: 54.4m +step 11646/16704 (69.72%) | loss: 2.615111 | lrm: 0.61 | dt: 646.02ms | tok/sec: 811,567 | mfu: 50.72 | epoch: 2 | total time: 125.13m | eta: 54.4m +step 11647/16704 (69.73%) | loss: 2.611432 | lrm: 0.61 | dt: 644.35ms | tok/sec: 813,665 | mfu: 50.86 | epoch: 2 | total time: 125.14m | eta: 54.4m +step 11648/16704 (69.73%) | loss: 2.609487 | lrm: 0.61 | dt: 644.21ms | tok/sec: 813,841 | mfu: 50.87 | epoch: 2 | total time: 125.15m | eta: 54.4m +step 11649/16704 (69.74%) | loss: 2.616362 | lrm: 0.61 | dt: 647.33ms | tok/sec: 809,927 | mfu: 50.62 | epoch: 2 | total time: 125.16m | eta: 54.4m +step 11650/16704 (69.74%) | loss: 2.585737 | lrm: 0.61 | dt: 641.73ms | tok/sec: 816,989 | mfu: 51.06 | epoch: 2 | total time: 125.17m | eta: 54.3m +step 11651/16704 (69.75%) | loss: 2.577106 | lrm: 0.61 | dt: 647.63ms | tok/sec: 809,542 | mfu: 50.60 | epoch: 2 | total time: 125.18m | eta: 54.3m +step 11652/16704 (69.76%) | loss: 2.575558 | lrm: 0.60 | dt: 645.73ms | tok/sec: 811,927 | mfu: 50.75 | epoch: 2 | total time: 125.19m | eta: 54.3m +step 11653/16704 (69.76%) | loss: 2.574803 | lrm: 0.60 | dt: 644.88ms | tok/sec: 813,001 | mfu: 50.81 | epoch: 2 | total time: 125.20m | eta: 54.3m +step 11654/16704 (69.77%) | loss: 2.584176 | lrm: 0.60 | dt: 644.52ms | tok/sec: 813,448 | mfu: 50.84 | epoch: 2 | total time: 125.21m | eta: 54.3m +step 11655/16704 (69.77%) | loss: 2.569982 | lrm: 0.60 | dt: 643.93ms | tok/sec: 814,203 | mfu: 50.89 | epoch: 2 | total time: 125.22m | eta: 54.3m +step 11656/16704 (69.78%) | loss: 2.576888 | lrm: 0.60 | dt: 645.65ms | tok/sec: 812,036 | mfu: 50.75 | epoch: 2 | total time: 125.23m | eta: 54.3m +step 11657/16704 (69.79%) | loss: 2.567517 | lrm: 0.60 | dt: 645.46ms | tok/sec: 812,267 | mfu: 50.77 | epoch: 2 | total time: 125.25m | eta: 54.3m +step 11658/16704 (69.79%) | loss: 2.565317 | lrm: 0.60 | dt: 641.24ms | tok/sec: 817,615 | mfu: 51.10 | epoch: 2 | total time: 125.26m | eta: 54.3m +step 11659/16704 (69.80%) | loss: 2.579940 | lrm: 0.60 | dt: 643.15ms | tok/sec: 815,186 | mfu: 50.95 | epoch: 2 | total time: 125.27m | eta: 54.3m +step 11660/16704 (69.80%) | loss: 2.574041 | lrm: 0.60 | dt: 643.91ms | tok/sec: 814,224 | mfu: 50.89 | epoch: 2 | total time: 125.28m | eta: 54.2m +step 11661/16704 (69.81%) | loss: 2.570470 | lrm: 0.60 | dt: 645.55ms | tok/sec: 812,151 | mfu: 50.76 | epoch: 2 | total time: 125.29m | eta: 54.2m +step 11662/16704 (69.82%) | loss: 2.571236 | lrm: 0.60 | dt: 641.74ms | tok/sec: 816,976 | mfu: 51.06 | epoch: 2 | total time: 125.30m | eta: 54.2m +step 11663/16704 (69.82%) | loss: 2.577034 | lrm: 0.60 | dt: 645.39ms | tok/sec: 812,352 | mfu: 50.77 | epoch: 2 | total time: 125.31m | eta: 54.2m +step 11664/16704 (69.83%) | loss: 2.557852 | lrm: 0.60 | dt: 644.24ms | tok/sec: 813,808 | mfu: 50.86 | epoch: 2 | total time: 125.32m | eta: 54.2m +step 11665/16704 (69.83%) | loss: 2.581074 | lrm: 0.60 | dt: 643.66ms | tok/sec: 814,547 | mfu: 50.91 | epoch: 2 | total time: 125.33m | eta: 54.2m +step 11666/16704 (69.84%) | loss: 2.592750 | lrm: 0.60 | dt: 643.00ms | tok/sec: 815,376 | mfu: 50.96 | epoch: 2 | total time: 125.34m | eta: 54.2m +step 11667/16704 (69.85%) | loss: 2.575711 | lrm: 0.60 | dt: 644.83ms | tok/sec: 813,068 | mfu: 50.82 | epoch: 2 | total time: 125.35m | eta: 54.2m +step 11668/16704 (69.85%) | loss: 2.582002 | lrm: 0.60 | dt: 646.48ms | tok/sec: 810,989 | mfu: 50.69 | epoch: 2 | total time: 125.36m | eta: 54.2m +step 11669/16704 (69.86%) | loss: 2.580034 | lrm: 0.60 | dt: 644.79ms | tok/sec: 813,117 | mfu: 50.82 | epoch: 2 | total time: 125.37m | eta: 54.1m +step 11670/16704 (69.86%) | loss: 2.583821 | lrm: 0.60 | dt: 643.41ms | tok/sec: 814,861 | mfu: 50.93 | epoch: 2 | total time: 125.39m | eta: 54.1m +step 11671/16704 (69.87%) | loss: 2.584587 | lrm: 0.60 | dt: 642.60ms | tok/sec: 815,885 | mfu: 50.99 | epoch: 2 | total time: 125.40m | eta: 54.1m +step 11672/16704 (69.88%) | loss: 2.587902 | lrm: 0.60 | dt: 644.95ms | tok/sec: 812,916 | mfu: 50.81 | epoch: 2 | total time: 125.41m | eta: 54.1m +step 11673/16704 (69.88%) | loss: 2.590934 | lrm: 0.60 | dt: 643.88ms | tok/sec: 814,263 | mfu: 50.89 | epoch: 2 | total time: 125.42m | eta: 54.1m +step 11674/16704 (69.89%) | loss: 2.585014 | lrm: 0.60 | dt: 642.38ms | tok/sec: 816,166 | mfu: 51.01 | epoch: 2 | total time: 125.43m | eta: 54.1m +step 11675/16704 (69.89%) | loss: 2.589415 | lrm: 0.60 | dt: 643.62ms | tok/sec: 814,587 | mfu: 50.91 | epoch: 2 | total time: 125.44m | eta: 54.1m +step 11676/16704 (69.90%) | loss: 2.585945 | lrm: 0.60 | dt: 646.03ms | tok/sec: 811,548 | mfu: 50.72 | epoch: 2 | total time: 125.45m | eta: 54.1m +step 11677/16704 (69.91%) | loss: 2.597418 | lrm: 0.60 | dt: 646.86ms | tok/sec: 810,507 | mfu: 50.66 | epoch: 2 | total time: 125.46m | eta: 54.1m +step 11678/16704 (69.91%) | loss: 2.600410 | lrm: 0.60 | dt: 642.99ms | tok/sec: 815,388 | mfu: 50.96 | epoch: 2 | total time: 125.47m | eta: 54.0m +step 11679/16704 (69.92%) | loss: 2.591375 | lrm: 0.60 | dt: 644.64ms | tok/sec: 813,301 | mfu: 50.83 | epoch: 2 | total time: 125.48m | eta: 54.0m +step 11680/16704 (69.92%) | loss: 2.602404 | lrm: 0.60 | dt: 643.44ms | tok/sec: 814,814 | mfu: 50.93 | epoch: 2 | total time: 125.49m | eta: 54.0m +step 11681/16704 (69.93%) | loss: 2.595941 | lrm: 0.60 | dt: 643.20ms | tok/sec: 815,118 | mfu: 50.95 | epoch: 2 | total time: 125.50m | eta: 54.0m +step 11682/16704 (69.94%) | loss: 2.600699 | lrm: 0.60 | dt: 645.81ms | tok/sec: 811,830 | mfu: 50.74 | epoch: 2 | total time: 125.51m | eta: 54.0m +step 11683/16704 (69.94%) | loss: 2.590903 | lrm: 0.60 | dt: 641.17ms | tok/sec: 817,706 | mfu: 51.11 | epoch: 2 | total time: 125.52m | eta: 54.0m +step 11684/16704 (69.95%) | loss: 2.594316 | lrm: 0.60 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 2 | total time: 125.54m | eta: 54.0m +step 11685/16704 (69.95%) | loss: 2.577949 | lrm: 0.60 | dt: 642.55ms | tok/sec: 815,947 | mfu: 51.00 | epoch: 2 | total time: 125.55m | eta: 54.0m +step 11686/16704 (69.96%) | loss: 2.586517 | lrm: 0.60 | dt: 642.65ms | tok/sec: 815,825 | mfu: 50.99 | epoch: 2 | total time: 125.56m | eta: 54.0m +step 11687/16704 (69.97%) | loss: 2.587277 | lrm: 0.60 | dt: 644.70ms | tok/sec: 813,233 | mfu: 50.83 | epoch: 2 | total time: 125.57m | eta: 53.9m +step 11688/16704 (69.97%) | loss: 2.601449 | lrm: 0.60 | dt: 640.93ms | tok/sec: 818,008 | mfu: 51.13 | epoch: 2 | total time: 125.58m | eta: 53.9m +step 11689/16704 (69.98%) | loss: 2.591915 | lrm: 0.60 | dt: 642.50ms | tok/sec: 816,010 | mfu: 51.00 | epoch: 2 | total time: 125.59m | eta: 53.9m +step 11690/16704 (69.98%) | loss: 2.587969 | lrm: 0.60 | dt: 642.11ms | tok/sec: 816,514 | mfu: 51.03 | epoch: 2 | total time: 125.60m | eta: 53.9m +step 11691/16704 (69.99%) | loss: 2.596122 | lrm: 0.60 | dt: 646.01ms | tok/sec: 811,575 | mfu: 50.72 | epoch: 2 | total time: 125.61m | eta: 53.9m +step 11692/16704 (70.00%) | loss: 2.580136 | lrm: 0.60 | dt: 644.51ms | tok/sec: 813,461 | mfu: 50.84 | epoch: 2 | total time: 125.62m | eta: 53.9m +step 11693/16704 (70.00%) | loss: 2.579445 | lrm: 0.60 | dt: 642.19ms | tok/sec: 816,404 | mfu: 51.03 | epoch: 2 | total time: 125.63m | eta: 53.9m +step 11694/16704 (70.01%) | loss: 2.565190 | lrm: 0.60 | dt: 642.94ms | tok/sec: 815,458 | mfu: 50.97 | epoch: 2 | total time: 125.64m | eta: 53.9m +step 11695/16704 (70.01%) | loss: 2.574027 | lrm: 0.60 | dt: 644.09ms | tok/sec: 813,998 | mfu: 50.88 | epoch: 2 | total time: 125.65m | eta: 53.9m +step 11696/16704 (70.02%) | loss: 2.580479 | lrm: 0.60 | dt: 644.56ms | tok/sec: 813,407 | mfu: 50.84 | epoch: 2 | total time: 125.66m | eta: 53.9m +step 11697/16704 (70.03%) | loss: 2.572519 | lrm: 0.60 | dt: 643.33ms | tok/sec: 814,965 | mfu: 50.94 | epoch: 2 | total time: 125.67m | eta: 53.8m +step 11698/16704 (70.03%) | loss: 2.569548 | lrm: 0.60 | dt: 645.52ms | tok/sec: 812,200 | mfu: 50.76 | epoch: 2 | total time: 125.69m | eta: 53.8m +step 11699/16704 (70.04%) | loss: 2.569418 | lrm: 0.60 | dt: 642.94ms | tok/sec: 815,451 | mfu: 50.97 | epoch: 2 | total time: 125.70m | eta: 53.8m +step 11700/16704 (70.04%) | loss: 2.569247 | lrm: 0.60 | dt: 642.13ms | tok/sec: 816,487 | mfu: 51.03 | epoch: 2 | total time: 125.71m | eta: 53.8m +step 11701/16704 (70.05%) | loss: 2.558254 | lrm: 0.60 | dt: 646.81ms | tok/sec: 810,574 | mfu: 50.66 | epoch: 2 | total time: 125.72m | eta: 53.8m +step 11702/16704 (70.06%) | loss: 2.555087 | lrm: 0.60 | dt: 646.07ms | tok/sec: 811,504 | mfu: 50.72 | epoch: 2 | total time: 125.73m | eta: 53.8m +step 11703/16704 (70.06%) | loss: 2.554652 | lrm: 0.60 | dt: 642.25ms | tok/sec: 816,329 | mfu: 51.02 | epoch: 2 | total time: 125.74m | eta: 53.8m +step 11704/16704 (70.07%) | loss: 2.554176 | lrm: 0.60 | dt: 644.50ms | tok/sec: 813,480 | mfu: 50.84 | epoch: 2 | total time: 125.75m | eta: 53.8m +step 11705/16704 (70.07%) | loss: 2.567775 | lrm: 0.60 | dt: 642.41ms | tok/sec: 816,123 | mfu: 51.01 | epoch: 2 | total time: 125.76m | eta: 53.8m +step 11706/16704 (70.08%) | loss: 2.566909 | lrm: 0.60 | dt: 645.17ms | tok/sec: 812,630 | mfu: 50.79 | epoch: 2 | total time: 125.77m | eta: 53.7m +step 11707/16704 (70.09%) | loss: 2.572448 | lrm: 0.60 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 2 | total time: 125.78m | eta: 53.7m +step 11708/16704 (70.09%) | loss: 2.577861 | lrm: 0.60 | dt: 645.40ms | tok/sec: 812,343 | mfu: 50.77 | epoch: 2 | total time: 125.79m | eta: 53.7m +step 11709/16704 (70.10%) | loss: 2.579968 | lrm: 0.60 | dt: 643.15ms | tok/sec: 815,182 | mfu: 50.95 | epoch: 2 | total time: 125.80m | eta: 53.7m +step 11710/16704 (70.10%) | loss: 2.586796 | lrm: 0.60 | dt: 645.34ms | tok/sec: 812,422 | mfu: 50.78 | epoch: 2 | total time: 125.81m | eta: 53.7m +step 11711/16704 (70.11%) | loss: 2.588894 | lrm: 0.60 | dt: 642.87ms | tok/sec: 815,547 | mfu: 50.97 | epoch: 2 | total time: 125.83m | eta: 53.7m +step 11712/16704 (70.11%) | loss: 2.571638 | lrm: 0.60 | dt: 646.92ms | tok/sec: 810,433 | mfu: 50.65 | epoch: 2 | total time: 125.84m | eta: 53.7m +step 11713/16704 (70.12%) | loss: 2.580761 | lrm: 0.60 | dt: 643.69ms | tok/sec: 814,504 | mfu: 50.91 | epoch: 2 | total time: 125.85m | eta: 53.7m +step 11714/16704 (70.13%) | loss: 2.568800 | lrm: 0.60 | dt: 645.15ms | tok/sec: 812,658 | mfu: 50.79 | epoch: 2 | total time: 125.86m | eta: 53.7m +step 11715/16704 (70.13%) | loss: 2.563993 | lrm: 0.60 | dt: 645.55ms | tok/sec: 812,161 | mfu: 50.76 | epoch: 2 | total time: 125.87m | eta: 53.6m +step 11716/16704 (70.14%) | loss: 2.572404 | lrm: 0.60 | dt: 644.07ms | tok/sec: 814,024 | mfu: 50.88 | epoch: 2 | total time: 125.88m | eta: 53.6m +step 11717/16704 (70.14%) | loss: 2.589954 | lrm: 0.60 | dt: 646.29ms | tok/sec: 811,231 | mfu: 50.70 | epoch: 2 | total time: 125.89m | eta: 53.6m +step 11718/16704 (70.15%) | loss: 2.581846 | lrm: 0.60 | dt: 643.16ms | tok/sec: 815,175 | mfu: 50.95 | epoch: 2 | total time: 125.90m | eta: 53.6m +step 11719/16704 (70.16%) | loss: 2.577743 | lrm: 0.60 | dt: 644.42ms | tok/sec: 813,576 | mfu: 50.85 | epoch: 2 | total time: 125.91m | eta: 53.6m +step 11720/16704 (70.16%) | loss: 2.558829 | lrm: 0.60 | dt: 643.20ms | tok/sec: 815,118 | mfu: 50.95 | epoch: 2 | total time: 125.92m | eta: 53.6m +step 11721/16704 (70.17%) | loss: 2.555236 | lrm: 0.60 | dt: 641.80ms | tok/sec: 816,906 | mfu: 51.06 | epoch: 2 | total time: 125.93m | eta: 53.6m +step 11722/16704 (70.17%) | loss: 2.559917 | lrm: 0.60 | dt: 643.59ms | tok/sec: 814,633 | mfu: 50.92 | epoch: 2 | total time: 125.94m | eta: 53.6m +step 11723/16704 (70.18%) | loss: 2.562952 | lrm: 0.60 | dt: 642.63ms | tok/sec: 815,853 | mfu: 50.99 | epoch: 2 | total time: 125.95m | eta: 53.6m +step 11724/16704 (70.19%) | loss: 2.558341 | lrm: 0.60 | dt: 642.82ms | tok/sec: 815,607 | mfu: 50.98 | epoch: 2 | total time: 125.96m | eta: 53.6m +step 11725/16704 (70.19%) | loss: 2.560505 | lrm: 0.60 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 2 | total time: 125.98m | eta: 53.5m +step 11726/16704 (70.20%) | loss: 2.553902 | lrm: 0.60 | dt: 642.31ms | tok/sec: 816,253 | mfu: 51.02 | epoch: 2 | total time: 125.99m | eta: 53.5m +step 11727/16704 (70.20%) | loss: 2.544160 | lrm: 0.60 | dt: 642.86ms | tok/sec: 815,555 | mfu: 50.97 | epoch: 2 | total time: 126.00m | eta: 53.5m +step 11728/16704 (70.21%) | loss: 2.550004 | lrm: 0.60 | dt: 643.25ms | tok/sec: 815,062 | mfu: 50.94 | epoch: 2 | total time: 126.01m | eta: 53.5m +step 11729/16704 (70.22%) | loss: 2.545904 | lrm: 0.60 | dt: 641.08ms | tok/sec: 817,825 | mfu: 51.12 | epoch: 2 | total time: 126.02m | eta: 53.5m +step 11730/16704 (70.22%) | loss: 2.548281 | lrm: 0.60 | dt: 646.36ms | tok/sec: 811,142 | mfu: 50.70 | epoch: 2 | total time: 126.03m | eta: 53.5m +step 11731/16704 (70.23%) | loss: 2.552597 | lrm: 0.60 | dt: 644.12ms | tok/sec: 813,965 | mfu: 50.87 | epoch: 2 | total time: 126.04m | eta: 53.5m +step 11732/16704 (70.23%) | loss: 2.548320 | lrm: 0.60 | dt: 647.07ms | tok/sec: 810,245 | mfu: 50.64 | epoch: 2 | total time: 126.05m | eta: 53.5m +step 11733/16704 (70.24%) | loss: 2.561809 | lrm: 0.60 | dt: 643.14ms | tok/sec: 815,200 | mfu: 50.95 | epoch: 2 | total time: 126.06m | eta: 53.5m +step 11734/16704 (70.25%) | loss: 2.571233 | lrm: 0.60 | dt: 642.93ms | tok/sec: 815,472 | mfu: 50.97 | epoch: 2 | total time: 126.07m | eta: 53.4m +step 11735/16704 (70.25%) | loss: 2.579044 | lrm: 0.59 | dt: 642.74ms | tok/sec: 815,711 | mfu: 50.98 | epoch: 2 | total time: 126.08m | eta: 53.4m +step 11736/16704 (70.26%) | loss: 2.576653 | lrm: 0.59 | dt: 645.32ms | tok/sec: 812,443 | mfu: 50.78 | epoch: 2 | total time: 126.09m | eta: 53.4m +step 11737/16704 (70.26%) | loss: 2.572361 | lrm: 0.59 | dt: 644.11ms | tok/sec: 813,975 | mfu: 50.87 | epoch: 2 | total time: 126.10m | eta: 53.4m +step 11738/16704 (70.27%) | loss: 2.568203 | lrm: 0.59 | dt: 644.58ms | tok/sec: 813,379 | mfu: 50.84 | epoch: 2 | total time: 126.11m | eta: 53.4m +step 11739/16704 (70.28%) | loss: 2.570844 | lrm: 0.59 | dt: 643.94ms | tok/sec: 814,187 | mfu: 50.89 | epoch: 2 | total time: 126.13m | eta: 53.4m +step 11740/16704 (70.28%) | loss: 2.572419 | lrm: 0.59 | dt: 644.50ms | tok/sec: 813,479 | mfu: 50.84 | epoch: 2 | total time: 126.14m | eta: 53.4m +step 11741/16704 (70.29%) | loss: 2.560002 | lrm: 0.59 | dt: 643.85ms | tok/sec: 814,302 | mfu: 50.90 | epoch: 2 | total time: 126.15m | eta: 53.4m +step 11742/16704 (70.29%) | loss: 2.559014 | lrm: 0.59 | dt: 642.53ms | tok/sec: 815,979 | mfu: 51.00 | epoch: 2 | total time: 126.16m | eta: 53.4m +step 11743/16704 (70.30%) | loss: 2.555351 | lrm: 0.59 | dt: 643.34ms | tok/sec: 814,941 | mfu: 50.94 | epoch: 2 | total time: 126.17m | eta: 53.3m +step 11744/16704 (70.31%) | loss: 2.558361 | lrm: 0.59 | dt: 640.95ms | tok/sec: 817,985 | mfu: 51.13 | epoch: 2 | total time: 126.18m | eta: 53.3m +step 11745/16704 (70.31%) | loss: 2.564973 | lrm: 0.59 | dt: 643.02ms | tok/sec: 815,347 | mfu: 50.96 | epoch: 2 | total time: 126.19m | eta: 53.3m +step 11746/16704 (70.32%) | loss: 2.595978 | lrm: 0.59 | dt: 641.95ms | tok/sec: 816,710 | mfu: 51.05 | epoch: 2 | total time: 126.20m | eta: 53.3m +step 11747/16704 (70.32%) | loss: 2.595205 | lrm: 0.59 | dt: 643.58ms | tok/sec: 814,638 | mfu: 50.92 | epoch: 2 | total time: 126.21m | eta: 53.3m +step 11748/16704 (70.33%) | loss: 2.580022 | lrm: 0.59 | dt: 642.71ms | tok/sec: 815,748 | mfu: 50.99 | epoch: 2 | total time: 126.22m | eta: 53.3m +step 11749/16704 (70.34%) | loss: 2.581496 | lrm: 0.59 | dt: 647.06ms | tok/sec: 810,257 | mfu: 50.64 | epoch: 2 | total time: 126.23m | eta: 53.3m +Step 11750 | Validation bpb: 0.788936 +step 11750/16704 (70.34%) | loss: 2.582646 | lrm: 0.59 | dt: 646.44ms | tok/sec: 811,038 | mfu: 50.69 | epoch: 2 | total time: 126.24m | eta: 53.3m +step 11751/16704 (70.35%) | loss: 2.578352 | lrm: 0.59 | dt: 647.92ms | tok/sec: 809,182 | mfu: 50.58 | epoch: 2 | total time: 126.25m | eta: 53.3m +step 11752/16704 (70.35%) | loss: 2.580269 | lrm: 0.59 | dt: 641.18ms | tok/sec: 817,687 | mfu: 51.11 | epoch: 2 | total time: 126.27m | eta: 53.3m +step 11753/16704 (70.36%) | loss: 2.568285 | lrm: 0.59 | dt: 642.71ms | tok/sec: 815,751 | mfu: 50.99 | epoch: 2 | total time: 126.28m | eta: 53.2m +step 11754/16704 (70.37%) | loss: 2.570794 | lrm: 0.59 | dt: 647.12ms | tok/sec: 810,185 | mfu: 50.64 | epoch: 2 | total time: 126.29m | eta: 53.2m +step 11755/16704 (70.37%) | loss: 2.584596 | lrm: 0.59 | dt: 642.48ms | tok/sec: 816,031 | mfu: 51.00 | epoch: 2 | total time: 126.30m | eta: 53.2m +step 11756/16704 (70.38%) | loss: 2.608034 | lrm: 0.59 | dt: 643.74ms | tok/sec: 814,442 | mfu: 50.90 | epoch: 2 | total time: 126.31m | eta: 53.2m +step 11757/16704 (70.38%) | loss: 2.615241 | lrm: 0.59 | dt: 645.90ms | tok/sec: 811,719 | mfu: 50.73 | epoch: 2 | total time: 126.32m | eta: 53.2m +step 11758/16704 (70.39%) | loss: 2.608907 | lrm: 0.59 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 2 | total time: 126.33m | eta: 53.2m +step 11759/16704 (70.40%) | loss: 2.601541 | lrm: 0.59 | dt: 645.77ms | tok/sec: 811,882 | mfu: 50.74 | epoch: 2 | total time: 126.34m | eta: 53.2m +step 11760/16704 (70.40%) | loss: 2.594111 | lrm: 0.59 | dt: 643.56ms | tok/sec: 814,663 | mfu: 50.92 | epoch: 2 | total time: 126.35m | eta: 53.2m +step 11761/16704 (70.41%) | loss: 2.586671 | lrm: 0.59 | dt: 643.83ms | tok/sec: 814,325 | mfu: 50.90 | epoch: 2 | total time: 126.36m | eta: 53.2m +step 11762/16704 (70.41%) | loss: 2.605426 | lrm: 0.59 | dt: 643.27ms | tok/sec: 815,031 | mfu: 50.94 | epoch: 2 | total time: 126.37m | eta: 53.1m +step 11763/16704 (70.42%) | loss: 2.606460 | lrm: 0.59 | dt: 642.62ms | tok/sec: 815,862 | mfu: 50.99 | epoch: 2 | total time: 126.38m | eta: 53.1m +step 11764/16704 (70.43%) | loss: 2.602495 | lrm: 0.59 | dt: 644.56ms | tok/sec: 813,406 | mfu: 50.84 | epoch: 2 | total time: 126.39m | eta: 53.1m +step 11765/16704 (70.43%) | loss: 2.611371 | lrm: 0.59 | dt: 642.74ms | tok/sec: 815,712 | mfu: 50.98 | epoch: 2 | total time: 126.40m | eta: 53.1m +step 11766/16704 (70.44%) | loss: 2.608451 | lrm: 0.59 | dt: 642.22ms | tok/sec: 816,373 | mfu: 51.02 | epoch: 2 | total time: 126.42m | eta: 53.1m +step 11767/16704 (70.44%) | loss: 2.590337 | lrm: 0.59 | dt: 642.64ms | tok/sec: 815,836 | mfu: 50.99 | epoch: 2 | total time: 126.43m | eta: 53.1m +step 11768/16704 (70.45%) | loss: 2.585390 | lrm: 0.59 | dt: 642.23ms | tok/sec: 816,360 | mfu: 51.02 | epoch: 2 | total time: 126.44m | eta: 53.1m +step 11769/16704 (70.46%) | loss: 2.579636 | lrm: 0.59 | dt: 643.81ms | tok/sec: 814,351 | mfu: 50.90 | epoch: 2 | total time: 126.45m | eta: 53.1m +step 11770/16704 (70.46%) | loss: 2.572973 | lrm: 0.59 | dt: 640.99ms | tok/sec: 817,939 | mfu: 51.12 | epoch: 2 | total time: 126.46m | eta: 53.1m +step 11771/16704 (70.47%) | loss: 2.568954 | lrm: 0.59 | dt: 645.50ms | tok/sec: 812,221 | mfu: 50.76 | epoch: 2 | total time: 126.47m | eta: 53.0m +step 11772/16704 (70.47%) | loss: 2.570489 | lrm: 0.59 | dt: 642.42ms | tok/sec: 816,108 | mfu: 51.01 | epoch: 2 | total time: 126.48m | eta: 53.0m +step 11773/16704 (70.48%) | loss: 2.564394 | lrm: 0.59 | dt: 643.81ms | tok/sec: 814,352 | mfu: 50.90 | epoch: 2 | total time: 126.49m | eta: 53.0m +step 11774/16704 (70.49%) | loss: 2.570703 | lrm: 0.59 | dt: 641.77ms | tok/sec: 816,944 | mfu: 51.06 | epoch: 2 | total time: 126.50m | eta: 53.0m +step 11775/16704 (70.49%) | loss: 2.577982 | lrm: 0.59 | dt: 644.50ms | tok/sec: 813,486 | mfu: 50.84 | epoch: 2 | total time: 126.51m | eta: 53.0m +step 11776/16704 (70.50%) | loss: 2.565070 | lrm: 0.59 | dt: 643.62ms | tok/sec: 814,596 | mfu: 50.91 | epoch: 2 | total time: 126.52m | eta: 53.0m +step 11777/16704 (70.50%) | loss: 2.561957 | lrm: 0.59 | dt: 643.65ms | tok/sec: 814,548 | mfu: 50.91 | epoch: 2 | total time: 126.53m | eta: 53.0m +step 11778/16704 (70.51%) | loss: 2.565203 | lrm: 0.59 | dt: 642.19ms | tok/sec: 816,400 | mfu: 51.03 | epoch: 2 | total time: 126.54m | eta: 53.0m +step 11779/16704 (70.52%) | loss: 2.568998 | lrm: 0.59 | dt: 644.27ms | tok/sec: 813,765 | mfu: 50.86 | epoch: 2 | total time: 126.55m | eta: 53.0m +step 11780/16704 (70.52%) | loss: 2.564496 | lrm: 0.59 | dt: 643.14ms | tok/sec: 815,204 | mfu: 50.95 | epoch: 2 | total time: 126.57m | eta: 52.9m +step 11781/16704 (70.53%) | loss: 2.575215 | lrm: 0.59 | dt: 644.68ms | tok/sec: 813,250 | mfu: 50.83 | epoch: 2 | total time: 126.58m | eta: 52.9m +step 11782/16704 (70.53%) | loss: 2.575902 | lrm: 0.59 | dt: 642.72ms | tok/sec: 815,733 | mfu: 50.98 | epoch: 2 | total time: 126.59m | eta: 52.9m +step 11783/16704 (70.54%) | loss: 2.564166 | lrm: 0.59 | dt: 643.18ms | tok/sec: 815,151 | mfu: 50.95 | epoch: 2 | total time: 126.60m | eta: 52.9m +step 11784/16704 (70.55%) | loss: 2.573348 | lrm: 0.59 | dt: 640.38ms | tok/sec: 818,719 | mfu: 51.17 | epoch: 2 | total time: 126.61m | eta: 52.9m +step 11785/16704 (70.55%) | loss: 2.574928 | lrm: 0.59 | dt: 643.13ms | tok/sec: 815,211 | mfu: 50.95 | epoch: 2 | total time: 126.62m | eta: 52.9m +step 11786/16704 (70.56%) | loss: 2.565203 | lrm: 0.59 | dt: 641.91ms | tok/sec: 816,760 | mfu: 51.05 | epoch: 2 | total time: 126.63m | eta: 52.9m +step 11787/16704 (70.56%) | loss: 2.569654 | lrm: 0.59 | dt: 643.06ms | tok/sec: 815,297 | mfu: 50.96 | epoch: 2 | total time: 126.64m | eta: 52.9m +step 11788/16704 (70.57%) | loss: 2.573735 | lrm: 0.59 | dt: 643.95ms | tok/sec: 814,171 | mfu: 50.89 | epoch: 2 | total time: 126.65m | eta: 52.9m +step 11789/16704 (70.58%) | loss: 2.573159 | lrm: 0.59 | dt: 644.62ms | tok/sec: 813,327 | mfu: 50.83 | epoch: 2 | total time: 126.66m | eta: 52.9m +step 11790/16704 (70.58%) | loss: 2.575964 | lrm: 0.59 | dt: 643.39ms | tok/sec: 814,884 | mfu: 50.93 | epoch: 2 | total time: 126.67m | eta: 52.8m +step 11791/16704 (70.59%) | loss: 2.562224 | lrm: 0.59 | dt: 643.89ms | tok/sec: 814,247 | mfu: 50.89 | epoch: 2 | total time: 126.68m | eta: 52.8m +step 11792/16704 (70.59%) | loss: 2.567444 | lrm: 0.59 | dt: 644.96ms | tok/sec: 812,903 | mfu: 50.81 | epoch: 2 | total time: 126.69m | eta: 52.8m +step 11793/16704 (70.60%) | loss: 2.561450 | lrm: 0.59 | dt: 644.70ms | tok/sec: 813,226 | mfu: 50.83 | epoch: 2 | total time: 126.70m | eta: 52.8m +step 11794/16704 (70.61%) | loss: 2.555194 | lrm: 0.59 | dt: 642.32ms | tok/sec: 816,246 | mfu: 51.02 | epoch: 2 | total time: 126.72m | eta: 52.8m +step 11795/16704 (70.61%) | loss: 2.564171 | lrm: 0.59 | dt: 647.73ms | tok/sec: 809,428 | mfu: 50.59 | epoch: 2 | total time: 126.73m | eta: 52.8m +step 11796/16704 (70.62%) | loss: 2.574199 | lrm: 0.59 | dt: 642.24ms | tok/sec: 816,343 | mfu: 51.02 | epoch: 2 | total time: 126.74m | eta: 52.8m +step 11797/16704 (70.62%) | loss: 2.584718 | lrm: 0.59 | dt: 642.01ms | tok/sec: 816,634 | mfu: 51.04 | epoch: 2 | total time: 126.75m | eta: 52.8m +step 11798/16704 (70.63%) | loss: 2.590439 | lrm: 0.59 | dt: 646.03ms | tok/sec: 811,551 | mfu: 50.72 | epoch: 2 | total time: 126.76m | eta: 52.8m +step 11799/16704 (70.64%) | loss: 2.579942 | lrm: 0.59 | dt: 644.42ms | tok/sec: 813,583 | mfu: 50.85 | epoch: 2 | total time: 126.77m | eta: 52.7m +step 11800/16704 (70.64%) | loss: 2.590635 | lrm: 0.59 | dt: 644.56ms | tok/sec: 813,404 | mfu: 50.84 | epoch: 2 | total time: 126.78m | eta: 52.7m +step 11801/16704 (70.65%) | loss: 2.604737 | lrm: 0.59 | dt: 644.40ms | tok/sec: 813,611 | mfu: 50.85 | epoch: 2 | total time: 126.79m | eta: 52.7m +step 11802/16704 (70.65%) | loss: 2.608936 | lrm: 0.59 | dt: 646.02ms | tok/sec: 811,568 | mfu: 50.72 | epoch: 2 | total time: 126.80m | eta: 52.7m +step 11803/16704 (70.66%) | loss: 2.605502 | lrm: 0.59 | dt: 644.98ms | tok/sec: 812,869 | mfu: 50.81 | epoch: 2 | total time: 126.81m | eta: 52.7m +step 11804/16704 (70.67%) | loss: 2.605180 | lrm: 0.59 | dt: 642.04ms | tok/sec: 816,597 | mfu: 51.04 | epoch: 2 | total time: 126.82m | eta: 52.7m +step 11805/16704 (70.67%) | loss: 2.594180 | lrm: 0.59 | dt: 646.35ms | tok/sec: 811,148 | mfu: 50.70 | epoch: 2 | total time: 126.83m | eta: 52.7m +step 11806/16704 (70.68%) | loss: 2.590195 | lrm: 0.59 | dt: 642.72ms | tok/sec: 815,731 | mfu: 50.98 | epoch: 2 | total time: 126.84m | eta: 52.7m +step 11807/16704 (70.68%) | loss: 2.583629 | lrm: 0.59 | dt: 646.11ms | tok/sec: 811,457 | mfu: 50.72 | epoch: 2 | total time: 126.86m | eta: 52.7m +step 11808/16704 (70.69%) | loss: 2.588025 | lrm: 0.59 | dt: 643.94ms | tok/sec: 814,181 | mfu: 50.89 | epoch: 2 | total time: 126.87m | eta: 52.6m +step 11809/16704 (70.70%) | loss: 2.589970 | lrm: 0.59 | dt: 640.63ms | tok/sec: 818,391 | mfu: 51.15 | epoch: 2 | total time: 126.88m | eta: 52.6m +step 11810/16704 (70.70%) | loss: 2.585834 | lrm: 0.59 | dt: 645.00ms | tok/sec: 812,843 | mfu: 50.80 | epoch: 2 | total time: 126.89m | eta: 52.6m +step 11811/16704 (70.71%) | loss: 2.573850 | lrm: 0.59 | dt: 644.94ms | tok/sec: 812,929 | mfu: 50.81 | epoch: 2 | total time: 126.90m | eta: 52.6m +step 11812/16704 (70.71%) | loss: 2.579219 | lrm: 0.59 | dt: 645.13ms | tok/sec: 812,685 | mfu: 50.79 | epoch: 2 | total time: 126.91m | eta: 52.6m +step 11813/16704 (70.72%) | loss: 2.574516 | lrm: 0.59 | dt: 642.77ms | tok/sec: 815,674 | mfu: 50.98 | epoch: 2 | total time: 126.92m | eta: 52.6m +step 11814/16704 (70.73%) | loss: 2.563884 | lrm: 0.59 | dt: 643.50ms | tok/sec: 814,738 | mfu: 50.92 | epoch: 2 | total time: 126.93m | eta: 52.6m +step 11815/16704 (70.73%) | loss: 2.568230 | lrm: 0.59 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 2 | total time: 126.94m | eta: 52.6m +step 11816/16704 (70.74%) | loss: 2.588263 | lrm: 0.59 | dt: 644.31ms | tok/sec: 813,718 | mfu: 50.86 | epoch: 2 | total time: 126.95m | eta: 52.6m +step 11817/16704 (70.74%) | loss: 2.578283 | lrm: 0.59 | dt: 642.78ms | tok/sec: 815,654 | mfu: 50.98 | epoch: 2 | total time: 126.96m | eta: 52.6m +step 11818/16704 (70.75%) | loss: 2.585790 | lrm: 0.59 | dt: 644.79ms | tok/sec: 813,110 | mfu: 50.82 | epoch: 2 | total time: 126.97m | eta: 52.5m +step 11819/16704 (70.76%) | loss: 2.592046 | lrm: 0.58 | dt: 644.87ms | tok/sec: 813,009 | mfu: 50.81 | epoch: 2 | total time: 126.98m | eta: 52.5m +step 11820/16704 (70.76%) | loss: 2.583151 | lrm: 0.58 | dt: 644.12ms | tok/sec: 813,964 | mfu: 50.87 | epoch: 2 | total time: 126.99m | eta: 52.5m +step 11821/16704 (70.77%) | loss: 2.587417 | lrm: 0.58 | dt: 643.37ms | tok/sec: 814,913 | mfu: 50.93 | epoch: 2 | total time: 127.01m | eta: 52.5m +step 11822/16704 (70.77%) | loss: 2.579465 | lrm: 0.58 | dt: 644.64ms | tok/sec: 813,304 | mfu: 50.83 | epoch: 2 | total time: 127.02m | eta: 52.5m +step 11823/16704 (70.78%) | loss: 2.575864 | lrm: 0.58 | dt: 643.96ms | tok/sec: 814,166 | mfu: 50.89 | epoch: 2 | total time: 127.03m | eta: 52.5m +step 11824/16704 (70.79%) | loss: 2.563118 | lrm: 0.58 | dt: 641.25ms | tok/sec: 817,605 | mfu: 51.10 | epoch: 2 | total time: 127.04m | eta: 52.5m +step 11825/16704 (70.79%) | loss: 2.556217 | lrm: 0.58 | dt: 644.41ms | tok/sec: 813,591 | mfu: 50.85 | epoch: 2 | total time: 127.05m | eta: 52.5m +step 11826/16704 (70.80%) | loss: 2.552749 | lrm: 0.58 | dt: 643.41ms | tok/sec: 814,860 | mfu: 50.93 | epoch: 2 | total time: 127.06m | eta: 52.5m +step 11827/16704 (70.80%) | loss: 2.554309 | lrm: 0.58 | dt: 643.43ms | tok/sec: 814,834 | mfu: 50.93 | epoch: 2 | total time: 127.07m | eta: 52.4m +step 11828/16704 (70.81%) | loss: 2.563909 | lrm: 0.58 | dt: 644.86ms | tok/sec: 813,027 | mfu: 50.82 | epoch: 2 | total time: 127.08m | eta: 52.4m +step 11829/16704 (70.82%) | loss: 2.562258 | lrm: 0.58 | dt: 642.80ms | tok/sec: 815,631 | mfu: 50.98 | epoch: 2 | total time: 127.09m | eta: 52.4m +step 11830/16704 (70.82%) | loss: 2.565077 | lrm: 0.58 | dt: 645.18ms | tok/sec: 812,619 | mfu: 50.79 | epoch: 2 | total time: 127.10m | eta: 52.4m +step 11831/16704 (70.83%) | loss: 2.565440 | lrm: 0.58 | dt: 644.95ms | tok/sec: 812,907 | mfu: 50.81 | epoch: 2 | total time: 127.11m | eta: 52.4m +step 11832/16704 (70.83%) | loss: 2.561981 | lrm: 0.58 | dt: 645.85ms | tok/sec: 811,777 | mfu: 50.74 | epoch: 2 | total time: 127.12m | eta: 52.4m +step 11833/16704 (70.84%) | loss: 2.554372 | lrm: 0.58 | dt: 643.91ms | tok/sec: 814,230 | mfu: 50.89 | epoch: 2 | total time: 127.13m | eta: 52.4m +step 11834/16704 (70.85%) | loss: 2.554928 | lrm: 0.58 | dt: 642.97ms | tok/sec: 815,411 | mfu: 50.96 | epoch: 2 | total time: 127.14m | eta: 52.4m +step 11835/16704 (70.85%) | loss: 2.555459 | lrm: 0.58 | dt: 644.27ms | tok/sec: 813,767 | mfu: 50.86 | epoch: 2 | total time: 127.16m | eta: 52.4m +step 11836/16704 (70.86%) | loss: 2.557326 | lrm: 0.58 | dt: 645.33ms | tok/sec: 812,434 | mfu: 50.78 | epoch: 2 | total time: 127.17m | eta: 52.3m +step 11837/16704 (70.86%) | loss: 2.549125 | lrm: 0.58 | dt: 641.93ms | tok/sec: 816,731 | mfu: 51.05 | epoch: 2 | total time: 127.18m | eta: 52.3m +step 11838/16704 (70.87%) | loss: 2.558875 | lrm: 0.58 | dt: 645.00ms | tok/sec: 812,843 | mfu: 50.80 | epoch: 2 | total time: 127.19m | eta: 52.3m +step 11839/16704 (70.88%) | loss: 2.545621 | lrm: 0.58 | dt: 643.31ms | tok/sec: 814,982 | mfu: 50.94 | epoch: 2 | total time: 127.20m | eta: 52.3m +step 11840/16704 (70.88%) | loss: 2.550146 | lrm: 0.58 | dt: 644.18ms | tok/sec: 813,882 | mfu: 50.87 | epoch: 2 | total time: 127.21m | eta: 52.3m +step 11841/16704 (70.89%) | loss: 2.555788 | lrm: 0.58 | dt: 644.62ms | tok/sec: 813,322 | mfu: 50.83 | epoch: 2 | total time: 127.22m | eta: 52.3m +step 11842/16704 (70.89%) | loss: 2.563647 | lrm: 0.58 | dt: 643.69ms | tok/sec: 814,500 | mfu: 50.91 | epoch: 2 | total time: 127.23m | eta: 52.3m +step 11843/16704 (70.90%) | loss: 2.559409 | lrm: 0.58 | dt: 644.26ms | tok/sec: 813,784 | mfu: 50.86 | epoch: 2 | total time: 127.24m | eta: 52.3m +step 11844/16704 (70.91%) | loss: 2.557926 | lrm: 0.58 | dt: 644.77ms | tok/sec: 813,138 | mfu: 50.82 | epoch: 2 | total time: 127.25m | eta: 52.3m +step 11845/16704 (70.91%) | loss: 2.561739 | lrm: 0.58 | dt: 644.95ms | tok/sec: 812,911 | mfu: 50.81 | epoch: 2 | total time: 127.26m | eta: 52.2m +step 11846/16704 (70.92%) | loss: 2.562099 | lrm: 0.58 | dt: 643.48ms | tok/sec: 814,774 | mfu: 50.92 | epoch: 2 | total time: 127.27m | eta: 52.2m +step 11847/16704 (70.92%) | loss: 2.556852 | lrm: 0.58 | dt: 646.15ms | tok/sec: 811,399 | mfu: 50.71 | epoch: 2 | total time: 127.28m | eta: 52.2m +step 11848/16704 (70.93%) | loss: 2.564692 | lrm: 0.58 | dt: 641.97ms | tok/sec: 816,681 | mfu: 51.04 | epoch: 2 | total time: 127.30m | eta: 52.2m +step 11849/16704 (70.94%) | loss: 2.572921 | lrm: 0.58 | dt: 643.78ms | tok/sec: 814,386 | mfu: 50.90 | epoch: 2 | total time: 127.31m | eta: 52.2m +step 11850/16704 (70.94%) | loss: 2.574301 | lrm: 0.58 | dt: 642.60ms | tok/sec: 815,887 | mfu: 50.99 | epoch: 2 | total time: 127.32m | eta: 52.2m +step 11851/16704 (70.95%) | loss: 2.565885 | lrm: 0.58 | dt: 643.68ms | tok/sec: 814,517 | mfu: 50.91 | epoch: 2 | total time: 127.33m | eta: 52.2m +step 11852/16704 (70.95%) | loss: 2.566994 | lrm: 0.58 | dt: 644.45ms | tok/sec: 813,541 | mfu: 50.85 | epoch: 2 | total time: 127.34m | eta: 52.2m +step 11853/16704 (70.96%) | loss: 2.581312 | lrm: 0.58 | dt: 643.09ms | tok/sec: 815,268 | mfu: 50.96 | epoch: 2 | total time: 127.35m | eta: 52.2m +step 11854/16704 (70.97%) | loss: 2.586468 | lrm: 0.58 | dt: 643.68ms | tok/sec: 814,513 | mfu: 50.91 | epoch: 2 | total time: 127.36m | eta: 52.2m +step 11855/16704 (70.97%) | loss: 2.587767 | lrm: 0.58 | dt: 644.01ms | tok/sec: 814,105 | mfu: 50.88 | epoch: 2 | total time: 127.37m | eta: 52.1m +step 11856/16704 (70.98%) | loss: 2.596632 | lrm: 0.58 | dt: 644.31ms | tok/sec: 813,721 | mfu: 50.86 | epoch: 2 | total time: 127.38m | eta: 52.1m +step 11857/16704 (70.98%) | loss: 2.580199 | lrm: 0.58 | dt: 643.20ms | tok/sec: 815,119 | mfu: 50.95 | epoch: 2 | total time: 127.39m | eta: 52.1m +step 11858/16704 (70.99%) | loss: 2.563895 | lrm: 0.58 | dt: 644.21ms | tok/sec: 813,848 | mfu: 50.87 | epoch: 2 | total time: 127.40m | eta: 52.1m +step 11859/16704 (70.99%) | loss: 2.577251 | lrm: 0.58 | dt: 642.29ms | tok/sec: 816,273 | mfu: 51.02 | epoch: 2 | total time: 127.41m | eta: 52.1m +step 11860/16704 (71.00%) | loss: 2.582749 | lrm: 0.58 | dt: 645.06ms | tok/sec: 812,770 | mfu: 50.80 | epoch: 2 | total time: 127.42m | eta: 52.1m +step 11861/16704 (71.01%) | loss: 2.573962 | lrm: 0.58 | dt: 644.89ms | tok/sec: 812,986 | mfu: 50.81 | epoch: 2 | total time: 127.43m | eta: 52.1m +step 11862/16704 (71.01%) | loss: 2.565029 | lrm: 0.58 | dt: 642.17ms | tok/sec: 816,429 | mfu: 51.03 | epoch: 2 | total time: 127.45m | eta: 52.1m +step 11863/16704 (71.02%) | loss: 2.564105 | lrm: 0.58 | dt: 645.15ms | tok/sec: 812,660 | mfu: 50.79 | epoch: 2 | total time: 127.46m | eta: 52.1m +step 11864/16704 (71.02%) | loss: 2.568107 | lrm: 0.58 | dt: 643.14ms | tok/sec: 815,196 | mfu: 50.95 | epoch: 2 | total time: 127.47m | eta: 52.0m +step 11865/16704 (71.03%) | loss: 2.572840 | lrm: 0.58 | dt: 644.82ms | tok/sec: 813,071 | mfu: 50.82 | epoch: 2 | total time: 127.48m | eta: 52.0m +step 11866/16704 (71.04%) | loss: 2.579332 | lrm: 0.58 | dt: 643.10ms | tok/sec: 815,248 | mfu: 50.95 | epoch: 2 | total time: 127.49m | eta: 52.0m +step 11867/16704 (71.04%) | loss: 2.584952 | lrm: 0.58 | dt: 645.14ms | tok/sec: 812,675 | mfu: 50.79 | epoch: 2 | total time: 127.50m | eta: 52.0m +step 11868/16704 (71.05%) | loss: 2.582042 | lrm: 0.58 | dt: 642.89ms | tok/sec: 815,515 | mfu: 50.97 | epoch: 2 | total time: 127.51m | eta: 52.0m +step 11869/16704 (71.05%) | loss: 2.577351 | lrm: 0.58 | dt: 644.91ms | tok/sec: 812,968 | mfu: 50.81 | epoch: 2 | total time: 127.52m | eta: 52.0m +step 11870/16704 (71.06%) | loss: 2.584685 | lrm: 0.58 | dt: 643.99ms | tok/sec: 814,121 | mfu: 50.88 | epoch: 2 | total time: 127.53m | eta: 52.0m +step 11871/16704 (71.07%) | loss: 2.581100 | lrm: 0.58 | dt: 644.41ms | tok/sec: 813,587 | mfu: 50.85 | epoch: 2 | total time: 127.54m | eta: 52.0m +step 11872/16704 (71.07%) | loss: 2.574645 | lrm: 0.58 | dt: 644.27ms | tok/sec: 813,775 | mfu: 50.86 | epoch: 2 | total time: 127.55m | eta: 52.0m +step 11873/16704 (71.08%) | loss: 2.576843 | lrm: 0.58 | dt: 642.17ms | tok/sec: 816,428 | mfu: 51.03 | epoch: 2 | total time: 127.56m | eta: 51.9m +step 11874/16704 (71.08%) | loss: 2.574382 | lrm: 0.58 | dt: 643.73ms | tok/sec: 814,447 | mfu: 50.90 | epoch: 2 | total time: 127.57m | eta: 51.9m +step 11875/16704 (71.09%) | loss: 2.580174 | lrm: 0.58 | dt: 641.73ms | tok/sec: 816,988 | mfu: 51.06 | epoch: 2 | total time: 127.58m | eta: 51.9m +step 11876/16704 (71.10%) | loss: 2.584150 | lrm: 0.58 | dt: 643.50ms | tok/sec: 814,743 | mfu: 50.92 | epoch: 2 | total time: 127.60m | eta: 51.9m +step 11877/16704 (71.10%) | loss: 2.585675 | lrm: 0.58 | dt: 645.42ms | tok/sec: 812,324 | mfu: 50.77 | epoch: 2 | total time: 127.61m | eta: 51.9m +step 11878/16704 (71.11%) | loss: 2.578340 | lrm: 0.58 | dt: 643.53ms | tok/sec: 814,707 | mfu: 50.92 | epoch: 2 | total time: 127.62m | eta: 51.9m +step 11879/16704 (71.11%) | loss: 2.574515 | lrm: 0.58 | dt: 643.50ms | tok/sec: 814,744 | mfu: 50.92 | epoch: 2 | total time: 127.63m | eta: 51.9m +step 11880/16704 (71.12%) | loss: 2.570880 | lrm: 0.58 | dt: 641.28ms | tok/sec: 817,566 | mfu: 51.10 | epoch: 2 | total time: 127.64m | eta: 51.9m +step 11881/16704 (71.13%) | loss: 2.577100 | lrm: 0.58 | dt: 645.14ms | tok/sec: 812,669 | mfu: 50.79 | epoch: 2 | total time: 127.65m | eta: 51.9m +step 11882/16704 (71.13%) | loss: 2.568686 | lrm: 0.58 | dt: 643.21ms | tok/sec: 815,112 | mfu: 50.95 | epoch: 2 | total time: 127.66m | eta: 51.9m +step 11883/16704 (71.14%) | loss: 2.564246 | lrm: 0.58 | dt: 642.26ms | tok/sec: 816,311 | mfu: 51.02 | epoch: 2 | total time: 127.67m | eta: 51.8m +step 11884/16704 (71.14%) | loss: 2.563402 | lrm: 0.58 | dt: 644.85ms | tok/sec: 813,037 | mfu: 50.82 | epoch: 2 | total time: 127.68m | eta: 51.8m +step 11885/16704 (71.15%) | loss: 2.571897 | lrm: 0.58 | dt: 644.58ms | tok/sec: 813,381 | mfu: 50.84 | epoch: 2 | total time: 127.69m | eta: 51.8m +step 11886/16704 (71.16%) | loss: 2.576769 | lrm: 0.58 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 2 | total time: 127.70m | eta: 51.8m +step 11887/16704 (71.16%) | loss: 2.565972 | lrm: 0.58 | dt: 643.84ms | tok/sec: 814,317 | mfu: 50.90 | epoch: 2 | total time: 127.71m | eta: 51.8m +step 11888/16704 (71.17%) | loss: 2.555652 | lrm: 0.58 | dt: 644.13ms | tok/sec: 813,946 | mfu: 50.87 | epoch: 2 | total time: 127.72m | eta: 51.8m +step 11889/16704 (71.17%) | loss: 2.572027 | lrm: 0.58 | dt: 642.96ms | tok/sec: 815,422 | mfu: 50.97 | epoch: 2 | total time: 127.74m | eta: 51.8m +step 11890/16704 (71.18%) | loss: 2.565348 | lrm: 0.58 | dt: 643.97ms | tok/sec: 814,154 | mfu: 50.89 | epoch: 2 | total time: 127.75m | eta: 51.8m +step 11891/16704 (71.19%) | loss: 2.559077 | lrm: 0.58 | dt: 643.25ms | tok/sec: 815,067 | mfu: 50.94 | epoch: 2 | total time: 127.76m | eta: 51.8m +step 11892/16704 (71.19%) | loss: 2.572290 | lrm: 0.58 | dt: 645.74ms | tok/sec: 811,915 | mfu: 50.75 | epoch: 2 | total time: 127.77m | eta: 51.7m +step 11893/16704 (71.20%) | loss: 2.575246 | lrm: 0.58 | dt: 642.42ms | tok/sec: 816,116 | mfu: 51.01 | epoch: 2 | total time: 127.78m | eta: 51.7m +step 11894/16704 (71.20%) | loss: 2.569682 | lrm: 0.58 | dt: 644.60ms | tok/sec: 813,352 | mfu: 50.84 | epoch: 2 | total time: 127.79m | eta: 51.7m +step 11895/16704 (71.21%) | loss: 2.568199 | lrm: 0.58 | dt: 644.03ms | tok/sec: 814,075 | mfu: 50.88 | epoch: 2 | total time: 127.80m | eta: 51.7m +step 11896/16704 (71.22%) | loss: 2.567723 | lrm: 0.58 | dt: 643.99ms | tok/sec: 814,126 | mfu: 50.88 | epoch: 2 | total time: 127.81m | eta: 51.7m +step 11897/16704 (71.22%) | loss: 2.570176 | lrm: 0.58 | dt: 644.86ms | tok/sec: 813,020 | mfu: 50.81 | epoch: 2 | total time: 127.82m | eta: 51.7m +step 11898/16704 (71.23%) | loss: 2.583789 | lrm: 0.58 | dt: 642.63ms | tok/sec: 815,847 | mfu: 50.99 | epoch: 2 | total time: 127.83m | eta: 51.7m +step 11899/16704 (71.23%) | loss: 2.584908 | lrm: 0.58 | dt: 644.09ms | tok/sec: 813,999 | mfu: 50.88 | epoch: 2 | total time: 127.84m | eta: 51.7m +step 11900/16704 (71.24%) | loss: 2.581392 | lrm: 0.58 | dt: 642.71ms | tok/sec: 815,743 | mfu: 50.99 | epoch: 2 | total time: 127.85m | eta: 51.7m +step 11901/16704 (71.25%) | loss: 2.573165 | lrm: 0.58 | dt: 644.62ms | tok/sec: 813,323 | mfu: 50.83 | epoch: 2 | total time: 127.86m | eta: 51.6m +step 11902/16704 (71.25%) | loss: 2.586599 | lrm: 0.57 | dt: 642.91ms | tok/sec: 815,490 | mfu: 50.97 | epoch: 2 | total time: 127.87m | eta: 51.6m +step 11903/16704 (71.26%) | loss: 2.580896 | lrm: 0.57 | dt: 643.43ms | tok/sec: 814,827 | mfu: 50.93 | epoch: 2 | total time: 127.89m | eta: 51.6m +step 11904/16704 (71.26%) | loss: 2.583078 | lrm: 0.57 | dt: 642.92ms | tok/sec: 815,479 | mfu: 50.97 | epoch: 2 | total time: 127.90m | eta: 51.6m +step 11905/16704 (71.27%) | loss: 2.582582 | lrm: 0.57 | dt: 642.58ms | tok/sec: 815,908 | mfu: 51.00 | epoch: 2 | total time: 127.91m | eta: 51.6m +step 11906/16704 (71.28%) | loss: 2.580189 | lrm: 0.57 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 2 | total time: 127.92m | eta: 51.6m +step 11907/16704 (71.28%) | loss: 2.578407 | lrm: 0.57 | dt: 642.43ms | tok/sec: 816,096 | mfu: 51.01 | epoch: 2 | total time: 127.93m | eta: 51.6m +step 11908/16704 (71.29%) | loss: 2.568272 | lrm: 0.57 | dt: 644.58ms | tok/sec: 813,375 | mfu: 50.84 | epoch: 2 | total time: 127.94m | eta: 51.6m +step 11909/16704 (71.29%) | loss: 2.577986 | lrm: 0.57 | dt: 646.45ms | tok/sec: 811,032 | mfu: 50.69 | epoch: 2 | total time: 127.95m | eta: 51.6m +step 11910/16704 (71.30%) | loss: 2.561923 | lrm: 0.57 | dt: 642.86ms | tok/sec: 815,549 | mfu: 50.97 | epoch: 2 | total time: 127.96m | eta: 51.5m +step 11911/16704 (71.31%) | loss: 2.556762 | lrm: 0.57 | dt: 642.53ms | tok/sec: 815,979 | mfu: 51.00 | epoch: 2 | total time: 127.97m | eta: 51.5m +step 11912/16704 (71.31%) | loss: 2.562772 | lrm: 0.57 | dt: 644.22ms | tok/sec: 813,829 | mfu: 50.87 | epoch: 2 | total time: 127.98m | eta: 51.5m +step 11913/16704 (71.32%) | loss: 2.577218 | lrm: 0.57 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 2 | total time: 127.99m | eta: 51.5m +step 11914/16704 (71.32%) | loss: 2.583306 | lrm: 0.57 | dt: 646.55ms | tok/sec: 810,899 | mfu: 50.68 | epoch: 2 | total time: 128.00m | eta: 51.5m +step 11915/16704 (71.33%) | loss: 2.583575 | lrm: 0.57 | dt: 643.72ms | tok/sec: 814,463 | mfu: 50.91 | epoch: 2 | total time: 128.01m | eta: 51.5m +step 11916/16704 (71.34%) | loss: 2.580291 | lrm: 0.57 | dt: 643.53ms | tok/sec: 814,705 | mfu: 50.92 | epoch: 2 | total time: 128.02m | eta: 51.5m +step 11917/16704 (71.34%) | loss: 2.568728 | lrm: 0.57 | dt: 645.05ms | tok/sec: 812,791 | mfu: 50.80 | epoch: 2 | total time: 128.04m | eta: 51.5m +step 11918/16704 (71.35%) | loss: 2.579340 | lrm: 0.57 | dt: 644.89ms | tok/sec: 812,986 | mfu: 50.81 | epoch: 2 | total time: 128.05m | eta: 51.5m +step 11919/16704 (71.35%) | loss: 2.576055 | lrm: 0.57 | dt: 644.00ms | tok/sec: 814,109 | mfu: 50.88 | epoch: 2 | total time: 128.06m | eta: 51.5m +step 11920/16704 (71.36%) | loss: 2.569320 | lrm: 0.57 | dt: 644.38ms | tok/sec: 813,637 | mfu: 50.85 | epoch: 2 | total time: 128.07m | eta: 51.4m +step 11921/16704 (71.37%) | loss: 2.580680 | lrm: 0.57 | dt: 643.49ms | tok/sec: 814,760 | mfu: 50.92 | epoch: 2 | total time: 128.08m | eta: 51.4m +step 11922/16704 (71.37%) | loss: 2.567061 | lrm: 0.57 | dt: 644.57ms | tok/sec: 813,388 | mfu: 50.84 | epoch: 2 | total time: 128.09m | eta: 51.4m +step 11923/16704 (71.38%) | loss: 2.562607 | lrm: 0.57 | dt: 644.73ms | tok/sec: 813,195 | mfu: 50.83 | epoch: 2 | total time: 128.10m | eta: 51.4m +step 11924/16704 (71.38%) | loss: 2.564350 | lrm: 0.57 | dt: 644.58ms | tok/sec: 813,375 | mfu: 50.84 | epoch: 2 | total time: 128.11m | eta: 51.4m +step 11925/16704 (71.39%) | loss: 2.555732 | lrm: 0.57 | dt: 642.16ms | tok/sec: 816,446 | mfu: 51.03 | epoch: 2 | total time: 128.12m | eta: 51.4m +step 11926/16704 (71.40%) | loss: 2.552496 | lrm: 0.57 | dt: 642.26ms | tok/sec: 816,315 | mfu: 51.02 | epoch: 2 | total time: 128.13m | eta: 51.4m +step 11927/16704 (71.40%) | loss: 2.545849 | lrm: 0.57 | dt: 643.46ms | tok/sec: 814,789 | mfu: 50.93 | epoch: 2 | total time: 128.14m | eta: 51.4m +step 11928/16704 (71.41%) | loss: 2.557060 | lrm: 0.57 | dt: 643.55ms | tok/sec: 814,678 | mfu: 50.92 | epoch: 2 | total time: 128.15m | eta: 51.4m +step 11929/16704 (71.41%) | loss: 2.562570 | lrm: 0.57 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 2 | total time: 128.16m | eta: 51.3m +step 11930/16704 (71.42%) | loss: 2.554718 | lrm: 0.57 | dt: 645.17ms | tok/sec: 812,636 | mfu: 50.79 | epoch: 2 | total time: 128.18m | eta: 51.3m +step 11931/16704 (71.43%) | loss: 2.559735 | lrm: 0.57 | dt: 642.39ms | tok/sec: 816,151 | mfu: 51.01 | epoch: 2 | total time: 128.19m | eta: 51.3m +step 11932/16704 (71.43%) | loss: 2.556940 | lrm: 0.57 | dt: 644.89ms | tok/sec: 812,984 | mfu: 50.81 | epoch: 2 | total time: 128.20m | eta: 51.3m +step 11933/16704 (71.44%) | loss: 2.559721 | lrm: 0.57 | dt: 645.20ms | tok/sec: 812,596 | mfu: 50.79 | epoch: 2 | total time: 128.21m | eta: 51.3m +step 11934/16704 (71.44%) | loss: 2.561253 | lrm: 0.57 | dt: 643.10ms | tok/sec: 815,254 | mfu: 50.95 | epoch: 2 | total time: 128.22m | eta: 51.3m +step 11935/16704 (71.45%) | loss: 2.552640 | lrm: 0.57 | dt: 647.01ms | tok/sec: 810,323 | mfu: 50.65 | epoch: 2 | total time: 128.23m | eta: 51.3m +step 11936/16704 (71.46%) | loss: 2.538695 | lrm: 0.57 | dt: 643.49ms | tok/sec: 814,760 | mfu: 50.92 | epoch: 2 | total time: 128.24m | eta: 51.3m +step 11937/16704 (71.46%) | loss: 2.547407 | lrm: 0.57 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 2 | total time: 128.25m | eta: 51.3m +step 11938/16704 (71.47%) | loss: 2.551685 | lrm: 0.57 | dt: 644.01ms | tok/sec: 814,095 | mfu: 50.88 | epoch: 2 | total time: 128.26m | eta: 51.2m +step 11939/16704 (71.47%) | loss: 2.546804 | lrm: 0.57 | dt: 643.63ms | tok/sec: 814,582 | mfu: 50.91 | epoch: 2 | total time: 128.27m | eta: 51.2m +step 11940/16704 (71.48%) | loss: 2.540505 | lrm: 0.57 | dt: 643.73ms | tok/sec: 814,455 | mfu: 50.90 | epoch: 2 | total time: 128.28m | eta: 51.2m +step 11941/16704 (71.49%) | loss: 2.547684 | lrm: 0.57 | dt: 646.09ms | tok/sec: 811,475 | mfu: 50.72 | epoch: 2 | total time: 128.29m | eta: 51.2m +step 11942/16704 (71.49%) | loss: 2.545953 | lrm: 0.57 | dt: 642.87ms | tok/sec: 815,548 | mfu: 50.97 | epoch: 2 | total time: 128.30m | eta: 51.2m +step 11943/16704 (71.50%) | loss: 2.558882 | lrm: 0.57 | dt: 642.76ms | tok/sec: 815,685 | mfu: 50.98 | epoch: 2 | total time: 128.31m | eta: 51.2m +step 11944/16704 (71.50%) | loss: 2.572436 | lrm: 0.57 | dt: 643.28ms | tok/sec: 815,024 | mfu: 50.94 | epoch: 2 | total time: 128.33m | eta: 51.2m +step 11945/16704 (71.51%) | loss: 2.566369 | lrm: 0.57 | dt: 644.68ms | tok/sec: 813,252 | mfu: 50.83 | epoch: 2 | total time: 128.34m | eta: 51.2m +step 11946/16704 (71.52%) | loss: 2.557941 | lrm: 0.57 | dt: 643.93ms | tok/sec: 814,197 | mfu: 50.89 | epoch: 2 | total time: 128.35m | eta: 51.2m +step 11947/16704 (71.52%) | loss: 2.585949 | lrm: 0.57 | dt: 645.80ms | tok/sec: 811,836 | mfu: 50.74 | epoch: 2 | total time: 128.36m | eta: 51.2m +step 11948/16704 (71.53%) | loss: 2.584637 | lrm: 0.57 | dt: 643.77ms | tok/sec: 814,399 | mfu: 50.90 | epoch: 2 | total time: 128.37m | eta: 51.1m +step 11949/16704 (71.53%) | loss: 2.574483 | lrm: 0.57 | dt: 643.45ms | tok/sec: 814,812 | mfu: 50.93 | epoch: 2 | total time: 128.38m | eta: 51.1m +step 11950/16704 (71.54%) | loss: 2.569609 | lrm: 0.57 | dt: 645.01ms | tok/sec: 812,838 | mfu: 50.80 | epoch: 2 | total time: 128.39m | eta: 51.1m +step 11951/16704 (71.55%) | loss: 2.564556 | lrm: 0.57 | dt: 644.24ms | tok/sec: 813,802 | mfu: 50.86 | epoch: 2 | total time: 128.40m | eta: 51.1m +step 11952/16704 (71.55%) | loss: 2.571071 | lrm: 0.57 | dt: 643.51ms | tok/sec: 814,735 | mfu: 50.92 | epoch: 2 | total time: 128.41m | eta: 51.1m +step 11953/16704 (71.56%) | loss: 2.564319 | lrm: 0.57 | dt: 642.16ms | tok/sec: 816,446 | mfu: 51.03 | epoch: 2 | total time: 128.42m | eta: 51.1m +step 11954/16704 (71.56%) | loss: 2.568960 | lrm: 0.57 | dt: 643.67ms | tok/sec: 814,530 | mfu: 50.91 | epoch: 2 | total time: 128.43m | eta: 51.1m +step 11955/16704 (71.57%) | loss: 2.565304 | lrm: 0.57 | dt: 644.28ms | tok/sec: 813,759 | mfu: 50.86 | epoch: 2 | total time: 128.44m | eta: 51.1m +step 11956/16704 (71.58%) | loss: 2.556990 | lrm: 0.57 | dt: 645.79ms | tok/sec: 811,861 | mfu: 50.74 | epoch: 2 | total time: 128.45m | eta: 51.1m +step 11957/16704 (71.58%) | loss: 2.568444 | lrm: 0.57 | dt: 642.87ms | tok/sec: 815,543 | mfu: 50.97 | epoch: 2 | total time: 128.46m | eta: 51.0m +step 11958/16704 (71.59%) | loss: 2.568153 | lrm: 0.57 | dt: 644.01ms | tok/sec: 814,099 | mfu: 50.88 | epoch: 2 | total time: 128.48m | eta: 51.0m +step 11959/16704 (71.59%) | loss: 2.568879 | lrm: 0.57 | dt: 644.94ms | tok/sec: 812,922 | mfu: 50.81 | epoch: 2 | total time: 128.49m | eta: 51.0m +step 11960/16704 (71.60%) | loss: 2.567346 | lrm: 0.57 | dt: 642.57ms | tok/sec: 815,918 | mfu: 51.00 | epoch: 2 | total time: 128.50m | eta: 51.0m +step 11961/16704 (71.61%) | loss: 2.569896 | lrm: 0.57 | dt: 647.01ms | tok/sec: 810,326 | mfu: 50.65 | epoch: 2 | total time: 128.51m | eta: 51.0m +step 11962/16704 (71.61%) | loss: 2.576308 | lrm: 0.57 | dt: 643.25ms | tok/sec: 815,064 | mfu: 50.94 | epoch: 2 | total time: 128.52m | eta: 51.0m +step 11963/16704 (71.62%) | loss: 2.569569 | lrm: 0.57 | dt: 645.31ms | tok/sec: 812,464 | mfu: 50.78 | epoch: 2 | total time: 128.53m | eta: 51.0m +step 11964/16704 (71.62%) | loss: 2.572862 | lrm: 0.57 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 2 | total time: 128.54m | eta: 51.0m +step 11965/16704 (71.63%) | loss: 2.566979 | lrm: 0.57 | dt: 643.01ms | tok/sec: 815,362 | mfu: 50.96 | epoch: 2 | total time: 128.55m | eta: 51.0m +step 11966/16704 (71.64%) | loss: 2.538540 | lrm: 0.57 | dt: 643.56ms | tok/sec: 814,662 | mfu: 50.92 | epoch: 2 | total time: 128.56m | eta: 50.9m +step 11967/16704 (71.64%) | loss: 2.538446 | lrm: 0.57 | dt: 641.38ms | tok/sec: 817,437 | mfu: 51.09 | epoch: 2 | total time: 128.57m | eta: 50.9m +step 11968/16704 (71.65%) | loss: 2.541418 | lrm: 0.57 | dt: 642.31ms | tok/sec: 816,253 | mfu: 51.02 | epoch: 2 | total time: 128.58m | eta: 50.9m +step 11969/16704 (71.65%) | loss: 2.534313 | lrm: 0.57 | dt: 642.69ms | tok/sec: 815,772 | mfu: 50.99 | epoch: 2 | total time: 128.59m | eta: 50.9m +step 11970/16704 (71.66%) | loss: 2.533255 | lrm: 0.57 | dt: 641.73ms | tok/sec: 816,986 | mfu: 51.06 | epoch: 2 | total time: 128.60m | eta: 50.9m +step 11971/16704 (71.67%) | loss: 2.547106 | lrm: 0.57 | dt: 642.60ms | tok/sec: 815,888 | mfu: 50.99 | epoch: 2 | total time: 128.62m | eta: 50.9m +step 11972/16704 (71.67%) | loss: 2.554303 | lrm: 0.57 | dt: 643.12ms | tok/sec: 815,222 | mfu: 50.95 | epoch: 2 | total time: 128.63m | eta: 50.9m +step 11973/16704 (71.68%) | loss: 2.545535 | lrm: 0.57 | dt: 645.67ms | tok/sec: 812,003 | mfu: 50.75 | epoch: 2 | total time: 128.64m | eta: 50.9m +step 11974/16704 (71.68%) | loss: 2.537645 | lrm: 0.57 | dt: 644.13ms | tok/sec: 813,949 | mfu: 50.87 | epoch: 2 | total time: 128.65m | eta: 50.9m +step 11975/16704 (71.69%) | loss: 2.554172 | lrm: 0.57 | dt: 642.17ms | tok/sec: 816,426 | mfu: 51.03 | epoch: 2 | total time: 128.66m | eta: 50.9m +step 11976/16704 (71.70%) | loss: 2.564787 | lrm: 0.57 | dt: 643.19ms | tok/sec: 815,136 | mfu: 50.95 | epoch: 2 | total time: 128.67m | eta: 50.8m +step 11977/16704 (71.70%) | loss: 2.571702 | lrm: 0.57 | dt: 643.23ms | tok/sec: 815,087 | mfu: 50.94 | epoch: 2 | total time: 128.68m | eta: 50.8m +step 11978/16704 (71.71%) | loss: 2.563422 | lrm: 0.57 | dt: 643.71ms | tok/sec: 814,479 | mfu: 50.91 | epoch: 2 | total time: 128.69m | eta: 50.8m +step 11979/16704 (71.71%) | loss: 2.558819 | lrm: 0.57 | dt: 641.77ms | tok/sec: 816,934 | mfu: 51.06 | epoch: 2 | total time: 128.70m | eta: 50.8m +step 11980/16704 (71.72%) | loss: 2.561814 | lrm: 0.57 | dt: 642.37ms | tok/sec: 816,178 | mfu: 51.01 | epoch: 2 | total time: 128.71m | eta: 50.8m +step 11981/16704 (71.73%) | loss: 2.556973 | lrm: 0.57 | dt: 642.83ms | tok/sec: 815,589 | mfu: 50.98 | epoch: 2 | total time: 128.72m | eta: 50.8m +step 11982/16704 (71.73%) | loss: 2.546495 | lrm: 0.57 | dt: 644.65ms | tok/sec: 813,286 | mfu: 50.83 | epoch: 2 | total time: 128.73m | eta: 50.8m +step 11983/16704 (71.74%) | loss: 2.542303 | lrm: 0.57 | dt: 644.66ms | tok/sec: 813,276 | mfu: 50.83 | epoch: 2 | total time: 128.74m | eta: 50.8m +step 11984/16704 (71.74%) | loss: 2.539567 | lrm: 0.57 | dt: 643.99ms | tok/sec: 814,126 | mfu: 50.88 | epoch: 2 | total time: 128.75m | eta: 50.8m +step 11985/16704 (71.75%) | loss: 2.542940 | lrm: 0.57 | dt: 648.40ms | tok/sec: 808,584 | mfu: 50.54 | epoch: 2 | total time: 128.77m | eta: 50.7m +step 11986/16704 (71.76%) | loss: 2.539702 | lrm: 0.56 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 2 | total time: 128.78m | eta: 50.7m +step 11987/16704 (71.76%) | loss: 2.546171 | lrm: 0.56 | dt: 642.00ms | tok/sec: 816,652 | mfu: 51.04 | epoch: 2 | total time: 128.79m | eta: 50.7m +step 11988/16704 (71.77%) | loss: 2.555641 | lrm: 0.56 | dt: 644.17ms | tok/sec: 813,893 | mfu: 50.87 | epoch: 2 | total time: 128.80m | eta: 50.7m +step 11989/16704 (71.77%) | loss: 2.553056 | lrm: 0.56 | dt: 645.54ms | tok/sec: 812,168 | mfu: 50.76 | epoch: 2 | total time: 128.81m | eta: 50.7m +step 11990/16704 (71.78%) | loss: 2.561671 | lrm: 0.56 | dt: 642.32ms | tok/sec: 816,243 | mfu: 51.02 | epoch: 2 | total time: 128.82m | eta: 50.7m +step 11991/16704 (71.79%) | loss: 2.563531 | lrm: 0.56 | dt: 644.06ms | tok/sec: 814,030 | mfu: 50.88 | epoch: 2 | total time: 128.83m | eta: 50.7m +step 11992/16704 (71.79%) | loss: 2.561304 | lrm: 0.56 | dt: 644.43ms | tok/sec: 813,564 | mfu: 50.85 | epoch: 2 | total time: 128.84m | eta: 50.7m +step 11993/16704 (71.80%) | loss: 2.573144 | lrm: 0.56 | dt: 643.58ms | tok/sec: 814,638 | mfu: 50.92 | epoch: 2 | total time: 128.85m | eta: 50.7m +step 11994/16704 (71.80%) | loss: 2.567287 | lrm: 0.56 | dt: 643.20ms | tok/sec: 815,121 | mfu: 50.95 | epoch: 2 | total time: 128.86m | eta: 50.6m +step 11995/16704 (71.81%) | loss: 2.561830 | lrm: 0.56 | dt: 641.98ms | tok/sec: 816,674 | mfu: 51.04 | epoch: 2 | total time: 128.87m | eta: 50.6m +step 11996/16704 (71.82%) | loss: 2.569631 | lrm: 0.56 | dt: 642.09ms | tok/sec: 816,539 | mfu: 51.03 | epoch: 2 | total time: 128.88m | eta: 50.6m +step 11997/16704 (71.82%) | loss: 2.570611 | lrm: 0.56 | dt: 643.16ms | tok/sec: 815,169 | mfu: 50.95 | epoch: 2 | total time: 128.89m | eta: 50.6m +step 11998/16704 (71.83%) | loss: 2.579125 | lrm: 0.56 | dt: 644.20ms | tok/sec: 813,859 | mfu: 50.87 | epoch: 2 | total time: 128.90m | eta: 50.6m +step 11999/16704 (71.83%) | loss: 2.591368 | lrm: 0.56 | dt: 640.98ms | tok/sec: 817,945 | mfu: 51.12 | epoch: 2 | total time: 128.92m | eta: 50.6m +[GC rank7] gen2: 139.2ms collected 0 objects +[GC rank6] gen2: 145.0ms collected 0 objects +[GC rank5] gen2: 146.1ms collected 0 objects +[GC rank2] gen2: 147.2ms collected 0 objects +[GC rank1] gen2: 147.8ms collected 0 objects +[GC rank4] gen2: 187.4ms collected 0 objects +[GC rank0] gen2: 229.8ms collected 0 objects +[GC rank3] gen2: 233.7ms collected 0 objects +Step 12000 | Validation bpb: 0.786593 +Evaluating: hellaswag_zeroshot (0-shot, type: multiple_choice)... accuracy: 0.4731 | centered: 0.2975 | time: 22.39s +Evaluating: jeopardy (10-shot, type: language_modeling)... accuracy: 0.1389 | centered: 0.1389 | time: 4.76s +Evaluating: bigbench_qa_wikidata (10-shot, type: language_modeling)... accuracy: 0.4886 | centered: 0.4886 | time: 46.64s +Evaluating: arc_easy (10-shot, type: multiple_choice)... accuracy: 0.6498 | centered: 0.5331 | time: 5.88s +Evaluating: arc_challenge (10-shot, type: multiple_choice)... accuracy: 0.3515 | centered: 0.1354 | time: 2.91s +Evaluating: copa (0-shot, type: multiple_choice)... accuracy: 0.6500 | centered: 0.3000 | time: 0.23s +Evaluating: commonsense_qa (10-shot, type: multiple_choice)... accuracy: 0.3382 | centered: 0.1728 | time: 3.07s +Evaluating: piqa (10-shot, type: multiple_choice)... accuracy: 0.6942 | centered: 0.3885 | time: 4.32s +Evaluating: openbook_qa (0-shot, type: multiple_choice)... accuracy: 0.3780 | centered: 0.1707 | time: 1.11s +Evaluating: lambada_openai (0-shot, type: language_modeling)... accuracy: 0.3866 | centered: 0.3866 | time: 11.38s +Evaluating: hellaswag (10-shot, type: multiple_choice)... accuracy: 0.4716 | centered: 0.2955 | time: 35.35s +Evaluating: winograd (0-shot, type: schema)... accuracy: 0.6337 | centered: 0.2674 | time: 0.61s +Evaluating: winogrande (0-shot, type: schema)... accuracy: 0.5478 | centered: 0.0955 | time: 2.75s +Evaluating: bigbench_dyck_languages (10-shot, type: language_modeling)... accuracy: 0.1290 | centered: 0.1290 | time: 2.35s +Evaluating: agi_eval_lsat_ar (3-shot, type: multiple_choice)... accuracy: 0.2739 | centered: 0.0924 | time: 0.80s +Evaluating: bigbench_cs_algorithms (10-shot, type: language_modeling)... accuracy: 0.4083 | centered: 0.4083 | time: 3.03s +Evaluating: bigbench_operators (10-shot, type: language_modeling)... accuracy: 0.1857 | centered: 0.1857 | time: 0.49s +Evaluating: bigbench_repeat_copy_logic (10-shot, type: language_modeling)... accuracy: 0.0312 | centered: 0.0312 | time: 0.08s +Evaluating: squad (10-shot, type: language_modeling)... accuracy: 0.2893 | centered: 0.2893 | time: 28.53s +Evaluating: coqa (0-shot, type: language_modeling)... accuracy: 0.2373 | centered: 0.2373 | time: 18.32s +Evaluating: boolq (10-shot, type: multiple_choice)... accuracy: 0.5966 | centered: -0.0615 | time: 10.72s +Evaluating: bigbench_language_identification (10-shot, type: multiple_choice)... accuracy: 0.2523 | centered: 0.1774 | time: 59.40s +Step 12000 | CORE metric: 0.2345 +step 12000/16704 (71.84%) | loss: 2.593040 | lrm: 0.56 | dt: 630.03ms | tok/sec: 832,160 | mfu: 52.01 | epoch: 2 | total time: 128.93m | eta: 50.6m +step 12001/16704 (71.85%) | loss: 2.587367 | lrm: 0.56 | dt: 651.15ms | tok/sec: 805,178 | mfu: 50.32 | epoch: 2 | total time: 128.94m | eta: 50.6m +step 12002/16704 (71.85%) | loss: 2.571997 | lrm: 0.56 | dt: 643.76ms | tok/sec: 814,415 | mfu: 50.90 | epoch: 2 | total time: 128.95m | eta: 50.6m +step 12003/16704 (71.86%) | loss: 2.577289 | lrm: 0.56 | dt: 642.03ms | tok/sec: 816,613 | mfu: 51.04 | epoch: 2 | total time: 128.96m | eta: 50.5m +step 12004/16704 (71.86%) | loss: 2.580910 | lrm: 0.56 | dt: 647.54ms | tok/sec: 809,664 | mfu: 50.61 | epoch: 2 | total time: 128.97m | eta: 50.5m +step 12005/16704 (71.87%) | loss: 2.586658 | lrm: 0.56 | dt: 636.93ms | tok/sec: 823,145 | mfu: 51.45 | epoch: 2 | total time: 128.98m | eta: 50.5m +step 12006/16704 (71.88%) | loss: 2.579821 | lrm: 0.56 | dt: 643.33ms | tok/sec: 814,962 | mfu: 50.94 | epoch: 2 | total time: 128.99m | eta: 50.5m +step 12007/16704 (71.88%) | loss: 2.583066 | lrm: 0.56 | dt: 644.07ms | tok/sec: 814,028 | mfu: 50.88 | epoch: 2 | total time: 129.00m | eta: 50.5m +step 12008/16704 (71.89%) | loss: 2.581828 | lrm: 0.56 | dt: 640.55ms | tok/sec: 818,500 | mfu: 51.16 | epoch: 2 | total time: 129.01m | eta: 50.5m +step 12009/16704 (71.89%) | loss: 2.585219 | lrm: 0.56 | dt: 645.61ms | tok/sec: 812,079 | mfu: 50.76 | epoch: 2 | total time: 129.02m | eta: 50.5m +step 12010/16704 (71.90%) | loss: 2.599060 | lrm: 0.56 | dt: 641.78ms | tok/sec: 816,925 | mfu: 51.06 | epoch: 2 | total time: 129.03m | eta: 50.5m +step 12011/16704 (71.90%) | loss: 2.594742 | lrm: 0.56 | dt: 641.87ms | tok/sec: 816,809 | mfu: 51.05 | epoch: 2 | total time: 129.04m | eta: 50.5m +step 12012/16704 (71.91%) | loss: 2.603197 | lrm: 0.56 | dt: 643.64ms | tok/sec: 814,565 | mfu: 50.91 | epoch: 2 | total time: 129.05m | eta: 50.5m +step 12013/16704 (71.92%) | loss: 2.593126 | lrm: 0.56 | dt: 639.79ms | tok/sec: 819,471 | mfu: 51.22 | epoch: 2 | total time: 129.07m | eta: 50.4m +step 12014/16704 (71.92%) | loss: 2.593896 | lrm: 0.56 | dt: 642.39ms | tok/sec: 816,150 | mfu: 51.01 | epoch: 2 | total time: 129.08m | eta: 50.4m +step 12015/16704 (71.93%) | loss: 2.592924 | lrm: 0.56 | dt: 642.43ms | tok/sec: 816,098 | mfu: 51.01 | epoch: 2 | total time: 129.09m | eta: 50.4m +step 12016/16704 (71.93%) | loss: 2.591262 | lrm: 0.56 | dt: 642.09ms | tok/sec: 816,536 | mfu: 51.03 | epoch: 2 | total time: 129.10m | eta: 50.4m +step 12017/16704 (71.94%) | loss: 2.588053 | lrm: 0.56 | dt: 642.71ms | tok/sec: 815,741 | mfu: 50.98 | epoch: 2 | total time: 129.11m | eta: 50.4m +step 12018/16704 (71.95%) | loss: 2.597074 | lrm: 0.56 | dt: 640.74ms | tok/sec: 818,250 | mfu: 51.14 | epoch: 2 | total time: 129.12m | eta: 50.4m +step 12019/16704 (71.95%) | loss: 2.599700 | lrm: 0.56 | dt: 645.78ms | tok/sec: 811,865 | mfu: 50.74 | epoch: 2 | total time: 129.13m | eta: 50.4m +step 12020/16704 (71.96%) | loss: 2.610170 | lrm: 0.56 | dt: 642.88ms | tok/sec: 815,523 | mfu: 50.97 | epoch: 2 | total time: 129.14m | eta: 50.4m +step 12021/16704 (71.96%) | loss: 2.611063 | lrm: 0.56 | dt: 641.43ms | tok/sec: 817,377 | mfu: 51.09 | epoch: 2 | total time: 129.15m | eta: 50.4m +step 12022/16704 (71.97%) | loss: 2.603236 | lrm: 0.56 | dt: 643.50ms | tok/sec: 814,743 | mfu: 50.92 | epoch: 2 | total time: 129.16m | eta: 50.3m +step 12023/16704 (71.98%) | loss: 2.596609 | lrm: 0.56 | dt: 640.18ms | tok/sec: 818,967 | mfu: 51.19 | epoch: 2 | total time: 129.17m | eta: 50.3m +step 12024/16704 (71.98%) | loss: 2.583896 | lrm: 0.56 | dt: 642.56ms | tok/sec: 815,931 | mfu: 51.00 | epoch: 2 | total time: 129.18m | eta: 50.3m +step 12025/16704 (71.99%) | loss: 2.580118 | lrm: 0.56 | dt: 642.31ms | tok/sec: 816,257 | mfu: 51.02 | epoch: 2 | total time: 129.19m | eta: 50.3m +step 12026/16704 (71.99%) | loss: 2.585697 | lrm: 0.56 | dt: 643.43ms | tok/sec: 814,835 | mfu: 50.93 | epoch: 2 | total time: 129.20m | eta: 50.3m +step 12027/16704 (72.00%) | loss: 2.586323 | lrm: 0.56 | dt: 640.04ms | tok/sec: 819,148 | mfu: 51.20 | epoch: 2 | total time: 129.22m | eta: 50.3m +step 12028/16704 (72.01%) | loss: 2.590313 | lrm: 0.56 | dt: 642.22ms | tok/sec: 816,364 | mfu: 51.02 | epoch: 2 | total time: 129.23m | eta: 50.3m +step 12029/16704 (72.01%) | loss: 2.584717 | lrm: 0.56 | dt: 641.85ms | tok/sec: 816,843 | mfu: 51.05 | epoch: 2 | total time: 129.24m | eta: 50.3m +step 12030/16704 (72.02%) | loss: 2.584974 | lrm: 0.56 | dt: 643.42ms | tok/sec: 814,842 | mfu: 50.93 | epoch: 2 | total time: 129.25m | eta: 50.3m +step 12031/16704 (72.02%) | loss: 2.576818 | lrm: 0.56 | dt: 641.27ms | tok/sec: 817,581 | mfu: 51.10 | epoch: 2 | total time: 129.26m | eta: 50.2m +step 12032/16704 (72.03%) | loss: 2.570127 | lrm: 0.56 | dt: 642.17ms | tok/sec: 816,437 | mfu: 51.03 | epoch: 2 | total time: 129.27m | eta: 50.2m +step 12033/16704 (72.04%) | loss: 2.571523 | lrm: 0.56 | dt: 643.03ms | tok/sec: 815,336 | mfu: 50.96 | epoch: 2 | total time: 129.28m | eta: 50.2m +step 12034/16704 (72.04%) | loss: 2.573762 | lrm: 0.56 | dt: 643.05ms | tok/sec: 815,319 | mfu: 50.96 | epoch: 2 | total time: 129.29m | eta: 50.2m +step 12035/16704 (72.05%) | loss: 2.568940 | lrm: 0.56 | dt: 641.79ms | tok/sec: 816,918 | mfu: 51.06 | epoch: 2 | total time: 129.30m | eta: 50.2m +step 12036/16704 (72.05%) | loss: 2.559362 | lrm: 0.56 | dt: 641.68ms | tok/sec: 817,058 | mfu: 51.07 | epoch: 2 | total time: 129.31m | eta: 50.2m +step 12037/16704 (72.06%) | loss: 2.558759 | lrm: 0.56 | dt: 641.82ms | tok/sec: 816,871 | mfu: 51.06 | epoch: 2 | total time: 129.32m | eta: 50.2m +step 12038/16704 (72.07%) | loss: 2.550722 | lrm: 0.56 | dt: 642.09ms | tok/sec: 816,537 | mfu: 51.03 | epoch: 2 | total time: 129.33m | eta: 50.2m +step 12039/16704 (72.07%) | loss: 2.551090 | lrm: 0.56 | dt: 642.44ms | tok/sec: 816,086 | mfu: 51.01 | epoch: 2 | total time: 129.34m | eta: 50.2m +step 12040/16704 (72.08%) | loss: 2.557971 | lrm: 0.56 | dt: 640.79ms | tok/sec: 818,189 | mfu: 51.14 | epoch: 2 | total time: 129.35m | eta: 50.2m +step 12041/16704 (72.08%) | loss: 2.545150 | lrm: 0.56 | dt: 642.04ms | tok/sec: 816,601 | mfu: 51.04 | epoch: 2 | total time: 129.37m | eta: 50.1m +step 12042/16704 (72.09%) | loss: 2.545625 | lrm: 0.56 | dt: 640.70ms | tok/sec: 818,303 | mfu: 51.15 | epoch: 2 | total time: 129.38m | eta: 50.1m +step 12043/16704 (72.10%) | loss: 2.543878 | lrm: 0.56 | dt: 641.43ms | tok/sec: 817,368 | mfu: 51.09 | epoch: 2 | total time: 129.39m | eta: 50.1m +step 12044/16704 (72.10%) | loss: 2.537691 | lrm: 0.56 | dt: 643.32ms | tok/sec: 814,967 | mfu: 50.94 | epoch: 2 | total time: 129.40m | eta: 50.1m +step 12045/16704 (72.11%) | loss: 2.548350 | lrm: 0.56 | dt: 642.53ms | tok/sec: 815,973 | mfu: 51.00 | epoch: 2 | total time: 129.41m | eta: 50.1m +step 12046/16704 (72.11%) | loss: 2.554308 | lrm: 0.56 | dt: 642.40ms | tok/sec: 816,139 | mfu: 51.01 | epoch: 2 | total time: 129.42m | eta: 50.1m +step 12047/16704 (72.12%) | loss: 2.556540 | lrm: 0.56 | dt: 642.33ms | tok/sec: 816,228 | mfu: 51.02 | epoch: 2 | total time: 129.43m | eta: 50.1m +step 12048/16704 (72.13%) | loss: 2.559955 | lrm: 0.56 | dt: 642.22ms | tok/sec: 816,368 | mfu: 51.02 | epoch: 2 | total time: 129.44m | eta: 50.1m +step 12049/16704 (72.13%) | loss: 2.557810 | lrm: 0.56 | dt: 643.12ms | tok/sec: 815,230 | mfu: 50.95 | epoch: 2 | total time: 129.45m | eta: 50.1m +step 12050/16704 (72.14%) | loss: 2.566193 | lrm: 0.56 | dt: 641.66ms | tok/sec: 817,081 | mfu: 51.07 | epoch: 2 | total time: 129.46m | eta: 50.0m +step 12051/16704 (72.14%) | loss: 2.558129 | lrm: 0.56 | dt: 642.61ms | tok/sec: 815,878 | mfu: 50.99 | epoch: 2 | total time: 129.47m | eta: 50.0m +step 12052/16704 (72.15%) | loss: 2.563816 | lrm: 0.56 | dt: 644.20ms | tok/sec: 813,863 | mfu: 50.87 | epoch: 2 | total time: 129.48m | eta: 50.0m +step 12053/16704 (72.16%) | loss: 2.567222 | lrm: 0.56 | dt: 640.19ms | tok/sec: 818,953 | mfu: 51.19 | epoch: 2 | total time: 129.49m | eta: 50.0m +step 12054/16704 (72.16%) | loss: 2.573877 | lrm: 0.56 | dt: 642.31ms | tok/sec: 816,255 | mfu: 51.02 | epoch: 2 | total time: 129.50m | eta: 50.0m +step 12055/16704 (72.17%) | loss: 2.571877 | lrm: 0.56 | dt: 643.73ms | tok/sec: 814,449 | mfu: 50.90 | epoch: 2 | total time: 129.51m | eta: 50.0m +step 12056/16704 (72.17%) | loss: 2.565043 | lrm: 0.56 | dt: 641.95ms | tok/sec: 816,710 | mfu: 51.05 | epoch: 2 | total time: 129.53m | eta: 50.0m +step 12057/16704 (72.18%) | loss: 2.561694 | lrm: 0.56 | dt: 642.58ms | tok/sec: 815,916 | mfu: 51.00 | epoch: 2 | total time: 129.54m | eta: 50.0m +step 12058/16704 (72.19%) | loss: 2.558364 | lrm: 0.56 | dt: 645.25ms | tok/sec: 812,535 | mfu: 50.78 | epoch: 2 | total time: 129.55m | eta: 50.0m +step 12059/16704 (72.19%) | loss: 2.562213 | lrm: 0.56 | dt: 641.45ms | tok/sec: 817,349 | mfu: 51.09 | epoch: 2 | total time: 129.56m | eta: 49.9m +step 12060/16704 (72.20%) | loss: 2.560055 | lrm: 0.56 | dt: 642.24ms | tok/sec: 816,336 | mfu: 51.02 | epoch: 2 | total time: 129.57m | eta: 49.9m +step 12061/16704 (72.20%) | loss: 2.556701 | lrm: 0.56 | dt: 641.46ms | tok/sec: 817,337 | mfu: 51.08 | epoch: 2 | total time: 129.58m | eta: 49.9m +step 12062/16704 (72.21%) | loss: 2.553602 | lrm: 0.56 | dt: 642.94ms | tok/sec: 815,452 | mfu: 50.97 | epoch: 2 | total time: 129.59m | eta: 49.9m +step 12063/16704 (72.22%) | loss: 2.545727 | lrm: 0.56 | dt: 642.83ms | tok/sec: 815,587 | mfu: 50.98 | epoch: 2 | total time: 129.60m | eta: 49.9m +step 12064/16704 (72.22%) | loss: 2.540409 | lrm: 0.56 | dt: 641.34ms | tok/sec: 817,483 | mfu: 51.09 | epoch: 2 | total time: 129.61m | eta: 49.9m +step 12065/16704 (72.23%) | loss: 2.536739 | lrm: 0.56 | dt: 641.91ms | tok/sec: 816,765 | mfu: 51.05 | epoch: 2 | total time: 129.62m | eta: 49.9m +step 12066/16704 (72.23%) | loss: 2.538856 | lrm: 0.56 | dt: 642.16ms | tok/sec: 816,443 | mfu: 51.03 | epoch: 2 | total time: 129.63m | eta: 49.9m +step 12067/16704 (72.24%) | loss: 2.548343 | lrm: 0.56 | dt: 641.70ms | tok/sec: 817,036 | mfu: 51.07 | epoch: 2 | total time: 129.64m | eta: 49.9m +step 12068/16704 (72.25%) | loss: 2.541271 | lrm: 0.56 | dt: 643.07ms | tok/sec: 815,291 | mfu: 50.96 | epoch: 2 | total time: 129.65m | eta: 49.8m +step 12069/16704 (72.25%) | loss: 2.539503 | lrm: 0.55 | dt: 643.30ms | tok/sec: 814,995 | mfu: 50.94 | epoch: 2 | total time: 129.66m | eta: 49.8m +step 12070/16704 (72.26%) | loss: 2.552370 | lrm: 0.55 | dt: 641.26ms | tok/sec: 817,590 | mfu: 51.10 | epoch: 2 | total time: 129.68m | eta: 49.8m +step 12071/16704 (72.26%) | loss: 2.564224 | lrm: 0.55 | dt: 641.81ms | tok/sec: 816,891 | mfu: 51.06 | epoch: 2 | total time: 129.69m | eta: 49.8m +step 12072/16704 (72.27%) | loss: 2.551692 | lrm: 0.55 | dt: 644.53ms | tok/sec: 813,441 | mfu: 50.84 | epoch: 2 | total time: 129.70m | eta: 49.8m +step 12073/16704 (72.28%) | loss: 2.556060 | lrm: 0.55 | dt: 642.03ms | tok/sec: 816,607 | mfu: 51.04 | epoch: 2 | total time: 129.71m | eta: 49.8m +step 12074/16704 (72.28%) | loss: 2.560826 | lrm: 0.55 | dt: 643.65ms | tok/sec: 814,559 | mfu: 50.91 | epoch: 2 | total time: 129.72m | eta: 49.8m +step 12075/16704 (72.29%) | loss: 2.560556 | lrm: 0.55 | dt: 641.62ms | tok/sec: 817,128 | mfu: 51.07 | epoch: 2 | total time: 129.73m | eta: 49.8m +step 12076/16704 (72.29%) | loss: 2.567783 | lrm: 0.55 | dt: 643.44ms | tok/sec: 814,815 | mfu: 50.93 | epoch: 2 | total time: 129.74m | eta: 49.8m +step 12077/16704 (72.30%) | loss: 2.564869 | lrm: 0.55 | dt: 643.88ms | tok/sec: 814,261 | mfu: 50.89 | epoch: 2 | total time: 129.75m | eta: 49.8m +step 12078/16704 (72.31%) | loss: 2.560474 | lrm: 0.55 | dt: 641.34ms | tok/sec: 817,486 | mfu: 51.09 | epoch: 2 | total time: 129.76m | eta: 49.7m +step 12079/16704 (72.31%) | loss: 2.564264 | lrm: 0.55 | dt: 643.06ms | tok/sec: 815,297 | mfu: 50.96 | epoch: 2 | total time: 129.77m | eta: 49.7m +step 12080/16704 (72.32%) | loss: 2.552767 | lrm: 0.55 | dt: 641.78ms | tok/sec: 816,923 | mfu: 51.06 | epoch: 2 | total time: 129.78m | eta: 49.7m +step 12081/16704 (72.32%) | loss: 2.560320 | lrm: 0.55 | dt: 643.46ms | tok/sec: 814,795 | mfu: 50.93 | epoch: 2 | total time: 129.79m | eta: 49.7m +step 12082/16704 (72.33%) | loss: 2.559182 | lrm: 0.55 | dt: 642.34ms | tok/sec: 816,212 | mfu: 51.01 | epoch: 2 | total time: 129.80m | eta: 49.7m +step 12083/16704 (72.34%) | loss: 2.560012 | lrm: 0.55 | dt: 642.86ms | tok/sec: 815,552 | mfu: 50.97 | epoch: 2 | total time: 129.81m | eta: 49.7m +step 12084/16704 (72.34%) | loss: 2.571653 | lrm: 0.55 | dt: 643.72ms | tok/sec: 814,466 | mfu: 50.91 | epoch: 2 | total time: 129.83m | eta: 49.7m +step 12085/16704 (72.35%) | loss: 2.562941 | lrm: 0.55 | dt: 643.32ms | tok/sec: 814,973 | mfu: 50.94 | epoch: 2 | total time: 129.84m | eta: 49.7m +step 12086/16704 (72.35%) | loss: 2.554473 | lrm: 0.55 | dt: 644.21ms | tok/sec: 813,841 | mfu: 50.87 | epoch: 2 | total time: 129.85m | eta: 49.7m +step 12087/16704 (72.36%) | loss: 2.564191 | lrm: 0.55 | dt: 641.19ms | tok/sec: 817,685 | mfu: 51.11 | epoch: 2 | total time: 129.86m | eta: 49.6m +step 12088/16704 (72.37%) | loss: 2.557390 | lrm: 0.55 | dt: 641.60ms | tok/sec: 817,155 | mfu: 51.07 | epoch: 2 | total time: 129.87m | eta: 49.6m +step 12089/16704 (72.37%) | loss: 2.555657 | lrm: 0.55 | dt: 641.66ms | tok/sec: 817,083 | mfu: 51.07 | epoch: 2 | total time: 129.88m | eta: 49.6m +step 12090/16704 (72.38%) | loss: 2.561996 | lrm: 0.55 | dt: 642.47ms | tok/sec: 816,054 | mfu: 51.00 | epoch: 2 | total time: 129.89m | eta: 49.6m +step 12091/16704 (72.38%) | loss: 2.567115 | lrm: 0.55 | dt: 641.43ms | tok/sec: 817,370 | mfu: 51.09 | epoch: 2 | total time: 129.90m | eta: 49.6m +step 12092/16704 (72.39%) | loss: 2.562773 | lrm: 0.55 | dt: 642.78ms | tok/sec: 815,657 | mfu: 50.98 | epoch: 2 | total time: 129.91m | eta: 49.6m +step 12093/16704 (72.40%) | loss: 2.569247 | lrm: 0.55 | dt: 641.02ms | tok/sec: 817,890 | mfu: 51.12 | epoch: 2 | total time: 129.92m | eta: 49.6m +step 12094/16704 (72.40%) | loss: 2.565120 | lrm: 0.55 | dt: 643.25ms | tok/sec: 815,066 | mfu: 50.94 | epoch: 2 | total time: 129.93m | eta: 49.6m +step 12095/16704 (72.41%) | loss: 2.564249 | lrm: 0.55 | dt: 643.27ms | tok/sec: 815,039 | mfu: 50.94 | epoch: 2 | total time: 129.94m | eta: 49.6m +step 12096/16704 (72.41%) | loss: 2.557859 | lrm: 0.55 | dt: 642.69ms | tok/sec: 815,771 | mfu: 50.99 | epoch: 2 | total time: 129.95m | eta: 49.5m +step 12097/16704 (72.42%) | loss: 2.555808 | lrm: 0.55 | dt: 642.43ms | tok/sec: 816,096 | mfu: 51.01 | epoch: 2 | total time: 129.96m | eta: 49.5m +step 12098/16704 (72.43%) | loss: 2.561318 | lrm: 0.55 | dt: 641.98ms | tok/sec: 816,672 | mfu: 51.04 | epoch: 2 | total time: 129.98m | eta: 49.5m +step 12099/16704 (72.43%) | loss: 2.570457 | lrm: 0.55 | dt: 641.88ms | tok/sec: 816,796 | mfu: 51.05 | epoch: 2 | total time: 129.99m | eta: 49.5m +step 12100/16704 (72.44%) | loss: 2.573698 | lrm: 0.55 | dt: 641.53ms | tok/sec: 817,251 | mfu: 51.08 | epoch: 2 | total time: 130.00m | eta: 49.5m +step 12101/16704 (72.44%) | loss: 2.575152 | lrm: 0.55 | dt: 643.49ms | tok/sec: 814,758 | mfu: 50.92 | epoch: 2 | total time: 130.01m | eta: 49.5m +step 12102/16704 (72.45%) | loss: 2.565077 | lrm: 0.55 | dt: 644.34ms | tok/sec: 813,682 | mfu: 50.86 | epoch: 2 | total time: 130.02m | eta: 49.5m +step 12103/16704 (72.46%) | loss: 2.552588 | lrm: 0.55 | dt: 642.46ms | tok/sec: 816,064 | mfu: 51.01 | epoch: 2 | total time: 130.03m | eta: 49.5m +step 12104/16704 (72.46%) | loss: 2.556739 | lrm: 0.55 | dt: 643.09ms | tok/sec: 815,261 | mfu: 50.95 | epoch: 2 | total time: 130.04m | eta: 49.5m +step 12105/16704 (72.47%) | loss: 2.563524 | lrm: 0.55 | dt: 641.08ms | tok/sec: 817,818 | mfu: 51.11 | epoch: 2 | total time: 130.05m | eta: 49.5m +step 12106/16704 (72.47%) | loss: 2.574982 | lrm: 0.55 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 2 | total time: 130.06m | eta: 49.4m +step 12107/16704 (72.48%) | loss: 2.578516 | lrm: 0.55 | dt: 642.30ms | tok/sec: 816,266 | mfu: 51.02 | epoch: 2 | total time: 130.07m | eta: 49.4m +step 12108/16704 (72.49%) | loss: 2.581528 | lrm: 0.55 | dt: 643.87ms | tok/sec: 814,272 | mfu: 50.89 | epoch: 2 | total time: 130.08m | eta: 49.4m +step 12109/16704 (72.49%) | loss: 2.575223 | lrm: 0.55 | dt: 645.91ms | tok/sec: 811,700 | mfu: 50.73 | epoch: 2 | total time: 130.09m | eta: 49.4m +step 12110/16704 (72.50%) | loss: 2.570812 | lrm: 0.55 | dt: 641.06ms | tok/sec: 817,845 | mfu: 51.12 | epoch: 2 | total time: 130.10m | eta: 49.4m +step 12111/16704 (72.50%) | loss: 2.565519 | lrm: 0.55 | dt: 641.51ms | tok/sec: 817,269 | mfu: 51.08 | epoch: 2 | total time: 130.11m | eta: 49.4m +step 12112/16704 (72.51%) | loss: 2.563644 | lrm: 0.55 | dt: 643.28ms | tok/sec: 815,022 | mfu: 50.94 | epoch: 2 | total time: 130.13m | eta: 49.4m +step 12113/16704 (72.52%) | loss: 2.557293 | lrm: 0.55 | dt: 640.29ms | tok/sec: 818,831 | mfu: 51.18 | epoch: 2 | total time: 130.14m | eta: 49.4m +step 12114/16704 (72.52%) | loss: 2.558307 | lrm: 0.55 | dt: 642.83ms | tok/sec: 815,590 | mfu: 50.98 | epoch: 2 | total time: 130.15m | eta: 49.4m +step 12115/16704 (72.53%) | loss: 2.557344 | lrm: 0.55 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 2 | total time: 130.16m | eta: 49.3m +step 12116/16704 (72.53%) | loss: 2.560796 | lrm: 0.55 | dt: 643.03ms | tok/sec: 815,335 | mfu: 50.96 | epoch: 2 | total time: 130.17m | eta: 49.3m +step 12117/16704 (72.54%) | loss: 2.536302 | lrm: 0.55 | dt: 643.70ms | tok/sec: 814,487 | mfu: 50.91 | epoch: 2 | total time: 130.18m | eta: 49.3m +step 12118/16704 (72.55%) | loss: 2.544769 | lrm: 0.55 | dt: 640.66ms | tok/sec: 818,357 | mfu: 51.15 | epoch: 2 | total time: 130.19m | eta: 49.3m +step 12119/16704 (72.55%) | loss: 2.554203 | lrm: 0.55 | dt: 642.83ms | tok/sec: 815,594 | mfu: 50.98 | epoch: 2 | total time: 130.20m | eta: 49.3m +step 12120/16704 (72.56%) | loss: 2.548439 | lrm: 0.55 | dt: 645.31ms | tok/sec: 812,459 | mfu: 50.78 | epoch: 2 | total time: 130.21m | eta: 49.3m +step 12121/16704 (72.56%) | loss: 2.531509 | lrm: 0.55 | dt: 643.72ms | tok/sec: 814,471 | mfu: 50.91 | epoch: 2 | total time: 130.22m | eta: 49.3m +step 12122/16704 (72.57%) | loss: 2.541543 | lrm: 0.55 | dt: 643.99ms | tok/sec: 814,126 | mfu: 50.88 | epoch: 2 | total time: 130.23m | eta: 49.3m +step 12123/16704 (72.58%) | loss: 2.545630 | lrm: 0.55 | dt: 642.47ms | tok/sec: 816,055 | mfu: 51.00 | epoch: 2 | total time: 130.24m | eta: 49.3m +step 12124/16704 (72.58%) | loss: 2.538001 | lrm: 0.55 | dt: 643.20ms | tok/sec: 815,127 | mfu: 50.95 | epoch: 2 | total time: 130.25m | eta: 49.2m +step 12125/16704 (72.59%) | loss: 2.537287 | lrm: 0.55 | dt: 645.01ms | tok/sec: 812,837 | mfu: 50.80 | epoch: 2 | total time: 130.26m | eta: 49.2m +step 12126/16704 (72.59%) | loss: 2.538172 | lrm: 0.55 | dt: 642.58ms | tok/sec: 815,905 | mfu: 51.00 | epoch: 2 | total time: 130.28m | eta: 49.2m +step 12127/16704 (72.60%) | loss: 2.543154 | lrm: 0.55 | dt: 643.71ms | tok/sec: 814,479 | mfu: 50.91 | epoch: 2 | total time: 130.29m | eta: 49.2m +step 12128/16704 (72.61%) | loss: 2.550824 | lrm: 0.55 | dt: 643.40ms | tok/sec: 814,871 | mfu: 50.93 | epoch: 2 | total time: 130.30m | eta: 49.2m +step 12129/16704 (72.61%) | loss: 2.558738 | lrm: 0.55 | dt: 643.75ms | tok/sec: 814,423 | mfu: 50.90 | epoch: 2 | total time: 130.31m | eta: 49.2m +step 12130/16704 (72.62%) | loss: 2.564305 | lrm: 0.55 | dt: 642.47ms | tok/sec: 816,051 | mfu: 51.00 | epoch: 2 | total time: 130.32m | eta: 49.2m +step 12131/16704 (72.62%) | loss: 2.562682 | lrm: 0.55 | dt: 642.45ms | tok/sec: 816,076 | mfu: 51.01 | epoch: 2 | total time: 130.33m | eta: 49.2m +step 12132/16704 (72.63%) | loss: 2.562419 | lrm: 0.55 | dt: 644.05ms | tok/sec: 814,044 | mfu: 50.88 | epoch: 2 | total time: 130.34m | eta: 49.2m +step 12133/16704 (72.64%) | loss: 2.558003 | lrm: 0.55 | dt: 642.56ms | tok/sec: 815,931 | mfu: 51.00 | epoch: 2 | total time: 130.35m | eta: 49.1m +step 12134/16704 (72.64%) | loss: 2.548526 | lrm: 0.55 | dt: 642.22ms | tok/sec: 816,372 | mfu: 51.02 | epoch: 2 | total time: 130.36m | eta: 49.1m +step 12135/16704 (72.65%) | loss: 2.550425 | lrm: 0.55 | dt: 642.18ms | tok/sec: 816,423 | mfu: 51.03 | epoch: 2 | total time: 130.37m | eta: 49.1m +step 12136/16704 (72.65%) | loss: 2.548096 | lrm: 0.55 | dt: 642.00ms | tok/sec: 816,653 | mfu: 51.04 | epoch: 2 | total time: 130.38m | eta: 49.1m +step 12137/16704 (72.66%) | loss: 2.549281 | lrm: 0.55 | dt: 642.41ms | tok/sec: 816,127 | mfu: 51.01 | epoch: 2 | total time: 130.39m | eta: 49.1m +step 12138/16704 (72.67%) | loss: 2.554744 | lrm: 0.55 | dt: 644.52ms | tok/sec: 813,451 | mfu: 50.84 | epoch: 2 | total time: 130.40m | eta: 49.1m +step 12139/16704 (72.67%) | loss: 2.552300 | lrm: 0.55 | dt: 641.21ms | tok/sec: 817,657 | mfu: 51.10 | epoch: 2 | total time: 130.41m | eta: 49.1m +step 12140/16704 (72.68%) | loss: 2.566636 | lrm: 0.55 | dt: 642.47ms | tok/sec: 816,055 | mfu: 51.00 | epoch: 2 | total time: 130.43m | eta: 49.1m +step 12141/16704 (72.68%) | loss: 2.562076 | lrm: 0.55 | dt: 644.12ms | tok/sec: 813,964 | mfu: 50.87 | epoch: 2 | total time: 130.44m | eta: 49.1m +step 12142/16704 (72.69%) | loss: 2.557159 | lrm: 0.55 | dt: 642.60ms | tok/sec: 815,890 | mfu: 50.99 | epoch: 2 | total time: 130.45m | eta: 49.1m +step 12143/16704 (72.70%) | loss: 2.544281 | lrm: 0.55 | dt: 642.89ms | tok/sec: 815,515 | mfu: 50.97 | epoch: 2 | total time: 130.46m | eta: 49.0m +step 12144/16704 (72.70%) | loss: 2.536567 | lrm: 0.55 | dt: 641.83ms | tok/sec: 816,858 | mfu: 51.05 | epoch: 2 | total time: 130.47m | eta: 49.0m +step 12145/16704 (72.71%) | loss: 2.527656 | lrm: 0.55 | dt: 642.15ms | tok/sec: 816,451 | mfu: 51.03 | epoch: 2 | total time: 130.48m | eta: 49.0m +step 12146/16704 (72.71%) | loss: 2.533230 | lrm: 0.55 | dt: 643.13ms | tok/sec: 815,206 | mfu: 50.95 | epoch: 2 | total time: 130.49m | eta: 49.0m +step 12147/16704 (72.72%) | loss: 2.533566 | lrm: 0.55 | dt: 643.93ms | tok/sec: 814,195 | mfu: 50.89 | epoch: 2 | total time: 130.50m | eta: 49.0m +step 12148/16704 (72.73%) | loss: 2.541071 | lrm: 0.55 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 2 | total time: 130.51m | eta: 49.0m +step 12149/16704 (72.73%) | loss: 2.560534 | lrm: 0.55 | dt: 641.79ms | tok/sec: 816,920 | mfu: 51.06 | epoch: 2 | total time: 130.52m | eta: 49.0m +step 12150/16704 (72.74%) | loss: 2.555439 | lrm: 0.55 | dt: 642.62ms | tok/sec: 815,862 | mfu: 50.99 | epoch: 2 | total time: 130.53m | eta: 49.0m +step 12151/16704 (72.74%) | loss: 2.552364 | lrm: 0.55 | dt: 642.24ms | tok/sec: 816,340 | mfu: 51.02 | epoch: 2 | total time: 130.54m | eta: 49.0m +step 12152/16704 (72.75%) | loss: 2.551091 | lrm: 0.55 | dt: 643.33ms | tok/sec: 814,957 | mfu: 50.94 | epoch: 2 | total time: 130.55m | eta: 48.9m +step 12153/16704 (72.76%) | loss: 2.558296 | lrm: 0.54 | dt: 642.67ms | tok/sec: 815,793 | mfu: 50.99 | epoch: 2 | total time: 130.56m | eta: 48.9m +step 12154/16704 (72.76%) | loss: 2.553939 | lrm: 0.54 | dt: 642.27ms | tok/sec: 816,306 | mfu: 51.02 | epoch: 2 | total time: 130.58m | eta: 48.9m +step 12155/16704 (72.77%) | loss: 2.548882 | lrm: 0.54 | dt: 645.42ms | tok/sec: 812,322 | mfu: 50.77 | epoch: 2 | total time: 130.59m | eta: 48.9m +step 12156/16704 (72.77%) | loss: 2.568822 | lrm: 0.54 | dt: 641.17ms | tok/sec: 817,700 | mfu: 51.11 | epoch: 2 | total time: 130.60m | eta: 48.9m +step 12157/16704 (72.78%) | loss: 2.575417 | lrm: 0.54 | dt: 643.90ms | tok/sec: 814,240 | mfu: 50.89 | epoch: 2 | total time: 130.61m | eta: 48.9m +step 12158/16704 (72.78%) | loss: 2.600225 | lrm: 0.54 | dt: 642.08ms | tok/sec: 816,545 | mfu: 51.04 | epoch: 2 | total time: 130.62m | eta: 48.9m +step 12159/16704 (72.79%) | loss: 2.599588 | lrm: 0.54 | dt: 642.40ms | tok/sec: 816,136 | mfu: 51.01 | epoch: 2 | total time: 130.63m | eta: 48.9m +step 12160/16704 (72.80%) | loss: 2.587334 | lrm: 0.54 | dt: 644.28ms | tok/sec: 813,762 | mfu: 50.86 | epoch: 2 | total time: 130.64m | eta: 48.9m +step 12161/16704 (72.80%) | loss: 2.593214 | lrm: 0.54 | dt: 643.47ms | tok/sec: 814,780 | mfu: 50.92 | epoch: 2 | total time: 130.65m | eta: 48.8m +step 12162/16704 (72.81%) | loss: 2.584624 | lrm: 0.54 | dt: 645.31ms | tok/sec: 812,464 | mfu: 50.78 | epoch: 2 | total time: 130.66m | eta: 48.8m +step 12163/16704 (72.81%) | loss: 2.577380 | lrm: 0.54 | dt: 640.39ms | tok/sec: 818,703 | mfu: 51.17 | epoch: 2 | total time: 130.67m | eta: 48.8m +step 12164/16704 (72.82%) | loss: 2.589817 | lrm: 0.54 | dt: 642.77ms | tok/sec: 815,665 | mfu: 50.98 | epoch: 2 | total time: 130.68m | eta: 48.8m +step 12165/16704 (72.83%) | loss: 2.583616 | lrm: 0.54 | dt: 642.33ms | tok/sec: 816,224 | mfu: 51.02 | epoch: 2 | total time: 130.69m | eta: 48.8m +step 12166/16704 (72.83%) | loss: 2.575319 | lrm: 0.54 | dt: 640.16ms | tok/sec: 819,000 | mfu: 51.19 | epoch: 2 | total time: 130.70m | eta: 48.8m +step 12167/16704 (72.84%) | loss: 2.568055 | lrm: 0.54 | dt: 644.56ms | tok/sec: 813,407 | mfu: 50.84 | epoch: 2 | total time: 130.71m | eta: 48.8m +step 12168/16704 (72.84%) | loss: 2.544527 | lrm: 0.54 | dt: 641.83ms | tok/sec: 816,863 | mfu: 51.06 | epoch: 2 | total time: 130.73m | eta: 48.8m +step 12169/16704 (72.85%) | loss: 2.558908 | lrm: 0.54 | dt: 643.47ms | tok/sec: 814,784 | mfu: 50.93 | epoch: 2 | total time: 130.74m | eta: 48.8m +step 12170/16704 (72.86%) | loss: 2.553560 | lrm: 0.54 | dt: 644.61ms | tok/sec: 813,336 | mfu: 50.83 | epoch: 2 | total time: 130.75m | eta: 48.8m +step 12171/16704 (72.86%) | loss: 2.562829 | lrm: 0.54 | dt: 640.68ms | tok/sec: 818,329 | mfu: 51.15 | epoch: 2 | total time: 130.76m | eta: 48.7m +step 12172/16704 (72.87%) | loss: 2.561647 | lrm: 0.54 | dt: 642.65ms | tok/sec: 815,823 | mfu: 50.99 | epoch: 2 | total time: 130.77m | eta: 48.7m +step 12173/16704 (72.87%) | loss: 2.555294 | lrm: 0.54 | dt: 643.42ms | tok/sec: 814,846 | mfu: 50.93 | epoch: 2 | total time: 130.78m | eta: 48.7m +step 12174/16704 (72.88%) | loss: 2.557382 | lrm: 0.54 | dt: 641.33ms | tok/sec: 817,502 | mfu: 51.10 | epoch: 2 | total time: 130.79m | eta: 48.7m +step 12175/16704 (72.89%) | loss: 2.567428 | lrm: 0.54 | dt: 643.71ms | tok/sec: 814,474 | mfu: 50.91 | epoch: 2 | total time: 130.80m | eta: 48.7m +step 12176/16704 (72.89%) | loss: 2.571593 | lrm: 0.54 | dt: 642.41ms | tok/sec: 816,130 | mfu: 51.01 | epoch: 2 | total time: 130.81m | eta: 48.7m +step 12177/16704 (72.90%) | loss: 2.565110 | lrm: 0.54 | dt: 641.22ms | tok/sec: 817,645 | mfu: 51.10 | epoch: 2 | total time: 130.82m | eta: 48.7m +step 12178/16704 (72.90%) | loss: 2.560568 | lrm: 0.54 | dt: 642.61ms | tok/sec: 815,870 | mfu: 50.99 | epoch: 2 | total time: 130.83m | eta: 48.7m +step 12179/16704 (72.91%) | loss: 2.555333 | lrm: 0.54 | dt: 641.84ms | tok/sec: 816,851 | mfu: 51.05 | epoch: 2 | total time: 130.84m | eta: 48.7m +step 12180/16704 (72.92%) | loss: 2.563888 | lrm: 0.54 | dt: 641.20ms | tok/sec: 817,671 | mfu: 51.11 | epoch: 2 | total time: 130.85m | eta: 48.6m +step 12181/16704 (72.92%) | loss: 2.554576 | lrm: 0.54 | dt: 643.33ms | tok/sec: 814,964 | mfu: 50.94 | epoch: 2 | total time: 130.86m | eta: 48.6m +step 12182/16704 (72.93%) | loss: 2.565388 | lrm: 0.54 | dt: 642.21ms | tok/sec: 816,378 | mfu: 51.02 | epoch: 2 | total time: 130.88m | eta: 48.6m +step 12183/16704 (72.93%) | loss: 2.564719 | lrm: 0.54 | dt: 644.25ms | tok/sec: 813,792 | mfu: 50.86 | epoch: 2 | total time: 130.89m | eta: 48.6m +step 12184/16704 (72.94%) | loss: 2.581628 | lrm: 0.54 | dt: 641.17ms | tok/sec: 817,708 | mfu: 51.11 | epoch: 2 | total time: 130.90m | eta: 48.6m +step 12185/16704 (72.95%) | loss: 2.566742 | lrm: 0.54 | dt: 641.92ms | tok/sec: 816,751 | mfu: 51.05 | epoch: 2 | total time: 130.91m | eta: 48.6m +step 12186/16704 (72.95%) | loss: 2.569830 | lrm: 0.54 | dt: 642.40ms | tok/sec: 816,133 | mfu: 51.01 | epoch: 2 | total time: 130.92m | eta: 48.6m +step 12187/16704 (72.96%) | loss: 2.559042 | lrm: 0.54 | dt: 642.21ms | tok/sec: 816,375 | mfu: 51.02 | epoch: 2 | total time: 130.93m | eta: 48.6m +step 12188/16704 (72.96%) | loss: 2.544548 | lrm: 0.54 | dt: 642.43ms | tok/sec: 816,099 | mfu: 51.01 | epoch: 2 | total time: 130.94m | eta: 48.6m +step 12189/16704 (72.97%) | loss: 2.552507 | lrm: 0.54 | dt: 643.46ms | tok/sec: 814,796 | mfu: 50.93 | epoch: 2 | total time: 130.95m | eta: 48.5m +step 12190/16704 (72.98%) | loss: 2.545700 | lrm: 0.54 | dt: 641.71ms | tok/sec: 817,013 | mfu: 51.06 | epoch: 2 | total time: 130.96m | eta: 48.5m +step 12191/16704 (72.98%) | loss: 2.546594 | lrm: 0.54 | dt: 642.59ms | tok/sec: 815,901 | mfu: 50.99 | epoch: 2 | total time: 130.97m | eta: 48.5m +step 12192/16704 (72.99%) | loss: 2.550637 | lrm: 0.54 | dt: 644.27ms | tok/sec: 813,772 | mfu: 50.86 | epoch: 2 | total time: 130.98m | eta: 48.5m +step 12193/16704 (72.99%) | loss: 2.563715 | lrm: 0.54 | dt: 641.08ms | tok/sec: 817,816 | mfu: 51.11 | epoch: 2 | total time: 130.99m | eta: 48.5m +step 12194/16704 (73.00%) | loss: 2.544586 | lrm: 0.54 | dt: 642.65ms | tok/sec: 815,820 | mfu: 50.99 | epoch: 2 | total time: 131.00m | eta: 48.5m +step 12195/16704 (73.01%) | loss: 2.553227 | lrm: 0.54 | dt: 641.40ms | tok/sec: 817,416 | mfu: 51.09 | epoch: 2 | total time: 131.01m | eta: 48.5m +step 12196/16704 (73.01%) | loss: 2.551452 | lrm: 0.54 | dt: 639.72ms | tok/sec: 819,552 | mfu: 51.22 | epoch: 2 | total time: 131.03m | eta: 48.5m +step 12197/16704 (73.02%) | loss: 2.550201 | lrm: 0.54 | dt: 642.90ms | tok/sec: 815,498 | mfu: 50.97 | epoch: 2 | total time: 131.04m | eta: 48.5m +step 12198/16704 (73.02%) | loss: 2.555478 | lrm: 0.54 | dt: 641.21ms | tok/sec: 817,653 | mfu: 51.10 | epoch: 2 | total time: 131.05m | eta: 48.4m +step 12199/16704 (73.03%) | loss: 2.556282 | lrm: 0.54 | dt: 644.90ms | tok/sec: 812,974 | mfu: 50.81 | epoch: 2 | total time: 131.06m | eta: 48.4m +step 12200/16704 (73.04%) | loss: 2.548698 | lrm: 0.54 | dt: 643.15ms | tok/sec: 815,186 | mfu: 50.95 | epoch: 2 | total time: 131.07m | eta: 48.4m +step 12201/16704 (73.04%) | loss: 2.550054 | lrm: 0.54 | dt: 642.65ms | tok/sec: 815,828 | mfu: 50.99 | epoch: 2 | total time: 131.08m | eta: 48.4m +step 12202/16704 (73.05%) | loss: 2.545183 | lrm: 0.54 | dt: 643.09ms | tok/sec: 815,269 | mfu: 50.96 | epoch: 2 | total time: 131.09m | eta: 48.4m +step 12203/16704 (73.05%) | loss: 2.556848 | lrm: 0.54 | dt: 641.89ms | tok/sec: 816,794 | mfu: 51.05 | epoch: 2 | total time: 131.10m | eta: 48.4m +step 12204/16704 (73.06%) | loss: 2.562577 | lrm: 0.54 | dt: 644.46ms | tok/sec: 813,529 | mfu: 50.85 | epoch: 2 | total time: 131.11m | eta: 48.4m +step 12205/16704 (73.07%) | loss: 2.554153 | lrm: 0.54 | dt: 643.98ms | tok/sec: 814,135 | mfu: 50.88 | epoch: 2 | total time: 131.12m | eta: 48.4m +step 12206/16704 (73.07%) | loss: 2.562243 | lrm: 0.54 | dt: 643.41ms | tok/sec: 814,862 | mfu: 50.93 | epoch: 2 | total time: 131.13m | eta: 48.4m +step 12207/16704 (73.08%) | loss: 2.551729 | lrm: 0.54 | dt: 644.78ms | tok/sec: 813,123 | mfu: 50.82 | epoch: 2 | total time: 131.14m | eta: 48.4m +step 12208/16704 (73.08%) | loss: 2.547311 | lrm: 0.54 | dt: 640.96ms | tok/sec: 817,967 | mfu: 51.12 | epoch: 2 | total time: 131.15m | eta: 48.3m +step 12209/16704 (73.09%) | loss: 2.556234 | lrm: 0.54 | dt: 644.63ms | tok/sec: 813,320 | mfu: 50.83 | epoch: 2 | total time: 131.16m | eta: 48.3m +step 12210/16704 (73.10%) | loss: 2.549777 | lrm: 0.54 | dt: 641.63ms | tok/sec: 817,118 | mfu: 51.07 | epoch: 2 | total time: 131.18m | eta: 48.3m +step 12211/16704 (73.10%) | loss: 2.544678 | lrm: 0.54 | dt: 644.16ms | tok/sec: 813,908 | mfu: 50.87 | epoch: 2 | total time: 131.19m | eta: 48.3m +step 12212/16704 (73.11%) | loss: 2.540997 | lrm: 0.54 | dt: 643.76ms | tok/sec: 814,419 | mfu: 50.90 | epoch: 2 | total time: 131.20m | eta: 48.3m +step 12213/16704 (73.11%) | loss: 2.549269 | lrm: 0.54 | dt: 644.98ms | tok/sec: 812,874 | mfu: 50.81 | epoch: 2 | total time: 131.21m | eta: 48.3m +step 12214/16704 (73.12%) | loss: 2.550091 | lrm: 0.54 | dt: 642.12ms | tok/sec: 816,495 | mfu: 51.03 | epoch: 2 | total time: 131.22m | eta: 48.3m +step 12215/16704 (73.13%) | loss: 2.545980 | lrm: 0.54 | dt: 641.24ms | tok/sec: 817,617 | mfu: 51.10 | epoch: 2 | total time: 131.23m | eta: 48.3m +step 12216/16704 (73.13%) | loss: 2.539455 | lrm: 0.54 | dt: 641.21ms | tok/sec: 817,651 | mfu: 51.10 | epoch: 2 | total time: 131.24m | eta: 48.3m +step 12217/16704 (73.14%) | loss: 2.548607 | lrm: 0.54 | dt: 642.64ms | tok/sec: 815,833 | mfu: 50.99 | epoch: 2 | total time: 131.25m | eta: 48.2m +step 12218/16704 (73.14%) | loss: 2.537530 | lrm: 0.54 | dt: 644.40ms | tok/sec: 813,602 | mfu: 50.85 | epoch: 2 | total time: 131.26m | eta: 48.2m +step 12219/16704 (73.15%) | loss: 2.542145 | lrm: 0.54 | dt: 642.20ms | tok/sec: 816,394 | mfu: 51.03 | epoch: 2 | total time: 131.27m | eta: 48.2m +step 12220/16704 (73.16%) | loss: 2.540696 | lrm: 0.54 | dt: 642.72ms | tok/sec: 815,733 | mfu: 50.98 | epoch: 2 | total time: 131.28m | eta: 48.2m +step 12221/16704 (73.16%) | loss: 2.550443 | lrm: 0.54 | dt: 642.06ms | tok/sec: 816,568 | mfu: 51.04 | epoch: 2 | total time: 131.29m | eta: 48.2m +step 12222/16704 (73.17%) | loss: 2.562464 | lrm: 0.54 | dt: 643.31ms | tok/sec: 814,981 | mfu: 50.94 | epoch: 2 | total time: 131.30m | eta: 48.2m +step 12223/16704 (73.17%) | loss: 2.561296 | lrm: 0.54 | dt: 643.33ms | tok/sec: 814,954 | mfu: 50.94 | epoch: 2 | total time: 131.31m | eta: 48.2m +step 12224/16704 (73.18%) | loss: 2.552593 | lrm: 0.54 | dt: 643.01ms | tok/sec: 815,362 | mfu: 50.96 | epoch: 2 | total time: 131.33m | eta: 48.2m +step 12225/16704 (73.19%) | loss: 2.537365 | lrm: 0.54 | dt: 643.71ms | tok/sec: 814,474 | mfu: 50.91 | epoch: 2 | total time: 131.34m | eta: 48.2m +step 12226/16704 (73.19%) | loss: 2.547516 | lrm: 0.54 | dt: 642.04ms | tok/sec: 816,594 | mfu: 51.04 | epoch: 2 | total time: 131.35m | eta: 48.1m +step 12227/16704 (73.20%) | loss: 2.545707 | lrm: 0.54 | dt: 642.65ms | tok/sec: 815,821 | mfu: 50.99 | epoch: 2 | total time: 131.36m | eta: 48.1m +step 12228/16704 (73.20%) | loss: 2.559010 | lrm: 0.54 | dt: 642.50ms | tok/sec: 816,007 | mfu: 51.00 | epoch: 2 | total time: 131.37m | eta: 48.1m +step 12229/16704 (73.21%) | loss: 2.549587 | lrm: 0.54 | dt: 641.96ms | tok/sec: 816,692 | mfu: 51.04 | epoch: 2 | total time: 131.38m | eta: 48.1m +step 12230/16704 (73.22%) | loss: 2.545296 | lrm: 0.54 | dt: 643.86ms | tok/sec: 814,291 | mfu: 50.89 | epoch: 2 | total time: 131.39m | eta: 48.1m +step 12231/16704 (73.22%) | loss: 2.538065 | lrm: 0.54 | dt: 644.30ms | tok/sec: 813,732 | mfu: 50.86 | epoch: 2 | total time: 131.40m | eta: 48.1m +step 12232/16704 (73.23%) | loss: 2.549802 | lrm: 0.54 | dt: 644.63ms | tok/sec: 813,317 | mfu: 50.83 | epoch: 2 | total time: 131.41m | eta: 48.1m +step 12233/16704 (73.23%) | loss: 2.543218 | lrm: 0.54 | dt: 641.86ms | tok/sec: 816,831 | mfu: 51.05 | epoch: 2 | total time: 131.42m | eta: 48.1m +step 12234/16704 (73.24%) | loss: 2.544995 | lrm: 0.54 | dt: 643.40ms | tok/sec: 814,868 | mfu: 50.93 | epoch: 2 | total time: 131.43m | eta: 48.1m +step 12235/16704 (73.25%) | loss: 2.538793 | lrm: 0.54 | dt: 640.34ms | tok/sec: 818,760 | mfu: 51.17 | epoch: 2 | total time: 131.44m | eta: 48.1m +step 12236/16704 (73.25%) | loss: 2.547167 | lrm: 0.53 | dt: 641.84ms | tok/sec: 816,849 | mfu: 51.05 | epoch: 2 | total time: 131.45m | eta: 48.0m +step 12237/16704 (73.26%) | loss: 2.555640 | lrm: 0.53 | dt: 643.52ms | tok/sec: 814,716 | mfu: 50.92 | epoch: 2 | total time: 131.46m | eta: 48.0m +step 12238/16704 (73.26%) | loss: 2.551290 | lrm: 0.53 | dt: 644.28ms | tok/sec: 813,756 | mfu: 50.86 | epoch: 2 | total time: 131.48m | eta: 48.0m +step 12239/16704 (73.27%) | loss: 2.554946 | lrm: 0.53 | dt: 643.45ms | tok/sec: 814,801 | mfu: 50.93 | epoch: 2 | total time: 131.49m | eta: 48.0m +step 12240/16704 (73.28%) | loss: 2.565937 | lrm: 0.53 | dt: 642.68ms | tok/sec: 815,785 | mfu: 50.99 | epoch: 2 | total time: 131.50m | eta: 48.0m +step 12241/16704 (73.28%) | loss: 2.554829 | lrm: 0.53 | dt: 642.58ms | tok/sec: 815,915 | mfu: 51.00 | epoch: 2 | total time: 131.51m | eta: 48.0m +step 12242/16704 (73.29%) | loss: 2.552346 | lrm: 0.53 | dt: 640.53ms | tok/sec: 818,520 | mfu: 51.16 | epoch: 2 | total time: 131.52m | eta: 48.0m +step 12243/16704 (73.29%) | loss: 2.565352 | lrm: 0.53 | dt: 643.44ms | tok/sec: 814,821 | mfu: 50.93 | epoch: 2 | total time: 131.53m | eta: 48.0m +step 12244/16704 (73.30%) | loss: 2.564021 | lrm: 0.53 | dt: 643.24ms | tok/sec: 815,076 | mfu: 50.94 | epoch: 2 | total time: 131.54m | eta: 48.0m +step 12245/16704 (73.31%) | loss: 2.552766 | lrm: 0.53 | dt: 643.32ms | tok/sec: 814,967 | mfu: 50.94 | epoch: 2 | total time: 131.55m | eta: 47.9m +step 12246/16704 (73.31%) | loss: 2.547408 | lrm: 0.53 | dt: 643.22ms | tok/sec: 815,094 | mfu: 50.94 | epoch: 2 | total time: 131.56m | eta: 47.9m +step 12247/16704 (73.32%) | loss: 2.541981 | lrm: 0.53 | dt: 642.67ms | tok/sec: 815,799 | mfu: 50.99 | epoch: 2 | total time: 131.57m | eta: 47.9m +step 12248/16704 (73.32%) | loss: 2.548218 | lrm: 0.53 | dt: 642.84ms | tok/sec: 815,579 | mfu: 50.97 | epoch: 2 | total time: 131.58m | eta: 47.9m +step 12249/16704 (73.33%) | loss: 2.547133 | lrm: 0.53 | dt: 641.27ms | tok/sec: 817,572 | mfu: 51.10 | epoch: 2 | total time: 131.59m | eta: 47.9m +Step 12250 | Validation bpb: 0.784341 +step 12250/16704 (73.34%) | loss: 2.561419 | lrm: 0.53 | dt: 647.13ms | tok/sec: 810,176 | mfu: 50.64 | epoch: 2 | total time: 131.60m | eta: 47.9m +step 12251/16704 (73.34%) | loss: 2.554893 | lrm: 0.53 | dt: 643.48ms | tok/sec: 814,768 | mfu: 50.92 | epoch: 2 | total time: 131.61m | eta: 47.9m +step 12252/16704 (73.35%) | loss: 2.559002 | lrm: 0.53 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 2 | total time: 131.63m | eta: 47.9m +step 12253/16704 (73.35%) | loss: 2.571338 | lrm: 0.53 | dt: 640.55ms | tok/sec: 818,494 | mfu: 51.16 | epoch: 2 | total time: 131.64m | eta: 47.9m +step 12254/16704 (73.36%) | loss: 2.571382 | lrm: 0.53 | dt: 645.05ms | tok/sec: 812,789 | mfu: 50.80 | epoch: 2 | total time: 131.65m | eta: 47.8m +step 12255/16704 (73.37%) | loss: 2.566281 | lrm: 0.53 | dt: 643.00ms | tok/sec: 815,378 | mfu: 50.96 | epoch: 2 | total time: 131.66m | eta: 47.8m +step 12256/16704 (73.37%) | loss: 2.562549 | lrm: 0.53 | dt: 639.77ms | tok/sec: 819,498 | mfu: 51.22 | epoch: 2 | total time: 131.67m | eta: 47.8m +step 12257/16704 (73.38%) | loss: 2.557876 | lrm: 0.53 | dt: 645.12ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 2 | total time: 131.68m | eta: 47.8m +step 12258/16704 (73.38%) | loss: 2.553337 | lrm: 0.53 | dt: 638.51ms | tok/sec: 821,112 | mfu: 51.32 | epoch: 2 | total time: 131.69m | eta: 47.8m +step 12259/16704 (73.39%) | loss: 2.555399 | lrm: 0.53 | dt: 643.50ms | tok/sec: 814,738 | mfu: 50.92 | epoch: 2 | total time: 131.70m | eta: 47.8m +step 12260/16704 (73.40%) | loss: 2.552780 | lrm: 0.53 | dt: 643.70ms | tok/sec: 814,489 | mfu: 50.91 | epoch: 2 | total time: 131.71m | eta: 47.8m +step 12261/16704 (73.40%) | loss: 2.556963 | lrm: 0.53 | dt: 639.51ms | tok/sec: 819,828 | mfu: 51.24 | epoch: 2 | total time: 131.72m | eta: 47.8m +step 12262/16704 (73.41%) | loss: 2.540460 | lrm: 0.53 | dt: 645.42ms | tok/sec: 812,318 | mfu: 50.77 | epoch: 2 | total time: 131.73m | eta: 47.8m +step 12263/16704 (73.41%) | loss: 2.534835 | lrm: 0.53 | dt: 642.87ms | tok/sec: 815,547 | mfu: 50.97 | epoch: 2 | total time: 131.74m | eta: 47.7m +step 12264/16704 (73.42%) | loss: 2.548279 | lrm: 0.53 | dt: 641.67ms | tok/sec: 817,073 | mfu: 51.07 | epoch: 2 | total time: 131.75m | eta: 47.7m +step 12265/16704 (73.43%) | loss: 2.551348 | lrm: 0.53 | dt: 642.91ms | tok/sec: 815,495 | mfu: 50.97 | epoch: 2 | total time: 131.76m | eta: 47.7m +step 12266/16704 (73.43%) | loss: 2.548019 | lrm: 0.53 | dt: 640.97ms | tok/sec: 817,959 | mfu: 51.12 | epoch: 2 | total time: 131.78m | eta: 47.7m +step 12267/16704 (73.44%) | loss: 2.544536 | lrm: 0.53 | dt: 643.98ms | tok/sec: 814,133 | mfu: 50.88 | epoch: 2 | total time: 131.79m | eta: 47.7m +step 12268/16704 (73.44%) | loss: 2.543889 | lrm: 0.53 | dt: 641.80ms | tok/sec: 816,904 | mfu: 51.06 | epoch: 2 | total time: 131.80m | eta: 47.7m +step 12269/16704 (73.45%) | loss: 2.547326 | lrm: 0.53 | dt: 641.82ms | tok/sec: 816,880 | mfu: 51.06 | epoch: 2 | total time: 131.81m | eta: 47.7m +step 12270/16704 (73.46%) | loss: 2.533774 | lrm: 0.53 | dt: 642.88ms | tok/sec: 815,524 | mfu: 50.97 | epoch: 2 | total time: 131.82m | eta: 47.7m +step 12271/16704 (73.46%) | loss: 2.521427 | lrm: 0.53 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 2 | total time: 131.83m | eta: 47.7m +step 12272/16704 (73.47%) | loss: 2.526819 | lrm: 0.53 | dt: 642.94ms | tok/sec: 815,458 | mfu: 50.97 | epoch: 2 | total time: 131.84m | eta: 47.7m +step 12273/16704 (73.47%) | loss: 2.516670 | lrm: 0.53 | dt: 642.00ms | tok/sec: 816,650 | mfu: 51.04 | epoch: 2 | total time: 131.85m | eta: 47.6m +step 12274/16704 (73.48%) | loss: 2.521513 | lrm: 0.53 | dt: 641.78ms | tok/sec: 816,925 | mfu: 51.06 | epoch: 2 | total time: 131.86m | eta: 47.6m +step 12275/16704 (73.49%) | loss: 2.515931 | lrm: 0.53 | dt: 643.34ms | tok/sec: 814,943 | mfu: 50.94 | epoch: 2 | total time: 131.87m | eta: 47.6m +step 12276/16704 (73.49%) | loss: 2.515847 | lrm: 0.53 | dt: 642.34ms | tok/sec: 816,221 | mfu: 51.01 | epoch: 2 | total time: 131.88m | eta: 47.6m +step 12277/16704 (73.50%) | loss: 2.515101 | lrm: 0.53 | dt: 643.23ms | tok/sec: 815,091 | mfu: 50.94 | epoch: 2 | total time: 131.89m | eta: 47.6m +step 12278/16704 (73.50%) | loss: 2.516876 | lrm: 0.53 | dt: 642.97ms | tok/sec: 815,422 | mfu: 50.97 | epoch: 2 | total time: 131.90m | eta: 47.6m +step 12279/16704 (73.51%) | loss: 2.533554 | lrm: 0.53 | dt: 643.75ms | tok/sec: 814,424 | mfu: 50.90 | epoch: 2 | total time: 131.91m | eta: 47.6m +step 12280/16704 (73.52%) | loss: 2.537822 | lrm: 0.53 | dt: 642.27ms | tok/sec: 816,300 | mfu: 51.02 | epoch: 2 | total time: 131.93m | eta: 47.6m +step 12281/16704 (73.52%) | loss: 2.547521 | lrm: 0.53 | dt: 643.13ms | tok/sec: 815,213 | mfu: 50.95 | epoch: 2 | total time: 131.94m | eta: 47.6m +step 12282/16704 (73.53%) | loss: 2.545987 | lrm: 0.53 | dt: 642.28ms | tok/sec: 816,291 | mfu: 51.02 | epoch: 2 | total time: 131.95m | eta: 47.5m +step 12283/16704 (73.53%) | loss: 2.544488 | lrm: 0.53 | dt: 643.46ms | tok/sec: 814,801 | mfu: 50.93 | epoch: 2 | total time: 131.96m | eta: 47.5m +step 12284/16704 (73.54%) | loss: 2.552240 | lrm: 0.53 | dt: 644.09ms | tok/sec: 813,995 | mfu: 50.88 | epoch: 2 | total time: 131.97m | eta: 47.5m +step 12285/16704 (73.55%) | loss: 2.551207 | lrm: 0.53 | dt: 643.44ms | tok/sec: 814,824 | mfu: 50.93 | epoch: 2 | total time: 131.98m | eta: 47.5m +step 12286/16704 (73.55%) | loss: 2.558369 | lrm: 0.53 | dt: 644.58ms | tok/sec: 813,382 | mfu: 50.84 | epoch: 2 | total time: 131.99m | eta: 47.5m +step 12287/16704 (73.56%) | loss: 2.548348 | lrm: 0.53 | dt: 642.31ms | tok/sec: 816,255 | mfu: 51.02 | epoch: 2 | total time: 132.00m | eta: 47.5m +step 12288/16704 (73.56%) | loss: 2.555660 | lrm: 0.53 | dt: 641.89ms | tok/sec: 816,788 | mfu: 51.05 | epoch: 2 | total time: 132.01m | eta: 47.5m +step 12289/16704 (73.57%) | loss: 2.558633 | lrm: 0.53 | dt: 641.38ms | tok/sec: 817,433 | mfu: 51.09 | epoch: 2 | total time: 132.02m | eta: 47.5m +step 12290/16704 (73.58%) | loss: 2.554287 | lrm: 0.53 | dt: 645.44ms | tok/sec: 812,289 | mfu: 50.77 | epoch: 2 | total time: 132.03m | eta: 47.5m +step 12291/16704 (73.58%) | loss: 2.556772 | lrm: 0.53 | dt: 643.68ms | tok/sec: 814,519 | mfu: 50.91 | epoch: 2 | total time: 132.04m | eta: 47.4m +step 12292/16704 (73.59%) | loss: 2.556689 | lrm: 0.53 | dt: 641.95ms | tok/sec: 816,713 | mfu: 51.05 | epoch: 2 | total time: 132.05m | eta: 47.4m +step 12293/16704 (73.59%) | loss: 2.560653 | lrm: 0.53 | dt: 643.66ms | tok/sec: 814,537 | mfu: 50.91 | epoch: 2 | total time: 132.06m | eta: 47.4m +step 12294/16704 (73.60%) | loss: 2.544674 | lrm: 0.53 | dt: 645.02ms | tok/sec: 812,826 | mfu: 50.80 | epoch: 2 | total time: 132.08m | eta: 47.4m +step 12295/16704 (73.61%) | loss: 2.549466 | lrm: 0.53 | dt: 645.27ms | tok/sec: 812,514 | mfu: 50.78 | epoch: 2 | total time: 132.09m | eta: 47.4m +step 12296/16704 (73.61%) | loss: 2.547402 | lrm: 0.53 | dt: 642.52ms | tok/sec: 815,987 | mfu: 51.00 | epoch: 2 | total time: 132.10m | eta: 47.4m +step 12297/16704 (73.62%) | loss: 2.550808 | lrm: 0.53 | dt: 644.13ms | tok/sec: 813,953 | mfu: 50.87 | epoch: 2 | total time: 132.11m | eta: 47.4m +step 12298/16704 (73.62%) | loss: 2.547947 | lrm: 0.53 | dt: 642.22ms | tok/sec: 816,367 | mfu: 51.02 | epoch: 2 | total time: 132.12m | eta: 47.4m +step 12299/16704 (73.63%) | loss: 2.551432 | lrm: 0.53 | dt: 643.93ms | tok/sec: 814,202 | mfu: 50.89 | epoch: 2 | total time: 132.13m | eta: 47.4m +step 12300/16704 (73.64%) | loss: 2.556863 | lrm: 0.53 | dt: 642.52ms | tok/sec: 815,989 | mfu: 51.00 | epoch: 2 | total time: 132.14m | eta: 47.4m +step 12301/16704 (73.64%) | loss: 2.569888 | lrm: 0.53 | dt: 642.76ms | tok/sec: 815,686 | mfu: 50.98 | epoch: 2 | total time: 132.15m | eta: 47.3m +step 12302/16704 (73.65%) | loss: 2.567873 | lrm: 0.53 | dt: 644.26ms | tok/sec: 813,778 | mfu: 50.86 | epoch: 2 | total time: 132.16m | eta: 47.3m +step 12303/16704 (73.65%) | loss: 2.569455 | lrm: 0.53 | dt: 643.46ms | tok/sec: 814,791 | mfu: 50.93 | epoch: 2 | total time: 132.17m | eta: 47.3m +step 12304/16704 (73.66%) | loss: 2.580335 | lrm: 0.53 | dt: 646.15ms | tok/sec: 811,407 | mfu: 50.71 | epoch: 2 | total time: 132.18m | eta: 47.3m +step 12305/16704 (73.66%) | loss: 2.581646 | lrm: 0.53 | dt: 642.41ms | tok/sec: 816,129 | mfu: 51.01 | epoch: 2 | total time: 132.19m | eta: 47.3m +step 12306/16704 (73.67%) | loss: 2.589064 | lrm: 0.53 | dt: 643.11ms | tok/sec: 815,242 | mfu: 50.95 | epoch: 2 | total time: 132.20m | eta: 47.3m +step 12307/16704 (73.68%) | loss: 2.587848 | lrm: 0.53 | dt: 643.95ms | tok/sec: 814,171 | mfu: 50.89 | epoch: 2 | total time: 132.21m | eta: 47.3m +step 12308/16704 (73.68%) | loss: 2.590043 | lrm: 0.53 | dt: 643.76ms | tok/sec: 814,415 | mfu: 50.90 | epoch: 2 | total time: 132.23m | eta: 47.3m +step 12309/16704 (73.69%) | loss: 2.596577 | lrm: 0.53 | dt: 645.38ms | tok/sec: 812,370 | mfu: 50.77 | epoch: 2 | total time: 132.24m | eta: 47.3m +step 12310/16704 (73.69%) | loss: 2.598846 | lrm: 0.53 | dt: 642.03ms | tok/sec: 816,609 | mfu: 51.04 | epoch: 2 | total time: 132.25m | eta: 47.2m +step 12311/16704 (73.70%) | loss: 2.591130 | lrm: 0.53 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 2 | total time: 132.26m | eta: 47.2m +step 12312/16704 (73.71%) | loss: 2.575196 | lrm: 0.53 | dt: 646.83ms | tok/sec: 810,547 | mfu: 50.66 | epoch: 2 | total time: 132.27m | eta: 47.2m +step 12313/16704 (73.71%) | loss: 2.578962 | lrm: 0.53 | dt: 641.83ms | tok/sec: 816,863 | mfu: 51.06 | epoch: 2 | total time: 132.28m | eta: 47.2m +step 12314/16704 (73.72%) | loss: 2.571303 | lrm: 0.53 | dt: 642.28ms | tok/sec: 816,290 | mfu: 51.02 | epoch: 2 | total time: 132.29m | eta: 47.2m +step 12315/16704 (73.72%) | loss: 2.577145 | lrm: 0.53 | dt: 644.31ms | tok/sec: 813,717 | mfu: 50.86 | epoch: 2 | total time: 132.30m | eta: 47.2m +step 12316/16704 (73.73%) | loss: 2.574371 | lrm: 0.53 | dt: 643.00ms | tok/sec: 815,374 | mfu: 50.96 | epoch: 2 | total time: 132.31m | eta: 47.2m +step 12317/16704 (73.74%) | loss: 2.567128 | lrm: 0.53 | dt: 644.81ms | tok/sec: 813,091 | mfu: 50.82 | epoch: 2 | total time: 132.32m | eta: 47.2m +step 12318/16704 (73.74%) | loss: 2.574390 | lrm: 0.53 | dt: 644.48ms | tok/sec: 813,506 | mfu: 50.85 | epoch: 2 | total time: 132.33m | eta: 47.2m +step 12319/16704 (73.75%) | loss: 2.574157 | lrm: 0.53 | dt: 642.73ms | tok/sec: 815,717 | mfu: 50.98 | epoch: 2 | total time: 132.34m | eta: 47.1m +step 12320/16704 (73.75%) | loss: 2.579041 | lrm: 0.52 | dt: 645.24ms | tok/sec: 812,548 | mfu: 50.79 | epoch: 2 | total time: 132.35m | eta: 47.1m +step 12321/16704 (73.76%) | loss: 2.564344 | lrm: 0.52 | dt: 644.55ms | tok/sec: 813,416 | mfu: 50.84 | epoch: 2 | total time: 132.37m | eta: 47.1m +step 12322/16704 (73.77%) | loss: 2.559528 | lrm: 0.52 | dt: 643.21ms | tok/sec: 815,110 | mfu: 50.95 | epoch: 2 | total time: 132.38m | eta: 47.1m +step 12323/16704 (73.77%) | loss: 2.565541 | lrm: 0.52 | dt: 643.26ms | tok/sec: 815,042 | mfu: 50.94 | epoch: 2 | total time: 132.39m | eta: 47.1m +step 12324/16704 (73.78%) | loss: 2.566650 | lrm: 0.52 | dt: 640.97ms | tok/sec: 817,955 | mfu: 51.12 | epoch: 2 | total time: 132.40m | eta: 47.1m +step 12325/16704 (73.78%) | loss: 2.552853 | lrm: 0.52 | dt: 642.22ms | tok/sec: 816,365 | mfu: 51.02 | epoch: 2 | total time: 132.41m | eta: 47.1m +step 12326/16704 (73.79%) | loss: 2.555878 | lrm: 0.52 | dt: 644.41ms | tok/sec: 813,596 | mfu: 50.85 | epoch: 2 | total time: 132.42m | eta: 47.1m +step 12327/16704 (73.80%) | loss: 2.565293 | lrm: 0.52 | dt: 642.85ms | tok/sec: 815,562 | mfu: 50.97 | epoch: 2 | total time: 132.43m | eta: 47.1m +step 12328/16704 (73.80%) | loss: 2.565046 | lrm: 0.52 | dt: 643.51ms | tok/sec: 814,733 | mfu: 50.92 | epoch: 2 | total time: 132.44m | eta: 47.0m +step 12329/16704 (73.81%) | loss: 2.558523 | lrm: 0.52 | dt: 643.75ms | tok/sec: 814,423 | mfu: 50.90 | epoch: 2 | total time: 132.45m | eta: 47.0m +step 12330/16704 (73.81%) | loss: 2.573236 | lrm: 0.52 | dt: 640.64ms | tok/sec: 818,381 | mfu: 51.15 | epoch: 2 | total time: 132.46m | eta: 47.0m +step 12331/16704 (73.82%) | loss: 2.566433 | lrm: 0.52 | dt: 642.81ms | tok/sec: 815,613 | mfu: 50.98 | epoch: 2 | total time: 132.47m | eta: 47.0m +step 12332/16704 (73.83%) | loss: 2.565787 | lrm: 0.52 | dt: 643.82ms | tok/sec: 814,335 | mfu: 50.90 | epoch: 2 | total time: 132.48m | eta: 47.0m +step 12333/16704 (73.83%) | loss: 2.567358 | lrm: 0.52 | dt: 644.34ms | tok/sec: 813,684 | mfu: 50.86 | epoch: 2 | total time: 132.49m | eta: 47.0m +step 12334/16704 (73.84%) | loss: 2.569220 | lrm: 0.52 | dt: 644.33ms | tok/sec: 813,700 | mfu: 50.86 | epoch: 2 | total time: 132.50m | eta: 47.0m +step 12335/16704 (73.84%) | loss: 2.567397 | lrm: 0.52 | dt: 641.93ms | tok/sec: 816,741 | mfu: 51.05 | epoch: 2 | total time: 132.52m | eta: 47.0m +step 12336/16704 (73.85%) | loss: 2.579293 | lrm: 0.52 | dt: 644.51ms | tok/sec: 813,473 | mfu: 50.84 | epoch: 2 | total time: 132.53m | eta: 47.0m +step 12337/16704 (73.86%) | loss: 2.573146 | lrm: 0.52 | dt: 640.32ms | tok/sec: 818,790 | mfu: 51.18 | epoch: 2 | total time: 132.54m | eta: 47.0m +step 12338/16704 (73.86%) | loss: 2.561555 | lrm: 0.52 | dt: 644.77ms | tok/sec: 813,141 | mfu: 50.82 | epoch: 2 | total time: 132.55m | eta: 46.9m +step 12339/16704 (73.87%) | loss: 2.551524 | lrm: 0.52 | dt: 642.59ms | tok/sec: 815,894 | mfu: 50.99 | epoch: 2 | total time: 132.56m | eta: 46.9m +step 12340/16704 (73.87%) | loss: 2.541006 | lrm: 0.52 | dt: 643.44ms | tok/sec: 814,821 | mfu: 50.93 | epoch: 2 | total time: 132.57m | eta: 46.9m +step 12341/16704 (73.88%) | loss: 2.549838 | lrm: 0.52 | dt: 644.68ms | tok/sec: 813,259 | mfu: 50.83 | epoch: 2 | total time: 132.58m | eta: 46.9m +step 12342/16704 (73.89%) | loss: 2.536454 | lrm: 0.52 | dt: 641.48ms | tok/sec: 817,306 | mfu: 51.08 | epoch: 2 | total time: 132.59m | eta: 46.9m +step 12343/16704 (73.89%) | loss: 2.551327 | lrm: 0.52 | dt: 641.16ms | tok/sec: 817,724 | mfu: 51.11 | epoch: 2 | total time: 132.60m | eta: 46.9m +step 12344/16704 (73.90%) | loss: 2.553961 | lrm: 0.52 | dt: 641.62ms | tok/sec: 817,125 | mfu: 51.07 | epoch: 2 | total time: 132.61m | eta: 46.9m +step 12345/16704 (73.90%) | loss: 2.546828 | lrm: 0.52 | dt: 641.72ms | tok/sec: 817,007 | mfu: 51.06 | epoch: 2 | total time: 132.62m | eta: 46.9m +step 12346/16704 (73.91%) | loss: 2.548483 | lrm: 0.52 | dt: 642.67ms | tok/sec: 815,793 | mfu: 50.99 | epoch: 2 | total time: 132.63m | eta: 46.9m +step 12347/16704 (73.92%) | loss: 2.550694 | lrm: 0.52 | dt: 644.86ms | tok/sec: 813,021 | mfu: 50.81 | epoch: 2 | total time: 132.64m | eta: 46.8m +step 12348/16704 (73.92%) | loss: 2.549468 | lrm: 0.52 | dt: 646.07ms | tok/sec: 811,500 | mfu: 50.72 | epoch: 2 | total time: 132.65m | eta: 46.8m +step 12349/16704 (73.93%) | loss: 2.547688 | lrm: 0.52 | dt: 644.47ms | tok/sec: 813,522 | mfu: 50.85 | epoch: 2 | total time: 132.67m | eta: 46.8m +step 12350/16704 (73.93%) | loss: 2.538696 | lrm: 0.52 | dt: 643.06ms | tok/sec: 815,297 | mfu: 50.96 | epoch: 2 | total time: 132.68m | eta: 46.8m +step 12351/16704 (73.94%) | loss: 2.541384 | lrm: 0.52 | dt: 643.30ms | tok/sec: 815,000 | mfu: 50.94 | epoch: 2 | total time: 132.69m | eta: 46.8m +step 12352/16704 (73.95%) | loss: 2.538753 | lrm: 0.52 | dt: 644.13ms | tok/sec: 813,943 | mfu: 50.87 | epoch: 2 | total time: 132.70m | eta: 46.8m +step 12353/16704 (73.95%) | loss: 2.545158 | lrm: 0.52 | dt: 645.02ms | tok/sec: 812,830 | mfu: 50.80 | epoch: 2 | total time: 132.71m | eta: 46.8m +step 12354/16704 (73.96%) | loss: 2.542899 | lrm: 0.52 | dt: 641.71ms | tok/sec: 817,011 | mfu: 51.06 | epoch: 2 | total time: 132.72m | eta: 46.8m +step 12355/16704 (73.96%) | loss: 2.531204 | lrm: 0.52 | dt: 643.50ms | tok/sec: 814,747 | mfu: 50.92 | epoch: 2 | total time: 132.73m | eta: 46.8m +step 12356/16704 (73.97%) | loss: 2.535857 | lrm: 0.52 | dt: 642.51ms | tok/sec: 816,001 | mfu: 51.00 | epoch: 2 | total time: 132.74m | eta: 46.7m +step 12357/16704 (73.98%) | loss: 2.520470 | lrm: 0.52 | dt: 644.62ms | tok/sec: 813,328 | mfu: 50.83 | epoch: 2 | total time: 132.75m | eta: 46.7m +step 12358/16704 (73.98%) | loss: 2.517197 | lrm: 0.52 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 2 | total time: 132.76m | eta: 46.7m +step 12359/16704 (73.99%) | loss: 2.526074 | lrm: 0.52 | dt: 646.38ms | tok/sec: 811,110 | mfu: 50.70 | epoch: 2 | total time: 132.77m | eta: 46.7m +step 12360/16704 (73.99%) | loss: 2.534966 | lrm: 0.52 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 2 | total time: 132.78m | eta: 46.7m +step 12361/16704 (74.00%) | loss: 2.539238 | lrm: 0.52 | dt: 643.12ms | tok/sec: 815,227 | mfu: 50.95 | epoch: 2 | total time: 132.79m | eta: 46.7m +step 12362/16704 (74.01%) | loss: 2.540172 | lrm: 0.52 | dt: 644.10ms | tok/sec: 813,979 | mfu: 50.87 | epoch: 2 | total time: 132.80m | eta: 46.7m +step 12363/16704 (74.01%) | loss: 2.534095 | lrm: 0.52 | dt: 642.80ms | tok/sec: 815,633 | mfu: 50.98 | epoch: 2 | total time: 132.82m | eta: 46.7m +step 12364/16704 (74.02%) | loss: 2.542659 | lrm: 0.52 | dt: 645.43ms | tok/sec: 812,312 | mfu: 50.77 | epoch: 2 | total time: 132.83m | eta: 46.7m +step 12365/16704 (74.02%) | loss: 2.548674 | lrm: 0.52 | dt: 642.99ms | tok/sec: 815,396 | mfu: 50.96 | epoch: 2 | total time: 132.84m | eta: 46.7m +step 12366/16704 (74.03%) | loss: 2.555325 | lrm: 0.52 | dt: 643.37ms | tok/sec: 814,909 | mfu: 50.93 | epoch: 2 | total time: 132.85m | eta: 46.6m +step 12367/16704 (74.04%) | loss: 2.559374 | lrm: 0.52 | dt: 641.91ms | tok/sec: 816,761 | mfu: 51.05 | epoch: 2 | total time: 132.86m | eta: 46.6m +step 12368/16704 (74.04%) | loss: 2.552894 | lrm: 0.52 | dt: 643.26ms | tok/sec: 815,052 | mfu: 50.94 | epoch: 2 | total time: 132.87m | eta: 46.6m +step 12369/16704 (74.05%) | loss: 2.548979 | lrm: 0.52 | dt: 644.30ms | tok/sec: 813,738 | mfu: 50.86 | epoch: 2 | total time: 132.88m | eta: 46.6m +step 12370/16704 (74.05%) | loss: 2.544080 | lrm: 0.52 | dt: 642.09ms | tok/sec: 816,534 | mfu: 51.03 | epoch: 2 | total time: 132.89m | eta: 46.6m +step 12371/16704 (74.06%) | loss: 2.551294 | lrm: 0.52 | dt: 643.62ms | tok/sec: 814,590 | mfu: 50.91 | epoch: 2 | total time: 132.90m | eta: 46.6m +step 12372/16704 (74.07%) | loss: 2.555288 | lrm: 0.52 | dt: 644.20ms | tok/sec: 813,864 | mfu: 50.87 | epoch: 2 | total time: 132.91m | eta: 46.6m +step 12373/16704 (74.07%) | loss: 2.579354 | lrm: 0.52 | dt: 643.19ms | tok/sec: 815,137 | mfu: 50.95 | epoch: 2 | total time: 132.92m | eta: 46.6m +step 12374/16704 (74.08%) | loss: 2.588217 | lrm: 0.52 | dt: 641.79ms | tok/sec: 816,910 | mfu: 51.06 | epoch: 2 | total time: 132.93m | eta: 46.6m +step 12375/16704 (74.08%) | loss: 2.577284 | lrm: 0.52 | dt: 644.40ms | tok/sec: 813,607 | mfu: 50.85 | epoch: 2 | total time: 132.94m | eta: 46.5m +step 12376/16704 (74.09%) | loss: 2.566263 | lrm: 0.52 | dt: 642.78ms | tok/sec: 815,654 | mfu: 50.98 | epoch: 2 | total time: 132.95m | eta: 46.5m +step 12377/16704 (74.10%) | loss: 2.560395 | lrm: 0.52 | dt: 643.22ms | tok/sec: 815,102 | mfu: 50.95 | epoch: 2 | total time: 132.97m | eta: 46.5m +step 12378/16704 (74.10%) | loss: 2.554239 | lrm: 0.52 | dt: 643.22ms | tok/sec: 815,099 | mfu: 50.94 | epoch: 2 | total time: 132.98m | eta: 46.5m +step 12379/16704 (74.11%) | loss: 2.543296 | lrm: 0.52 | dt: 644.63ms | tok/sec: 813,318 | mfu: 50.83 | epoch: 2 | total time: 132.99m | eta: 46.5m +step 12380/16704 (74.11%) | loss: 2.535022 | lrm: 0.52 | dt: 641.59ms | tok/sec: 817,172 | mfu: 51.07 | epoch: 2 | total time: 133.00m | eta: 46.5m +step 12381/16704 (74.12%) | loss: 2.545623 | lrm: 0.52 | dt: 645.48ms | tok/sec: 812,246 | mfu: 50.77 | epoch: 2 | total time: 133.01m | eta: 46.5m +step 12382/16704 (74.13%) | loss: 2.538815 | lrm: 0.52 | dt: 642.05ms | tok/sec: 816,589 | mfu: 51.04 | epoch: 2 | total time: 133.02m | eta: 46.5m +step 12383/16704 (74.13%) | loss: 2.543560 | lrm: 0.52 | dt: 646.44ms | tok/sec: 811,040 | mfu: 50.69 | epoch: 2 | total time: 133.03m | eta: 46.5m +step 12384/16704 (74.14%) | loss: 2.539579 | lrm: 0.52 | dt: 642.14ms | tok/sec: 816,465 | mfu: 51.03 | epoch: 2 | total time: 133.04m | eta: 46.4m +step 12385/16704 (74.14%) | loss: 2.547782 | lrm: 0.52 | dt: 644.55ms | tok/sec: 813,416 | mfu: 50.84 | epoch: 2 | total time: 133.05m | eta: 46.4m +step 12386/16704 (74.15%) | loss: 2.547250 | lrm: 0.52 | dt: 642.62ms | tok/sec: 815,855 | mfu: 50.99 | epoch: 2 | total time: 133.06m | eta: 46.4m +step 12387/16704 (74.16%) | loss: 2.547582 | lrm: 0.52 | dt: 642.39ms | tok/sec: 816,153 | mfu: 51.01 | epoch: 2 | total time: 133.07m | eta: 46.4m +step 12388/16704 (74.16%) | loss: 2.543124 | lrm: 0.52 | dt: 644.33ms | tok/sec: 813,689 | mfu: 50.86 | epoch: 2 | total time: 133.08m | eta: 46.4m +step 12389/16704 (74.17%) | loss: 2.546826 | lrm: 0.52 | dt: 641.79ms | tok/sec: 816,916 | mfu: 51.06 | epoch: 2 | total time: 133.09m | eta: 46.4m +step 12390/16704 (74.17%) | loss: 2.559387 | lrm: 0.52 | dt: 642.95ms | tok/sec: 815,440 | mfu: 50.97 | epoch: 2 | total time: 133.10m | eta: 46.4m +step 12391/16704 (74.18%) | loss: 2.557784 | lrm: 0.52 | dt: 644.51ms | tok/sec: 813,463 | mfu: 50.84 | epoch: 2 | total time: 133.12m | eta: 46.4m +step 12392/16704 (74.19%) | loss: 2.558780 | lrm: 0.52 | dt: 640.63ms | tok/sec: 818,388 | mfu: 51.15 | epoch: 2 | total time: 133.13m | eta: 46.4m +step 12393/16704 (74.19%) | loss: 2.561726 | lrm: 0.52 | dt: 642.94ms | tok/sec: 815,454 | mfu: 50.97 | epoch: 2 | total time: 133.14m | eta: 46.4m +step 12394/16704 (74.20%) | loss: 2.561886 | lrm: 0.52 | dt: 644.08ms | tok/sec: 814,014 | mfu: 50.88 | epoch: 2 | total time: 133.15m | eta: 46.3m +step 12395/16704 (74.20%) | loss: 2.553790 | lrm: 0.52 | dt: 642.29ms | tok/sec: 816,275 | mfu: 51.02 | epoch: 2 | total time: 133.16m | eta: 46.3m +step 12396/16704 (74.21%) | loss: 2.545223 | lrm: 0.52 | dt: 642.97ms | tok/sec: 815,414 | mfu: 50.96 | epoch: 2 | total time: 133.17m | eta: 46.3m +step 12397/16704 (74.22%) | loss: 2.556498 | lrm: 0.52 | dt: 641.92ms | tok/sec: 816,750 | mfu: 51.05 | epoch: 2 | total time: 133.18m | eta: 46.3m +step 12398/16704 (74.22%) | loss: 2.562945 | lrm: 0.52 | dt: 643.84ms | tok/sec: 814,310 | mfu: 50.90 | epoch: 2 | total time: 133.19m | eta: 46.3m +step 12399/16704 (74.23%) | loss: 2.548751 | lrm: 0.52 | dt: 642.82ms | tok/sec: 815,603 | mfu: 50.98 | epoch: 2 | total time: 133.20m | eta: 46.3m +step 12400/16704 (74.23%) | loss: 2.551591 | lrm: 0.52 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 2 | total time: 133.21m | eta: 46.3m +step 12401/16704 (74.24%) | loss: 2.541986 | lrm: 0.52 | dt: 643.03ms | tok/sec: 815,344 | mfu: 50.96 | epoch: 2 | total time: 133.22m | eta: 46.3m +step 12402/16704 (74.25%) | loss: 2.529971 | lrm: 0.52 | dt: 643.90ms | tok/sec: 814,240 | mfu: 50.89 | epoch: 2 | total time: 133.23m | eta: 46.3m +step 12403/16704 (74.25%) | loss: 2.528027 | lrm: 0.51 | dt: 644.73ms | tok/sec: 813,195 | mfu: 50.83 | epoch: 2 | total time: 133.24m | eta: 46.2m +step 12404/16704 (74.26%) | loss: 2.546602 | lrm: 0.51 | dt: 643.68ms | tok/sec: 814,521 | mfu: 50.91 | epoch: 2 | total time: 133.25m | eta: 46.2m +step 12405/16704 (74.26%) | loss: 2.556703 | lrm: 0.51 | dt: 642.85ms | tok/sec: 815,562 | mfu: 50.97 | epoch: 2 | total time: 133.27m | eta: 46.2m +step 12406/16704 (74.27%) | loss: 2.560481 | lrm: 0.51 | dt: 644.06ms | tok/sec: 814,030 | mfu: 50.88 | epoch: 2 | total time: 133.28m | eta: 46.2m +step 12407/16704 (74.28%) | loss: 2.552707 | lrm: 0.51 | dt: 643.32ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 2 | total time: 133.29m | eta: 46.2m +step 12408/16704 (74.28%) | loss: 2.553420 | lrm: 0.51 | dt: 642.77ms | tok/sec: 815,675 | mfu: 50.98 | epoch: 2 | total time: 133.30m | eta: 46.2m +step 12409/16704 (74.29%) | loss: 2.556070 | lrm: 0.51 | dt: 644.17ms | tok/sec: 813,893 | mfu: 50.87 | epoch: 2 | total time: 133.31m | eta: 46.2m +step 12410/16704 (74.29%) | loss: 2.572310 | lrm: 0.51 | dt: 643.63ms | tok/sec: 814,576 | mfu: 50.91 | epoch: 2 | total time: 133.32m | eta: 46.2m +step 12411/16704 (74.30%) | loss: 2.574820 | lrm: 0.51 | dt: 642.47ms | tok/sec: 816,049 | mfu: 51.00 | epoch: 2 | total time: 133.33m | eta: 46.2m +step 12412/16704 (74.31%) | loss: 2.571966 | lrm: 0.51 | dt: 644.94ms | tok/sec: 812,930 | mfu: 50.81 | epoch: 2 | total time: 133.34m | eta: 46.1m +step 12413/16704 (74.31%) | loss: 2.577839 | lrm: 0.51 | dt: 642.40ms | tok/sec: 816,144 | mfu: 51.01 | epoch: 2 | total time: 133.35m | eta: 46.1m +step 12414/16704 (74.32%) | loss: 2.565081 | lrm: 0.51 | dt: 644.14ms | tok/sec: 813,932 | mfu: 50.87 | epoch: 2 | total time: 133.36m | eta: 46.1m +step 12415/16704 (74.32%) | loss: 2.562835 | lrm: 0.51 | dt: 642.73ms | tok/sec: 815,720 | mfu: 50.98 | epoch: 2 | total time: 133.37m | eta: 46.1m +step 12416/16704 (74.33%) | loss: 2.559275 | lrm: 0.51 | dt: 643.39ms | tok/sec: 814,888 | mfu: 50.93 | epoch: 2 | total time: 133.38m | eta: 46.1m +step 12417/16704 (74.34%) | loss: 2.558382 | lrm: 0.51 | dt: 643.48ms | tok/sec: 814,768 | mfu: 50.92 | epoch: 2 | total time: 133.39m | eta: 46.1m +step 12418/16704 (74.34%) | loss: 2.548580 | lrm: 0.51 | dt: 641.45ms | tok/sec: 817,344 | mfu: 51.09 | epoch: 2 | total time: 133.41m | eta: 46.1m +step 12419/16704 (74.35%) | loss: 2.544013 | lrm: 0.51 | dt: 644.71ms | tok/sec: 813,221 | mfu: 50.83 | epoch: 2 | total time: 133.42m | eta: 46.1m +step 12420/16704 (74.35%) | loss: 2.546578 | lrm: 0.51 | dt: 643.82ms | tok/sec: 814,342 | mfu: 50.90 | epoch: 2 | total time: 133.43m | eta: 46.1m +step 12421/16704 (74.36%) | loss: 2.548253 | lrm: 0.51 | dt: 645.32ms | tok/sec: 812,447 | mfu: 50.78 | epoch: 2 | total time: 133.44m | eta: 46.0m +step 12422/16704 (74.37%) | loss: 2.547873 | lrm: 0.51 | dt: 642.04ms | tok/sec: 816,603 | mfu: 51.04 | epoch: 2 | total time: 133.45m | eta: 46.0m +step 12423/16704 (74.37%) | loss: 2.550607 | lrm: 0.51 | dt: 643.14ms | tok/sec: 815,206 | mfu: 50.95 | epoch: 2 | total time: 133.46m | eta: 46.0m +step 12424/16704 (74.38%) | loss: 2.555611 | lrm: 0.51 | dt: 642.32ms | tok/sec: 816,241 | mfu: 51.02 | epoch: 2 | total time: 133.47m | eta: 46.0m +step 12425/16704 (74.38%) | loss: 2.565966 | lrm: 0.51 | dt: 643.09ms | tok/sec: 815,264 | mfu: 50.96 | epoch: 2 | total time: 133.48m | eta: 46.0m +step 12426/16704 (74.39%) | loss: 2.559899 | lrm: 0.51 | dt: 644.70ms | tok/sec: 813,229 | mfu: 50.83 | epoch: 2 | total time: 133.49m | eta: 46.0m +step 12427/16704 (74.40%) | loss: 2.561054 | lrm: 0.51 | dt: 644.71ms | tok/sec: 813,215 | mfu: 50.83 | epoch: 2 | total time: 133.50m | eta: 46.0m +step 12428/16704 (74.40%) | loss: 2.564181 | lrm: 0.51 | dt: 642.78ms | tok/sec: 815,661 | mfu: 50.98 | epoch: 2 | total time: 133.51m | eta: 46.0m +step 12429/16704 (74.41%) | loss: 2.563824 | lrm: 0.51 | dt: 642.65ms | tok/sec: 815,818 | mfu: 50.99 | epoch: 2 | total time: 133.52m | eta: 46.0m +step 12430/16704 (74.41%) | loss: 2.552937 | lrm: 0.51 | dt: 643.56ms | tok/sec: 814,667 | mfu: 50.92 | epoch: 2 | total time: 133.53m | eta: 46.0m +step 12431/16704 (74.42%) | loss: 2.542033 | lrm: 0.51 | dt: 645.40ms | tok/sec: 812,350 | mfu: 50.77 | epoch: 2 | total time: 133.54m | eta: 45.9m +step 12432/16704 (74.43%) | loss: 2.526118 | lrm: 0.51 | dt: 643.75ms | tok/sec: 814,433 | mfu: 50.90 | epoch: 2 | total time: 133.56m | eta: 45.9m +step 12433/16704 (74.43%) | loss: 2.522881 | lrm: 0.51 | dt: 644.86ms | tok/sec: 813,029 | mfu: 50.82 | epoch: 2 | total time: 133.57m | eta: 45.9m +step 12434/16704 (74.44%) | loss: 2.521384 | lrm: 0.51 | dt: 640.76ms | tok/sec: 818,229 | mfu: 51.14 | epoch: 2 | total time: 133.58m | eta: 45.9m +step 12435/16704 (74.44%) | loss: 2.518865 | lrm: 0.51 | dt: 645.39ms | tok/sec: 812,359 | mfu: 50.77 | epoch: 2 | total time: 133.59m | eta: 45.9m +step 12436/16704 (74.45%) | loss: 2.523002 | lrm: 0.51 | dt: 641.10ms | tok/sec: 817,793 | mfu: 51.11 | epoch: 2 | total time: 133.60m | eta: 45.9m +step 12437/16704 (74.46%) | loss: 2.532280 | lrm: 0.51 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 2 | total time: 133.61m | eta: 45.9m +step 12438/16704 (74.46%) | loss: 2.528218 | lrm: 0.51 | dt: 644.69ms | tok/sec: 813,244 | mfu: 50.83 | epoch: 2 | total time: 133.62m | eta: 45.9m +step 12439/16704 (74.47%) | loss: 2.522358 | lrm: 0.51 | dt: 641.37ms | tok/sec: 817,454 | mfu: 51.09 | epoch: 2 | total time: 133.63m | eta: 45.9m +step 12440/16704 (74.47%) | loss: 2.535799 | lrm: 0.51 | dt: 645.22ms | tok/sec: 812,574 | mfu: 50.79 | epoch: 2 | total time: 133.64m | eta: 45.8m +step 12441/16704 (74.48%) | loss: 2.554684 | lrm: 0.51 | dt: 641.37ms | tok/sec: 817,455 | mfu: 51.09 | epoch: 2 | total time: 133.65m | eta: 45.8m +step 12442/16704 (74.49%) | loss: 2.555276 | lrm: 0.51 | dt: 646.36ms | tok/sec: 811,144 | mfu: 50.70 | epoch: 2 | total time: 133.66m | eta: 45.8m +step 12443/16704 (74.49%) | loss: 2.549630 | lrm: 0.51 | dt: 644.62ms | tok/sec: 813,327 | mfu: 50.83 | epoch: 2 | total time: 133.67m | eta: 45.8m +step 12444/16704 (74.50%) | loss: 2.568840 | lrm: 0.51 | dt: 643.94ms | tok/sec: 814,190 | mfu: 50.89 | epoch: 2 | total time: 133.68m | eta: 45.8m +step 12445/16704 (74.50%) | loss: 2.567776 | lrm: 0.51 | dt: 643.04ms | tok/sec: 815,332 | mfu: 50.96 | epoch: 2 | total time: 133.69m | eta: 45.8m +step 12446/16704 (74.51%) | loss: 2.573129 | lrm: 0.51 | dt: 643.80ms | tok/sec: 814,366 | mfu: 50.90 | epoch: 2 | total time: 133.71m | eta: 45.8m +step 12447/16704 (74.52%) | loss: 2.573109 | lrm: 0.51 | dt: 642.44ms | tok/sec: 816,093 | mfu: 51.01 | epoch: 2 | total time: 133.72m | eta: 45.8m +step 12448/16704 (74.52%) | loss: 2.571025 | lrm: 0.51 | dt: 644.96ms | tok/sec: 812,897 | mfu: 50.81 | epoch: 2 | total time: 133.73m | eta: 45.8m +step 12449/16704 (74.53%) | loss: 2.577920 | lrm: 0.51 | dt: 642.32ms | tok/sec: 816,239 | mfu: 51.02 | epoch: 2 | total time: 133.74m | eta: 45.7m +step 12450/16704 (74.53%) | loss: 2.567519 | lrm: 0.51 | dt: 643.89ms | tok/sec: 814,247 | mfu: 50.89 | epoch: 2 | total time: 133.75m | eta: 45.7m +step 12451/16704 (74.54%) | loss: 2.569742 | lrm: 0.51 | dt: 643.58ms | tok/sec: 814,641 | mfu: 50.92 | epoch: 2 | total time: 133.76m | eta: 45.7m +step 12452/16704 (74.55%) | loss: 2.585268 | lrm: 0.51 | dt: 640.78ms | tok/sec: 818,208 | mfu: 51.14 | epoch: 2 | total time: 133.77m | eta: 45.7m +step 12453/16704 (74.55%) | loss: 2.578305 | lrm: 0.51 | dt: 644.64ms | tok/sec: 813,299 | mfu: 50.83 | epoch: 2 | total time: 133.78m | eta: 45.7m +step 12454/16704 (74.56%) | loss: 2.576910 | lrm: 0.51 | dt: 643.21ms | tok/sec: 815,113 | mfu: 50.95 | epoch: 2 | total time: 133.79m | eta: 45.7m +step 12455/16704 (74.56%) | loss: 2.593457 | lrm: 0.51 | dt: 643.08ms | tok/sec: 815,278 | mfu: 50.96 | epoch: 2 | total time: 133.80m | eta: 45.7m +step 12456/16704 (74.57%) | loss: 2.602084 | lrm: 0.51 | dt: 641.67ms | tok/sec: 817,067 | mfu: 51.07 | epoch: 2 | total time: 133.81m | eta: 45.7m +step 12457/16704 (74.57%) | loss: 2.586838 | lrm: 0.51 | dt: 642.93ms | tok/sec: 815,460 | mfu: 50.97 | epoch: 2 | total time: 133.82m | eta: 45.7m +step 12458/16704 (74.58%) | loss: 2.593522 | lrm: 0.51 | dt: 642.89ms | tok/sec: 815,518 | mfu: 50.97 | epoch: 2 | total time: 133.83m | eta: 45.7m +step 12459/16704 (74.59%) | loss: 2.594613 | lrm: 0.51 | dt: 641.98ms | tok/sec: 816,671 | mfu: 51.04 | epoch: 2 | total time: 133.84m | eta: 45.6m +step 12460/16704 (74.59%) | loss: 2.596960 | lrm: 0.51 | dt: 644.55ms | tok/sec: 813,420 | mfu: 50.84 | epoch: 2 | total time: 133.86m | eta: 45.6m +step 12461/16704 (74.60%) | loss: 2.589289 | lrm: 0.51 | dt: 644.79ms | tok/sec: 813,114 | mfu: 50.82 | epoch: 2 | total time: 133.87m | eta: 45.6m +step 12462/16704 (74.60%) | loss: 2.589810 | lrm: 0.51 | dt: 644.80ms | tok/sec: 813,106 | mfu: 50.82 | epoch: 2 | total time: 133.88m | eta: 45.6m +step 12463/16704 (74.61%) | loss: 2.579310 | lrm: 0.51 | dt: 644.70ms | tok/sec: 813,224 | mfu: 50.83 | epoch: 2 | total time: 133.89m | eta: 45.6m +step 12464/16704 (74.62%) | loss: 2.566880 | lrm: 0.51 | dt: 641.60ms | tok/sec: 817,153 | mfu: 51.07 | epoch: 2 | total time: 133.90m | eta: 45.6m +step 12465/16704 (74.62%) | loss: 2.554566 | lrm: 0.51 | dt: 644.19ms | tok/sec: 813,872 | mfu: 50.87 | epoch: 2 | total time: 133.91m | eta: 45.6m +step 12466/16704 (74.63%) | loss: 2.549383 | lrm: 0.51 | dt: 643.64ms | tok/sec: 814,567 | mfu: 50.91 | epoch: 2 | total time: 133.92m | eta: 45.6m +step 12467/16704 (74.63%) | loss: 2.546948 | lrm: 0.51 | dt: 647.09ms | tok/sec: 810,221 | mfu: 50.64 | epoch: 2 | total time: 133.93m | eta: 45.6m +step 12468/16704 (74.64%) | loss: 2.530405 | lrm: 0.51 | dt: 644.11ms | tok/sec: 813,969 | mfu: 50.87 | epoch: 2 | total time: 133.94m | eta: 45.5m +step 12469/16704 (74.65%) | loss: 2.542895 | lrm: 0.51 | dt: 642.96ms | tok/sec: 815,427 | mfu: 50.97 | epoch: 2 | total time: 133.95m | eta: 45.5m +step 12470/16704 (74.65%) | loss: 2.539472 | lrm: 0.51 | dt: 642.40ms | tok/sec: 816,136 | mfu: 51.01 | epoch: 2 | total time: 133.96m | eta: 45.5m +step 12471/16704 (74.66%) | loss: 2.531482 | lrm: 0.51 | dt: 642.84ms | tok/sec: 815,577 | mfu: 50.97 | epoch: 2 | total time: 133.97m | eta: 45.5m +step 12472/16704 (74.66%) | loss: 2.537586 | lrm: 0.51 | dt: 643.01ms | tok/sec: 815,362 | mfu: 50.96 | epoch: 2 | total time: 133.98m | eta: 45.5m +step 12473/16704 (74.67%) | loss: 2.544673 | lrm: 0.51 | dt: 642.26ms | tok/sec: 816,320 | mfu: 51.02 | epoch: 2 | total time: 133.99m | eta: 45.5m +step 12474/16704 (74.68%) | loss: 2.555100 | lrm: 0.51 | dt: 642.49ms | tok/sec: 816,028 | mfu: 51.00 | epoch: 2 | total time: 134.01m | eta: 45.5m +step 12475/16704 (74.68%) | loss: 2.550114 | lrm: 0.51 | dt: 643.93ms | tok/sec: 814,200 | mfu: 50.89 | epoch: 2 | total time: 134.02m | eta: 45.5m +step 12476/16704 (74.69%) | loss: 2.562610 | lrm: 0.51 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 2 | total time: 134.03m | eta: 45.5m +step 12477/16704 (74.69%) | loss: 2.562811 | lrm: 0.51 | dt: 642.93ms | tok/sec: 815,462 | mfu: 50.97 | epoch: 2 | total time: 134.04m | eta: 45.4m +step 12478/16704 (74.70%) | loss: 2.586540 | lrm: 0.51 | dt: 644.81ms | tok/sec: 813,087 | mfu: 50.82 | epoch: 2 | total time: 134.05m | eta: 45.4m +step 12479/16704 (74.71%) | loss: 2.579446 | lrm: 0.51 | dt: 641.83ms | tok/sec: 816,864 | mfu: 51.06 | epoch: 2 | total time: 134.06m | eta: 45.4m +step 12480/16704 (74.71%) | loss: 2.577429 | lrm: 0.51 | dt: 641.88ms | tok/sec: 816,801 | mfu: 51.05 | epoch: 2 | total time: 134.07m | eta: 45.4m +step 12481/16704 (74.72%) | loss: 2.569083 | lrm: 0.51 | dt: 643.50ms | tok/sec: 814,746 | mfu: 50.92 | epoch: 2 | total time: 134.08m | eta: 45.4m +step 12482/16704 (74.72%) | loss: 2.562767 | lrm: 0.51 | dt: 644.53ms | tok/sec: 813,443 | mfu: 50.84 | epoch: 2 | total time: 134.09m | eta: 45.4m +step 12483/16704 (74.73%) | loss: 2.550984 | lrm: 0.51 | dt: 646.49ms | tok/sec: 810,972 | mfu: 50.69 | epoch: 2 | total time: 134.10m | eta: 45.4m +step 12484/16704 (74.74%) | loss: 2.557798 | lrm: 0.51 | dt: 644.20ms | tok/sec: 813,863 | mfu: 50.87 | epoch: 2 | total time: 134.11m | eta: 45.4m +step 12485/16704 (74.74%) | loss: 2.567475 | lrm: 0.51 | dt: 643.61ms | tok/sec: 814,603 | mfu: 50.91 | epoch: 2 | total time: 134.12m | eta: 45.4m +step 12486/16704 (74.75%) | loss: 2.564775 | lrm: 0.51 | dt: 645.12ms | tok/sec: 812,700 | mfu: 50.79 | epoch: 2 | total time: 134.13m | eta: 45.3m +step 12487/16704 (74.75%) | loss: 2.567576 | lrm: 0.50 | dt: 643.97ms | tok/sec: 814,143 | mfu: 50.89 | epoch: 2 | total time: 134.15m | eta: 45.3m +step 12488/16704 (74.76%) | loss: 2.577867 | lrm: 0.50 | dt: 647.45ms | tok/sec: 809,768 | mfu: 50.61 | epoch: 2 | total time: 134.16m | eta: 45.3m +step 12489/16704 (74.77%) | loss: 2.573994 | lrm: 0.50 | dt: 641.57ms | tok/sec: 817,192 | mfu: 51.08 | epoch: 2 | total time: 134.17m | eta: 45.3m +step 12490/16704 (74.77%) | loss: 2.558712 | lrm: 0.50 | dt: 644.17ms | tok/sec: 813,891 | mfu: 50.87 | epoch: 2 | total time: 134.18m | eta: 45.3m +step 12491/16704 (74.78%) | loss: 2.554090 | lrm: 0.50 | dt: 642.96ms | tok/sec: 815,432 | mfu: 50.97 | epoch: 2 | total time: 134.19m | eta: 45.3m +step 12492/16704 (74.78%) | loss: 2.565907 | lrm: 0.50 | dt: 642.45ms | tok/sec: 816,082 | mfu: 51.01 | epoch: 2 | total time: 134.20m | eta: 45.3m +step 12493/16704 (74.79%) | loss: 2.578848 | lrm: 0.50 | dt: 644.54ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 134.21m | eta: 45.3m +step 12494/16704 (74.80%) | loss: 2.567560 | lrm: 0.50 | dt: 642.88ms | tok/sec: 815,525 | mfu: 50.97 | epoch: 2 | total time: 134.22m | eta: 45.3m +step 12495/16704 (74.80%) | loss: 2.568590 | lrm: 0.50 | dt: 641.73ms | tok/sec: 816,989 | mfu: 51.06 | epoch: 2 | total time: 134.23m | eta: 45.3m +step 12496/16704 (74.81%) | loss: 2.567271 | lrm: 0.50 | dt: 643.22ms | tok/sec: 815,102 | mfu: 50.95 | epoch: 2 | total time: 134.24m | eta: 45.2m +step 12497/16704 (74.81%) | loss: 2.581397 | lrm: 0.50 | dt: 643.79ms | tok/sec: 814,372 | mfu: 50.90 | epoch: 2 | total time: 134.25m | eta: 45.2m +step 12498/16704 (74.82%) | loss: 2.579850 | lrm: 0.50 | dt: 644.43ms | tok/sec: 813,562 | mfu: 50.85 | epoch: 2 | total time: 134.26m | eta: 45.2m +step 12499/16704 (74.83%) | loss: 2.581352 | lrm: 0.50 | dt: 650.42ms | tok/sec: 806,071 | mfu: 50.38 | epoch: 2 | total time: 134.27m | eta: 45.2m +Step 12500 | Validation bpb: 0.782226 +step 12500/16704 (74.83%) | loss: 2.594213 | lrm: 0.50 | dt: 651.11ms | tok/sec: 805,226 | mfu: 50.33 | epoch: 2 | total time: 134.28m | eta: 45.2m +step 12501/16704 (74.84%) | loss: 2.595585 | lrm: 0.50 | dt: 644.99ms | tok/sec: 812,858 | mfu: 50.80 | epoch: 2 | total time: 134.30m | eta: 45.2m +step 12502/16704 (74.84%) | loss: 2.600144 | lrm: 0.50 | dt: 646.92ms | tok/sec: 810,437 | mfu: 50.65 | epoch: 2 | total time: 134.31m | eta: 45.2m +step 12503/16704 (74.85%) | loss: 2.594450 | lrm: 0.50 | dt: 641.02ms | tok/sec: 817,900 | mfu: 51.12 | epoch: 2 | total time: 134.32m | eta: 45.2m +step 12504/16704 (74.86%) | loss: 2.588478 | lrm: 0.50 | dt: 644.92ms | tok/sec: 812,953 | mfu: 50.81 | epoch: 2 | total time: 134.33m | eta: 45.2m +step 12505/16704 (74.86%) | loss: 2.581809 | lrm: 0.50 | dt: 643.82ms | tok/sec: 814,341 | mfu: 50.90 | epoch: 2 | total time: 134.34m | eta: 45.1m +step 12506/16704 (74.87%) | loss: 2.560174 | lrm: 0.50 | dt: 642.38ms | tok/sec: 816,168 | mfu: 51.01 | epoch: 2 | total time: 134.35m | eta: 45.1m +step 12507/16704 (74.87%) | loss: 2.545532 | lrm: 0.50 | dt: 644.72ms | tok/sec: 813,199 | mfu: 50.83 | epoch: 2 | total time: 134.36m | eta: 45.1m +step 12508/16704 (74.88%) | loss: 2.543626 | lrm: 0.50 | dt: 642.62ms | tok/sec: 815,857 | mfu: 50.99 | epoch: 2 | total time: 134.37m | eta: 45.1m +step 12509/16704 (74.89%) | loss: 2.552551 | lrm: 0.50 | dt: 644.42ms | tok/sec: 813,586 | mfu: 50.85 | epoch: 2 | total time: 134.38m | eta: 45.1m +step 12510/16704 (74.89%) | loss: 2.559598 | lrm: 0.50 | dt: 642.72ms | tok/sec: 815,738 | mfu: 50.98 | epoch: 2 | total time: 134.39m | eta: 45.1m +step 12511/16704 (74.90%) | loss: 2.555678 | lrm: 0.50 | dt: 642.60ms | tok/sec: 815,886 | mfu: 50.99 | epoch: 2 | total time: 134.40m | eta: 45.1m +step 12512/16704 (74.90%) | loss: 2.563463 | lrm: 0.50 | dt: 645.13ms | tok/sec: 812,685 | mfu: 50.79 | epoch: 2 | total time: 134.41m | eta: 45.1m +step 12513/16704 (74.91%) | loss: 2.553201 | lrm: 0.50 | dt: 643.27ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 2 | total time: 134.42m | eta: 45.1m +step 12514/16704 (74.92%) | loss: 2.547633 | lrm: 0.50 | dt: 644.32ms | tok/sec: 813,710 | mfu: 50.86 | epoch: 2 | total time: 134.44m | eta: 45.0m +step 12515/16704 (74.92%) | loss: 2.542002 | lrm: 0.50 | dt: 645.10ms | tok/sec: 812,721 | mfu: 50.80 | epoch: 2 | total time: 134.45m | eta: 45.0m +step 12516/16704 (74.93%) | loss: 2.538941 | lrm: 0.50 | dt: 642.23ms | tok/sec: 816,350 | mfu: 51.02 | epoch: 2 | total time: 134.46m | eta: 45.0m +step 12517/16704 (74.93%) | loss: 2.534325 | lrm: 0.50 | dt: 643.36ms | tok/sec: 814,924 | mfu: 50.93 | epoch: 2 | total time: 134.47m | eta: 45.0m +step 12518/16704 (74.94%) | loss: 2.541264 | lrm: 0.50 | dt: 642.54ms | tok/sec: 815,966 | mfu: 51.00 | epoch: 2 | total time: 134.48m | eta: 45.0m +step 12519/16704 (74.95%) | loss: 2.540559 | lrm: 0.50 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.85 | epoch: 2 | total time: 134.49m | eta: 45.0m +step 12520/16704 (74.95%) | loss: 2.542131 | lrm: 0.50 | dt: 643.09ms | tok/sec: 815,257 | mfu: 50.95 | epoch: 2 | total time: 134.50m | eta: 45.0m +step 12521/16704 (74.96%) | loss: 2.546549 | lrm: 0.50 | dt: 644.58ms | tok/sec: 813,373 | mfu: 50.84 | epoch: 2 | total time: 134.51m | eta: 45.0m +step 12522/16704 (74.96%) | loss: 2.560256 | lrm: 0.50 | dt: 644.04ms | tok/sec: 814,056 | mfu: 50.88 | epoch: 2 | total time: 134.52m | eta: 45.0m +step 12523/16704 (74.97%) | loss: 2.547038 | lrm: 0.50 | dt: 643.58ms | tok/sec: 814,643 | mfu: 50.92 | epoch: 2 | total time: 134.53m | eta: 45.0m +step 12524/16704 (74.98%) | loss: 2.535167 | lrm: 0.50 | dt: 644.38ms | tok/sec: 813,631 | mfu: 50.85 | epoch: 2 | total time: 134.54m | eta: 44.9m +step 12525/16704 (74.98%) | loss: 2.546083 | lrm: 0.50 | dt: 643.66ms | tok/sec: 814,537 | mfu: 50.91 | epoch: 2 | total time: 134.55m | eta: 44.9m +step 12526/16704 (74.99%) | loss: 2.551794 | lrm: 0.50 | dt: 642.75ms | tok/sec: 815,700 | mfu: 50.98 | epoch: 2 | total time: 134.56m | eta: 44.9m +step 12527/16704 (74.99%) | loss: 2.549936 | lrm: 0.50 | dt: 643.48ms | tok/sec: 814,773 | mfu: 50.92 | epoch: 2 | total time: 134.57m | eta: 44.9m +step 12528/16704 (75.00%) | loss: 2.554225 | lrm: 0.50 | dt: 643.59ms | tok/sec: 814,631 | mfu: 50.92 | epoch: 2 | total time: 134.59m | eta: 44.9m +step 12529/16704 (75.01%) | loss: 2.566377 | lrm: 0.50 | dt: 644.18ms | tok/sec: 813,889 | mfu: 50.87 | epoch: 2 | total time: 134.60m | eta: 44.9m +step 12530/16704 (75.01%) | loss: 2.556054 | lrm: 0.50 | dt: 643.82ms | tok/sec: 814,342 | mfu: 50.90 | epoch: 2 | total time: 134.61m | eta: 44.9m +step 12531/16704 (75.02%) | loss: 2.544464 | lrm: 0.50 | dt: 641.67ms | tok/sec: 817,067 | mfu: 51.07 | epoch: 2 | total time: 134.62m | eta: 44.9m +step 12532/16704 (75.02%) | loss: 2.556887 | lrm: 0.50 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 2 | total time: 134.63m | eta: 44.9m +step 12533/16704 (75.03%) | loss: 2.557297 | lrm: 0.50 | dt: 642.61ms | tok/sec: 815,875 | mfu: 50.99 | epoch: 2 | total time: 134.64m | eta: 44.8m +step 12534/16704 (75.04%) | loss: 2.557766 | lrm: 0.50 | dt: 643.75ms | tok/sec: 814,428 | mfu: 50.90 | epoch: 2 | total time: 134.65m | eta: 44.8m +step 12535/16704 (75.04%) | loss: 2.540857 | lrm: 0.50 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 2 | total time: 134.66m | eta: 44.8m +step 12536/16704 (75.05%) | loss: 2.535191 | lrm: 0.50 | dt: 642.97ms | tok/sec: 815,410 | mfu: 50.96 | epoch: 2 | total time: 134.67m | eta: 44.8m +step 12537/16704 (75.05%) | loss: 2.540253 | lrm: 0.50 | dt: 642.53ms | tok/sec: 815,971 | mfu: 51.00 | epoch: 2 | total time: 134.68m | eta: 44.8m +step 12538/16704 (75.06%) | loss: 2.539544 | lrm: 0.50 | dt: 644.87ms | tok/sec: 813,017 | mfu: 50.81 | epoch: 2 | total time: 134.69m | eta: 44.8m +step 12539/16704 (75.07%) | loss: 2.549628 | lrm: 0.50 | dt: 643.80ms | tok/sec: 814,362 | mfu: 50.90 | epoch: 2 | total time: 134.70m | eta: 44.8m +step 12540/16704 (75.07%) | loss: 2.541746 | lrm: 0.50 | dt: 642.60ms | tok/sec: 815,883 | mfu: 50.99 | epoch: 2 | total time: 134.71m | eta: 44.8m +step 12541/16704 (75.08%) | loss: 2.540188 | lrm: 0.50 | dt: 644.50ms | tok/sec: 813,479 | mfu: 50.84 | epoch: 2 | total time: 134.72m | eta: 44.8m +step 12542/16704 (75.08%) | loss: 2.534242 | lrm: 0.50 | dt: 642.31ms | tok/sec: 816,254 | mfu: 51.02 | epoch: 2 | total time: 134.74m | eta: 44.7m +step 12543/16704 (75.09%) | loss: 2.534545 | lrm: 0.50 | dt: 643.49ms | tok/sec: 814,754 | mfu: 50.92 | epoch: 2 | total time: 134.75m | eta: 44.7m +step 12544/16704 (75.10%) | loss: 2.538698 | lrm: 0.50 | dt: 643.18ms | tok/sec: 815,147 | mfu: 50.95 | epoch: 2 | total time: 134.76m | eta: 44.7m +step 12545/16704 (75.10%) | loss: 2.534545 | lrm: 0.50 | dt: 642.01ms | tok/sec: 816,629 | mfu: 51.04 | epoch: 2 | total time: 134.77m | eta: 44.7m +step 12546/16704 (75.11%) | loss: 2.548942 | lrm: 0.50 | dt: 643.28ms | tok/sec: 815,023 | mfu: 50.94 | epoch: 2 | total time: 134.78m | eta: 44.7m +step 12547/16704 (75.11%) | loss: 2.537945 | lrm: 0.50 | dt: 644.24ms | tok/sec: 813,802 | mfu: 50.86 | epoch: 2 | total time: 134.79m | eta: 44.7m +step 12548/16704 (75.12%) | loss: 2.543635 | lrm: 0.50 | dt: 643.42ms | tok/sec: 814,849 | mfu: 50.93 | epoch: 2 | total time: 134.80m | eta: 44.7m +step 12549/16704 (75.13%) | loss: 2.559137 | lrm: 0.50 | dt: 642.38ms | tok/sec: 816,165 | mfu: 51.01 | epoch: 2 | total time: 134.81m | eta: 44.7m +step 12550/16704 (75.13%) | loss: 2.560694 | lrm: 0.50 | dt: 641.99ms | tok/sec: 816,661 | mfu: 51.04 | epoch: 2 | total time: 134.82m | eta: 44.7m +step 12551/16704 (75.14%) | loss: 2.556365 | lrm: 0.50 | dt: 644.83ms | tok/sec: 813,067 | mfu: 50.82 | epoch: 2 | total time: 134.83m | eta: 44.7m +step 12552/16704 (75.14%) | loss: 2.551941 | lrm: 0.50 | dt: 642.33ms | tok/sec: 816,223 | mfu: 51.02 | epoch: 2 | total time: 134.84m | eta: 44.6m +step 12553/16704 (75.15%) | loss: 2.546329 | lrm: 0.50 | dt: 644.44ms | tok/sec: 813,561 | mfu: 50.85 | epoch: 2 | total time: 134.85m | eta: 44.6m +step 12554/16704 (75.16%) | loss: 2.554561 | lrm: 0.50 | dt: 643.63ms | tok/sec: 814,582 | mfu: 50.91 | epoch: 2 | total time: 134.86m | eta: 44.6m +step 12555/16704 (75.16%) | loss: 2.562751 | lrm: 0.50 | dt: 643.99ms | tok/sec: 814,130 | mfu: 50.88 | epoch: 2 | total time: 134.87m | eta: 44.6m +step 12556/16704 (75.17%) | loss: 2.563540 | lrm: 0.50 | dt: 644.53ms | tok/sec: 813,445 | mfu: 50.84 | epoch: 2 | total time: 134.89m | eta: 44.6m +step 12557/16704 (75.17%) | loss: 2.564000 | lrm: 0.50 | dt: 642.80ms | tok/sec: 815,635 | mfu: 50.98 | epoch: 2 | total time: 134.90m | eta: 44.6m +step 12558/16704 (75.18%) | loss: 2.550956 | lrm: 0.50 | dt: 643.34ms | tok/sec: 814,946 | mfu: 50.94 | epoch: 2 | total time: 134.91m | eta: 44.6m +step 12559/16704 (75.19%) | loss: 2.552910 | lrm: 0.50 | dt: 643.28ms | tok/sec: 815,028 | mfu: 50.94 | epoch: 2 | total time: 134.92m | eta: 44.6m +step 12560/16704 (75.19%) | loss: 2.563398 | lrm: 0.50 | dt: 643.65ms | tok/sec: 814,559 | mfu: 50.91 | epoch: 2 | total time: 134.93m | eta: 44.6m +step 12561/16704 (75.20%) | loss: 2.561772 | lrm: 0.50 | dt: 644.42ms | tok/sec: 813,575 | mfu: 50.85 | epoch: 2 | total time: 134.94m | eta: 44.5m +step 12562/16704 (75.20%) | loss: 2.553894 | lrm: 0.50 | dt: 642.05ms | tok/sec: 816,587 | mfu: 51.04 | epoch: 2 | total time: 134.95m | eta: 44.5m +step 12563/16704 (75.21%) | loss: 2.570525 | lrm: 0.50 | dt: 645.82ms | tok/sec: 811,816 | mfu: 50.74 | epoch: 2 | total time: 134.96m | eta: 44.5m +step 12564/16704 (75.22%) | loss: 2.585721 | lrm: 0.50 | dt: 643.20ms | tok/sec: 815,118 | mfu: 50.95 | epoch: 2 | total time: 134.97m | eta: 44.5m +step 12565/16704 (75.22%) | loss: 2.585782 | lrm: 0.50 | dt: 643.42ms | tok/sec: 814,846 | mfu: 50.93 | epoch: 2 | total time: 134.98m | eta: 44.5m +step 12566/16704 (75.23%) | loss: 2.601390 | lrm: 0.50 | dt: 643.63ms | tok/sec: 814,579 | mfu: 50.91 | epoch: 2 | total time: 134.99m | eta: 44.5m +step 12567/16704 (75.23%) | loss: 2.592203 | lrm: 0.50 | dt: 642.79ms | tok/sec: 815,650 | mfu: 50.98 | epoch: 2 | total time: 135.00m | eta: 44.5m +step 12568/16704 (75.24%) | loss: 2.582447 | lrm: 0.50 | dt: 643.21ms | tok/sec: 815,111 | mfu: 50.95 | epoch: 2 | total time: 135.01m | eta: 44.5m +step 12569/16704 (75.25%) | loss: 2.571780 | lrm: 0.50 | dt: 643.09ms | tok/sec: 815,259 | mfu: 50.95 | epoch: 2 | total time: 135.02m | eta: 44.5m +step 12570/16704 (75.25%) | loss: 2.571201 | lrm: 0.49 | dt: 643.10ms | tok/sec: 815,257 | mfu: 50.95 | epoch: 2 | total time: 135.04m | eta: 44.4m +step 12571/16704 (75.26%) | loss: 2.559620 | lrm: 0.49 | dt: 643.44ms | tok/sec: 814,823 | mfu: 50.93 | epoch: 2 | total time: 135.05m | eta: 44.4m +step 12572/16704 (75.26%) | loss: 2.549053 | lrm: 0.49 | dt: 643.42ms | tok/sec: 814,850 | mfu: 50.93 | epoch: 2 | total time: 135.06m | eta: 44.4m +step 12573/16704 (75.27%) | loss: 2.542675 | lrm: 0.49 | dt: 646.45ms | tok/sec: 811,024 | mfu: 50.69 | epoch: 2 | total time: 135.07m | eta: 44.4m +step 12574/16704 (75.28%) | loss: 2.550050 | lrm: 0.49 | dt: 643.20ms | tok/sec: 815,121 | mfu: 50.95 | epoch: 2 | total time: 135.08m | eta: 44.4m +step 12575/16704 (75.28%) | loss: 2.556019 | lrm: 0.49 | dt: 642.86ms | tok/sec: 815,550 | mfu: 50.97 | epoch: 2 | total time: 135.09m | eta: 44.4m +step 12576/16704 (75.29%) | loss: 2.550736 | lrm: 0.49 | dt: 645.01ms | tok/sec: 812,841 | mfu: 50.80 | epoch: 2 | total time: 135.10m | eta: 44.4m +step 12577/16704 (75.29%) | loss: 2.530612 | lrm: 0.49 | dt: 643.84ms | tok/sec: 814,313 | mfu: 50.90 | epoch: 2 | total time: 135.11m | eta: 44.4m +step 12578/16704 (75.30%) | loss: 2.527632 | lrm: 0.49 | dt: 643.17ms | tok/sec: 815,164 | mfu: 50.95 | epoch: 2 | total time: 135.12m | eta: 44.4m +step 12579/16704 (75.31%) | loss: 2.547965 | lrm: 0.49 | dt: 642.87ms | tok/sec: 815,540 | mfu: 50.97 | epoch: 2 | total time: 135.13m | eta: 44.3m +step 12580/16704 (75.31%) | loss: 2.539182 | lrm: 0.49 | dt: 647.20ms | tok/sec: 810,084 | mfu: 50.63 | epoch: 2 | total time: 135.14m | eta: 44.3m +step 12581/16704 (75.32%) | loss: 2.545090 | lrm: 0.49 | dt: 641.69ms | tok/sec: 817,037 | mfu: 51.07 | epoch: 2 | total time: 135.15m | eta: 44.3m +step 12582/16704 (75.32%) | loss: 2.549421 | lrm: 0.49 | dt: 644.86ms | tok/sec: 813,028 | mfu: 50.82 | epoch: 2 | total time: 135.16m | eta: 44.3m +step 12583/16704 (75.33%) | loss: 2.547529 | lrm: 0.49 | dt: 642.73ms | tok/sec: 815,718 | mfu: 50.98 | epoch: 2 | total time: 135.18m | eta: 44.3m +step 12584/16704 (75.34%) | loss: 2.554527 | lrm: 0.49 | dt: 643.81ms | tok/sec: 814,350 | mfu: 50.90 | epoch: 2 | total time: 135.19m | eta: 44.3m +step 12585/16704 (75.34%) | loss: 2.548771 | lrm: 0.49 | dt: 643.03ms | tok/sec: 815,345 | mfu: 50.96 | epoch: 2 | total time: 135.20m | eta: 44.3m +step 12586/16704 (75.35%) | loss: 2.540725 | lrm: 0.49 | dt: 642.90ms | tok/sec: 815,499 | mfu: 50.97 | epoch: 2 | total time: 135.21m | eta: 44.3m +step 12587/16704 (75.35%) | loss: 2.538758 | lrm: 0.49 | dt: 643.00ms | tok/sec: 815,378 | mfu: 50.96 | epoch: 2 | total time: 135.22m | eta: 44.3m +step 12588/16704 (75.36%) | loss: 2.540889 | lrm: 0.49 | dt: 644.78ms | tok/sec: 813,120 | mfu: 50.82 | epoch: 2 | total time: 135.23m | eta: 44.3m +step 12589/16704 (75.37%) | loss: 2.545767 | lrm: 0.49 | dt: 642.65ms | tok/sec: 815,826 | mfu: 50.99 | epoch: 2 | total time: 135.24m | eta: 44.2m +step 12590/16704 (75.37%) | loss: 2.545765 | lrm: 0.49 | dt: 641.74ms | tok/sec: 816,982 | mfu: 51.06 | epoch: 2 | total time: 135.25m | eta: 44.2m +step 12591/16704 (75.38%) | loss: 2.538691 | lrm: 0.49 | dt: 644.97ms | tok/sec: 812,889 | mfu: 50.81 | epoch: 2 | total time: 135.26m | eta: 44.2m +step 12592/16704 (75.38%) | loss: 2.541015 | lrm: 0.49 | dt: 641.66ms | tok/sec: 817,076 | mfu: 51.07 | epoch: 2 | total time: 135.27m | eta: 44.2m +step 12593/16704 (75.39%) | loss: 2.545442 | lrm: 0.49 | dt: 643.45ms | tok/sec: 814,804 | mfu: 50.93 | epoch: 2 | total time: 135.28m | eta: 44.2m +step 12594/16704 (75.40%) | loss: 2.547688 | lrm: 0.49 | dt: 642.42ms | tok/sec: 816,119 | mfu: 51.01 | epoch: 2 | total time: 135.29m | eta: 44.2m +step 12595/16704 (75.40%) | loss: 2.510733 | lrm: 0.49 | dt: 640.80ms | tok/sec: 818,182 | mfu: 51.14 | epoch: 2 | total time: 135.30m | eta: 44.2m +step 12596/16704 (75.41%) | loss: 2.528372 | lrm: 0.49 | dt: 643.88ms | tok/sec: 814,260 | mfu: 50.89 | epoch: 2 | total time: 135.31m | eta: 44.2m +step 12597/16704 (75.41%) | loss: 2.523590 | lrm: 0.49 | dt: 643.54ms | tok/sec: 814,696 | mfu: 50.92 | epoch: 2 | total time: 135.33m | eta: 44.2m +step 12598/16704 (75.42%) | loss: 2.524622 | lrm: 0.49 | dt: 642.70ms | tok/sec: 815,754 | mfu: 50.99 | epoch: 2 | total time: 135.34m | eta: 44.1m +step 12599/16704 (75.43%) | loss: 2.532418 | lrm: 0.49 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 2 | total time: 135.35m | eta: 44.1m +step 12600/16704 (75.43%) | loss: 2.528466 | lrm: 0.49 | dt: 642.17ms | tok/sec: 816,437 | mfu: 51.03 | epoch: 2 | total time: 135.36m | eta: 44.1m +step 12601/16704 (75.44%) | loss: 2.531453 | lrm: 0.49 | dt: 644.05ms | tok/sec: 814,052 | mfu: 50.88 | epoch: 2 | total time: 135.37m | eta: 44.1m +step 12602/16704 (75.44%) | loss: 2.522167 | lrm: 0.49 | dt: 643.36ms | tok/sec: 814,918 | mfu: 50.93 | epoch: 2 | total time: 135.38m | eta: 44.1m +step 12603/16704 (75.45%) | loss: 2.538986 | lrm: 0.49 | dt: 642.59ms | tok/sec: 815,898 | mfu: 50.99 | epoch: 2 | total time: 135.39m | eta: 44.1m +step 12604/16704 (75.45%) | loss: 2.541111 | lrm: 0.49 | dt: 643.70ms | tok/sec: 814,490 | mfu: 50.91 | epoch: 2 | total time: 135.40m | eta: 44.1m +step 12605/16704 (75.46%) | loss: 2.541289 | lrm: 0.49 | dt: 644.01ms | tok/sec: 814,093 | mfu: 50.88 | epoch: 2 | total time: 135.41m | eta: 44.1m +step 12606/16704 (75.47%) | loss: 2.527199 | lrm: 0.49 | dt: 645.25ms | tok/sec: 812,533 | mfu: 50.78 | epoch: 2 | total time: 135.42m | eta: 44.1m +step 12607/16704 (75.47%) | loss: 2.534107 | lrm: 0.49 | dt: 641.41ms | tok/sec: 817,399 | mfu: 51.09 | epoch: 2 | total time: 135.43m | eta: 44.0m +step 12608/16704 (75.48%) | loss: 2.540413 | lrm: 0.49 | dt: 643.87ms | tok/sec: 814,280 | mfu: 50.89 | epoch: 2 | total time: 135.44m | eta: 44.0m +step 12609/16704 (75.48%) | loss: 2.543376 | lrm: 0.49 | dt: 643.64ms | tok/sec: 814,561 | mfu: 50.91 | epoch: 2 | total time: 135.45m | eta: 44.0m +step 12610/16704 (75.49%) | loss: 2.553038 | lrm: 0.49 | dt: 643.57ms | tok/sec: 814,652 | mfu: 50.92 | epoch: 2 | total time: 135.46m | eta: 44.0m +step 12611/16704 (75.50%) | loss: 2.555887 | lrm: 0.49 | dt: 645.55ms | tok/sec: 812,160 | mfu: 50.76 | epoch: 2 | total time: 135.48m | eta: 44.0m +step 12612/16704 (75.50%) | loss: 2.545054 | lrm: 0.49 | dt: 643.63ms | tok/sec: 814,575 | mfu: 50.91 | epoch: 2 | total time: 135.49m | eta: 44.0m +step 12613/16704 (75.51%) | loss: 2.542495 | lrm: 0.49 | dt: 644.71ms | tok/sec: 813,216 | mfu: 50.83 | epoch: 2 | total time: 135.50m | eta: 44.0m +step 12614/16704 (75.51%) | loss: 2.551830 | lrm: 0.49 | dt: 644.47ms | tok/sec: 813,521 | mfu: 50.85 | epoch: 2 | total time: 135.51m | eta: 44.0m +step 12615/16704 (75.52%) | loss: 2.531051 | lrm: 0.49 | dt: 646.04ms | tok/sec: 811,540 | mfu: 50.72 | epoch: 2 | total time: 135.52m | eta: 44.0m +step 12616/16704 (75.53%) | loss: 2.534092 | lrm: 0.49 | dt: 643.46ms | tok/sec: 814,796 | mfu: 50.93 | epoch: 2 | total time: 135.53m | eta: 44.0m +step 12617/16704 (75.53%) | loss: 2.546424 | lrm: 0.49 | dt: 641.82ms | tok/sec: 816,877 | mfu: 51.06 | epoch: 2 | total time: 135.54m | eta: 43.9m +step 12618/16704 (75.54%) | loss: 2.544246 | lrm: 0.49 | dt: 645.08ms | tok/sec: 812,751 | mfu: 50.80 | epoch: 2 | total time: 135.55m | eta: 43.9m +step 12619/16704 (75.54%) | loss: 2.566067 | lrm: 0.49 | dt: 643.71ms | tok/sec: 814,480 | mfu: 50.91 | epoch: 2 | total time: 135.56m | eta: 43.9m +step 12620/16704 (75.55%) | loss: 2.563816 | lrm: 0.49 | dt: 643.26ms | tok/sec: 815,050 | mfu: 50.94 | epoch: 2 | total time: 135.57m | eta: 43.9m +step 12621/16704 (75.56%) | loss: 2.562159 | lrm: 0.49 | dt: 646.24ms | tok/sec: 811,291 | mfu: 50.71 | epoch: 2 | total time: 135.58m | eta: 43.9m +step 12622/16704 (75.56%) | loss: 2.562963 | lrm: 0.49 | dt: 640.76ms | tok/sec: 818,229 | mfu: 51.14 | epoch: 2 | total time: 135.59m | eta: 43.9m +step 12623/16704 (75.57%) | loss: 2.569800 | lrm: 0.49 | dt: 643.97ms | tok/sec: 814,149 | mfu: 50.89 | epoch: 2 | total time: 135.60m | eta: 43.9m +step 12624/16704 (75.57%) | loss: 2.570138 | lrm: 0.49 | dt: 644.81ms | tok/sec: 813,090 | mfu: 50.82 | epoch: 2 | total time: 135.61m | eta: 43.9m +step 12625/16704 (75.58%) | loss: 2.555966 | lrm: 0.49 | dt: 641.93ms | tok/sec: 816,735 | mfu: 51.05 | epoch: 2 | total time: 135.63m | eta: 43.9m +step 12626/16704 (75.59%) | loss: 2.544619 | lrm: 0.49 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 2 | total time: 135.64m | eta: 43.8m +step 12627/16704 (75.59%) | loss: 2.543743 | lrm: 0.49 | dt: 642.74ms | tok/sec: 815,703 | mfu: 50.98 | epoch: 2 | total time: 135.65m | eta: 43.8m +step 12628/16704 (75.60%) | loss: 2.545710 | lrm: 0.49 | dt: 643.67ms | tok/sec: 814,525 | mfu: 50.91 | epoch: 2 | total time: 135.66m | eta: 43.8m +step 12629/16704 (75.60%) | loss: 2.534862 | lrm: 0.49 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 2 | total time: 135.67m | eta: 43.8m +step 12630/16704 (75.61%) | loss: 2.545146 | lrm: 0.49 | dt: 642.72ms | tok/sec: 815,739 | mfu: 50.98 | epoch: 2 | total time: 135.68m | eta: 43.8m +step 12631/16704 (75.62%) | loss: 2.547991 | lrm: 0.49 | dt: 644.65ms | tok/sec: 813,290 | mfu: 50.83 | epoch: 2 | total time: 135.69m | eta: 43.8m +step 12632/16704 (75.62%) | loss: 2.547352 | lrm: 0.49 | dt: 642.11ms | tok/sec: 816,513 | mfu: 51.03 | epoch: 2 | total time: 135.70m | eta: 43.8m +step 12633/16704 (75.63%) | loss: 2.550938 | lrm: 0.49 | dt: 641.80ms | tok/sec: 816,908 | mfu: 51.06 | epoch: 2 | total time: 135.71m | eta: 43.8m +step 12634/16704 (75.63%) | loss: 2.564796 | lrm: 0.49 | dt: 645.79ms | tok/sec: 811,860 | mfu: 50.74 | epoch: 2 | total time: 135.72m | eta: 43.8m +step 12635/16704 (75.64%) | loss: 2.564097 | lrm: 0.49 | dt: 643.47ms | tok/sec: 814,787 | mfu: 50.93 | epoch: 2 | total time: 135.73m | eta: 43.7m +step 12636/16704 (75.65%) | loss: 2.575580 | lrm: 0.49 | dt: 645.61ms | tok/sec: 812,082 | mfu: 50.76 | epoch: 2 | total time: 135.74m | eta: 43.7m +step 12637/16704 (75.65%) | loss: 2.573945 | lrm: 0.49 | dt: 641.47ms | tok/sec: 817,318 | mfu: 51.08 | epoch: 2 | total time: 135.75m | eta: 43.7m +step 12638/16704 (75.66%) | loss: 2.570352 | lrm: 0.49 | dt: 642.83ms | tok/sec: 815,597 | mfu: 50.98 | epoch: 2 | total time: 135.76m | eta: 43.7m +step 12639/16704 (75.66%) | loss: 2.566691 | lrm: 0.49 | dt: 644.26ms | tok/sec: 813,787 | mfu: 50.86 | epoch: 2 | total time: 135.78m | eta: 43.7m +step 12640/16704 (75.67%) | loss: 2.576424 | lrm: 0.49 | dt: 644.76ms | tok/sec: 813,157 | mfu: 50.82 | epoch: 2 | total time: 135.79m | eta: 43.7m +step 12641/16704 (75.68%) | loss: 2.579602 | lrm: 0.49 | dt: 642.81ms | tok/sec: 815,618 | mfu: 50.98 | epoch: 2 | total time: 135.80m | eta: 43.7m +step 12642/16704 (75.68%) | loss: 2.573812 | lrm: 0.49 | dt: 645.36ms | tok/sec: 812,393 | mfu: 50.78 | epoch: 2 | total time: 135.81m | eta: 43.7m +step 12643/16704 (75.69%) | loss: 2.559997 | lrm: 0.49 | dt: 642.79ms | tok/sec: 815,644 | mfu: 50.98 | epoch: 2 | total time: 135.82m | eta: 43.7m +step 12644/16704 (75.69%) | loss: 2.557840 | lrm: 0.49 | dt: 643.36ms | tok/sec: 814,921 | mfu: 50.93 | epoch: 2 | total time: 135.83m | eta: 43.6m +step 12645/16704 (75.70%) | loss: 2.553712 | lrm: 0.49 | dt: 644.14ms | tok/sec: 813,930 | mfu: 50.87 | epoch: 2 | total time: 135.84m | eta: 43.6m +step 12646/16704 (75.71%) | loss: 2.564597 | lrm: 0.49 | dt: 643.59ms | tok/sec: 814,625 | mfu: 50.92 | epoch: 2 | total time: 135.85m | eta: 43.6m +step 12647/16704 (75.71%) | loss: 2.551136 | lrm: 0.49 | dt: 644.57ms | tok/sec: 813,391 | mfu: 50.84 | epoch: 2 | total time: 135.86m | eta: 43.6m +step 12648/16704 (75.72%) | loss: 2.551279 | lrm: 0.49 | dt: 644.05ms | tok/sec: 814,045 | mfu: 50.88 | epoch: 2 | total time: 135.87m | eta: 43.6m +step 12649/16704 (75.72%) | loss: 2.552009 | lrm: 0.49 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 2 | total time: 135.88m | eta: 43.6m +step 12650/16704 (75.73%) | loss: 2.545316 | lrm: 0.49 | dt: 644.99ms | tok/sec: 812,856 | mfu: 50.80 | epoch: 2 | total time: 135.89m | eta: 43.6m +step 12651/16704 (75.74%) | loss: 2.534734 | lrm: 0.49 | dt: 643.78ms | tok/sec: 814,394 | mfu: 50.90 | epoch: 2 | total time: 135.90m | eta: 43.6m +step 12652/16704 (75.74%) | loss: 2.542273 | lrm: 0.49 | dt: 643.04ms | tok/sec: 815,323 | mfu: 50.96 | epoch: 2 | total time: 135.92m | eta: 43.6m +step 12653/16704 (75.75%) | loss: 2.546617 | lrm: 0.49 | dt: 642.42ms | tok/sec: 816,110 | mfu: 51.01 | epoch: 2 | total time: 135.93m | eta: 43.6m +step 12654/16704 (75.75%) | loss: 2.546447 | lrm: 0.48 | dt: 642.12ms | tok/sec: 816,499 | mfu: 51.03 | epoch: 2 | total time: 135.94m | eta: 43.5m +step 12655/16704 (75.76%) | loss: 2.551162 | lrm: 0.48 | dt: 644.62ms | tok/sec: 813,323 | mfu: 50.83 | epoch: 2 | total time: 135.95m | eta: 43.5m +step 12656/16704 (75.77%) | loss: 2.558613 | lrm: 0.48 | dt: 642.32ms | tok/sec: 816,239 | mfu: 51.02 | epoch: 2 | total time: 135.96m | eta: 43.5m +step 12657/16704 (75.77%) | loss: 2.552505 | lrm: 0.48 | dt: 643.78ms | tok/sec: 814,392 | mfu: 50.90 | epoch: 2 | total time: 135.97m | eta: 43.5m +step 12658/16704 (75.78%) | loss: 2.547343 | lrm: 0.48 | dt: 643.93ms | tok/sec: 814,199 | mfu: 50.89 | epoch: 2 | total time: 135.98m | eta: 43.5m +step 12659/16704 (75.78%) | loss: 2.551231 | lrm: 0.48 | dt: 642.53ms | tok/sec: 815,968 | mfu: 51.00 | epoch: 2 | total time: 135.99m | eta: 43.5m +step 12660/16704 (75.79%) | loss: 2.547546 | lrm: 0.48 | dt: 644.26ms | tok/sec: 813,784 | mfu: 50.86 | epoch: 2 | total time: 136.00m | eta: 43.5m +step 12661/16704 (75.80%) | loss: 2.544640 | lrm: 0.48 | dt: 642.88ms | tok/sec: 815,524 | mfu: 50.97 | epoch: 2 | total time: 136.01m | eta: 43.5m +step 12662/16704 (75.80%) | loss: 2.559115 | lrm: 0.48 | dt: 644.18ms | tok/sec: 813,878 | mfu: 50.87 | epoch: 2 | total time: 136.02m | eta: 43.5m +step 12663/16704 (75.81%) | loss: 2.564569 | lrm: 0.48 | dt: 644.74ms | tok/sec: 813,179 | mfu: 50.82 | epoch: 2 | total time: 136.03m | eta: 43.4m +step 12664/16704 (75.81%) | loss: 2.565158 | lrm: 0.48 | dt: 642.43ms | tok/sec: 816,102 | mfu: 51.01 | epoch: 2 | total time: 136.04m | eta: 43.4m +step 12665/16704 (75.82%) | loss: 2.562185 | lrm: 0.48 | dt: 644.90ms | tok/sec: 812,977 | mfu: 50.81 | epoch: 2 | total time: 136.05m | eta: 43.4m +step 12666/16704 (75.83%) | loss: 2.561406 | lrm: 0.48 | dt: 645.39ms | tok/sec: 812,352 | mfu: 50.77 | epoch: 2 | total time: 136.07m | eta: 43.4m +step 12667/16704 (75.83%) | loss: 2.567767 | lrm: 0.48 | dt: 647.03ms | tok/sec: 810,296 | mfu: 50.64 | epoch: 2 | total time: 136.08m | eta: 43.4m +step 12668/16704 (75.84%) | loss: 2.548870 | lrm: 0.48 | dt: 641.34ms | tok/sec: 817,490 | mfu: 51.09 | epoch: 2 | total time: 136.09m | eta: 43.4m +step 12669/16704 (75.84%) | loss: 2.538463 | lrm: 0.48 | dt: 644.71ms | tok/sec: 813,210 | mfu: 50.83 | epoch: 2 | total time: 136.10m | eta: 43.4m +step 12670/16704 (75.85%) | loss: 2.533439 | lrm: 0.48 | dt: 642.87ms | tok/sec: 815,548 | mfu: 50.97 | epoch: 2 | total time: 136.11m | eta: 43.4m +step 12671/16704 (75.86%) | loss: 2.522221 | lrm: 0.48 | dt: 644.43ms | tok/sec: 813,566 | mfu: 50.85 | epoch: 2 | total time: 136.12m | eta: 43.4m +step 12672/16704 (75.86%) | loss: 2.514696 | lrm: 0.48 | dt: 646.53ms | tok/sec: 810,926 | mfu: 50.68 | epoch: 2 | total time: 136.13m | eta: 43.3m +step 12673/16704 (75.87%) | loss: 2.514018 | lrm: 0.48 | dt: 643.38ms | tok/sec: 814,894 | mfu: 50.93 | epoch: 2 | total time: 136.14m | eta: 43.3m +step 12674/16704 (75.87%) | loss: 2.523828 | lrm: 0.48 | dt: 643.32ms | tok/sec: 814,973 | mfu: 50.94 | epoch: 2 | total time: 136.15m | eta: 43.3m +step 12675/16704 (75.88%) | loss: 2.530624 | lrm: 0.48 | dt: 643.05ms | tok/sec: 815,316 | mfu: 50.96 | epoch: 2 | total time: 136.16m | eta: 43.3m +step 12676/16704 (75.89%) | loss: 2.528083 | lrm: 0.48 | dt: 644.95ms | tok/sec: 812,907 | mfu: 50.81 | epoch: 2 | total time: 136.17m | eta: 43.3m +step 12677/16704 (75.89%) | loss: 2.535521 | lrm: 0.48 | dt: 643.13ms | tok/sec: 815,214 | mfu: 50.95 | epoch: 2 | total time: 136.18m | eta: 43.3m +step 12678/16704 (75.90%) | loss: 2.540879 | lrm: 0.48 | dt: 645.62ms | tok/sec: 812,071 | mfu: 50.76 | epoch: 2 | total time: 136.19m | eta: 43.3m +step 12679/16704 (75.90%) | loss: 2.547330 | lrm: 0.48 | dt: 644.60ms | tok/sec: 813,351 | mfu: 50.84 | epoch: 2 | total time: 136.20m | eta: 43.3m +step 12680/16704 (75.91%) | loss: 2.540079 | lrm: 0.48 | dt: 643.87ms | tok/sec: 814,279 | mfu: 50.89 | epoch: 2 | total time: 136.22m | eta: 43.3m +step 12681/16704 (75.92%) | loss: 2.545354 | lrm: 0.48 | dt: 643.87ms | tok/sec: 814,276 | mfu: 50.89 | epoch: 2 | total time: 136.23m | eta: 43.3m +step 12682/16704 (75.92%) | loss: 2.546440 | lrm: 0.48 | dt: 642.46ms | tok/sec: 816,064 | mfu: 51.01 | epoch: 2 | total time: 136.24m | eta: 43.2m +step 12683/16704 (75.93%) | loss: 2.532519 | lrm: 0.48 | dt: 643.19ms | tok/sec: 815,142 | mfu: 50.95 | epoch: 2 | total time: 136.25m | eta: 43.2m +step 12684/16704 (75.93%) | loss: 2.530607 | lrm: 0.48 | dt: 644.75ms | tok/sec: 813,162 | mfu: 50.82 | epoch: 2 | total time: 136.26m | eta: 43.2m +step 12685/16704 (75.94%) | loss: 2.528589 | lrm: 0.48 | dt: 644.47ms | tok/sec: 813,519 | mfu: 50.85 | epoch: 2 | total time: 136.27m | eta: 43.2m +step 12686/16704 (75.95%) | loss: 2.517479 | lrm: 0.48 | dt: 644.47ms | tok/sec: 813,520 | mfu: 50.85 | epoch: 2 | total time: 136.28m | eta: 43.2m +step 12687/16704 (75.95%) | loss: 2.516910 | lrm: 0.48 | dt: 642.26ms | tok/sec: 816,320 | mfu: 51.02 | epoch: 2 | total time: 136.29m | eta: 43.2m +step 12688/16704 (75.96%) | loss: 2.525851 | lrm: 0.48 | dt: 642.97ms | tok/sec: 815,415 | mfu: 50.96 | epoch: 2 | total time: 136.30m | eta: 43.2m +step 12689/16704 (75.96%) | loss: 2.536814 | lrm: 0.48 | dt: 643.43ms | tok/sec: 814,833 | mfu: 50.93 | epoch: 2 | total time: 136.31m | eta: 43.2m +step 12690/16704 (75.97%) | loss: 2.533977 | lrm: 0.48 | dt: 645.19ms | tok/sec: 812,615 | mfu: 50.79 | epoch: 2 | total time: 136.32m | eta: 43.2m +step 12691/16704 (75.98%) | loss: 2.525752 | lrm: 0.48 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 2 | total time: 136.33m | eta: 43.1m +step 12692/16704 (75.98%) | loss: 2.523382 | lrm: 0.48 | dt: 643.37ms | tok/sec: 814,909 | mfu: 50.93 | epoch: 2 | total time: 136.34m | eta: 43.1m +step 12693/16704 (75.99%) | loss: 2.517213 | lrm: 0.48 | dt: 641.77ms | tok/sec: 816,937 | mfu: 51.06 | epoch: 2 | total time: 136.36m | eta: 43.1m +step 12694/16704 (75.99%) | loss: 2.509832 | lrm: 0.48 | dt: 644.07ms | tok/sec: 814,028 | mfu: 50.88 | epoch: 2 | total time: 136.37m | eta: 43.1m +step 12695/16704 (76.00%) | loss: 2.523079 | lrm: 0.48 | dt: 643.02ms | tok/sec: 815,356 | mfu: 50.96 | epoch: 2 | total time: 136.38m | eta: 43.1m +step 12696/16704 (76.01%) | loss: 2.530290 | lrm: 0.48 | dt: 646.11ms | tok/sec: 811,455 | mfu: 50.72 | epoch: 2 | total time: 136.39m | eta: 43.1m +step 12697/16704 (76.01%) | loss: 2.541687 | lrm: 0.48 | dt: 644.67ms | tok/sec: 813,270 | mfu: 50.83 | epoch: 2 | total time: 136.40m | eta: 43.1m +step 12698/16704 (76.02%) | loss: 2.545301 | lrm: 0.48 | dt: 641.31ms | tok/sec: 817,525 | mfu: 51.10 | epoch: 2 | total time: 136.41m | eta: 43.1m +step 12699/16704 (76.02%) | loss: 2.538183 | lrm: 0.48 | dt: 643.88ms | tok/sec: 814,264 | mfu: 50.89 | epoch: 2 | total time: 136.42m | eta: 43.1m +step 12700/16704 (76.03%) | loss: 2.544275 | lrm: 0.48 | dt: 643.83ms | tok/sec: 814,325 | mfu: 50.90 | epoch: 2 | total time: 136.43m | eta: 43.0m +step 12701/16704 (76.04%) | loss: 2.542604 | lrm: 0.48 | dt: 643.55ms | tok/sec: 814,675 | mfu: 50.92 | epoch: 2 | total time: 136.44m | eta: 43.0m +step 12702/16704 (76.04%) | loss: 2.544810 | lrm: 0.48 | dt: 643.48ms | tok/sec: 814,768 | mfu: 50.92 | epoch: 2 | total time: 136.45m | eta: 43.0m +step 12703/16704 (76.05%) | loss: 2.542173 | lrm: 0.48 | dt: 642.16ms | tok/sec: 816,447 | mfu: 51.03 | epoch: 2 | total time: 136.46m | eta: 43.0m +step 12704/16704 (76.05%) | loss: 2.534931 | lrm: 0.48 | dt: 641.12ms | tok/sec: 817,767 | mfu: 51.11 | epoch: 2 | total time: 136.47m | eta: 43.0m +step 12705/16704 (76.06%) | loss: 2.540407 | lrm: 0.48 | dt: 644.15ms | tok/sec: 813,921 | mfu: 50.87 | epoch: 2 | total time: 136.48m | eta: 43.0m +step 12706/16704 (76.07%) | loss: 2.535490 | lrm: 0.48 | dt: 643.43ms | tok/sec: 814,826 | mfu: 50.93 | epoch: 2 | total time: 136.49m | eta: 43.0m +step 12707/16704 (76.07%) | loss: 2.534332 | lrm: 0.48 | dt: 643.72ms | tok/sec: 814,465 | mfu: 50.91 | epoch: 2 | total time: 136.51m | eta: 43.0m +step 12708/16704 (76.08%) | loss: 2.522784 | lrm: 0.48 | dt: 643.93ms | tok/sec: 814,195 | mfu: 50.89 | epoch: 2 | total time: 136.52m | eta: 43.0m +step 12709/16704 (76.08%) | loss: 2.534002 | lrm: 0.48 | dt: 641.44ms | tok/sec: 817,361 | mfu: 51.09 | epoch: 2 | total time: 136.53m | eta: 43.0m +step 12710/16704 (76.09%) | loss: 2.539126 | lrm: 0.48 | dt: 642.62ms | tok/sec: 815,863 | mfu: 50.99 | epoch: 2 | total time: 136.54m | eta: 42.9m +step 12711/16704 (76.10%) | loss: 2.545865 | lrm: 0.48 | dt: 643.29ms | tok/sec: 815,013 | mfu: 50.94 | epoch: 2 | total time: 136.55m | eta: 42.9m +step 12712/16704 (76.10%) | loss: 2.560709 | lrm: 0.48 | dt: 642.41ms | tok/sec: 816,124 | mfu: 51.01 | epoch: 2 | total time: 136.56m | eta: 42.9m +step 12713/16704 (76.11%) | loss: 2.553365 | lrm: 0.48 | dt: 643.51ms | tok/sec: 814,728 | mfu: 50.92 | epoch: 2 | total time: 136.57m | eta: 42.9m +step 12714/16704 (76.11%) | loss: 2.546097 | lrm: 0.48 | dt: 643.81ms | tok/sec: 814,353 | mfu: 50.90 | epoch: 2 | total time: 136.58m | eta: 42.9m +step 12715/16704 (76.12%) | loss: 2.550753 | lrm: 0.48 | dt: 644.05ms | tok/sec: 814,054 | mfu: 50.88 | epoch: 2 | total time: 136.59m | eta: 42.9m +step 12716/16704 (76.13%) | loss: 2.559020 | lrm: 0.48 | dt: 645.42ms | tok/sec: 812,325 | mfu: 50.77 | epoch: 2 | total time: 136.60m | eta: 42.9m +step 12717/16704 (76.13%) | loss: 2.567640 | lrm: 0.48 | dt: 641.12ms | tok/sec: 817,765 | mfu: 51.11 | epoch: 2 | total time: 136.61m | eta: 42.9m +step 12718/16704 (76.14%) | loss: 2.569558 | lrm: 0.48 | dt: 645.77ms | tok/sec: 811,883 | mfu: 50.74 | epoch: 2 | total time: 136.62m | eta: 42.9m +step 12719/16704 (76.14%) | loss: 2.566422 | lrm: 0.48 | dt: 643.60ms | tok/sec: 814,621 | mfu: 50.92 | epoch: 2 | total time: 136.63m | eta: 42.8m +step 12720/16704 (76.15%) | loss: 2.576274 | lrm: 0.48 | dt: 645.46ms | tok/sec: 812,276 | mfu: 50.77 | epoch: 2 | total time: 136.64m | eta: 42.8m +step 12721/16704 (76.16%) | loss: 2.571635 | lrm: 0.48 | dt: 641.77ms | tok/sec: 816,942 | mfu: 51.06 | epoch: 2 | total time: 136.66m | eta: 42.8m +step 12722/16704 (76.16%) | loss: 2.578812 | lrm: 0.48 | dt: 643.48ms | tok/sec: 814,773 | mfu: 50.92 | epoch: 2 | total time: 136.67m | eta: 42.8m +step 12723/16704 (76.17%) | loss: 2.595568 | lrm: 0.48 | dt: 642.64ms | tok/sec: 815,834 | mfu: 50.99 | epoch: 2 | total time: 136.68m | eta: 42.8m +step 12724/16704 (76.17%) | loss: 2.594732 | lrm: 0.48 | dt: 641.79ms | tok/sec: 816,917 | mfu: 51.06 | epoch: 2 | total time: 136.69m | eta: 42.8m +step 12725/16704 (76.18%) | loss: 2.585120 | lrm: 0.48 | dt: 642.87ms | tok/sec: 815,542 | mfu: 50.97 | epoch: 2 | total time: 136.70m | eta: 42.8m +step 12726/16704 (76.19%) | loss: 2.584467 | lrm: 0.48 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 2 | total time: 136.71m | eta: 42.8m +step 12727/16704 (76.19%) | loss: 2.582761 | lrm: 0.48 | dt: 646.14ms | tok/sec: 811,410 | mfu: 50.71 | epoch: 2 | total time: 136.72m | eta: 42.8m +step 12728/16704 (76.20%) | loss: 2.574682 | lrm: 0.48 | dt: 643.15ms | tok/sec: 815,185 | mfu: 50.95 | epoch: 2 | total time: 136.73m | eta: 42.7m +step 12729/16704 (76.20%) | loss: 2.565807 | lrm: 0.48 | dt: 641.07ms | tok/sec: 817,838 | mfu: 51.12 | epoch: 2 | total time: 136.74m | eta: 42.7m +step 12730/16704 (76.21%) | loss: 2.556102 | lrm: 0.48 | dt: 645.84ms | tok/sec: 811,787 | mfu: 50.74 | epoch: 2 | total time: 136.75m | eta: 42.7m +step 12731/16704 (76.22%) | loss: 2.554669 | lrm: 0.48 | dt: 644.84ms | tok/sec: 813,054 | mfu: 50.82 | epoch: 2 | total time: 136.76m | eta: 42.7m +step 12732/16704 (76.22%) | loss: 2.576248 | lrm: 0.48 | dt: 643.93ms | tok/sec: 814,196 | mfu: 50.89 | epoch: 2 | total time: 136.77m | eta: 42.7m +step 12733/16704 (76.23%) | loss: 2.581470 | lrm: 0.48 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 2 | total time: 136.78m | eta: 42.7m +step 12734/16704 (76.23%) | loss: 2.565468 | lrm: 0.48 | dt: 643.68ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 2 | total time: 136.79m | eta: 42.7m +step 12735/16704 (76.24%) | loss: 2.563205 | lrm: 0.48 | dt: 644.23ms | tok/sec: 813,818 | mfu: 50.86 | epoch: 2 | total time: 136.81m | eta: 42.7m +step 12736/16704 (76.25%) | loss: 2.554182 | lrm: 0.48 | dt: 643.55ms | tok/sec: 814,683 | mfu: 50.92 | epoch: 2 | total time: 136.82m | eta: 42.7m +step 12737/16704 (76.25%) | loss: 2.549812 | lrm: 0.47 | dt: 645.14ms | tok/sec: 812,677 | mfu: 50.79 | epoch: 2 | total time: 136.83m | eta: 42.6m +step 12738/16704 (76.26%) | loss: 2.557914 | lrm: 0.47 | dt: 642.22ms | tok/sec: 816,364 | mfu: 51.02 | epoch: 2 | total time: 136.84m | eta: 42.6m +step 12739/16704 (76.26%) | loss: 2.558849 | lrm: 0.47 | dt: 645.23ms | tok/sec: 812,562 | mfu: 50.79 | epoch: 2 | total time: 136.85m | eta: 42.6m +step 12740/16704 (76.27%) | loss: 2.544166 | lrm: 0.47 | dt: 643.30ms | tok/sec: 814,999 | mfu: 50.94 | epoch: 2 | total time: 136.86m | eta: 42.6m +step 12741/16704 (76.28%) | loss: 2.548601 | lrm: 0.47 | dt: 645.01ms | tok/sec: 812,832 | mfu: 50.80 | epoch: 2 | total time: 136.87m | eta: 42.6m +step 12742/16704 (76.28%) | loss: 2.556722 | lrm: 0.47 | dt: 643.09ms | tok/sec: 815,266 | mfu: 50.96 | epoch: 2 | total time: 136.88m | eta: 42.6m +step 12743/16704 (76.29%) | loss: 2.560289 | lrm: 0.47 | dt: 644.86ms | tok/sec: 813,030 | mfu: 50.82 | epoch: 2 | total time: 136.89m | eta: 42.6m +step 12744/16704 (76.29%) | loss: 2.559250 | lrm: 0.47 | dt: 643.15ms | tok/sec: 815,184 | mfu: 50.95 | epoch: 2 | total time: 136.90m | eta: 42.6m +step 12745/16704 (76.30%) | loss: 2.567059 | lrm: 0.47 | dt: 643.83ms | tok/sec: 814,321 | mfu: 50.90 | epoch: 2 | total time: 136.91m | eta: 42.6m +step 12746/16704 (76.31%) | loss: 2.572079 | lrm: 0.47 | dt: 643.46ms | tok/sec: 814,793 | mfu: 50.93 | epoch: 2 | total time: 136.92m | eta: 42.6m +step 12747/16704 (76.31%) | loss: 2.584047 | lrm: 0.47 | dt: 642.88ms | tok/sec: 815,528 | mfu: 50.97 | epoch: 2 | total time: 136.93m | eta: 42.5m +step 12748/16704 (76.32%) | loss: 2.584514 | lrm: 0.47 | dt: 642.55ms | tok/sec: 815,954 | mfu: 51.00 | epoch: 2 | total time: 136.95m | eta: 42.5m +step 12749/16704 (76.32%) | loss: 2.564904 | lrm: 0.47 | dt: 646.05ms | tok/sec: 811,532 | mfu: 50.72 | epoch: 2 | total time: 136.96m | eta: 42.5m +Step 12750 | Validation bpb: 0.779982 +step 12750/16704 (76.33%) | loss: 2.576991 | lrm: 0.47 | dt: 644.12ms | tok/sec: 813,961 | mfu: 50.87 | epoch: 2 | total time: 136.97m | eta: 42.5m +step 12751/16704 (76.34%) | loss: 2.577768 | lrm: 0.47 | dt: 644.62ms | tok/sec: 813,331 | mfu: 50.83 | epoch: 2 | total time: 136.98m | eta: 42.5m +step 12752/16704 (76.34%) | loss: 2.575774 | lrm: 0.47 | dt: 646.63ms | tok/sec: 810,800 | mfu: 50.68 | epoch: 2 | total time: 136.99m | eta: 42.5m +step 12753/16704 (76.35%) | loss: 2.571030 | lrm: 0.47 | dt: 642.18ms | tok/sec: 816,418 | mfu: 51.03 | epoch: 2 | total time: 137.00m | eta: 42.5m +step 12754/16704 (76.35%) | loss: 2.566279 | lrm: 0.47 | dt: 645.00ms | tok/sec: 812,851 | mfu: 50.80 | epoch: 2 | total time: 137.01m | eta: 42.5m +step 12755/16704 (76.36%) | loss: 2.561716 | lrm: 0.47 | dt: 643.00ms | tok/sec: 815,377 | mfu: 50.96 | epoch: 2 | total time: 137.02m | eta: 42.5m +step 12756/16704 (76.36%) | loss: 2.548111 | lrm: 0.47 | dt: 642.03ms | tok/sec: 816,606 | mfu: 51.04 | epoch: 2 | total time: 137.03m | eta: 42.4m +step 12757/16704 (76.37%) | loss: 2.547339 | lrm: 0.47 | dt: 641.41ms | tok/sec: 817,398 | mfu: 51.09 | epoch: 2 | total time: 137.04m | eta: 42.4m +step 12758/16704 (76.38%) | loss: 2.557950 | lrm: 0.47 | dt: 643.03ms | tok/sec: 815,345 | mfu: 50.96 | epoch: 2 | total time: 137.05m | eta: 42.4m +step 12759/16704 (76.38%) | loss: 2.541414 | lrm: 0.47 | dt: 642.03ms | tok/sec: 816,614 | mfu: 51.04 | epoch: 2 | total time: 137.06m | eta: 42.4m +step 12760/16704 (76.39%) | loss: 2.542113 | lrm: 0.47 | dt: 643.91ms | tok/sec: 814,226 | mfu: 50.89 | epoch: 2 | total time: 137.07m | eta: 42.4m +step 12761/16704 (76.39%) | loss: 2.538028 | lrm: 0.47 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 2 | total time: 137.08m | eta: 42.4m +step 12762/16704 (76.40%) | loss: 2.545466 | lrm: 0.47 | dt: 642.95ms | tok/sec: 815,435 | mfu: 50.97 | epoch: 2 | total time: 137.10m | eta: 42.4m +step 12763/16704 (76.41%) | loss: 2.541307 | lrm: 0.47 | dt: 642.06ms | tok/sec: 816,573 | mfu: 51.04 | epoch: 2 | total time: 137.11m | eta: 42.4m +step 12764/16704 (76.41%) | loss: 2.541282 | lrm: 0.47 | dt: 643.17ms | tok/sec: 815,161 | mfu: 50.95 | epoch: 2 | total time: 137.12m | eta: 42.4m +step 12765/16704 (76.42%) | loss: 2.557553 | lrm: 0.47 | dt: 641.68ms | tok/sec: 817,051 | mfu: 51.07 | epoch: 2 | total time: 137.13m | eta: 42.3m +step 12766/16704 (76.42%) | loss: 2.551144 | lrm: 0.47 | dt: 644.99ms | tok/sec: 812,867 | mfu: 50.81 | epoch: 2 | total time: 137.14m | eta: 42.3m +step 12767/16704 (76.43%) | loss: 2.552464 | lrm: 0.47 | dt: 643.15ms | tok/sec: 815,193 | mfu: 50.95 | epoch: 2 | total time: 137.15m | eta: 42.3m +step 12768/16704 (76.44%) | loss: 2.547154 | lrm: 0.47 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 2 | total time: 137.16m | eta: 42.3m +step 12769/16704 (76.44%) | loss: 2.546338 | lrm: 0.47 | dt: 644.79ms | tok/sec: 813,115 | mfu: 50.82 | epoch: 2 | total time: 137.17m | eta: 42.3m +step 12770/16704 (76.45%) | loss: 2.549490 | lrm: 0.47 | dt: 640.35ms | tok/sec: 818,756 | mfu: 51.17 | epoch: 2 | total time: 137.18m | eta: 42.3m +step 12771/16704 (76.45%) | loss: 2.547858 | lrm: 0.47 | dt: 643.94ms | tok/sec: 814,182 | mfu: 50.89 | epoch: 2 | total time: 137.19m | eta: 42.3m +step 12772/16704 (76.46%) | loss: 2.550234 | lrm: 0.47 | dt: 644.22ms | tok/sec: 813,831 | mfu: 50.87 | epoch: 2 | total time: 137.20m | eta: 42.3m +step 12773/16704 (76.47%) | loss: 2.549487 | lrm: 0.47 | dt: 642.13ms | tok/sec: 816,477 | mfu: 51.03 | epoch: 2 | total time: 137.21m | eta: 42.3m +step 12774/16704 (76.47%) | loss: 2.547548 | lrm: 0.47 | dt: 645.51ms | tok/sec: 812,204 | mfu: 50.76 | epoch: 2 | total time: 137.22m | eta: 42.3m +step 12775/16704 (76.48%) | loss: 2.542864 | lrm: 0.47 | dt: 643.31ms | tok/sec: 814,980 | mfu: 50.94 | epoch: 2 | total time: 137.23m | eta: 42.2m +step 12776/16704 (76.48%) | loss: 2.544560 | lrm: 0.47 | dt: 644.05ms | tok/sec: 814,044 | mfu: 50.88 | epoch: 2 | total time: 137.25m | eta: 42.2m +step 12777/16704 (76.49%) | loss: 2.546159 | lrm: 0.47 | dt: 642.32ms | tok/sec: 816,241 | mfu: 51.02 | epoch: 2 | total time: 137.26m | eta: 42.2m +step 12778/16704 (76.50%) | loss: 2.552892 | lrm: 0.47 | dt: 642.73ms | tok/sec: 815,719 | mfu: 50.98 | epoch: 2 | total time: 137.27m | eta: 42.2m +step 12779/16704 (76.50%) | loss: 2.550826 | lrm: 0.47 | dt: 645.55ms | tok/sec: 812,157 | mfu: 50.76 | epoch: 2 | total time: 137.28m | eta: 42.2m +step 12780/16704 (76.51%) | loss: 2.555642 | lrm: 0.47 | dt: 641.86ms | tok/sec: 816,831 | mfu: 51.05 | epoch: 2 | total time: 137.29m | eta: 42.2m +step 12781/16704 (76.51%) | loss: 2.566862 | lrm: 0.47 | dt: 643.89ms | tok/sec: 814,251 | mfu: 50.89 | epoch: 2 | total time: 137.30m | eta: 42.2m +step 12782/16704 (76.52%) | loss: 2.557318 | lrm: 0.47 | dt: 643.00ms | tok/sec: 815,383 | mfu: 50.96 | epoch: 2 | total time: 137.31m | eta: 42.2m +step 12783/16704 (76.53%) | loss: 2.559013 | lrm: 0.47 | dt: 643.18ms | tok/sec: 815,153 | mfu: 50.95 | epoch: 2 | total time: 137.32m | eta: 42.2m +step 12784/16704 (76.53%) | loss: 2.540702 | lrm: 0.47 | dt: 642.30ms | tok/sec: 816,270 | mfu: 51.02 | epoch: 2 | total time: 137.33m | eta: 42.1m +step 12785/16704 (76.54%) | loss: 2.529600 | lrm: 0.47 | dt: 642.86ms | tok/sec: 815,561 | mfu: 50.97 | epoch: 2 | total time: 137.34m | eta: 42.1m +step 12786/16704 (76.54%) | loss: 2.532075 | lrm: 0.47 | dt: 643.09ms | tok/sec: 815,267 | mfu: 50.96 | epoch: 2 | total time: 137.35m | eta: 42.1m +step 12787/16704 (76.55%) | loss: 2.549713 | lrm: 0.47 | dt: 642.73ms | tok/sec: 815,721 | mfu: 50.98 | epoch: 2 | total time: 137.36m | eta: 42.1m +step 12788/16704 (76.56%) | loss: 2.547012 | lrm: 0.47 | dt: 642.57ms | tok/sec: 815,919 | mfu: 51.00 | epoch: 2 | total time: 137.37m | eta: 42.1m +step 12789/16704 (76.56%) | loss: 2.531768 | lrm: 0.47 | dt: 642.15ms | tok/sec: 816,462 | mfu: 51.03 | epoch: 2 | total time: 137.38m | eta: 42.1m +step 12790/16704 (76.57%) | loss: 2.558945 | lrm: 0.47 | dt: 642.71ms | tok/sec: 815,748 | mfu: 50.99 | epoch: 2 | total time: 137.40m | eta: 42.1m +step 12791/16704 (76.57%) | loss: 2.563907 | lrm: 0.47 | dt: 641.79ms | tok/sec: 816,909 | mfu: 51.06 | epoch: 2 | total time: 137.41m | eta: 42.1m +step 12792/16704 (76.58%) | loss: 2.570324 | lrm: 0.47 | dt: 644.86ms | tok/sec: 813,029 | mfu: 50.82 | epoch: 2 | total time: 137.42m | eta: 42.1m +step 12793/16704 (76.59%) | loss: 2.562176 | lrm: 0.47 | dt: 642.55ms | tok/sec: 815,949 | mfu: 51.00 | epoch: 2 | total time: 137.43m | eta: 42.0m +step 12794/16704 (76.59%) | loss: 2.566841 | lrm: 0.47 | dt: 642.26ms | tok/sec: 816,315 | mfu: 51.02 | epoch: 2 | total time: 137.44m | eta: 42.0m +step 12795/16704 (76.60%) | loss: 2.566808 | lrm: 0.47 | dt: 643.53ms | tok/sec: 814,706 | mfu: 50.92 | epoch: 2 | total time: 137.45m | eta: 42.0m +step 12796/16704 (76.60%) | loss: 2.548909 | lrm: 0.47 | dt: 642.02ms | tok/sec: 816,623 | mfu: 51.04 | epoch: 2 | total time: 137.46m | eta: 42.0m +step 12797/16704 (76.61%) | loss: 2.540008 | lrm: 0.47 | dt: 645.23ms | tok/sec: 812,562 | mfu: 50.79 | epoch: 2 | total time: 137.47m | eta: 42.0m +step 12798/16704 (76.62%) | loss: 2.535587 | lrm: 0.47 | dt: 644.57ms | tok/sec: 813,397 | mfu: 50.84 | epoch: 2 | total time: 137.48m | eta: 42.0m +step 12799/16704 (76.62%) | loss: 2.528718 | lrm: 0.47 | dt: 642.94ms | tok/sec: 815,451 | mfu: 50.97 | epoch: 2 | total time: 137.49m | eta: 42.0m +step 12800/16704 (76.63%) | loss: 2.523034 | lrm: 0.47 | dt: 643.17ms | tok/sec: 815,167 | mfu: 50.95 | epoch: 2 | total time: 137.50m | eta: 42.0m +step 12801/16704 (76.63%) | loss: 2.525446 | lrm: 0.47 | dt: 641.52ms | tok/sec: 817,256 | mfu: 51.08 | epoch: 2 | total time: 137.51m | eta: 42.0m +step 12802/16704 (76.64%) | loss: 2.527611 | lrm: 0.47 | dt: 643.08ms | tok/sec: 815,277 | mfu: 50.96 | epoch: 2 | total time: 137.52m | eta: 41.9m +step 12803/16704 (76.65%) | loss: 2.519261 | lrm: 0.47 | dt: 642.60ms | tok/sec: 815,887 | mfu: 50.99 | epoch: 2 | total time: 137.53m | eta: 41.9m +step 12804/16704 (76.65%) | loss: 2.521772 | lrm: 0.47 | dt: 643.05ms | tok/sec: 815,319 | mfu: 50.96 | epoch: 2 | total time: 137.55m | eta: 41.9m +step 12805/16704 (76.66%) | loss: 2.532174 | lrm: 0.47 | dt: 644.32ms | tok/sec: 813,704 | mfu: 50.86 | epoch: 2 | total time: 137.56m | eta: 41.9m +step 12806/16704 (76.66%) | loss: 2.532664 | lrm: 0.47 | dt: 641.71ms | tok/sec: 817,013 | mfu: 51.06 | epoch: 2 | total time: 137.57m | eta: 41.9m +step 12807/16704 (76.67%) | loss: 2.541189 | lrm: 0.47 | dt: 646.09ms | tok/sec: 811,482 | mfu: 50.72 | epoch: 2 | total time: 137.58m | eta: 41.9m +step 12808/16704 (76.68%) | loss: 2.537421 | lrm: 0.47 | dt: 643.00ms | tok/sec: 815,379 | mfu: 50.96 | epoch: 2 | total time: 137.59m | eta: 41.9m +step 12809/16704 (76.68%) | loss: 2.528626 | lrm: 0.47 | dt: 643.77ms | tok/sec: 814,402 | mfu: 50.90 | epoch: 2 | total time: 137.60m | eta: 41.9m +step 12810/16704 (76.69%) | loss: 2.525894 | lrm: 0.47 | dt: 644.99ms | tok/sec: 812,858 | mfu: 50.80 | epoch: 2 | total time: 137.61m | eta: 41.9m +step 12811/16704 (76.69%) | loss: 2.525216 | lrm: 0.47 | dt: 642.49ms | tok/sec: 816,027 | mfu: 51.00 | epoch: 2 | total time: 137.62m | eta: 41.9m +step 12812/16704 (76.70%) | loss: 2.540519 | lrm: 0.47 | dt: 642.03ms | tok/sec: 816,607 | mfu: 51.04 | epoch: 2 | total time: 137.63m | eta: 41.8m +step 12813/16704 (76.71%) | loss: 2.553395 | lrm: 0.47 | dt: 642.84ms | tok/sec: 815,583 | mfu: 50.98 | epoch: 2 | total time: 137.64m | eta: 41.8m +step 12814/16704 (76.71%) | loss: 2.547789 | lrm: 0.47 | dt: 643.07ms | tok/sec: 815,286 | mfu: 50.96 | epoch: 2 | total time: 137.65m | eta: 41.8m +step 12815/16704 (76.72%) | loss: 2.551069 | lrm: 0.47 | dt: 644.30ms | tok/sec: 813,731 | mfu: 50.86 | epoch: 2 | total time: 137.66m | eta: 41.8m +step 12816/16704 (76.72%) | loss: 2.553690 | lrm: 0.47 | dt: 644.33ms | tok/sec: 813,700 | mfu: 50.86 | epoch: 2 | total time: 137.67m | eta: 41.8m +step 12817/16704 (76.73%) | loss: 2.546856 | lrm: 0.47 | dt: 644.07ms | tok/sec: 814,017 | mfu: 50.88 | epoch: 2 | total time: 137.68m | eta: 41.8m +step 12818/16704 (76.74%) | loss: 2.547913 | lrm: 0.47 | dt: 643.24ms | tok/sec: 815,076 | mfu: 50.94 | epoch: 2 | total time: 137.70m | eta: 41.8m +step 12819/16704 (76.74%) | loss: 2.543058 | lrm: 0.47 | dt: 643.29ms | tok/sec: 815,013 | mfu: 50.94 | epoch: 2 | total time: 137.71m | eta: 41.8m +step 12820/16704 (76.75%) | loss: 2.548960 | lrm: 0.47 | dt: 642.13ms | tok/sec: 816,488 | mfu: 51.03 | epoch: 2 | total time: 137.72m | eta: 41.8m +step 12821/16704 (76.75%) | loss: 2.557191 | lrm: 0.46 | dt: 645.10ms | tok/sec: 812,729 | mfu: 50.80 | epoch: 2 | total time: 137.73m | eta: 41.7m +step 12822/16704 (76.76%) | loss: 2.545956 | lrm: 0.46 | dt: 644.58ms | tok/sec: 813,374 | mfu: 50.84 | epoch: 2 | total time: 137.74m | eta: 41.7m +step 12823/16704 (76.77%) | loss: 2.540525 | lrm: 0.46 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 2 | total time: 137.75m | eta: 41.7m +step 12824/16704 (76.77%) | loss: 2.538741 | lrm: 0.46 | dt: 645.19ms | tok/sec: 812,614 | mfu: 50.79 | epoch: 2 | total time: 137.76m | eta: 41.7m +step 12825/16704 (76.78%) | loss: 2.549741 | lrm: 0.46 | dt: 642.81ms | tok/sec: 815,616 | mfu: 50.98 | epoch: 2 | total time: 137.77m | eta: 41.7m +step 12826/16704 (76.78%) | loss: 2.545548 | lrm: 0.46 | dt: 643.55ms | tok/sec: 814,682 | mfu: 50.92 | epoch: 2 | total time: 137.78m | eta: 41.7m +step 12827/16704 (76.79%) | loss: 2.547805 | lrm: 0.46 | dt: 643.08ms | tok/sec: 815,274 | mfu: 50.96 | epoch: 2 | total time: 137.79m | eta: 41.7m +step 12828/16704 (76.80%) | loss: 2.552845 | lrm: 0.46 | dt: 643.39ms | tok/sec: 814,883 | mfu: 50.93 | epoch: 2 | total time: 137.80m | eta: 41.7m +step 12829/16704 (76.80%) | loss: 2.554151 | lrm: 0.46 | dt: 643.04ms | tok/sec: 815,324 | mfu: 50.96 | epoch: 2 | total time: 137.81m | eta: 41.7m +step 12830/16704 (76.81%) | loss: 2.552009 | lrm: 0.46 | dt: 642.95ms | tok/sec: 815,436 | mfu: 50.97 | epoch: 2 | total time: 137.82m | eta: 41.6m +step 12831/16704 (76.81%) | loss: 2.560444 | lrm: 0.46 | dt: 644.83ms | tok/sec: 813,060 | mfu: 50.82 | epoch: 2 | total time: 137.84m | eta: 41.6m +step 12832/16704 (76.82%) | loss: 2.575299 | lrm: 0.46 | dt: 645.77ms | tok/sec: 811,882 | mfu: 50.74 | epoch: 2 | total time: 137.85m | eta: 41.6m +step 12833/16704 (76.83%) | loss: 2.570587 | lrm: 0.46 | dt: 641.05ms | tok/sec: 817,858 | mfu: 51.12 | epoch: 2 | total time: 137.86m | eta: 41.6m +step 12834/16704 (76.83%) | loss: 2.575541 | lrm: 0.46 | dt: 645.55ms | tok/sec: 812,151 | mfu: 50.76 | epoch: 2 | total time: 137.87m | eta: 41.6m +step 12835/16704 (76.84%) | loss: 2.565532 | lrm: 0.46 | dt: 646.72ms | tok/sec: 810,685 | mfu: 50.67 | epoch: 2 | total time: 137.88m | eta: 41.6m +step 12836/16704 (76.84%) | loss: 2.561212 | lrm: 0.46 | dt: 644.17ms | tok/sec: 813,895 | mfu: 50.87 | epoch: 2 | total time: 137.89m | eta: 41.6m +step 12837/16704 (76.85%) | loss: 2.558247 | lrm: 0.46 | dt: 644.52ms | tok/sec: 813,452 | mfu: 50.84 | epoch: 2 | total time: 137.90m | eta: 41.6m +step 12838/16704 (76.86%) | loss: 2.561029 | lrm: 0.46 | dt: 643.62ms | tok/sec: 814,595 | mfu: 50.91 | epoch: 2 | total time: 137.91m | eta: 41.6m +step 12839/16704 (76.86%) | loss: 2.540681 | lrm: 0.46 | dt: 644.44ms | tok/sec: 813,550 | mfu: 50.85 | epoch: 2 | total time: 137.92m | eta: 41.6m +step 12840/16704 (76.87%) | loss: 2.538058 | lrm: 0.46 | dt: 646.28ms | tok/sec: 811,245 | mfu: 50.70 | epoch: 2 | total time: 137.93m | eta: 41.5m +step 12841/16704 (76.87%) | loss: 2.537823 | lrm: 0.46 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 2 | total time: 137.94m | eta: 41.5m +step 12842/16704 (76.88%) | loss: 2.520595 | lrm: 0.46 | dt: 640.61ms | tok/sec: 818,416 | mfu: 51.15 | epoch: 2 | total time: 137.95m | eta: 41.5m +step 12843/16704 (76.89%) | loss: 2.530951 | lrm: 0.46 | dt: 643.65ms | tok/sec: 814,549 | mfu: 50.91 | epoch: 2 | total time: 137.96m | eta: 41.5m +step 12844/16704 (76.89%) | loss: 2.542579 | lrm: 0.46 | dt: 642.62ms | tok/sec: 815,860 | mfu: 50.99 | epoch: 2 | total time: 137.97m | eta: 41.5m +step 12845/16704 (76.90%) | loss: 2.549331 | lrm: 0.46 | dt: 645.38ms | tok/sec: 812,366 | mfu: 50.77 | epoch: 2 | total time: 137.99m | eta: 41.5m +step 12846/16704 (76.90%) | loss: 2.553807 | lrm: 0.46 | dt: 644.71ms | tok/sec: 813,212 | mfu: 50.83 | epoch: 2 | total time: 138.00m | eta: 41.5m +step 12847/16704 (76.91%) | loss: 2.548033 | lrm: 0.46 | dt: 645.00ms | tok/sec: 812,846 | mfu: 50.80 | epoch: 2 | total time: 138.01m | eta: 41.5m +step 12848/16704 (76.92%) | loss: 2.540438 | lrm: 0.46 | dt: 644.90ms | tok/sec: 812,981 | mfu: 50.81 | epoch: 2 | total time: 138.02m | eta: 41.5m +step 12849/16704 (76.92%) | loss: 2.533493 | lrm: 0.46 | dt: 644.20ms | tok/sec: 813,860 | mfu: 50.87 | epoch: 2 | total time: 138.03m | eta: 41.4m +step 12850/16704 (76.93%) | loss: 2.527859 | lrm: 0.46 | dt: 645.79ms | tok/sec: 811,855 | mfu: 50.74 | epoch: 2 | total time: 138.04m | eta: 41.4m +step 12851/16704 (76.93%) | loss: 2.549772 | lrm: 0.46 | dt: 642.37ms | tok/sec: 816,172 | mfu: 51.01 | epoch: 2 | total time: 138.05m | eta: 41.4m +step 12852/16704 (76.94%) | loss: 2.548022 | lrm: 0.46 | dt: 643.27ms | tok/sec: 815,039 | mfu: 50.94 | epoch: 2 | total time: 138.06m | eta: 41.4m +step 12853/16704 (76.95%) | loss: 2.525140 | lrm: 0.46 | dt: 645.78ms | tok/sec: 811,864 | mfu: 50.74 | epoch: 2 | total time: 138.07m | eta: 41.4m +step 12854/16704 (76.95%) | loss: 2.513066 | lrm: 0.46 | dt: 642.27ms | tok/sec: 816,307 | mfu: 51.02 | epoch: 2 | total time: 138.08m | eta: 41.4m +step 12855/16704 (76.96%) | loss: 2.495021 | lrm: 0.46 | dt: 644.10ms | tok/sec: 813,981 | mfu: 50.87 | epoch: 2 | total time: 138.09m | eta: 41.4m +step 12856/16704 (76.96%) | loss: 2.502600 | lrm: 0.46 | dt: 645.83ms | tok/sec: 811,802 | mfu: 50.74 | epoch: 2 | total time: 138.10m | eta: 41.4m +step 12857/16704 (76.97%) | loss: 2.485714 | lrm: 0.46 | dt: 642.36ms | tok/sec: 816,184 | mfu: 51.01 | epoch: 2 | total time: 138.11m | eta: 41.4m +step 12858/16704 (76.98%) | loss: 2.486194 | lrm: 0.46 | dt: 643.77ms | tok/sec: 814,403 | mfu: 50.90 | epoch: 2 | total time: 138.13m | eta: 41.3m +step 12859/16704 (76.98%) | loss: 2.495077 | lrm: 0.46 | dt: 643.85ms | tok/sec: 814,298 | mfu: 50.89 | epoch: 2 | total time: 138.14m | eta: 41.3m +step 12860/16704 (76.99%) | loss: 2.492237 | lrm: 0.46 | dt: 645.40ms | tok/sec: 812,345 | mfu: 50.77 | epoch: 2 | total time: 138.15m | eta: 41.3m +step 12861/16704 (76.99%) | loss: 2.508334 | lrm: 0.46 | dt: 645.20ms | tok/sec: 812,601 | mfu: 50.79 | epoch: 2 | total time: 138.16m | eta: 41.3m +step 12862/16704 (77.00%) | loss: 2.522018 | lrm: 0.46 | dt: 644.92ms | tok/sec: 812,950 | mfu: 50.81 | epoch: 2 | total time: 138.17m | eta: 41.3m +step 12863/16704 (77.01%) | loss: 2.508151 | lrm: 0.46 | dt: 642.96ms | tok/sec: 815,424 | mfu: 50.97 | epoch: 2 | total time: 138.18m | eta: 41.3m +step 12864/16704 (77.01%) | loss: 2.503502 | lrm: 0.46 | dt: 647.32ms | tok/sec: 809,936 | mfu: 50.62 | epoch: 2 | total time: 138.19m | eta: 41.3m +step 12865/16704 (77.02%) | loss: 2.527925 | lrm: 0.46 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 2 | total time: 138.20m | eta: 41.3m +step 12866/16704 (77.02%) | loss: 2.528278 | lrm: 0.46 | dt: 643.58ms | tok/sec: 814,645 | mfu: 50.92 | epoch: 2 | total time: 138.21m | eta: 41.3m +step 12867/16704 (77.03%) | loss: 2.535458 | lrm: 0.46 | dt: 645.09ms | tok/sec: 812,740 | mfu: 50.80 | epoch: 2 | total time: 138.22m | eta: 41.3m +step 12868/16704 (77.04%) | loss: 2.536318 | lrm: 0.46 | dt: 643.92ms | tok/sec: 814,218 | mfu: 50.89 | epoch: 2 | total time: 138.23m | eta: 41.2m +step 12869/16704 (77.04%) | loss: 2.535189 | lrm: 0.46 | dt: 643.66ms | tok/sec: 814,540 | mfu: 50.91 | epoch: 2 | total time: 138.24m | eta: 41.2m +step 12870/16704 (77.05%) | loss: 2.533659 | lrm: 0.46 | dt: 644.55ms | tok/sec: 813,414 | mfu: 50.84 | epoch: 2 | total time: 138.25m | eta: 41.2m +step 12871/16704 (77.05%) | loss: 2.530039 | lrm: 0.46 | dt: 644.65ms | tok/sec: 813,293 | mfu: 50.83 | epoch: 2 | total time: 138.26m | eta: 41.2m +step 12872/16704 (77.06%) | loss: 2.523115 | lrm: 0.46 | dt: 645.21ms | tok/sec: 812,590 | mfu: 50.79 | epoch: 2 | total time: 138.28m | eta: 41.2m +step 12873/16704 (77.07%) | loss: 2.517418 | lrm: 0.46 | dt: 645.75ms | tok/sec: 811,900 | mfu: 50.74 | epoch: 2 | total time: 138.29m | eta: 41.2m +step 12874/16704 (77.07%) | loss: 2.522260 | lrm: 0.46 | dt: 643.16ms | tok/sec: 815,170 | mfu: 50.95 | epoch: 2 | total time: 138.30m | eta: 41.2m +step 12875/16704 (77.08%) | loss: 2.538228 | lrm: 0.46 | dt: 643.33ms | tok/sec: 814,959 | mfu: 50.94 | epoch: 2 | total time: 138.31m | eta: 41.2m +step 12876/16704 (77.08%) | loss: 2.531043 | lrm: 0.46 | dt: 645.86ms | tok/sec: 811,765 | mfu: 50.74 | epoch: 2 | total time: 138.32m | eta: 41.2m +step 12877/16704 (77.09%) | loss: 2.523550 | lrm: 0.46 | dt: 644.39ms | tok/sec: 813,619 | mfu: 50.85 | epoch: 2 | total time: 138.33m | eta: 41.1m +step 12878/16704 (77.10%) | loss: 2.515156 | lrm: 0.46 | dt: 643.42ms | tok/sec: 814,847 | mfu: 50.93 | epoch: 2 | total time: 138.34m | eta: 41.1m +step 12879/16704 (77.10%) | loss: 2.524649 | lrm: 0.46 | dt: 644.69ms | tok/sec: 813,235 | mfu: 50.83 | epoch: 2 | total time: 138.35m | eta: 41.1m +step 12880/16704 (77.11%) | loss: 2.521195 | lrm: 0.46 | dt: 643.64ms | tok/sec: 814,563 | mfu: 50.91 | epoch: 2 | total time: 138.36m | eta: 41.1m +step 12881/16704 (77.11%) | loss: 2.513361 | lrm: 0.46 | dt: 644.69ms | tok/sec: 813,234 | mfu: 50.83 | epoch: 2 | total time: 138.37m | eta: 41.1m +step 12882/16704 (77.12%) | loss: 2.520284 | lrm: 0.46 | dt: 645.51ms | tok/sec: 812,205 | mfu: 50.76 | epoch: 2 | total time: 138.38m | eta: 41.1m +step 12883/16704 (77.13%) | loss: 2.518368 | lrm: 0.46 | dt: 643.99ms | tok/sec: 814,122 | mfu: 50.88 | epoch: 2 | total time: 138.39m | eta: 41.1m +step 12884/16704 (77.13%) | loss: 2.520084 | lrm: 0.46 | dt: 643.27ms | tok/sec: 815,040 | mfu: 50.94 | epoch: 2 | total time: 138.40m | eta: 41.1m +step 12885/16704 (77.14%) | loss: 2.515746 | lrm: 0.46 | dt: 645.52ms | tok/sec: 812,193 | mfu: 50.76 | epoch: 2 | total time: 138.42m | eta: 41.1m +step 12886/16704 (77.14%) | loss: 2.514125 | lrm: 0.46 | dt: 643.53ms | tok/sec: 814,702 | mfu: 50.92 | epoch: 2 | total time: 138.43m | eta: 41.0m +step 12887/16704 (77.15%) | loss: 2.521564 | lrm: 0.46 | dt: 643.66ms | tok/sec: 814,546 | mfu: 50.91 | epoch: 2 | total time: 138.44m | eta: 41.0m +step 12888/16704 (77.16%) | loss: 2.516612 | lrm: 0.46 | dt: 643.84ms | tok/sec: 814,314 | mfu: 50.90 | epoch: 2 | total time: 138.45m | eta: 41.0m +step 12889/16704 (77.16%) | loss: 2.509948 | lrm: 0.46 | dt: 643.62ms | tok/sec: 814,588 | mfu: 50.91 | epoch: 2 | total time: 138.46m | eta: 41.0m +step 12890/16704 (77.17%) | loss: 2.501521 | lrm: 0.46 | dt: 646.80ms | tok/sec: 810,591 | mfu: 50.66 | epoch: 2 | total time: 138.47m | eta: 41.0m +step 12891/16704 (77.17%) | loss: 2.512418 | lrm: 0.46 | dt: 642.39ms | tok/sec: 816,157 | mfu: 51.01 | epoch: 2 | total time: 138.48m | eta: 41.0m +step 12892/16704 (77.18%) | loss: 2.513903 | lrm: 0.46 | dt: 642.27ms | tok/sec: 816,299 | mfu: 51.02 | epoch: 2 | total time: 138.49m | eta: 41.0m +step 12893/16704 (77.19%) | loss: 2.511933 | lrm: 0.46 | dt: 645.61ms | tok/sec: 812,079 | mfu: 50.76 | epoch: 2 | total time: 138.50m | eta: 41.0m +step 12894/16704 (77.19%) | loss: 2.514740 | lrm: 0.46 | dt: 645.72ms | tok/sec: 811,940 | mfu: 50.75 | epoch: 2 | total time: 138.51m | eta: 41.0m +step 12895/16704 (77.20%) | loss: 2.519305 | lrm: 0.46 | dt: 646.51ms | tok/sec: 810,956 | mfu: 50.69 | epoch: 2 | total time: 138.52m | eta: 40.9m +step 12896/16704 (77.20%) | loss: 2.535743 | lrm: 0.46 | dt: 642.23ms | tok/sec: 816,361 | mfu: 51.02 | epoch: 2 | total time: 138.53m | eta: 40.9m +step 12897/16704 (77.21%) | loss: 2.533965 | lrm: 0.46 | dt: 642.51ms | tok/sec: 816,002 | mfu: 51.00 | epoch: 2 | total time: 138.54m | eta: 40.9m +step 12898/16704 (77.22%) | loss: 2.536601 | lrm: 0.46 | dt: 644.94ms | tok/sec: 812,930 | mfu: 50.81 | epoch: 2 | total time: 138.55m | eta: 40.9m +step 12899/16704 (77.22%) | loss: 2.543721 | lrm: 0.46 | dt: 643.20ms | tok/sec: 815,130 | mfu: 50.95 | epoch: 2 | total time: 138.57m | eta: 40.9m +step 12900/16704 (77.23%) | loss: 2.516819 | lrm: 0.46 | dt: 645.38ms | tok/sec: 812,375 | mfu: 50.77 | epoch: 2 | total time: 138.58m | eta: 40.9m +step 12901/16704 (77.23%) | loss: 2.507507 | lrm: 0.46 | dt: 643.51ms | tok/sec: 814,737 | mfu: 50.92 | epoch: 2 | total time: 138.59m | eta: 40.9m +step 12902/16704 (77.24%) | loss: 2.510374 | lrm: 0.46 | dt: 645.84ms | tok/sec: 811,787 | mfu: 50.74 | epoch: 2 | total time: 138.60m | eta: 40.9m +step 12903/16704 (77.24%) | loss: 2.504727 | lrm: 0.46 | dt: 644.32ms | tok/sec: 813,704 | mfu: 50.86 | epoch: 2 | total time: 138.61m | eta: 40.9m +step 12904/16704 (77.25%) | loss: 2.520072 | lrm: 0.45 | dt: 644.47ms | tok/sec: 813,512 | mfu: 50.85 | epoch: 2 | total time: 138.62m | eta: 40.9m +step 12905/16704 (77.26%) | loss: 2.516843 | lrm: 0.45 | dt: 644.05ms | tok/sec: 814,046 | mfu: 50.88 | epoch: 2 | total time: 138.63m | eta: 40.8m +step 12906/16704 (77.26%) | loss: 2.519528 | lrm: 0.45 | dt: 644.81ms | tok/sec: 813,090 | mfu: 50.82 | epoch: 2 | total time: 138.64m | eta: 40.8m +step 12907/16704 (77.27%) | loss: 2.522859 | lrm: 0.45 | dt: 644.94ms | tok/sec: 812,922 | mfu: 50.81 | epoch: 2 | total time: 138.65m | eta: 40.8m +step 12908/16704 (77.27%) | loss: 2.522752 | lrm: 0.45 | dt: 642.49ms | tok/sec: 816,023 | mfu: 51.00 | epoch: 2 | total time: 138.66m | eta: 40.8m +step 12909/16704 (77.28%) | loss: 2.523147 | lrm: 0.45 | dt: 644.73ms | tok/sec: 813,184 | mfu: 50.83 | epoch: 2 | total time: 138.67m | eta: 40.8m +step 12910/16704 (77.29%) | loss: 2.527578 | lrm: 0.45 | dt: 644.27ms | tok/sec: 813,765 | mfu: 50.86 | epoch: 2 | total time: 138.68m | eta: 40.8m +step 12911/16704 (77.29%) | loss: 2.525931 | lrm: 0.45 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 2 | total time: 138.69m | eta: 40.8m +step 12912/16704 (77.30%) | loss: 2.517954 | lrm: 0.45 | dt: 645.99ms | tok/sec: 811,607 | mfu: 50.73 | epoch: 2 | total time: 138.70m | eta: 40.8m +step 12913/16704 (77.30%) | loss: 2.514797 | lrm: 0.45 | dt: 645.35ms | tok/sec: 812,405 | mfu: 50.78 | epoch: 2 | total time: 138.72m | eta: 40.8m +step 12914/16704 (77.31%) | loss: 2.500808 | lrm: 0.45 | dt: 642.80ms | tok/sec: 815,634 | mfu: 50.98 | epoch: 2 | total time: 138.73m | eta: 40.7m +step 12915/16704 (77.32%) | loss: 2.511356 | lrm: 0.45 | dt: 643.79ms | tok/sec: 814,378 | mfu: 50.90 | epoch: 2 | total time: 138.74m | eta: 40.7m +step 12916/16704 (77.32%) | loss: 2.507376 | lrm: 0.45 | dt: 645.63ms | tok/sec: 812,055 | mfu: 50.75 | epoch: 2 | total time: 138.75m | eta: 40.7m +step 12917/16704 (77.33%) | loss: 2.500323 | lrm: 0.45 | dt: 644.84ms | tok/sec: 813,055 | mfu: 50.82 | epoch: 2 | total time: 138.76m | eta: 40.7m +step 12918/16704 (77.33%) | loss: 2.502856 | lrm: 0.45 | dt: 642.93ms | tok/sec: 815,461 | mfu: 50.97 | epoch: 2 | total time: 138.77m | eta: 40.7m +step 12919/16704 (77.34%) | loss: 2.506973 | lrm: 0.45 | dt: 645.34ms | tok/sec: 812,427 | mfu: 50.78 | epoch: 2 | total time: 138.78m | eta: 40.7m +step 12920/16704 (77.35%) | loss: 2.502888 | lrm: 0.45 | dt: 642.79ms | tok/sec: 815,639 | mfu: 50.98 | epoch: 2 | total time: 138.79m | eta: 40.7m +step 12921/16704 (77.35%) | loss: 2.512437 | lrm: 0.45 | dt: 645.44ms | tok/sec: 812,293 | mfu: 50.77 | epoch: 2 | total time: 138.80m | eta: 40.7m +step 12922/16704 (77.36%) | loss: 2.511475 | lrm: 0.45 | dt: 646.76ms | tok/sec: 810,638 | mfu: 50.67 | epoch: 2 | total time: 138.81m | eta: 40.7m +step 12923/16704 (77.36%) | loss: 2.505614 | lrm: 0.45 | dt: 642.88ms | tok/sec: 815,529 | mfu: 50.97 | epoch: 2 | total time: 138.82m | eta: 40.6m +step 12924/16704 (77.37%) | loss: 2.507784 | lrm: 0.45 | dt: 647.02ms | tok/sec: 810,310 | mfu: 50.65 | epoch: 2 | total time: 138.83m | eta: 40.6m +step 12925/16704 (77.38%) | loss: 2.509621 | lrm: 0.45 | dt: 641.73ms | tok/sec: 816,985 | mfu: 51.06 | epoch: 2 | total time: 138.84m | eta: 40.6m +step 12926/16704 (77.38%) | loss: 2.500363 | lrm: 0.45 | dt: 642.42ms | tok/sec: 816,112 | mfu: 51.01 | epoch: 2 | total time: 138.86m | eta: 40.6m +step 12927/16704 (77.39%) | loss: 2.500493 | lrm: 0.45 | dt: 645.52ms | tok/sec: 812,190 | mfu: 50.76 | epoch: 2 | total time: 138.87m | eta: 40.6m +step 12928/16704 (77.39%) | loss: 2.509365 | lrm: 0.45 | dt: 642.16ms | tok/sec: 816,439 | mfu: 51.03 | epoch: 2 | total time: 138.88m | eta: 40.6m +step 12929/16704 (77.40%) | loss: 2.500590 | lrm: 0.45 | dt: 644.79ms | tok/sec: 813,117 | mfu: 50.82 | epoch: 2 | total time: 138.89m | eta: 40.6m +step 12930/16704 (77.41%) | loss: 2.494312 | lrm: 0.45 | dt: 644.85ms | tok/sec: 813,037 | mfu: 50.82 | epoch: 2 | total time: 138.90m | eta: 40.6m +step 12931/16704 (77.41%) | loss: 2.502843 | lrm: 0.45 | dt: 644.01ms | tok/sec: 814,102 | mfu: 50.88 | epoch: 2 | total time: 138.91m | eta: 40.6m +step 12932/16704 (77.42%) | loss: 2.499810 | lrm: 0.45 | dt: 644.53ms | tok/sec: 813,443 | mfu: 50.84 | epoch: 2 | total time: 138.92m | eta: 40.6m +step 12933/16704 (77.42%) | loss: 2.507102 | lrm: 0.45 | dt: 641.37ms | tok/sec: 817,448 | mfu: 51.09 | epoch: 2 | total time: 138.93m | eta: 40.5m +step 12934/16704 (77.43%) | loss: 2.516342 | lrm: 0.45 | dt: 644.81ms | tok/sec: 813,082 | mfu: 50.82 | epoch: 2 | total time: 138.94m | eta: 40.5m +step 12935/16704 (77.44%) | loss: 2.524799 | lrm: 0.45 | dt: 642.45ms | tok/sec: 816,075 | mfu: 51.01 | epoch: 2 | total time: 138.95m | eta: 40.5m +step 12936/16704 (77.44%) | loss: 2.538568 | lrm: 0.45 | dt: 643.53ms | tok/sec: 814,702 | mfu: 50.92 | epoch: 2 | total time: 138.96m | eta: 40.5m +step 12937/16704 (77.45%) | loss: 2.547466 | lrm: 0.45 | dt: 643.48ms | tok/sec: 814,766 | mfu: 50.92 | epoch: 2 | total time: 138.97m | eta: 40.5m +step 12938/16704 (77.45%) | loss: 2.536159 | lrm: 0.45 | dt: 644.92ms | tok/sec: 812,947 | mfu: 50.81 | epoch: 2 | total time: 138.98m | eta: 40.5m +step 12939/16704 (77.46%) | loss: 2.531715 | lrm: 0.45 | dt: 642.72ms | tok/sec: 815,734 | mfu: 50.98 | epoch: 2 | total time: 138.99m | eta: 40.5m +step 12940/16704 (77.47%) | loss: 2.542624 | lrm: 0.45 | dt: 645.20ms | tok/sec: 812,594 | mfu: 50.79 | epoch: 2 | total time: 139.01m | eta: 40.5m +step 12941/16704 (77.47%) | loss: 2.533334 | lrm: 0.45 | dt: 644.31ms | tok/sec: 813,714 | mfu: 50.86 | epoch: 2 | total time: 139.02m | eta: 40.5m +step 12942/16704 (77.48%) | loss: 2.540445 | lrm: 0.45 | dt: 644.88ms | tok/sec: 812,997 | mfu: 50.81 | epoch: 2 | total time: 139.03m | eta: 40.4m +step 12943/16704 (77.48%) | loss: 2.550624 | lrm: 0.45 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 2 | total time: 139.04m | eta: 40.4m +step 12944/16704 (77.49%) | loss: 2.551916 | lrm: 0.45 | dt: 642.45ms | tok/sec: 816,073 | mfu: 51.01 | epoch: 2 | total time: 139.05m | eta: 40.4m +step 12945/16704 (77.50%) | loss: 2.545085 | lrm: 0.45 | dt: 646.46ms | tok/sec: 811,010 | mfu: 50.69 | epoch: 2 | total time: 139.06m | eta: 40.4m +step 12946/16704 (77.50%) | loss: 2.534306 | lrm: 0.45 | dt: 641.65ms | tok/sec: 817,098 | mfu: 51.07 | epoch: 2 | total time: 139.07m | eta: 40.4m +step 12947/16704 (77.51%) | loss: 2.526670 | lrm: 0.45 | dt: 647.23ms | tok/sec: 810,044 | mfu: 50.63 | epoch: 2 | total time: 139.08m | eta: 40.4m +step 12948/16704 (77.51%) | loss: 2.535102 | lrm: 0.45 | dt: 642.28ms | tok/sec: 816,296 | mfu: 51.02 | epoch: 2 | total time: 139.09m | eta: 40.4m +step 12949/16704 (77.52%) | loss: 2.532139 | lrm: 0.45 | dt: 642.88ms | tok/sec: 815,528 | mfu: 50.97 | epoch: 2 | total time: 139.10m | eta: 40.4m +step 12950/16704 (77.53%) | loss: 2.533623 | lrm: 0.45 | dt: 644.64ms | tok/sec: 813,297 | mfu: 50.83 | epoch: 2 | total time: 139.11m | eta: 40.4m +step 12951/16704 (77.53%) | loss: 2.529591 | lrm: 0.45 | dt: 642.54ms | tok/sec: 815,962 | mfu: 51.00 | epoch: 2 | total time: 139.12m | eta: 40.3m +step 12952/16704 (77.54%) | loss: 2.535174 | lrm: 0.45 | dt: 642.57ms | tok/sec: 815,923 | mfu: 51.00 | epoch: 2 | total time: 139.13m | eta: 40.3m +step 12953/16704 (77.54%) | loss: 2.528025 | lrm: 0.45 | dt: 643.09ms | tok/sec: 815,266 | mfu: 50.96 | epoch: 2 | total time: 139.15m | eta: 40.3m +step 12954/16704 (77.55%) | loss: 2.519996 | lrm: 0.45 | dt: 643.02ms | tok/sec: 815,350 | mfu: 50.96 | epoch: 2 | total time: 139.16m | eta: 40.3m +step 12955/16704 (77.56%) | loss: 2.535772 | lrm: 0.45 | dt: 644.11ms | tok/sec: 813,968 | mfu: 50.87 | epoch: 2 | total time: 139.17m | eta: 40.3m +step 12956/16704 (77.56%) | loss: 2.538471 | lrm: 0.45 | dt: 642.78ms | tok/sec: 815,651 | mfu: 50.98 | epoch: 2 | total time: 139.18m | eta: 40.3m +step 12957/16704 (77.57%) | loss: 2.534020 | lrm: 0.45 | dt: 641.18ms | tok/sec: 817,696 | mfu: 51.11 | epoch: 2 | total time: 139.19m | eta: 40.3m +step 12958/16704 (77.57%) | loss: 2.535629 | lrm: 0.45 | dt: 642.25ms | tok/sec: 816,329 | mfu: 51.02 | epoch: 2 | total time: 139.20m | eta: 40.3m +step 12959/16704 (77.58%) | loss: 2.529726 | lrm: 0.45 | dt: 643.03ms | tok/sec: 815,343 | mfu: 50.96 | epoch: 2 | total time: 139.21m | eta: 40.3m +step 12960/16704 (77.59%) | loss: 2.525478 | lrm: 0.45 | dt: 643.65ms | tok/sec: 814,550 | mfu: 50.91 | epoch: 2 | total time: 139.22m | eta: 40.3m +step 12961/16704 (77.59%) | loss: 2.532680 | lrm: 0.45 | dt: 641.12ms | tok/sec: 817,772 | mfu: 51.11 | epoch: 2 | total time: 139.23m | eta: 40.2m +step 12962/16704 (77.60%) | loss: 2.525112 | lrm: 0.45 | dt: 643.62ms | tok/sec: 814,598 | mfu: 50.91 | epoch: 2 | total time: 139.24m | eta: 40.2m +step 12963/16704 (77.60%) | loss: 2.534061 | lrm: 0.45 | dt: 644.16ms | tok/sec: 813,906 | mfu: 50.87 | epoch: 2 | total time: 139.25m | eta: 40.2m +step 12964/16704 (77.61%) | loss: 2.534092 | lrm: 0.45 | dt: 642.52ms | tok/sec: 815,982 | mfu: 51.00 | epoch: 2 | total time: 139.26m | eta: 40.2m +step 12965/16704 (77.62%) | loss: 2.533050 | lrm: 0.45 | dt: 642.75ms | tok/sec: 815,698 | mfu: 50.98 | epoch: 2 | total time: 139.27m | eta: 40.2m +step 12966/16704 (77.62%) | loss: 2.534230 | lrm: 0.45 | dt: 642.40ms | tok/sec: 816,136 | mfu: 51.01 | epoch: 2 | total time: 139.28m | eta: 40.2m +step 12967/16704 (77.63%) | loss: 2.538328 | lrm: 0.45 | dt: 644.44ms | tok/sec: 813,554 | mfu: 50.85 | epoch: 2 | total time: 139.30m | eta: 40.2m +step 12968/16704 (77.63%) | loss: 2.530311 | lrm: 0.45 | dt: 643.94ms | tok/sec: 814,183 | mfu: 50.89 | epoch: 2 | total time: 139.31m | eta: 40.2m +step 12969/16704 (77.64%) | loss: 2.530503 | lrm: 0.45 | dt: 642.40ms | tok/sec: 816,136 | mfu: 51.01 | epoch: 2 | total time: 139.32m | eta: 40.2m +step 12970/16704 (77.65%) | loss: 2.519475 | lrm: 0.45 | dt: 645.33ms | tok/sec: 812,429 | mfu: 50.78 | epoch: 2 | total time: 139.33m | eta: 40.1m +step 12971/16704 (77.65%) | loss: 2.517038 | lrm: 0.45 | dt: 642.17ms | tok/sec: 816,433 | mfu: 51.03 | epoch: 2 | total time: 139.34m | eta: 40.1m +step 12972/16704 (77.66%) | loss: 2.519879 | lrm: 0.45 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 2 | total time: 139.35m | eta: 40.1m +step 12973/16704 (77.66%) | loss: 2.520704 | lrm: 0.45 | dt: 645.49ms | tok/sec: 812,230 | mfu: 50.77 | epoch: 2 | total time: 139.36m | eta: 40.1m +step 12974/16704 (77.67%) | loss: 2.524969 | lrm: 0.45 | dt: 644.38ms | tok/sec: 813,627 | mfu: 50.85 | epoch: 2 | total time: 139.37m | eta: 40.1m +step 12975/16704 (77.68%) | loss: 2.537383 | lrm: 0.45 | dt: 645.88ms | tok/sec: 811,740 | mfu: 50.73 | epoch: 2 | total time: 139.38m | eta: 40.1m +step 12976/16704 (77.68%) | loss: 2.552682 | lrm: 0.45 | dt: 642.19ms | tok/sec: 816,400 | mfu: 51.03 | epoch: 2 | total time: 139.39m | eta: 40.1m +step 12977/16704 (77.69%) | loss: 2.551927 | lrm: 0.45 | dt: 644.90ms | tok/sec: 812,977 | mfu: 50.81 | epoch: 2 | total time: 139.40m | eta: 40.1m +step 12978/16704 (77.69%) | loss: 2.558289 | lrm: 0.45 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 2 | total time: 139.41m | eta: 40.1m +step 12979/16704 (77.70%) | loss: 2.561264 | lrm: 0.45 | dt: 644.26ms | tok/sec: 813,778 | mfu: 50.86 | epoch: 2 | total time: 139.42m | eta: 40.0m +step 12980/16704 (77.71%) | loss: 2.568182 | lrm: 0.45 | dt: 645.29ms | tok/sec: 812,486 | mfu: 50.78 | epoch: 2 | total time: 139.43m | eta: 40.0m +step 12981/16704 (77.71%) | loss: 2.562957 | lrm: 0.45 | dt: 641.90ms | tok/sec: 816,778 | mfu: 51.05 | epoch: 2 | total time: 139.45m | eta: 40.0m +step 12982/16704 (77.72%) | loss: 2.557283 | lrm: 0.45 | dt: 644.20ms | tok/sec: 813,854 | mfu: 50.87 | epoch: 2 | total time: 139.46m | eta: 40.0m +step 12983/16704 (77.72%) | loss: 2.568733 | lrm: 0.45 | dt: 645.72ms | tok/sec: 811,943 | mfu: 50.75 | epoch: 2 | total time: 139.47m | eta: 40.0m +step 12984/16704 (77.73%) | loss: 2.559668 | lrm: 0.45 | dt: 641.80ms | tok/sec: 816,897 | mfu: 51.06 | epoch: 2 | total time: 139.48m | eta: 40.0m +step 12985/16704 (77.74%) | loss: 2.552881 | lrm: 0.45 | dt: 644.01ms | tok/sec: 814,094 | mfu: 50.88 | epoch: 2 | total time: 139.49m | eta: 40.0m +step 12986/16704 (77.74%) | loss: 2.557130 | lrm: 0.45 | dt: 645.28ms | tok/sec: 812,501 | mfu: 50.78 | epoch: 2 | total time: 139.50m | eta: 40.0m +step 12987/16704 (77.75%) | loss: 2.555470 | lrm: 0.45 | dt: 643.38ms | tok/sec: 814,896 | mfu: 50.93 | epoch: 2 | total time: 139.51m | eta: 40.0m +step 12988/16704 (77.75%) | loss: 2.543824 | lrm: 0.44 | dt: 644.22ms | tok/sec: 813,832 | mfu: 50.87 | epoch: 2 | total time: 139.52m | eta: 39.9m +step 12989/16704 (77.76%) | loss: 2.540056 | lrm: 0.44 | dt: 643.55ms | tok/sec: 814,679 | mfu: 50.92 | epoch: 2 | total time: 139.53m | eta: 39.9m +step 12990/16704 (77.77%) | loss: 2.554933 | lrm: 0.44 | dt: 643.43ms | tok/sec: 814,831 | mfu: 50.93 | epoch: 2 | total time: 139.54m | eta: 39.9m +step 12991/16704 (77.77%) | loss: 2.560875 | lrm: 0.44 | dt: 643.25ms | tok/sec: 815,060 | mfu: 50.94 | epoch: 2 | total time: 139.55m | eta: 39.9m +step 12992/16704 (77.78%) | loss: 2.556099 | lrm: 0.44 | dt: 644.03ms | tok/sec: 814,074 | mfu: 50.88 | epoch: 2 | total time: 139.56m | eta: 39.9m +step 12993/16704 (77.78%) | loss: 2.560276 | lrm: 0.44 | dt: 644.28ms | tok/sec: 813,754 | mfu: 50.86 | epoch: 2 | total time: 139.57m | eta: 39.9m +step 12994/16704 (77.79%) | loss: 2.564704 | lrm: 0.44 | dt: 644.78ms | tok/sec: 813,132 | mfu: 50.82 | epoch: 2 | total time: 139.58m | eta: 39.9m +step 12995/16704 (77.80%) | loss: 2.561363 | lrm: 0.44 | dt: 645.11ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 2 | total time: 139.60m | eta: 39.9m +step 12996/16704 (77.80%) | loss: 2.563557 | lrm: 0.44 | dt: 642.54ms | tok/sec: 815,965 | mfu: 51.00 | epoch: 2 | total time: 139.61m | eta: 39.9m +step 12997/16704 (77.81%) | loss: 2.557451 | lrm: 0.44 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 2 | total time: 139.62m | eta: 39.9m +step 12998/16704 (77.81%) | loss: 2.558719 | lrm: 0.44 | dt: 645.33ms | tok/sec: 812,440 | mfu: 50.78 | epoch: 2 | total time: 139.63m | eta: 39.8m +step 12999/16704 (77.82%) | loss: 2.545784 | lrm: 0.44 | dt: 642.47ms | tok/sec: 816,055 | mfu: 51.00 | epoch: 2 | total time: 139.64m | eta: 39.8m +Step 13000 | Validation bpb: 0.778009 +step 13000/16704 (77.83%) | loss: 2.541194 | lrm: 0.44 | dt: 644.53ms | tok/sec: 813,445 | mfu: 50.84 | epoch: 2 | total time: 139.65m | eta: 39.8m +step 13001/16704 (77.83%) | loss: 2.535927 | lrm: 0.44 | dt: 649.73ms | tok/sec: 806,937 | mfu: 50.43 | epoch: 2 | total time: 139.66m | eta: 39.8m +step 13002/16704 (77.84%) | loss: 2.529191 | lrm: 0.44 | dt: 642.11ms | tok/sec: 816,512 | mfu: 51.03 | epoch: 2 | total time: 139.67m | eta: 39.8m +step 13003/16704 (77.84%) | loss: 2.543715 | lrm: 0.44 | dt: 645.72ms | tok/sec: 811,949 | mfu: 50.75 | epoch: 2 | total time: 139.68m | eta: 39.8m +step 13004/16704 (77.85%) | loss: 2.553886 | lrm: 0.44 | dt: 650.10ms | tok/sec: 806,468 | mfu: 50.41 | epoch: 2 | total time: 139.69m | eta: 39.8m +step 13005/16704 (77.86%) | loss: 2.553859 | lrm: 0.44 | dt: 641.10ms | tok/sec: 817,793 | mfu: 51.11 | epoch: 2 | total time: 139.70m | eta: 39.8m +step 13006/16704 (77.86%) | loss: 2.558387 | lrm: 0.44 | dt: 645.33ms | tok/sec: 812,438 | mfu: 50.78 | epoch: 2 | total time: 139.71m | eta: 39.8m +step 13007/16704 (77.87%) | loss: 2.551751 | lrm: 0.44 | dt: 644.11ms | tok/sec: 813,971 | mfu: 50.87 | epoch: 2 | total time: 139.72m | eta: 39.7m +step 13008/16704 (77.87%) | loss: 2.541449 | lrm: 0.44 | dt: 642.77ms | tok/sec: 815,669 | mfu: 50.98 | epoch: 2 | total time: 139.74m | eta: 39.7m +step 13009/16704 (77.88%) | loss: 2.531826 | lrm: 0.44 | dt: 644.93ms | tok/sec: 812,934 | mfu: 50.81 | epoch: 2 | total time: 139.75m | eta: 39.7m +step 13010/16704 (77.89%) | loss: 2.536455 | lrm: 0.44 | dt: 643.86ms | tok/sec: 814,285 | mfu: 50.89 | epoch: 2 | total time: 139.76m | eta: 39.7m +step 13011/16704 (77.89%) | loss: 2.523736 | lrm: 0.44 | dt: 643.51ms | tok/sec: 814,734 | mfu: 50.92 | epoch: 2 | total time: 139.77m | eta: 39.7m +step 13012/16704 (77.90%) | loss: 2.519107 | lrm: 0.44 | dt: 645.23ms | tok/sec: 812,558 | mfu: 50.79 | epoch: 2 | total time: 139.78m | eta: 39.7m +step 13013/16704 (77.90%) | loss: 2.516314 | lrm: 0.44 | dt: 643.00ms | tok/sec: 815,383 | mfu: 50.96 | epoch: 2 | total time: 139.79m | eta: 39.7m +step 13014/16704 (77.91%) | loss: 2.507864 | lrm: 0.44 | dt: 643.28ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 2 | total time: 139.80m | eta: 39.7m +step 13015/16704 (77.92%) | loss: 2.507375 | lrm: 0.44 | dt: 644.85ms | tok/sec: 813,042 | mfu: 50.82 | epoch: 2 | total time: 139.81m | eta: 39.7m +step 13016/16704 (77.92%) | loss: 2.504477 | lrm: 0.44 | dt: 643.61ms | tok/sec: 814,605 | mfu: 50.91 | epoch: 2 | total time: 139.82m | eta: 39.6m +step 13017/16704 (77.93%) | loss: 2.499040 | lrm: 0.44 | dt: 643.88ms | tok/sec: 814,260 | mfu: 50.89 | epoch: 2 | total time: 139.83m | eta: 39.6m +step 13018/16704 (77.93%) | loss: 2.502319 | lrm: 0.44 | dt: 644.07ms | tok/sec: 814,023 | mfu: 50.88 | epoch: 2 | total time: 139.84m | eta: 39.6m +step 13019/16704 (77.94%) | loss: 2.515007 | lrm: 0.44 | dt: 646.25ms | tok/sec: 811,282 | mfu: 50.71 | epoch: 2 | total time: 139.85m | eta: 39.6m +step 13020/16704 (77.95%) | loss: 2.513753 | lrm: 0.44 | dt: 643.35ms | tok/sec: 814,937 | mfu: 50.93 | epoch: 2 | total time: 139.86m | eta: 39.6m +step 13021/16704 (77.95%) | loss: 2.520949 | lrm: 0.44 | dt: 644.76ms | tok/sec: 813,148 | mfu: 50.82 | epoch: 2 | total time: 139.87m | eta: 39.6m +step 13022/16704 (77.96%) | loss: 2.538212 | lrm: 0.44 | dt: 643.64ms | tok/sec: 814,571 | mfu: 50.91 | epoch: 2 | total time: 139.89m | eta: 39.6m +step 13023/16704 (77.96%) | loss: 2.537603 | lrm: 0.44 | dt: 644.33ms | tok/sec: 813,690 | mfu: 50.86 | epoch: 2 | total time: 139.90m | eta: 39.6m +step 13024/16704 (77.97%) | loss: 2.546419 | lrm: 0.44 | dt: 643.67ms | tok/sec: 814,528 | mfu: 50.91 | epoch: 2 | total time: 139.91m | eta: 39.6m +step 13025/16704 (77.98%) | loss: 2.535177 | lrm: 0.44 | dt: 644.82ms | tok/sec: 813,077 | mfu: 50.82 | epoch: 2 | total time: 139.92m | eta: 39.6m +step 13026/16704 (77.98%) | loss: 2.545177 | lrm: 0.44 | dt: 645.91ms | tok/sec: 811,703 | mfu: 50.73 | epoch: 2 | total time: 139.93m | eta: 39.5m +step 13027/16704 (77.99%) | loss: 2.533894 | lrm: 0.44 | dt: 642.05ms | tok/sec: 816,590 | mfu: 51.04 | epoch: 2 | total time: 139.94m | eta: 39.5m +step 13028/16704 (77.99%) | loss: 2.539933 | lrm: 0.44 | dt: 647.06ms | tok/sec: 810,263 | mfu: 50.64 | epoch: 2 | total time: 139.95m | eta: 39.5m +step 13029/16704 (78.00%) | loss: 2.536729 | lrm: 0.44 | dt: 644.20ms | tok/sec: 813,854 | mfu: 50.87 | epoch: 2 | total time: 139.96m | eta: 39.5m +step 13030/16704 (78.01%) | loss: 2.527636 | lrm: 0.44 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 2 | total time: 139.97m | eta: 39.5m +step 13031/16704 (78.01%) | loss: 2.527773 | lrm: 0.44 | dt: 645.26ms | tok/sec: 812,516 | mfu: 50.78 | epoch: 2 | total time: 139.98m | eta: 39.5m +step 13032/16704 (78.02%) | loss: 2.538003 | lrm: 0.44 | dt: 643.05ms | tok/sec: 815,308 | mfu: 50.96 | epoch: 2 | total time: 139.99m | eta: 39.5m +step 13033/16704 (78.02%) | loss: 2.543417 | lrm: 0.44 | dt: 645.01ms | tok/sec: 812,842 | mfu: 50.80 | epoch: 2 | total time: 140.00m | eta: 39.5m +step 13034/16704 (78.03%) | loss: 2.545570 | lrm: 0.44 | dt: 643.43ms | tok/sec: 814,832 | mfu: 50.93 | epoch: 2 | total time: 140.01m | eta: 39.5m +step 13035/16704 (78.04%) | loss: 2.544611 | lrm: 0.44 | dt: 647.47ms | tok/sec: 809,743 | mfu: 50.61 | epoch: 2 | total time: 140.03m | eta: 39.4m +step 13036/16704 (78.04%) | loss: 2.548797 | lrm: 0.44 | dt: 645.76ms | tok/sec: 811,889 | mfu: 50.74 | epoch: 2 | total time: 140.04m | eta: 39.4m +step 13037/16704 (78.05%) | loss: 2.550658 | lrm: 0.44 | dt: 645.37ms | tok/sec: 812,387 | mfu: 50.78 | epoch: 2 | total time: 140.05m | eta: 39.4m +step 13038/16704 (78.05%) | loss: 2.550216 | lrm: 0.44 | dt: 647.52ms | tok/sec: 809,689 | mfu: 50.61 | epoch: 2 | total time: 140.06m | eta: 39.4m +step 13039/16704 (78.06%) | loss: 2.549724 | lrm: 0.44 | dt: 644.37ms | tok/sec: 813,649 | mfu: 50.85 | epoch: 2 | total time: 140.07m | eta: 39.4m +step 13040/16704 (78.07%) | loss: 2.550427 | lrm: 0.44 | dt: 646.65ms | tok/sec: 810,770 | mfu: 50.67 | epoch: 2 | total time: 140.08m | eta: 39.4m +step 13041/16704 (78.07%) | loss: 2.547326 | lrm: 0.44 | dt: 648.45ms | tok/sec: 808,522 | mfu: 50.53 | epoch: 2 | total time: 140.09m | eta: 39.4m +step 13042/16704 (78.08%) | loss: 2.545229 | lrm: 0.44 | dt: 644.81ms | tok/sec: 813,093 | mfu: 50.82 | epoch: 2 | total time: 140.10m | eta: 39.4m +step 13043/16704 (78.08%) | loss: 2.545979 | lrm: 0.44 | dt: 644.09ms | tok/sec: 813,993 | mfu: 50.88 | epoch: 2 | total time: 140.11m | eta: 39.4m +step 13044/16704 (78.09%) | loss: 2.542095 | lrm: 0.44 | dt: 647.79ms | tok/sec: 809,346 | mfu: 50.59 | epoch: 2 | total time: 140.12m | eta: 39.3m +step 13045/16704 (78.10%) | loss: 2.534684 | lrm: 0.44 | dt: 645.73ms | tok/sec: 811,927 | mfu: 50.75 | epoch: 2 | total time: 140.13m | eta: 39.3m +step 13046/16704 (78.10%) | loss: 2.530582 | lrm: 0.44 | dt: 648.98ms | tok/sec: 807,863 | mfu: 50.49 | epoch: 2 | total time: 140.14m | eta: 39.3m +step 13047/16704 (78.11%) | loss: 2.532429 | lrm: 0.44 | dt: 644.43ms | tok/sec: 813,568 | mfu: 50.85 | epoch: 2 | total time: 140.15m | eta: 39.3m +step 13048/16704 (78.11%) | loss: 2.523939 | lrm: 0.44 | dt: 647.84ms | tok/sec: 809,288 | mfu: 50.58 | epoch: 2 | total time: 140.17m | eta: 39.3m +step 13049/16704 (78.12%) | loss: 2.529662 | lrm: 0.44 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 2 | total time: 140.18m | eta: 39.3m +step 13050/16704 (78.12%) | loss: 2.530979 | lrm: 0.44 | dt: 646.90ms | tok/sec: 810,467 | mfu: 50.66 | epoch: 2 | total time: 140.19m | eta: 39.3m +step 13051/16704 (78.13%) | loss: 2.528412 | lrm: 0.44 | dt: 647.08ms | tok/sec: 810,241 | mfu: 50.64 | epoch: 2 | total time: 140.20m | eta: 39.3m +step 13052/16704 (78.14%) | loss: 2.531226 | lrm: 0.44 | dt: 644.83ms | tok/sec: 813,067 | mfu: 50.82 | epoch: 2 | total time: 140.21m | eta: 39.3m +step 13053/16704 (78.14%) | loss: 2.537143 | lrm: 0.44 | dt: 648.53ms | tok/sec: 808,425 | mfu: 50.53 | epoch: 2 | total time: 140.22m | eta: 39.3m +step 13054/16704 (78.15%) | loss: 2.537888 | lrm: 0.44 | dt: 643.73ms | tok/sec: 814,453 | mfu: 50.90 | epoch: 2 | total time: 140.23m | eta: 39.2m +step 13055/16704 (78.15%) | loss: 2.546937 | lrm: 0.44 | dt: 647.20ms | tok/sec: 810,090 | mfu: 50.63 | epoch: 2 | total time: 140.24m | eta: 39.2m +step 13056/16704 (78.16%) | loss: 2.552770 | lrm: 0.44 | dt: 649.54ms | tok/sec: 807,172 | mfu: 50.45 | epoch: 2 | total time: 140.25m | eta: 39.2m +step 13057/16704 (78.17%) | loss: 2.563739 | lrm: 0.44 | dt: 643.56ms | tok/sec: 814,674 | mfu: 50.92 | epoch: 2 | total time: 140.26m | eta: 39.2m +step 13058/16704 (78.17%) | loss: 2.558553 | lrm: 0.44 | dt: 648.03ms | tok/sec: 809,048 | mfu: 50.57 | epoch: 2 | total time: 140.27m | eta: 39.2m +step 13059/16704 (78.18%) | loss: 2.555834 | lrm: 0.44 | dt: 645.60ms | tok/sec: 812,100 | mfu: 50.76 | epoch: 2 | total time: 140.28m | eta: 39.2m +step 13060/16704 (78.18%) | loss: 2.547270 | lrm: 0.44 | dt: 645.45ms | tok/sec: 812,286 | mfu: 50.77 | epoch: 2 | total time: 140.29m | eta: 39.2m +step 13061/16704 (78.19%) | loss: 2.537675 | lrm: 0.44 | dt: 645.69ms | tok/sec: 811,979 | mfu: 50.75 | epoch: 2 | total time: 140.31m | eta: 39.2m +step 13062/16704 (78.20%) | loss: 2.533673 | lrm: 0.44 | dt: 646.97ms | tok/sec: 810,368 | mfu: 50.65 | epoch: 2 | total time: 140.32m | eta: 39.2m +step 13063/16704 (78.20%) | loss: 2.531872 | lrm: 0.44 | dt: 647.54ms | tok/sec: 809,655 | mfu: 50.60 | epoch: 2 | total time: 140.33m | eta: 39.1m +step 13064/16704 (78.21%) | loss: 2.521329 | lrm: 0.44 | dt: 645.15ms | tok/sec: 812,666 | mfu: 50.79 | epoch: 2 | total time: 140.34m | eta: 39.1m +step 13065/16704 (78.21%) | loss: 2.532118 | lrm: 0.44 | dt: 646.36ms | tok/sec: 811,142 | mfu: 50.70 | epoch: 2 | total time: 140.35m | eta: 39.1m +step 13066/16704 (78.22%) | loss: 2.550543 | lrm: 0.44 | dt: 648.37ms | tok/sec: 808,630 | mfu: 50.54 | epoch: 2 | total time: 140.36m | eta: 39.1m +step 13067/16704 (78.23%) | loss: 2.547589 | lrm: 0.44 | dt: 646.25ms | tok/sec: 811,283 | mfu: 50.71 | epoch: 2 | total time: 140.37m | eta: 39.1m +step 13068/16704 (78.23%) | loss: 2.539517 | lrm: 0.44 | dt: 649.15ms | tok/sec: 807,649 | mfu: 50.48 | epoch: 2 | total time: 140.38m | eta: 39.1m +step 13069/16704 (78.24%) | loss: 2.547019 | lrm: 0.44 | dt: 645.39ms | tok/sec: 812,357 | mfu: 50.77 | epoch: 2 | total time: 140.39m | eta: 39.1m +step 13070/16704 (78.24%) | loss: 2.549893 | lrm: 0.44 | dt: 645.16ms | tok/sec: 812,643 | mfu: 50.79 | epoch: 2 | total time: 140.40m | eta: 39.1m +step 13071/16704 (78.25%) | loss: 2.547316 | lrm: 0.43 | dt: 651.06ms | tok/sec: 805,281 | mfu: 50.33 | epoch: 2 | total time: 140.41m | eta: 39.1m +step 13072/16704 (78.26%) | loss: 2.533118 | lrm: 0.43 | dt: 644.24ms | tok/sec: 813,813 | mfu: 50.86 | epoch: 2 | total time: 140.42m | eta: 39.0m +step 13073/16704 (78.26%) | loss: 2.531996 | lrm: 0.43 | dt: 646.84ms | tok/sec: 810,539 | mfu: 50.66 | epoch: 2 | total time: 140.43m | eta: 39.0m +step 13074/16704 (78.27%) | loss: 2.527235 | lrm: 0.43 | dt: 646.20ms | tok/sec: 811,336 | mfu: 50.71 | epoch: 2 | total time: 140.45m | eta: 39.0m +step 13075/16704 (78.27%) | loss: 2.531926 | lrm: 0.43 | dt: 644.81ms | tok/sec: 813,092 | mfu: 50.82 | epoch: 2 | total time: 140.46m | eta: 39.0m +step 13076/16704 (78.28%) | loss: 2.536707 | lrm: 0.43 | dt: 645.45ms | tok/sec: 812,280 | mfu: 50.77 | epoch: 2 | total time: 140.47m | eta: 39.0m +step 13077/16704 (78.29%) | loss: 2.542811 | lrm: 0.43 | dt: 646.51ms | tok/sec: 810,946 | mfu: 50.69 | epoch: 2 | total time: 140.48m | eta: 39.0m +step 13078/16704 (78.29%) | loss: 2.544437 | lrm: 0.43 | dt: 646.16ms | tok/sec: 811,386 | mfu: 50.71 | epoch: 2 | total time: 140.49m | eta: 39.0m +step 13079/16704 (78.30%) | loss: 2.535050 | lrm: 0.43 | dt: 646.37ms | tok/sec: 811,122 | mfu: 50.70 | epoch: 2 | total time: 140.50m | eta: 39.0m +step 13080/16704 (78.30%) | loss: 2.553940 | lrm: 0.43 | dt: 650.18ms | tok/sec: 806,374 | mfu: 50.40 | epoch: 2 | total time: 140.51m | eta: 39.0m +step 13081/16704 (78.31%) | loss: 2.562695 | lrm: 0.43 | dt: 647.11ms | tok/sec: 810,201 | mfu: 50.64 | epoch: 2 | total time: 140.52m | eta: 38.9m +step 13082/16704 (78.32%) | loss: 2.567443 | lrm: 0.43 | dt: 648.51ms | tok/sec: 808,454 | mfu: 50.53 | epoch: 2 | total time: 140.53m | eta: 38.9m +step 13083/16704 (78.32%) | loss: 2.558416 | lrm: 0.43 | dt: 648.49ms | tok/sec: 808,473 | mfu: 50.53 | epoch: 2 | total time: 140.54m | eta: 38.9m +step 13084/16704 (78.33%) | loss: 2.545438 | lrm: 0.43 | dt: 645.33ms | tok/sec: 812,430 | mfu: 50.78 | epoch: 2 | total time: 140.55m | eta: 38.9m +step 13085/16704 (78.33%) | loss: 2.537297 | lrm: 0.43 | dt: 649.93ms | tok/sec: 806,685 | mfu: 50.42 | epoch: 2 | total time: 140.56m | eta: 38.9m +step 13086/16704 (78.34%) | loss: 2.536141 | lrm: 0.43 | dt: 648.38ms | tok/sec: 808,616 | mfu: 50.54 | epoch: 2 | total time: 140.57m | eta: 38.9m +step 13087/16704 (78.35%) | loss: 2.522791 | lrm: 0.43 | dt: 645.16ms | tok/sec: 812,654 | mfu: 50.79 | epoch: 2 | total time: 140.59m | eta: 38.9m +step 13088/16704 (78.35%) | loss: 2.521946 | lrm: 0.43 | dt: 649.29ms | tok/sec: 807,485 | mfu: 50.47 | epoch: 2 | total time: 140.60m | eta: 38.9m +step 13089/16704 (78.36%) | loss: 2.520368 | lrm: 0.43 | dt: 644.51ms | tok/sec: 813,469 | mfu: 50.84 | epoch: 2 | total time: 140.61m | eta: 38.9m +step 13090/16704 (78.36%) | loss: 2.510560 | lrm: 0.43 | dt: 645.76ms | tok/sec: 811,892 | mfu: 50.74 | epoch: 2 | total time: 140.62m | eta: 38.9m +step 13091/16704 (78.37%) | loss: 2.510222 | lrm: 0.43 | dt: 647.87ms | tok/sec: 809,243 | mfu: 50.58 | epoch: 2 | total time: 140.63m | eta: 38.8m +step 13092/16704 (78.38%) | loss: 2.517303 | lrm: 0.43 | dt: 646.45ms | tok/sec: 811,026 | mfu: 50.69 | epoch: 2 | total time: 140.64m | eta: 38.8m +step 13093/16704 (78.38%) | loss: 2.508059 | lrm: 0.43 | dt: 644.97ms | tok/sec: 812,892 | mfu: 50.81 | epoch: 2 | total time: 140.65m | eta: 38.8m +step 13094/16704 (78.39%) | loss: 2.503315 | lrm: 0.43 | dt: 649.07ms | tok/sec: 807,754 | mfu: 50.49 | epoch: 2 | total time: 140.66m | eta: 38.8m +step 13095/16704 (78.39%) | loss: 2.507681 | lrm: 0.43 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 2 | total time: 140.67m | eta: 38.8m +step 13096/16704 (78.40%) | loss: 2.505973 | lrm: 0.43 | dt: 650.72ms | tok/sec: 805,704 | mfu: 50.36 | epoch: 2 | total time: 140.68m | eta: 38.8m +step 13097/16704 (78.41%) | loss: 2.511940 | lrm: 0.43 | dt: 645.60ms | tok/sec: 812,088 | mfu: 50.76 | epoch: 2 | total time: 140.69m | eta: 38.8m +step 13098/16704 (78.41%) | loss: 2.507137 | lrm: 0.43 | dt: 643.55ms | tok/sec: 814,678 | mfu: 50.92 | epoch: 2 | total time: 140.70m | eta: 38.8m +step 13099/16704 (78.42%) | loss: 2.495483 | lrm: 0.43 | dt: 649.94ms | tok/sec: 806,667 | mfu: 50.42 | epoch: 2 | total time: 140.72m | eta: 38.8m +step 13100/16704 (78.42%) | loss: 2.496076 | lrm: 0.43 | dt: 646.64ms | tok/sec: 810,787 | mfu: 50.68 | epoch: 2 | total time: 140.73m | eta: 38.7m +step 13101/16704 (78.43%) | loss: 2.493131 | lrm: 0.43 | dt: 646.99ms | tok/sec: 810,348 | mfu: 50.65 | epoch: 2 | total time: 140.74m | eta: 38.7m +step 13102/16704 (78.44%) | loss: 2.491452 | lrm: 0.43 | dt: 647.27ms | tok/sec: 809,993 | mfu: 50.63 | epoch: 2 | total time: 140.75m | eta: 38.7m +step 13103/16704 (78.44%) | loss: 2.500863 | lrm: 0.43 | dt: 645.42ms | tok/sec: 812,322 | mfu: 50.77 | epoch: 2 | total time: 140.76m | eta: 38.7m +step 13104/16704 (78.45%) | loss: 2.502647 | lrm: 0.43 | dt: 645.59ms | tok/sec: 812,112 | mfu: 50.76 | epoch: 2 | total time: 140.77m | eta: 38.7m +step 13105/16704 (78.45%) | loss: 2.505092 | lrm: 0.43 | dt: 646.27ms | tok/sec: 811,258 | mfu: 50.70 | epoch: 2 | total time: 140.78m | eta: 38.7m +step 13106/16704 (78.46%) | loss: 2.518692 | lrm: 0.43 | dt: 645.01ms | tok/sec: 812,841 | mfu: 50.80 | epoch: 2 | total time: 140.79m | eta: 38.7m +step 13107/16704 (78.47%) | loss: 2.521021 | lrm: 0.43 | dt: 646.08ms | tok/sec: 811,485 | mfu: 50.72 | epoch: 2 | total time: 140.80m | eta: 38.7m +step 13108/16704 (78.47%) | loss: 2.516284 | lrm: 0.43 | dt: 646.00ms | tok/sec: 811,591 | mfu: 50.73 | epoch: 2 | total time: 140.81m | eta: 38.7m +step 13109/16704 (78.48%) | loss: 2.513074 | lrm: 0.43 | dt: 645.90ms | tok/sec: 811,717 | mfu: 50.73 | epoch: 2 | total time: 140.82m | eta: 38.6m +step 13110/16704 (78.48%) | loss: 2.517500 | lrm: 0.43 | dt: 645.35ms | tok/sec: 812,409 | mfu: 50.78 | epoch: 2 | total time: 140.83m | eta: 38.6m +step 13111/16704 (78.49%) | loss: 2.524075 | lrm: 0.43 | dt: 643.74ms | tok/sec: 814,439 | mfu: 50.90 | epoch: 2 | total time: 140.84m | eta: 38.6m +step 13112/16704 (78.50%) | loss: 2.524644 | lrm: 0.43 | dt: 649.41ms | tok/sec: 807,323 | mfu: 50.46 | epoch: 2 | total time: 140.86m | eta: 38.6m +step 13113/16704 (78.50%) | loss: 2.526340 | lrm: 0.43 | dt: 642.29ms | tok/sec: 816,283 | mfu: 51.02 | epoch: 2 | total time: 140.87m | eta: 38.6m +step 13114/16704 (78.51%) | loss: 2.523361 | lrm: 0.43 | dt: 646.00ms | tok/sec: 811,586 | mfu: 50.73 | epoch: 2 | total time: 140.88m | eta: 38.6m +step 13115/16704 (78.51%) | loss: 2.524504 | lrm: 0.43 | dt: 643.52ms | tok/sec: 814,722 | mfu: 50.92 | epoch: 2 | total time: 140.89m | eta: 38.6m +step 13116/16704 (78.52%) | loss: 2.523353 | lrm: 0.43 | dt: 644.72ms | tok/sec: 813,201 | mfu: 50.83 | epoch: 2 | total time: 140.90m | eta: 38.6m +step 13117/16704 (78.53%) | loss: 2.520640 | lrm: 0.43 | dt: 644.35ms | tok/sec: 813,668 | mfu: 50.86 | epoch: 2 | total time: 140.91m | eta: 38.6m +step 13118/16704 (78.53%) | loss: 2.524491 | lrm: 0.43 | dt: 646.78ms | tok/sec: 810,614 | mfu: 50.66 | epoch: 2 | total time: 140.92m | eta: 38.6m +step 13119/16704 (78.54%) | loss: 2.533803 | lrm: 0.43 | dt: 644.57ms | tok/sec: 813,392 | mfu: 50.84 | epoch: 2 | total time: 140.93m | eta: 38.5m +step 13120/16704 (78.54%) | loss: 2.551988 | lrm: 0.43 | dt: 646.44ms | tok/sec: 811,037 | mfu: 50.69 | epoch: 2 | total time: 140.94m | eta: 38.5m +step 13121/16704 (78.55%) | loss: 2.548847 | lrm: 0.43 | dt: 643.97ms | tok/sec: 814,146 | mfu: 50.89 | epoch: 2 | total time: 140.95m | eta: 38.5m +step 13122/16704 (78.56%) | loss: 2.539252 | lrm: 0.43 | dt: 647.05ms | tok/sec: 810,272 | mfu: 50.64 | epoch: 2 | total time: 140.96m | eta: 38.5m +step 13123/16704 (78.56%) | loss: 2.542307 | lrm: 0.43 | dt: 643.89ms | tok/sec: 814,254 | mfu: 50.89 | epoch: 2 | total time: 140.97m | eta: 38.5m +step 13124/16704 (78.57%) | loss: 2.548221 | lrm: 0.43 | dt: 645.96ms | tok/sec: 811,638 | mfu: 50.73 | epoch: 2 | total time: 140.98m | eta: 38.5m +step 13125/16704 (78.57%) | loss: 2.536969 | lrm: 0.43 | dt: 643.53ms | tok/sec: 814,709 | mfu: 50.92 | epoch: 2 | total time: 140.99m | eta: 38.5m +step 13126/16704 (78.58%) | loss: 2.533423 | lrm: 0.43 | dt: 643.95ms | tok/sec: 814,179 | mfu: 50.89 | epoch: 2 | total time: 141.01m | eta: 38.5m +step 13127/16704 (78.59%) | loss: 2.538049 | lrm: 0.43 | dt: 645.99ms | tok/sec: 811,608 | mfu: 50.73 | epoch: 2 | total time: 141.02m | eta: 38.5m +step 13128/16704 (78.59%) | loss: 2.540699 | lrm: 0.43 | dt: 643.80ms | tok/sec: 814,370 | mfu: 50.90 | epoch: 2 | total time: 141.03m | eta: 38.4m +step 13129/16704 (78.60%) | loss: 2.543717 | lrm: 0.43 | dt: 643.30ms | tok/sec: 814,995 | mfu: 50.94 | epoch: 2 | total time: 141.04m | eta: 38.4m +step 13130/16704 (78.60%) | loss: 2.537738 | lrm: 0.43 | dt: 644.36ms | tok/sec: 813,658 | mfu: 50.85 | epoch: 2 | total time: 141.05m | eta: 38.4m +step 13131/16704 (78.61%) | loss: 2.528600 | lrm: 0.43 | dt: 645.53ms | tok/sec: 812,180 | mfu: 50.76 | epoch: 2 | total time: 141.06m | eta: 38.4m +step 13132/16704 (78.62%) | loss: 2.535594 | lrm: 0.43 | dt: 646.24ms | tok/sec: 811,286 | mfu: 50.71 | epoch: 2 | total time: 141.07m | eta: 38.4m +step 13133/16704 (78.62%) | loss: 2.534721 | lrm: 0.43 | dt: 642.34ms | tok/sec: 816,218 | mfu: 51.01 | epoch: 2 | total time: 141.08m | eta: 38.4m +step 13134/16704 (78.63%) | loss: 2.523126 | lrm: 0.43 | dt: 645.17ms | tok/sec: 812,635 | mfu: 50.79 | epoch: 2 | total time: 141.09m | eta: 38.4m +step 13135/16704 (78.63%) | loss: 2.515224 | lrm: 0.43 | dt: 644.46ms | tok/sec: 813,532 | mfu: 50.85 | epoch: 2 | total time: 141.10m | eta: 38.4m +step 13136/16704 (78.64%) | loss: 2.504867 | lrm: 0.43 | dt: 643.80ms | tok/sec: 814,362 | mfu: 50.90 | epoch: 2 | total time: 141.11m | eta: 38.4m +step 13137/16704 (78.65%) | loss: 2.501041 | lrm: 0.43 | dt: 645.56ms | tok/sec: 812,141 | mfu: 50.76 | epoch: 2 | total time: 141.12m | eta: 38.3m +step 13138/16704 (78.65%) | loss: 2.511776 | lrm: 0.43 | dt: 642.36ms | tok/sec: 816,188 | mfu: 51.01 | epoch: 2 | total time: 141.13m | eta: 38.3m +step 13139/16704 (78.66%) | loss: 2.505346 | lrm: 0.43 | dt: 644.51ms | tok/sec: 813,472 | mfu: 50.84 | epoch: 2 | total time: 141.15m | eta: 38.3m +step 13140/16704 (78.66%) | loss: 2.508744 | lrm: 0.43 | dt: 644.54ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 141.16m | eta: 38.3m +step 13141/16704 (78.67%) | loss: 2.518585 | lrm: 0.43 | dt: 643.40ms | tok/sec: 814,869 | mfu: 50.93 | epoch: 2 | total time: 141.17m | eta: 38.3m +step 13142/16704 (78.68%) | loss: 2.530421 | lrm: 0.43 | dt: 642.93ms | tok/sec: 815,460 | mfu: 50.97 | epoch: 2 | total time: 141.18m | eta: 38.3m +step 13143/16704 (78.68%) | loss: 2.526967 | lrm: 0.43 | dt: 641.50ms | tok/sec: 817,285 | mfu: 51.08 | epoch: 2 | total time: 141.19m | eta: 38.3m +step 13144/16704 (78.69%) | loss: 2.538398 | lrm: 0.43 | dt: 644.06ms | tok/sec: 814,041 | mfu: 50.88 | epoch: 2 | total time: 141.20m | eta: 38.3m +step 13145/16704 (78.69%) | loss: 2.543829 | lrm: 0.43 | dt: 645.58ms | tok/sec: 812,114 | mfu: 50.76 | epoch: 2 | total time: 141.21m | eta: 38.3m +step 13146/16704 (78.70%) | loss: 2.548276 | lrm: 0.43 | dt: 642.79ms | tok/sec: 815,640 | mfu: 50.98 | epoch: 2 | total time: 141.22m | eta: 38.3m +step 13147/16704 (78.71%) | loss: 2.535477 | lrm: 0.43 | dt: 644.08ms | tok/sec: 814,005 | mfu: 50.88 | epoch: 2 | total time: 141.23m | eta: 38.2m +step 13148/16704 (78.71%) | loss: 2.532965 | lrm: 0.43 | dt: 643.31ms | tok/sec: 814,984 | mfu: 50.94 | epoch: 2 | total time: 141.24m | eta: 38.2m +step 13149/16704 (78.72%) | loss: 2.523465 | lrm: 0.43 | dt: 646.20ms | tok/sec: 811,337 | mfu: 50.71 | epoch: 2 | total time: 141.25m | eta: 38.2m +step 13150/16704 (78.72%) | loss: 2.529095 | lrm: 0.43 | dt: 644.96ms | tok/sec: 812,897 | mfu: 50.81 | epoch: 2 | total time: 141.26m | eta: 38.2m +step 13151/16704 (78.73%) | loss: 2.520451 | lrm: 0.43 | dt: 642.72ms | tok/sec: 815,727 | mfu: 50.98 | epoch: 2 | total time: 141.27m | eta: 38.2m +step 13152/16704 (78.74%) | loss: 2.503480 | lrm: 0.43 | dt: 643.98ms | tok/sec: 814,133 | mfu: 50.88 | epoch: 2 | total time: 141.28m | eta: 38.2m +step 13153/16704 (78.74%) | loss: 2.503442 | lrm: 0.43 | dt: 643.95ms | tok/sec: 814,175 | mfu: 50.89 | epoch: 2 | total time: 141.30m | eta: 38.2m +step 13154/16704 (78.75%) | loss: 2.511193 | lrm: 0.43 | dt: 643.65ms | tok/sec: 814,552 | mfu: 50.91 | epoch: 2 | total time: 141.31m | eta: 38.2m +step 13155/16704 (78.75%) | loss: 2.512759 | lrm: 0.42 | dt: 642.63ms | tok/sec: 815,849 | mfu: 50.99 | epoch: 2 | total time: 141.32m | eta: 38.2m +step 13156/16704 (78.76%) | loss: 2.520382 | lrm: 0.42 | dt: 643.94ms | tok/sec: 814,182 | mfu: 50.89 | epoch: 2 | total time: 141.33m | eta: 38.1m +step 13157/16704 (78.77%) | loss: 2.509614 | lrm: 0.42 | dt: 642.47ms | tok/sec: 816,056 | mfu: 51.00 | epoch: 2 | total time: 141.34m | eta: 38.1m +step 13158/16704 (78.77%) | loss: 2.501543 | lrm: 0.42 | dt: 642.13ms | tok/sec: 816,483 | mfu: 51.03 | epoch: 2 | total time: 141.35m | eta: 38.1m +step 13159/16704 (78.78%) | loss: 2.515614 | lrm: 0.42 | dt: 644.29ms | tok/sec: 813,743 | mfu: 50.86 | epoch: 2 | total time: 141.36m | eta: 38.1m +step 13160/16704 (78.78%) | loss: 2.530214 | lrm: 0.42 | dt: 642.63ms | tok/sec: 815,850 | mfu: 50.99 | epoch: 2 | total time: 141.37m | eta: 38.1m +step 13161/16704 (78.79%) | loss: 2.526113 | lrm: 0.42 | dt: 644.40ms | tok/sec: 813,605 | mfu: 50.85 | epoch: 2 | total time: 141.38m | eta: 38.1m +step 13162/16704 (78.80%) | loss: 2.534738 | lrm: 0.42 | dt: 644.79ms | tok/sec: 813,108 | mfu: 50.82 | epoch: 2 | total time: 141.39m | eta: 38.1m +step 13163/16704 (78.80%) | loss: 2.542946 | lrm: 0.42 | dt: 643.73ms | tok/sec: 814,452 | mfu: 50.90 | epoch: 2 | total time: 141.40m | eta: 38.1m +step 13164/16704 (78.81%) | loss: 2.536852 | lrm: 0.42 | dt: 642.93ms | tok/sec: 815,465 | mfu: 50.97 | epoch: 2 | total time: 141.41m | eta: 38.1m +step 13165/16704 (78.81%) | loss: 2.537771 | lrm: 0.42 | dt: 645.09ms | tok/sec: 812,735 | mfu: 50.80 | epoch: 2 | total time: 141.42m | eta: 38.0m +step 13166/16704 (78.82%) | loss: 2.533001 | lrm: 0.42 | dt: 645.22ms | tok/sec: 812,568 | mfu: 50.79 | epoch: 2 | total time: 141.43m | eta: 38.0m +step 13167/16704 (78.83%) | loss: 2.522752 | lrm: 0.42 | dt: 644.10ms | tok/sec: 813,986 | mfu: 50.88 | epoch: 2 | total time: 141.45m | eta: 38.0m +step 13168/16704 (78.83%) | loss: 2.528439 | lrm: 0.42 | dt: 643.36ms | tok/sec: 814,919 | mfu: 50.93 | epoch: 2 | total time: 141.46m | eta: 38.0m +step 13169/16704 (78.84%) | loss: 2.520633 | lrm: 0.42 | dt: 642.39ms | tok/sec: 816,150 | mfu: 51.01 | epoch: 2 | total time: 141.47m | eta: 38.0m +step 13170/16704 (78.84%) | loss: 2.517532 | lrm: 0.42 | dt: 641.31ms | tok/sec: 817,529 | mfu: 51.10 | epoch: 2 | total time: 141.48m | eta: 38.0m +step 13171/16704 (78.85%) | loss: 2.513086 | lrm: 0.42 | dt: 643.92ms | tok/sec: 814,212 | mfu: 50.89 | epoch: 2 | total time: 141.49m | eta: 38.0m +step 13172/16704 (78.86%) | loss: 2.525830 | lrm: 0.42 | dt: 643.61ms | tok/sec: 814,610 | mfu: 50.91 | epoch: 2 | total time: 141.50m | eta: 38.0m +step 13173/16704 (78.86%) | loss: 2.526132 | lrm: 0.42 | dt: 646.17ms | tok/sec: 811,372 | mfu: 50.71 | epoch: 2 | total time: 141.51m | eta: 38.0m +step 13174/16704 (78.87%) | loss: 2.523099 | lrm: 0.42 | dt: 642.54ms | tok/sec: 815,963 | mfu: 51.00 | epoch: 2 | total time: 141.52m | eta: 37.9m +step 13175/16704 (78.87%) | loss: 2.528937 | lrm: 0.42 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 2 | total time: 141.53m | eta: 37.9m +step 13176/16704 (78.88%) | loss: 2.523302 | lrm: 0.42 | dt: 645.88ms | tok/sec: 811,738 | mfu: 50.73 | epoch: 2 | total time: 141.54m | eta: 37.9m +step 13177/16704 (78.89%) | loss: 2.531656 | lrm: 0.42 | dt: 644.84ms | tok/sec: 813,052 | mfu: 50.82 | epoch: 2 | total time: 141.55m | eta: 37.9m +step 13178/16704 (78.89%) | loss: 2.540288 | lrm: 0.42 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 2 | total time: 141.56m | eta: 37.9m +step 13179/16704 (78.90%) | loss: 2.542495 | lrm: 0.42 | dt: 643.52ms | tok/sec: 814,723 | mfu: 50.92 | epoch: 2 | total time: 141.57m | eta: 37.9m +step 13180/16704 (78.90%) | loss: 2.540962 | lrm: 0.42 | dt: 643.12ms | tok/sec: 815,222 | mfu: 50.95 | epoch: 2 | total time: 141.59m | eta: 37.9m +step 13181/16704 (78.91%) | loss: 2.540951 | lrm: 0.42 | dt: 645.04ms | tok/sec: 812,800 | mfu: 50.80 | epoch: 2 | total time: 141.60m | eta: 37.9m +step 13182/16704 (78.92%) | loss: 2.546054 | lrm: 0.42 | dt: 643.64ms | tok/sec: 814,564 | mfu: 50.91 | epoch: 2 | total time: 141.61m | eta: 37.9m +step 13183/16704 (78.92%) | loss: 2.532719 | lrm: 0.42 | dt: 644.59ms | tok/sec: 813,370 | mfu: 50.84 | epoch: 2 | total time: 141.62m | eta: 37.9m +step 13184/16704 (78.93%) | loss: 2.530074 | lrm: 0.42 | dt: 644.57ms | tok/sec: 813,396 | mfu: 50.84 | epoch: 2 | total time: 141.63m | eta: 37.8m +step 13185/16704 (78.93%) | loss: 2.517200 | lrm: 0.42 | dt: 643.24ms | tok/sec: 815,072 | mfu: 50.94 | epoch: 2 | total time: 141.64m | eta: 37.8m +step 13186/16704 (78.94%) | loss: 2.534098 | lrm: 0.42 | dt: 644.51ms | tok/sec: 813,462 | mfu: 50.84 | epoch: 2 | total time: 141.65m | eta: 37.8m +step 13187/16704 (78.95%) | loss: 2.525365 | lrm: 0.42 | dt: 644.92ms | tok/sec: 812,953 | mfu: 50.81 | epoch: 2 | total time: 141.66m | eta: 37.8m +step 13188/16704 (78.95%) | loss: 2.517278 | lrm: 0.42 | dt: 644.89ms | tok/sec: 812,993 | mfu: 50.81 | epoch: 2 | total time: 141.67m | eta: 37.8m +step 13189/16704 (78.96%) | loss: 2.517903 | lrm: 0.42 | dt: 643.38ms | tok/sec: 814,893 | mfu: 50.93 | epoch: 2 | total time: 141.68m | eta: 37.8m +step 13190/16704 (78.96%) | loss: 2.522587 | lrm: 0.42 | dt: 643.27ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 2 | total time: 141.69m | eta: 37.8m +step 13191/16704 (78.97%) | loss: 2.528874 | lrm: 0.42 | dt: 645.18ms | tok/sec: 812,619 | mfu: 50.79 | epoch: 2 | total time: 141.70m | eta: 37.8m +step 13192/16704 (78.98%) | loss: 2.541558 | lrm: 0.42 | dt: 640.93ms | tok/sec: 818,008 | mfu: 51.13 | epoch: 2 | total time: 141.71m | eta: 37.8m +step 13193/16704 (78.98%) | loss: 2.534493 | lrm: 0.42 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 2 | total time: 141.72m | eta: 37.7m +step 13194/16704 (78.99%) | loss: 2.537250 | lrm: 0.42 | dt: 640.81ms | tok/sec: 818,162 | mfu: 51.14 | epoch: 2 | total time: 141.74m | eta: 37.7m +step 13195/16704 (78.99%) | loss: 2.524174 | lrm: 0.42 | dt: 643.32ms | tok/sec: 814,975 | mfu: 50.94 | epoch: 2 | total time: 141.75m | eta: 37.7m +step 13196/16704 (79.00%) | loss: 2.535488 | lrm: 0.42 | dt: 645.22ms | tok/sec: 812,567 | mfu: 50.79 | epoch: 2 | total time: 141.76m | eta: 37.7m +step 13197/16704 (79.01%) | loss: 2.532314 | lrm: 0.42 | dt: 643.42ms | tok/sec: 814,845 | mfu: 50.93 | epoch: 2 | total time: 141.77m | eta: 37.7m +step 13198/16704 (79.01%) | loss: 2.532100 | lrm: 0.42 | dt: 648.87ms | tok/sec: 808,002 | mfu: 50.50 | epoch: 2 | total time: 141.78m | eta: 37.7m +step 13199/16704 (79.02%) | loss: 2.520845 | lrm: 0.42 | dt: 644.32ms | tok/sec: 813,704 | mfu: 50.86 | epoch: 2 | total time: 141.79m | eta: 37.7m +step 13200/16704 (79.02%) | loss: 2.519512 | lrm: 0.42 | dt: 645.57ms | tok/sec: 812,136 | mfu: 50.76 | epoch: 2 | total time: 141.80m | eta: 37.7m +step 13201/16704 (79.03%) | loss: 2.536473 | lrm: 0.42 | dt: 646.15ms | tok/sec: 811,408 | mfu: 50.71 | epoch: 2 | total time: 141.81m | eta: 37.7m +step 13202/16704 (79.03%) | loss: 2.526136 | lrm: 0.42 | dt: 644.11ms | tok/sec: 813,970 | mfu: 50.87 | epoch: 2 | total time: 141.82m | eta: 37.6m +step 13203/16704 (79.04%) | loss: 2.530281 | lrm: 0.42 | dt: 643.92ms | tok/sec: 814,213 | mfu: 50.89 | epoch: 2 | total time: 141.83m | eta: 37.6m +step 13204/16704 (79.05%) | loss: 2.529050 | lrm: 0.42 | dt: 646.51ms | tok/sec: 810,953 | mfu: 50.69 | epoch: 2 | total time: 141.84m | eta: 37.6m +step 13205/16704 (79.05%) | loss: 2.506468 | lrm: 0.42 | dt: 643.96ms | tok/sec: 814,159 | mfu: 50.89 | epoch: 2 | total time: 141.85m | eta: 37.6m +step 13206/16704 (79.06%) | loss: 2.524165 | lrm: 0.42 | dt: 645.17ms | tok/sec: 812,640 | mfu: 50.79 | epoch: 2 | total time: 141.86m | eta: 37.6m +step 13207/16704 (79.06%) | loss: 2.521130 | lrm: 0.42 | dt: 643.39ms | tok/sec: 814,877 | mfu: 50.93 | epoch: 2 | total time: 141.88m | eta: 37.6m +step 13208/16704 (79.07%) | loss: 2.526548 | lrm: 0.42 | dt: 643.83ms | tok/sec: 814,333 | mfu: 50.90 | epoch: 2 | total time: 141.89m | eta: 37.6m +step 13209/16704 (79.08%) | loss: 2.526298 | lrm: 0.42 | dt: 645.14ms | tok/sec: 812,670 | mfu: 50.79 | epoch: 2 | total time: 141.90m | eta: 37.6m +step 13210/16704 (79.08%) | loss: 2.538070 | lrm: 0.42 | dt: 647.23ms | tok/sec: 810,052 | mfu: 50.63 | epoch: 2 | total time: 141.91m | eta: 37.6m +step 13211/16704 (79.09%) | loss: 2.528746 | lrm: 0.42 | dt: 642.51ms | tok/sec: 816,000 | mfu: 51.00 | epoch: 2 | total time: 141.92m | eta: 37.6m +step 13212/16704 (79.09%) | loss: 2.524086 | lrm: 0.42 | dt: 644.92ms | tok/sec: 812,956 | mfu: 50.81 | epoch: 2 | total time: 141.93m | eta: 37.5m +step 13213/16704 (79.10%) | loss: 2.528852 | lrm: 0.42 | dt: 648.06ms | tok/sec: 809,007 | mfu: 50.56 | epoch: 2 | total time: 141.94m | eta: 37.5m +step 13214/16704 (79.11%) | loss: 2.542510 | lrm: 0.42 | dt: 639.56ms | tok/sec: 819,767 | mfu: 51.24 | epoch: 2 | total time: 141.95m | eta: 37.5m +step 13215/16704 (79.11%) | loss: 2.546530 | lrm: 0.42 | dt: 644.59ms | tok/sec: 813,370 | mfu: 50.84 | epoch: 2 | total time: 141.96m | eta: 37.5m +step 13216/16704 (79.12%) | loss: 2.553555 | lrm: 0.42 | dt: 644.52ms | tok/sec: 813,454 | mfu: 50.84 | epoch: 2 | total time: 141.97m | eta: 37.5m +step 13217/16704 (79.12%) | loss: 2.546832 | lrm: 0.42 | dt: 643.38ms | tok/sec: 814,899 | mfu: 50.93 | epoch: 2 | total time: 141.98m | eta: 37.5m +step 13218/16704 (79.13%) | loss: 2.548108 | lrm: 0.42 | dt: 646.25ms | tok/sec: 811,281 | mfu: 50.71 | epoch: 2 | total time: 141.99m | eta: 37.5m +step 13219/16704 (79.14%) | loss: 2.559367 | lrm: 0.42 | dt: 643.04ms | tok/sec: 815,329 | mfu: 50.96 | epoch: 2 | total time: 142.00m | eta: 37.5m +step 13220/16704 (79.14%) | loss: 2.575824 | lrm: 0.42 | dt: 642.96ms | tok/sec: 815,434 | mfu: 50.97 | epoch: 2 | total time: 142.01m | eta: 37.5m +step 13221/16704 (79.15%) | loss: 2.566206 | lrm: 0.42 | dt: 645.22ms | tok/sec: 812,572 | mfu: 50.79 | epoch: 2 | total time: 142.03m | eta: 37.4m +step 13222/16704 (79.15%) | loss: 2.569469 | lrm: 0.42 | dt: 645.32ms | tok/sec: 812,451 | mfu: 50.78 | epoch: 2 | total time: 142.04m | eta: 37.4m +step 13223/16704 (79.16%) | loss: 2.576653 | lrm: 0.42 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 2 | total time: 142.05m | eta: 37.4m +step 13224/16704 (79.17%) | loss: 2.576680 | lrm: 0.42 | dt: 642.79ms | tok/sec: 815,649 | mfu: 50.98 | epoch: 2 | total time: 142.06m | eta: 37.4m +step 13225/16704 (79.17%) | loss: 2.570368 | lrm: 0.42 | dt: 643.77ms | tok/sec: 814,406 | mfu: 50.90 | epoch: 2 | total time: 142.07m | eta: 37.4m +step 13226/16704 (79.18%) | loss: 2.573849 | lrm: 0.42 | dt: 644.58ms | tok/sec: 813,380 | mfu: 50.84 | epoch: 2 | total time: 142.08m | eta: 37.4m +step 13227/16704 (79.18%) | loss: 2.565697 | lrm: 0.42 | dt: 644.93ms | tok/sec: 812,936 | mfu: 50.81 | epoch: 2 | total time: 142.09m | eta: 37.4m +step 13228/16704 (79.19%) | loss: 2.552187 | lrm: 0.42 | dt: 643.36ms | tok/sec: 814,919 | mfu: 50.93 | epoch: 2 | total time: 142.10m | eta: 37.4m +step 13229/16704 (79.20%) | loss: 2.550535 | lrm: 0.42 | dt: 643.49ms | tok/sec: 814,754 | mfu: 50.92 | epoch: 2 | total time: 142.11m | eta: 37.4m +step 13230/16704 (79.20%) | loss: 2.549803 | lrm: 0.42 | dt: 643.47ms | tok/sec: 814,786 | mfu: 50.93 | epoch: 2 | total time: 142.12m | eta: 37.3m +step 13231/16704 (79.21%) | loss: 2.546382 | lrm: 0.42 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 2 | total time: 142.13m | eta: 37.3m +step 13232/16704 (79.21%) | loss: 2.538244 | lrm: 0.42 | dt: 643.54ms | tok/sec: 814,689 | mfu: 50.92 | epoch: 2 | total time: 142.14m | eta: 37.3m +step 13233/16704 (79.22%) | loss: 2.533975 | lrm: 0.42 | dt: 644.36ms | tok/sec: 813,650 | mfu: 50.85 | epoch: 2 | total time: 142.15m | eta: 37.3m +step 13234/16704 (79.23%) | loss: 2.533419 | lrm: 0.42 | dt: 641.02ms | tok/sec: 817,893 | mfu: 51.12 | epoch: 2 | total time: 142.16m | eta: 37.3m +step 13235/16704 (79.23%) | loss: 2.543612 | lrm: 0.42 | dt: 645.58ms | tok/sec: 812,117 | mfu: 50.76 | epoch: 2 | total time: 142.18m | eta: 37.3m +step 13236/16704 (79.24%) | loss: 2.543879 | lrm: 0.42 | dt: 645.47ms | tok/sec: 812,252 | mfu: 50.77 | epoch: 2 | total time: 142.19m | eta: 37.3m +step 13237/16704 (79.24%) | loss: 2.547788 | lrm: 0.42 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 2 | total time: 142.20m | eta: 37.3m +step 13238/16704 (79.25%) | loss: 2.544486 | lrm: 0.41 | dt: 645.91ms | tok/sec: 811,703 | mfu: 50.73 | epoch: 2 | total time: 142.21m | eta: 37.3m +step 13239/16704 (79.26%) | loss: 2.560085 | lrm: 0.41 | dt: 642.87ms | tok/sec: 815,538 | mfu: 50.97 | epoch: 2 | total time: 142.22m | eta: 37.3m +step 13240/16704 (79.26%) | loss: 2.553308 | lrm: 0.41 | dt: 645.66ms | tok/sec: 812,019 | mfu: 50.75 | epoch: 2 | total time: 142.23m | eta: 37.2m +step 13241/16704 (79.27%) | loss: 2.551252 | lrm: 0.41 | dt: 643.09ms | tok/sec: 815,260 | mfu: 50.95 | epoch: 2 | total time: 142.24m | eta: 37.2m +step 13242/16704 (79.27%) | loss: 2.543174 | lrm: 0.41 | dt: 643.08ms | tok/sec: 815,277 | mfu: 50.96 | epoch: 2 | total time: 142.25m | eta: 37.2m +step 13243/16704 (79.28%) | loss: 2.542268 | lrm: 0.41 | dt: 644.16ms | tok/sec: 813,915 | mfu: 50.87 | epoch: 2 | total time: 142.26m | eta: 37.2m +step 13244/16704 (79.29%) | loss: 2.540943 | lrm: 0.41 | dt: 642.34ms | tok/sec: 816,216 | mfu: 51.01 | epoch: 2 | total time: 142.27m | eta: 37.2m +step 13245/16704 (79.29%) | loss: 2.546506 | lrm: 0.41 | dt: 643.31ms | tok/sec: 814,989 | mfu: 50.94 | epoch: 2 | total time: 142.28m | eta: 37.2m +step 13246/16704 (79.30%) | loss: 2.551854 | lrm: 0.41 | dt: 644.74ms | tok/sec: 813,182 | mfu: 50.83 | epoch: 2 | total time: 142.29m | eta: 37.2m +step 13247/16704 (79.30%) | loss: 2.549048 | lrm: 0.41 | dt: 644.31ms | tok/sec: 813,717 | mfu: 50.86 | epoch: 2 | total time: 142.30m | eta: 37.2m +step 13248/16704 (79.31%) | loss: 2.549947 | lrm: 0.41 | dt: 643.32ms | tok/sec: 814,970 | mfu: 50.94 | epoch: 2 | total time: 142.32m | eta: 37.2m +step 13249/16704 (79.32%) | loss: 2.545223 | lrm: 0.41 | dt: 644.36ms | tok/sec: 813,651 | mfu: 50.85 | epoch: 2 | total time: 142.33m | eta: 37.1m +Step 13250 | Validation bpb: 0.775811 +step 13250/16704 (79.32%) | loss: 2.546697 | lrm: 0.41 | dt: 643.53ms | tok/sec: 814,710 | mfu: 50.92 | epoch: 2 | total time: 142.34m | eta: 37.1m +step 13251/16704 (79.33%) | loss: 2.540926 | lrm: 0.41 | dt: 648.33ms | tok/sec: 808,668 | mfu: 50.54 | epoch: 2 | total time: 142.35m | eta: 37.1m +step 13252/16704 (79.33%) | loss: 2.536475 | lrm: 0.41 | dt: 644.52ms | tok/sec: 813,457 | mfu: 50.84 | epoch: 2 | total time: 142.36m | eta: 37.1m +step 13253/16704 (79.34%) | loss: 2.531361 | lrm: 0.41 | dt: 639.62ms | tok/sec: 819,691 | mfu: 51.23 | epoch: 2 | total time: 142.37m | eta: 37.1m +step 13254/16704 (79.35%) | loss: 2.541739 | lrm: 0.41 | dt: 646.73ms | tok/sec: 810,675 | mfu: 50.67 | epoch: 2 | total time: 142.38m | eta: 37.1m +step 13255/16704 (79.35%) | loss: 2.541000 | lrm: 0.41 | dt: 640.33ms | tok/sec: 818,779 | mfu: 51.17 | epoch: 2 | total time: 142.39m | eta: 37.1m +step 13256/16704 (79.36%) | loss: 2.562099 | lrm: 0.41 | dt: 641.71ms | tok/sec: 817,019 | mfu: 51.06 | epoch: 2 | total time: 142.40m | eta: 37.1m +step 13257/16704 (79.36%) | loss: 2.557866 | lrm: 0.41 | dt: 646.14ms | tok/sec: 811,412 | mfu: 50.71 | epoch: 2 | total time: 142.41m | eta: 37.1m +step 13258/16704 (79.37%) | loss: 2.555285 | lrm: 0.41 | dt: 642.80ms | tok/sec: 815,626 | mfu: 50.98 | epoch: 2 | total time: 142.42m | eta: 37.0m +step 13259/16704 (79.38%) | loss: 2.560096 | lrm: 0.41 | dt: 643.78ms | tok/sec: 814,384 | mfu: 50.90 | epoch: 2 | total time: 142.43m | eta: 37.0m +step 13260/16704 (79.38%) | loss: 2.566239 | lrm: 0.41 | dt: 646.33ms | tok/sec: 811,175 | mfu: 50.70 | epoch: 2 | total time: 142.44m | eta: 37.0m +step 13261/16704 (79.39%) | loss: 2.577082 | lrm: 0.41 | dt: 643.75ms | tok/sec: 814,423 | mfu: 50.90 | epoch: 2 | total time: 142.45m | eta: 37.0m +step 13262/16704 (79.39%) | loss: 2.598448 | lrm: 0.41 | dt: 643.37ms | tok/sec: 814,911 | mfu: 50.93 | epoch: 2 | total time: 142.47m | eta: 37.0m +step 13263/16704 (79.40%) | loss: 2.586855 | lrm: 0.41 | dt: 642.15ms | tok/sec: 816,453 | mfu: 51.03 | epoch: 2 | total time: 142.48m | eta: 37.0m +step 13264/16704 (79.41%) | loss: 2.568976 | lrm: 0.41 | dt: 644.69ms | tok/sec: 813,243 | mfu: 50.83 | epoch: 2 | total time: 142.49m | eta: 37.0m +step 13265/16704 (79.41%) | loss: 2.568210 | lrm: 0.41 | dt: 641.69ms | tok/sec: 817,047 | mfu: 51.07 | epoch: 2 | total time: 142.50m | eta: 37.0m +step 13266/16704 (79.42%) | loss: 2.573309 | lrm: 0.41 | dt: 642.50ms | tok/sec: 816,007 | mfu: 51.00 | epoch: 2 | total time: 142.51m | eta: 37.0m +step 13267/16704 (79.42%) | loss: 2.574502 | lrm: 0.41 | dt: 642.53ms | tok/sec: 815,977 | mfu: 51.00 | epoch: 2 | total time: 142.52m | eta: 36.9m +step 13268/16704 (79.43%) | loss: 2.563529 | lrm: 0.41 | dt: 642.80ms | tok/sec: 815,630 | mfu: 50.98 | epoch: 2 | total time: 142.53m | eta: 36.9m +step 13269/16704 (79.44%) | loss: 2.558993 | lrm: 0.41 | dt: 642.68ms | tok/sec: 815,782 | mfu: 50.99 | epoch: 2 | total time: 142.54m | eta: 36.9m +step 13270/16704 (79.44%) | loss: 2.556849 | lrm: 0.41 | dt: 641.75ms | tok/sec: 816,963 | mfu: 51.06 | epoch: 2 | total time: 142.55m | eta: 36.9m +step 13271/16704 (79.45%) | loss: 2.556438 | lrm: 0.41 | dt: 644.21ms | tok/sec: 813,841 | mfu: 50.87 | epoch: 2 | total time: 142.56m | eta: 36.9m +step 13272/16704 (79.45%) | loss: 2.550473 | lrm: 0.41 | dt: 643.57ms | tok/sec: 814,661 | mfu: 50.92 | epoch: 2 | total time: 142.57m | eta: 36.9m +step 13273/16704 (79.46%) | loss: 2.544616 | lrm: 0.41 | dt: 643.85ms | tok/sec: 814,304 | mfu: 50.90 | epoch: 2 | total time: 142.58m | eta: 36.9m +step 13274/16704 (79.47%) | loss: 2.547407 | lrm: 0.41 | dt: 642.08ms | tok/sec: 816,544 | mfu: 51.04 | epoch: 2 | total time: 142.59m | eta: 36.9m +step 13275/16704 (79.47%) | loss: 2.547721 | lrm: 0.41 | dt: 643.50ms | tok/sec: 814,749 | mfu: 50.92 | epoch: 2 | total time: 142.60m | eta: 36.9m +step 13276/16704 (79.48%) | loss: 2.551048 | lrm: 0.41 | dt: 644.86ms | tok/sec: 813,021 | mfu: 50.81 | epoch: 2 | total time: 142.62m | eta: 36.9m +step 13277/16704 (79.48%) | loss: 2.547112 | lrm: 0.41 | dt: 643.01ms | tok/sec: 815,371 | mfu: 50.96 | epoch: 2 | total time: 142.63m | eta: 36.8m +step 13278/16704 (79.49%) | loss: 2.532827 | lrm: 0.41 | dt: 643.68ms | tok/sec: 814,511 | mfu: 50.91 | epoch: 2 | total time: 142.64m | eta: 36.8m +step 13279/16704 (79.50%) | loss: 2.532988 | lrm: 0.41 | dt: 645.55ms | tok/sec: 812,157 | mfu: 50.76 | epoch: 2 | total time: 142.65m | eta: 36.8m +step 13280/16704 (79.50%) | loss: 2.534255 | lrm: 0.41 | dt: 642.75ms | tok/sec: 815,694 | mfu: 50.98 | epoch: 2 | total time: 142.66m | eta: 36.8m +step 13281/16704 (79.51%) | loss: 2.526377 | lrm: 0.41 | dt: 644.45ms | tok/sec: 813,542 | mfu: 50.85 | epoch: 2 | total time: 142.67m | eta: 36.8m +step 13282/16704 (79.51%) | loss: 2.524017 | lrm: 0.41 | dt: 642.93ms | tok/sec: 815,469 | mfu: 50.97 | epoch: 2 | total time: 142.68m | eta: 36.8m +step 13283/16704 (79.52%) | loss: 2.521925 | lrm: 0.41 | dt: 643.76ms | tok/sec: 814,418 | mfu: 50.90 | epoch: 2 | total time: 142.69m | eta: 36.8m +step 13284/16704 (79.53%) | loss: 2.525810 | lrm: 0.41 | dt: 643.98ms | tok/sec: 814,142 | mfu: 50.89 | epoch: 2 | total time: 142.70m | eta: 36.8m +step 13285/16704 (79.53%) | loss: 2.525622 | lrm: 0.41 | dt: 647.88ms | tok/sec: 809,237 | mfu: 50.58 | epoch: 2 | total time: 142.71m | eta: 36.8m +step 13286/16704 (79.54%) | loss: 2.527241 | lrm: 0.41 | dt: 642.60ms | tok/sec: 815,882 | mfu: 50.99 | epoch: 2 | total time: 142.72m | eta: 36.7m +step 13287/16704 (79.54%) | loss: 2.517105 | lrm: 0.41 | dt: 645.33ms | tok/sec: 812,428 | mfu: 50.78 | epoch: 2 | total time: 142.73m | eta: 36.7m +step 13288/16704 (79.55%) | loss: 2.516331 | lrm: 0.41 | dt: 645.40ms | tok/sec: 812,352 | mfu: 50.77 | epoch: 2 | total time: 142.74m | eta: 36.7m +step 13289/16704 (79.56%) | loss: 2.515390 | lrm: 0.41 | dt: 641.08ms | tok/sec: 817,822 | mfu: 51.12 | epoch: 2 | total time: 142.76m | eta: 36.7m +step 13290/16704 (79.56%) | loss: 2.518572 | lrm: 0.41 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 2 | total time: 142.77m | eta: 36.7m +step 13291/16704 (79.57%) | loss: 2.517038 | lrm: 0.41 | dt: 645.22ms | tok/sec: 812,575 | mfu: 50.79 | epoch: 2 | total time: 142.78m | eta: 36.7m +step 13292/16704 (79.57%) | loss: 2.520082 | lrm: 0.41 | dt: 644.10ms | tok/sec: 813,980 | mfu: 50.87 | epoch: 2 | total time: 142.79m | eta: 36.7m +step 13293/16704 (79.58%) | loss: 2.526124 | lrm: 0.41 | dt: 643.95ms | tok/sec: 814,171 | mfu: 50.89 | epoch: 2 | total time: 142.80m | eta: 36.7m +step 13294/16704 (79.59%) | loss: 2.521806 | lrm: 0.41 | dt: 644.37ms | tok/sec: 813,647 | mfu: 50.85 | epoch: 2 | total time: 142.81m | eta: 36.7m +step 13295/16704 (79.59%) | loss: 2.511720 | lrm: 0.41 | dt: 642.62ms | tok/sec: 815,858 | mfu: 50.99 | epoch: 2 | total time: 142.82m | eta: 36.6m +step 13296/16704 (79.60%) | loss: 2.516581 | lrm: 0.41 | dt: 643.79ms | tok/sec: 814,374 | mfu: 50.90 | epoch: 2 | total time: 142.83m | eta: 36.6m +step 13297/16704 (79.60%) | loss: 2.514105 | lrm: 0.41 | dt: 643.77ms | tok/sec: 814,403 | mfu: 50.90 | epoch: 2 | total time: 142.84m | eta: 36.6m +step 13298/16704 (79.61%) | loss: 2.503693 | lrm: 0.41 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 2 | total time: 142.85m | eta: 36.6m +step 13299/16704 (79.62%) | loss: 2.503800 | lrm: 0.41 | dt: 646.43ms | tok/sec: 811,056 | mfu: 50.69 | epoch: 2 | total time: 142.86m | eta: 36.6m +step 13300/16704 (79.62%) | loss: 2.523639 | lrm: 0.41 | dt: 644.82ms | tok/sec: 813,081 | mfu: 50.82 | epoch: 2 | total time: 142.87m | eta: 36.6m +step 13301/16704 (79.63%) | loss: 2.522942 | lrm: 0.41 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 2 | total time: 142.88m | eta: 36.6m +step 13302/16704 (79.63%) | loss: 2.516340 | lrm: 0.41 | dt: 646.15ms | tok/sec: 811,402 | mfu: 50.71 | epoch: 2 | total time: 142.89m | eta: 36.6m +step 13303/16704 (79.64%) | loss: 2.514103 | lrm: 0.41 | dt: 641.84ms | tok/sec: 816,857 | mfu: 51.05 | epoch: 2 | total time: 142.91m | eta: 36.6m +step 13304/16704 (79.65%) | loss: 2.530843 | lrm: 0.41 | dt: 643.67ms | tok/sec: 814,531 | mfu: 50.91 | epoch: 2 | total time: 142.92m | eta: 36.6m +step 13305/16704 (79.65%) | loss: 2.528732 | lrm: 0.41 | dt: 642.23ms | tok/sec: 816,350 | mfu: 51.02 | epoch: 2 | total time: 142.93m | eta: 36.5m +step 13306/16704 (79.66%) | loss: 2.525544 | lrm: 0.41 | dt: 641.78ms | tok/sec: 816,930 | mfu: 51.06 | epoch: 2 | total time: 142.94m | eta: 36.5m +step 13307/16704 (79.66%) | loss: 2.516516 | lrm: 0.41 | dt: 644.41ms | tok/sec: 813,595 | mfu: 50.85 | epoch: 2 | total time: 142.95m | eta: 36.5m +step 13308/16704 (79.67%) | loss: 2.513287 | lrm: 0.41 | dt: 642.71ms | tok/sec: 815,744 | mfu: 50.99 | epoch: 2 | total time: 142.96m | eta: 36.5m +step 13309/16704 (79.68%) | loss: 2.508365 | lrm: 0.41 | dt: 645.05ms | tok/sec: 812,786 | mfu: 50.80 | epoch: 2 | total time: 142.97m | eta: 36.5m +step 13310/16704 (79.68%) | loss: 2.504143 | lrm: 0.41 | dt: 647.19ms | tok/sec: 810,094 | mfu: 50.63 | epoch: 2 | total time: 142.98m | eta: 36.5m +step 13311/16704 (79.69%) | loss: 2.509507 | lrm: 0.41 | dt: 641.03ms | tok/sec: 817,880 | mfu: 51.12 | epoch: 2 | total time: 142.99m | eta: 36.5m +step 13312/16704 (79.69%) | loss: 2.507144 | lrm: 0.41 | dt: 644.08ms | tok/sec: 814,012 | mfu: 50.88 | epoch: 2 | total time: 143.00m | eta: 36.5m +step 13313/16704 (79.70%) | loss: 2.506022 | lrm: 0.41 | dt: 644.39ms | tok/sec: 813,618 | mfu: 50.85 | epoch: 2 | total time: 143.01m | eta: 36.5m +step 13314/16704 (79.71%) | loss: 2.498397 | lrm: 0.41 | dt: 642.05ms | tok/sec: 816,581 | mfu: 51.04 | epoch: 2 | total time: 143.02m | eta: 36.4m +step 13315/16704 (79.71%) | loss: 2.499245 | lrm: 0.41 | dt: 647.39ms | tok/sec: 809,853 | mfu: 50.62 | epoch: 2 | total time: 143.03m | eta: 36.4m +step 13316/16704 (79.72%) | loss: 2.496493 | lrm: 0.41 | dt: 642.57ms | tok/sec: 815,922 | mfu: 51.00 | epoch: 2 | total time: 143.04m | eta: 36.4m +step 13317/16704 (79.72%) | loss: 2.506445 | lrm: 0.41 | dt: 642.69ms | tok/sec: 815,768 | mfu: 50.99 | epoch: 2 | total time: 143.06m | eta: 36.4m +step 13318/16704 (79.73%) | loss: 2.509346 | lrm: 0.41 | dt: 645.80ms | tok/sec: 811,844 | mfu: 50.74 | epoch: 2 | total time: 143.07m | eta: 36.4m +step 13319/16704 (79.74%) | loss: 2.522439 | lrm: 0.41 | dt: 641.91ms | tok/sec: 816,756 | mfu: 51.05 | epoch: 2 | total time: 143.08m | eta: 36.4m +step 13320/16704 (79.74%) | loss: 2.519892 | lrm: 0.41 | dt: 645.41ms | tok/sec: 812,331 | mfu: 50.77 | epoch: 2 | total time: 143.09m | eta: 36.4m +step 13321/16704 (79.75%) | loss: 2.503503 | lrm: 0.41 | dt: 642.55ms | tok/sec: 815,949 | mfu: 51.00 | epoch: 2 | total time: 143.10m | eta: 36.4m +step 13322/16704 (79.75%) | loss: 2.514110 | lrm: 0.40 | dt: 642.32ms | tok/sec: 816,243 | mfu: 51.02 | epoch: 2 | total time: 143.11m | eta: 36.4m +step 13323/16704 (79.76%) | loss: 2.508966 | lrm: 0.40 | dt: 645.18ms | tok/sec: 812,617 | mfu: 50.79 | epoch: 2 | total time: 143.12m | eta: 36.3m +step 13324/16704 (79.77%) | loss: 2.520698 | lrm: 0.40 | dt: 642.70ms | tok/sec: 815,754 | mfu: 50.99 | epoch: 2 | total time: 143.13m | eta: 36.3m +step 13325/16704 (79.77%) | loss: 2.511062 | lrm: 0.40 | dt: 646.96ms | tok/sec: 810,391 | mfu: 50.65 | epoch: 2 | total time: 143.14m | eta: 36.3m +step 13326/16704 (79.78%) | loss: 2.519292 | lrm: 0.40 | dt: 645.52ms | tok/sec: 812,197 | mfu: 50.76 | epoch: 2 | total time: 143.15m | eta: 36.3m +step 13327/16704 (79.78%) | loss: 2.521727 | lrm: 0.40 | dt: 645.23ms | tok/sec: 812,556 | mfu: 50.79 | epoch: 2 | total time: 143.16m | eta: 36.3m +step 13328/16704 (79.79%) | loss: 2.505355 | lrm: 0.40 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 2 | total time: 143.17m | eta: 36.3m +step 13329/16704 (79.80%) | loss: 2.512528 | lrm: 0.40 | dt: 643.69ms | tok/sec: 814,501 | mfu: 50.91 | epoch: 2 | total time: 143.18m | eta: 36.3m +step 13330/16704 (79.80%) | loss: 2.514191 | lrm: 0.40 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 2 | total time: 143.20m | eta: 36.3m +step 13331/16704 (79.81%) | loss: 2.521389 | lrm: 0.40 | dt: 643.25ms | tok/sec: 815,058 | mfu: 50.94 | epoch: 2 | total time: 143.21m | eta: 36.3m +step 13332/16704 (79.81%) | loss: 2.530268 | lrm: 0.40 | dt: 645.78ms | tok/sec: 811,871 | mfu: 50.74 | epoch: 2 | total time: 143.22m | eta: 36.3m +step 13333/16704 (79.82%) | loss: 2.532424 | lrm: 0.40 | dt: 643.46ms | tok/sec: 814,794 | mfu: 50.93 | epoch: 2 | total time: 143.23m | eta: 36.2m +step 13334/16704 (79.83%) | loss: 2.536014 | lrm: 0.40 | dt: 642.64ms | tok/sec: 815,829 | mfu: 50.99 | epoch: 2 | total time: 143.24m | eta: 36.2m +step 13335/16704 (79.83%) | loss: 2.529015 | lrm: 0.40 | dt: 644.52ms | tok/sec: 813,456 | mfu: 50.84 | epoch: 2 | total time: 143.25m | eta: 36.2m +step 13336/16704 (79.84%) | loss: 2.521633 | lrm: 0.40 | dt: 642.90ms | tok/sec: 815,498 | mfu: 50.97 | epoch: 2 | total time: 143.26m | eta: 36.2m +step 13337/16704 (79.84%) | loss: 2.532138 | lrm: 0.40 | dt: 643.40ms | tok/sec: 814,876 | mfu: 50.93 | epoch: 2 | total time: 143.27m | eta: 36.2m +step 13338/16704 (79.85%) | loss: 2.525417 | lrm: 0.40 | dt: 643.47ms | tok/sec: 814,784 | mfu: 50.93 | epoch: 2 | total time: 143.28m | eta: 36.2m +step 13339/16704 (79.86%) | loss: 2.527287 | lrm: 0.40 | dt: 642.63ms | tok/sec: 815,844 | mfu: 50.99 | epoch: 2 | total time: 143.29m | eta: 36.2m +step 13340/16704 (79.86%) | loss: 2.515767 | lrm: 0.40 | dt: 645.29ms | tok/sec: 812,490 | mfu: 50.78 | epoch: 2 | total time: 143.30m | eta: 36.2m +step 13341/16704 (79.87%) | loss: 2.508545 | lrm: 0.40 | dt: 644.74ms | tok/sec: 813,179 | mfu: 50.82 | epoch: 2 | total time: 143.31m | eta: 36.2m +step 13342/16704 (79.87%) | loss: 2.506613 | lrm: 0.40 | dt: 644.93ms | tok/sec: 812,932 | mfu: 50.81 | epoch: 2 | total time: 143.32m | eta: 36.1m +step 13343/16704 (79.88%) | loss: 2.519870 | lrm: 0.40 | dt: 642.70ms | tok/sec: 815,758 | mfu: 50.99 | epoch: 2 | total time: 143.33m | eta: 36.1m +step 13344/16704 (79.89%) | loss: 2.527318 | lrm: 0.40 | dt: 642.88ms | tok/sec: 815,535 | mfu: 50.97 | epoch: 2 | total time: 143.35m | eta: 36.1m +step 13345/16704 (79.89%) | loss: 2.531315 | lrm: 0.40 | dt: 645.13ms | tok/sec: 812,687 | mfu: 50.79 | epoch: 2 | total time: 143.36m | eta: 36.1m +step 13346/16704 (79.90%) | loss: 2.508038 | lrm: 0.40 | dt: 644.75ms | tok/sec: 813,158 | mfu: 50.82 | epoch: 2 | total time: 143.37m | eta: 36.1m +step 13347/16704 (79.90%) | loss: 2.501406 | lrm: 0.40 | dt: 644.06ms | tok/sec: 814,037 | mfu: 50.88 | epoch: 2 | total time: 143.38m | eta: 36.1m +step 13348/16704 (79.91%) | loss: 2.501904 | lrm: 0.40 | dt: 643.25ms | tok/sec: 815,065 | mfu: 50.94 | epoch: 2 | total time: 143.39m | eta: 36.1m +step 13349/16704 (79.91%) | loss: 2.502752 | lrm: 0.40 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 2 | total time: 143.40m | eta: 36.1m +step 13350/16704 (79.92%) | loss: 2.515263 | lrm: 0.40 | dt: 643.91ms | tok/sec: 814,224 | mfu: 50.89 | epoch: 2 | total time: 143.41m | eta: 36.1m +step 13351/16704 (79.93%) | loss: 2.508972 | lrm: 0.40 | dt: 644.22ms | tok/sec: 813,827 | mfu: 50.87 | epoch: 2 | total time: 143.42m | eta: 36.0m +step 13352/16704 (79.93%) | loss: 2.513166 | lrm: 0.40 | dt: 645.48ms | tok/sec: 812,245 | mfu: 50.77 | epoch: 2 | total time: 143.43m | eta: 36.0m +step 13353/16704 (79.94%) | loss: 2.515455 | lrm: 0.40 | dt: 643.18ms | tok/sec: 815,143 | mfu: 50.95 | epoch: 2 | total time: 143.44m | eta: 36.0m +step 13354/16704 (79.94%) | loss: 2.505217 | lrm: 0.40 | dt: 645.90ms | tok/sec: 811,712 | mfu: 50.73 | epoch: 2 | total time: 143.45m | eta: 36.0m +step 13355/16704 (79.95%) | loss: 2.500043 | lrm: 0.40 | dt: 645.33ms | tok/sec: 812,437 | mfu: 50.78 | epoch: 2 | total time: 143.46m | eta: 36.0m +step 13356/16704 (79.96%) | loss: 2.505834 | lrm: 0.40 | dt: 645.68ms | tok/sec: 811,998 | mfu: 50.75 | epoch: 2 | total time: 143.47m | eta: 36.0m +step 13357/16704 (79.96%) | loss: 2.512572 | lrm: 0.40 | dt: 642.04ms | tok/sec: 816,597 | mfu: 51.04 | epoch: 2 | total time: 143.48m | eta: 36.0m +step 13358/16704 (79.97%) | loss: 2.506058 | lrm: 0.40 | dt: 643.99ms | tok/sec: 814,123 | mfu: 50.88 | epoch: 2 | total time: 143.50m | eta: 36.0m +step 13359/16704 (79.97%) | loss: 2.500737 | lrm: 0.40 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 2 | total time: 143.51m | eta: 36.0m +step 13360/16704 (79.98%) | loss: 2.501223 | lrm: 0.40 | dt: 644.81ms | tok/sec: 813,095 | mfu: 50.82 | epoch: 2 | total time: 143.52m | eta: 35.9m +step 13361/16704 (79.99%) | loss: 2.521550 | lrm: 0.40 | dt: 644.65ms | tok/sec: 813,294 | mfu: 50.83 | epoch: 2 | total time: 143.53m | eta: 35.9m +step 13362/16704 (79.99%) | loss: 2.519500 | lrm: 0.40 | dt: 641.58ms | tok/sec: 817,176 | mfu: 51.07 | epoch: 2 | total time: 143.54m | eta: 35.9m +step 13363/16704 (80.00%) | loss: 2.534291 | lrm: 0.40 | dt: 645.91ms | tok/sec: 811,708 | mfu: 50.73 | epoch: 2 | total time: 143.55m | eta: 35.9m +step 13364/16704 (80.00%) | loss: 2.521481 | lrm: 0.40 | dt: 644.13ms | tok/sec: 813,942 | mfu: 50.87 | epoch: 2 | total time: 143.56m | eta: 35.9m +step 13365/16704 (80.01%) | loss: 2.515772 | lrm: 0.40 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 2 | total time: 143.57m | eta: 35.9m +step 13366/16704 (80.02%) | loss: 2.521182 | lrm: 0.40 | dt: 644.07ms | tok/sec: 814,021 | mfu: 50.88 | epoch: 2 | total time: 143.58m | eta: 35.9m +step 13367/16704 (80.02%) | loss: 2.504796 | lrm: 0.40 | dt: 642.75ms | tok/sec: 815,698 | mfu: 50.98 | epoch: 2 | total time: 143.59m | eta: 35.9m +step 13368/16704 (80.03%) | loss: 2.515632 | lrm: 0.40 | dt: 642.52ms | tok/sec: 815,988 | mfu: 51.00 | epoch: 2 | total time: 143.60m | eta: 35.9m +step 13369/16704 (80.03%) | loss: 2.520447 | lrm: 0.40 | dt: 644.52ms | tok/sec: 813,454 | mfu: 50.84 | epoch: 2 | total time: 143.61m | eta: 35.9m +step 13370/16704 (80.04%) | loss: 2.519200 | lrm: 0.40 | dt: 645.32ms | tok/sec: 812,446 | mfu: 50.78 | epoch: 2 | total time: 143.62m | eta: 35.8m +step 13371/16704 (80.05%) | loss: 2.514220 | lrm: 0.40 | dt: 643.68ms | tok/sec: 814,517 | mfu: 50.91 | epoch: 2 | total time: 143.64m | eta: 35.8m +step 13372/16704 (80.05%) | loss: 2.528154 | lrm: 0.40 | dt: 643.66ms | tok/sec: 814,543 | mfu: 50.91 | epoch: 2 | total time: 143.65m | eta: 35.8m +step 13373/16704 (80.06%) | loss: 2.527267 | lrm: 0.40 | dt: 645.38ms | tok/sec: 812,367 | mfu: 50.77 | epoch: 2 | total time: 143.66m | eta: 35.8m +step 13374/16704 (80.06%) | loss: 2.539444 | lrm: 0.40 | dt: 643.72ms | tok/sec: 814,469 | mfu: 50.91 | epoch: 2 | total time: 143.67m | eta: 35.8m +step 13375/16704 (80.07%) | loss: 2.538301 | lrm: 0.40 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 2 | total time: 143.68m | eta: 35.8m +step 13376/16704 (80.08%) | loss: 2.525105 | lrm: 0.40 | dt: 645.54ms | tok/sec: 812,173 | mfu: 50.76 | epoch: 2 | total time: 143.69m | eta: 35.8m +step 13377/16704 (80.08%) | loss: 2.525266 | lrm: 0.40 | dt: 642.74ms | tok/sec: 815,705 | mfu: 50.98 | epoch: 2 | total time: 143.70m | eta: 35.8m +step 13378/16704 (80.09%) | loss: 2.522791 | lrm: 0.40 | dt: 644.20ms | tok/sec: 813,857 | mfu: 50.87 | epoch: 2 | total time: 143.71m | eta: 35.8m +step 13379/16704 (80.09%) | loss: 2.522370 | lrm: 0.40 | dt: 644.69ms | tok/sec: 813,241 | mfu: 50.83 | epoch: 2 | total time: 143.72m | eta: 35.7m +step 13380/16704 (80.10%) | loss: 2.522976 | lrm: 0.40 | dt: 644.70ms | tok/sec: 813,225 | mfu: 50.83 | epoch: 2 | total time: 143.73m | eta: 35.7m +step 13381/16704 (80.11%) | loss: 2.510364 | lrm: 0.40 | dt: 645.54ms | tok/sec: 812,174 | mfu: 50.76 | epoch: 2 | total time: 143.74m | eta: 35.7m +step 13382/16704 (80.11%) | loss: 2.521734 | lrm: 0.40 | dt: 643.85ms | tok/sec: 814,302 | mfu: 50.90 | epoch: 2 | total time: 143.75m | eta: 35.7m +step 13383/16704 (80.12%) | loss: 2.525419 | lrm: 0.40 | dt: 645.98ms | tok/sec: 811,621 | mfu: 50.73 | epoch: 2 | total time: 143.76m | eta: 35.7m +step 13384/16704 (80.12%) | loss: 2.533912 | lrm: 0.40 | dt: 646.02ms | tok/sec: 811,566 | mfu: 50.72 | epoch: 2 | total time: 143.77m | eta: 35.7m +step 13385/16704 (80.13%) | loss: 2.543747 | lrm: 0.40 | dt: 641.56ms | tok/sec: 817,208 | mfu: 51.08 | epoch: 2 | total time: 143.79m | eta: 35.7m +step 13386/16704 (80.14%) | loss: 2.524939 | lrm: 0.40 | dt: 645.51ms | tok/sec: 812,212 | mfu: 50.76 | epoch: 2 | total time: 143.80m | eta: 35.7m +step 13387/16704 (80.14%) | loss: 2.530828 | lrm: 0.40 | dt: 645.11ms | tok/sec: 812,716 | mfu: 50.80 | epoch: 2 | total time: 143.81m | eta: 35.7m +step 13388/16704 (80.15%) | loss: 2.531018 | lrm: 0.40 | dt: 643.60ms | tok/sec: 814,612 | mfu: 50.91 | epoch: 2 | total time: 143.82m | eta: 35.6m +step 13389/16704 (80.15%) | loss: 2.528020 | lrm: 0.40 | dt: 645.50ms | tok/sec: 812,216 | mfu: 50.76 | epoch: 2 | total time: 143.83m | eta: 35.6m +step 13390/16704 (80.16%) | loss: 2.532224 | lrm: 0.40 | dt: 644.09ms | tok/sec: 813,993 | mfu: 50.88 | epoch: 2 | total time: 143.84m | eta: 35.6m +step 13391/16704 (80.17%) | loss: 2.531255 | lrm: 0.40 | dt: 644.57ms | tok/sec: 813,387 | mfu: 50.84 | epoch: 2 | total time: 143.85m | eta: 35.6m +step 13392/16704 (80.17%) | loss: 2.531527 | lrm: 0.40 | dt: 646.99ms | tok/sec: 810,350 | mfu: 50.65 | epoch: 2 | total time: 143.86m | eta: 35.6m +step 13393/16704 (80.18%) | loss: 2.542395 | lrm: 0.40 | dt: 643.21ms | tok/sec: 815,112 | mfu: 50.95 | epoch: 2 | total time: 143.87m | eta: 35.6m +step 13394/16704 (80.18%) | loss: 2.537312 | lrm: 0.40 | dt: 646.22ms | tok/sec: 811,309 | mfu: 50.71 | epoch: 2 | total time: 143.88m | eta: 35.6m +step 13395/16704 (80.19%) | loss: 2.529351 | lrm: 0.40 | dt: 646.83ms | tok/sec: 810,548 | mfu: 50.66 | epoch: 2 | total time: 143.89m | eta: 35.6m +step 13396/16704 (80.20%) | loss: 2.523541 | lrm: 0.40 | dt: 649.02ms | tok/sec: 807,816 | mfu: 50.49 | epoch: 2 | total time: 143.90m | eta: 35.6m +step 13397/16704 (80.20%) | loss: 2.524264 | lrm: 0.40 | dt: 645.00ms | tok/sec: 812,845 | mfu: 50.80 | epoch: 2 | total time: 143.91m | eta: 35.6m +step 13398/16704 (80.21%) | loss: 2.518457 | lrm: 0.40 | dt: 648.22ms | tok/sec: 808,813 | mfu: 50.55 | epoch: 2 | total time: 143.93m | eta: 35.5m +step 13399/16704 (80.21%) | loss: 2.535782 | lrm: 0.40 | dt: 647.81ms | tok/sec: 809,329 | mfu: 50.58 | epoch: 2 | total time: 143.94m | eta: 35.5m +step 13400/16704 (80.22%) | loss: 2.535996 | lrm: 0.40 | dt: 644.85ms | tok/sec: 813,038 | mfu: 50.82 | epoch: 2 | total time: 143.95m | eta: 35.5m +step 13401/16704 (80.23%) | loss: 2.526509 | lrm: 0.40 | dt: 647.64ms | tok/sec: 809,538 | mfu: 50.60 | epoch: 2 | total time: 143.96m | eta: 35.5m +step 13402/16704 (80.23%) | loss: 2.529575 | lrm: 0.40 | dt: 643.22ms | tok/sec: 815,095 | mfu: 50.94 | epoch: 2 | total time: 143.97m | eta: 35.5m +step 13403/16704 (80.24%) | loss: 2.524632 | lrm: 0.40 | dt: 647.80ms | tok/sec: 809,330 | mfu: 50.58 | epoch: 2 | total time: 143.98m | eta: 35.5m +step 13404/16704 (80.24%) | loss: 2.531273 | lrm: 0.40 | dt: 645.99ms | tok/sec: 811,605 | mfu: 50.73 | epoch: 2 | total time: 143.99m | eta: 35.5m +step 13405/16704 (80.25%) | loss: 2.525887 | lrm: 0.39 | dt: 644.97ms | tok/sec: 812,882 | mfu: 50.81 | epoch: 2 | total time: 144.00m | eta: 35.5m +step 13406/16704 (80.26%) | loss: 2.512975 | lrm: 0.39 | dt: 646.44ms | tok/sec: 811,034 | mfu: 50.69 | epoch: 2 | total time: 144.01m | eta: 35.5m +step 13407/16704 (80.26%) | loss: 2.508173 | lrm: 0.39 | dt: 646.37ms | tok/sec: 811,124 | mfu: 50.70 | epoch: 2 | total time: 144.02m | eta: 35.4m +step 13408/16704 (80.27%) | loss: 2.509734 | lrm: 0.39 | dt: 643.85ms | tok/sec: 814,300 | mfu: 50.89 | epoch: 2 | total time: 144.03m | eta: 35.4m +step 13409/16704 (80.27%) | loss: 2.511022 | lrm: 0.39 | dt: 648.34ms | tok/sec: 808,657 | mfu: 50.54 | epoch: 2 | total time: 144.04m | eta: 35.4m +step 13410/16704 (80.28%) | loss: 2.509934 | lrm: 0.39 | dt: 644.86ms | tok/sec: 813,024 | mfu: 50.82 | epoch: 2 | total time: 144.05m | eta: 35.4m +step 13411/16704 (80.29%) | loss: 2.515686 | lrm: 0.39 | dt: 646.60ms | tok/sec: 810,838 | mfu: 50.68 | epoch: 2 | total time: 144.07m | eta: 35.4m +step 13412/16704 (80.29%) | loss: 2.512007 | lrm: 0.39 | dt: 648.04ms | tok/sec: 809,031 | mfu: 50.57 | epoch: 2 | total time: 144.08m | eta: 35.4m +step 13413/16704 (80.30%) | loss: 2.501853 | lrm: 0.39 | dt: 645.37ms | tok/sec: 812,381 | mfu: 50.77 | epoch: 2 | total time: 144.09m | eta: 35.4m +step 13414/16704 (80.30%) | loss: 2.491508 | lrm: 0.39 | dt: 649.12ms | tok/sec: 807,695 | mfu: 50.48 | epoch: 2 | total time: 144.10m | eta: 35.4m +step 13415/16704 (80.31%) | loss: 2.489906 | lrm: 0.39 | dt: 645.64ms | tok/sec: 812,044 | mfu: 50.75 | epoch: 2 | total time: 144.11m | eta: 35.4m +step 13416/16704 (80.32%) | loss: 2.502278 | lrm: 0.39 | dt: 646.01ms | tok/sec: 811,575 | mfu: 50.72 | epoch: 2 | total time: 144.12m | eta: 35.3m +step 13417/16704 (80.32%) | loss: 2.507461 | lrm: 0.39 | dt: 647.61ms | tok/sec: 809,568 | mfu: 50.60 | epoch: 2 | total time: 144.13m | eta: 35.3m +step 13418/16704 (80.33%) | loss: 2.508901 | lrm: 0.39 | dt: 645.69ms | tok/sec: 811,978 | mfu: 50.75 | epoch: 2 | total time: 144.14m | eta: 35.3m +step 13419/16704 (80.33%) | loss: 2.518232 | lrm: 0.39 | dt: 648.64ms | tok/sec: 808,284 | mfu: 50.52 | epoch: 2 | total time: 144.15m | eta: 35.3m +step 13420/16704 (80.34%) | loss: 2.524382 | lrm: 0.39 | dt: 649.77ms | tok/sec: 806,882 | mfu: 50.43 | epoch: 2 | total time: 144.16m | eta: 35.3m +step 13421/16704 (80.35%) | loss: 2.521705 | lrm: 0.39 | dt: 645.53ms | tok/sec: 812,183 | mfu: 50.76 | epoch: 2 | total time: 144.17m | eta: 35.3m +step 13422/16704 (80.35%) | loss: 2.527537 | lrm: 0.39 | dt: 648.38ms | tok/sec: 808,615 | mfu: 50.54 | epoch: 2 | total time: 144.18m | eta: 35.3m +step 13423/16704 (80.36%) | loss: 2.524870 | lrm: 0.39 | dt: 646.59ms | tok/sec: 810,849 | mfu: 50.68 | epoch: 2 | total time: 144.19m | eta: 35.3m +step 13424/16704 (80.36%) | loss: 2.524859 | lrm: 0.39 | dt: 645.65ms | tok/sec: 812,037 | mfu: 50.75 | epoch: 2 | total time: 144.21m | eta: 35.3m +step 13425/16704 (80.37%) | loss: 2.522035 | lrm: 0.39 | dt: 646.44ms | tok/sec: 811,041 | mfu: 50.69 | epoch: 2 | total time: 144.22m | eta: 35.3m +step 13426/16704 (80.38%) | loss: 2.515072 | lrm: 0.39 | dt: 649.21ms | tok/sec: 807,573 | mfu: 50.47 | epoch: 2 | total time: 144.23m | eta: 35.2m +step 13427/16704 (80.38%) | loss: 2.507780 | lrm: 0.39 | dt: 644.96ms | tok/sec: 812,895 | mfu: 50.81 | epoch: 2 | total time: 144.24m | eta: 35.2m +step 13428/16704 (80.39%) | loss: 2.494918 | lrm: 0.39 | dt: 646.18ms | tok/sec: 811,367 | mfu: 50.71 | epoch: 2 | total time: 144.25m | eta: 35.2m +step 13429/16704 (80.39%) | loss: 2.506217 | lrm: 0.39 | dt: 647.44ms | tok/sec: 809,791 | mfu: 50.61 | epoch: 2 | total time: 144.26m | eta: 35.2m +step 13430/16704 (80.40%) | loss: 2.520297 | lrm: 0.39 | dt: 646.22ms | tok/sec: 811,318 | mfu: 50.71 | epoch: 2 | total time: 144.27m | eta: 35.2m +step 13431/16704 (80.41%) | loss: 2.515964 | lrm: 0.39 | dt: 646.56ms | tok/sec: 810,886 | mfu: 50.68 | epoch: 2 | total time: 144.28m | eta: 35.2m +step 13432/16704 (80.41%) | loss: 2.517165 | lrm: 0.39 | dt: 647.57ms | tok/sec: 809,626 | mfu: 50.60 | epoch: 2 | total time: 144.29m | eta: 35.2m +step 13433/16704 (80.42%) | loss: 2.518007 | lrm: 0.39 | dt: 648.12ms | tok/sec: 808,930 | mfu: 50.56 | epoch: 2 | total time: 144.30m | eta: 35.2m +step 13434/16704 (80.42%) | loss: 2.521346 | lrm: 0.39 | dt: 646.22ms | tok/sec: 811,318 | mfu: 50.71 | epoch: 2 | total time: 144.31m | eta: 35.2m +step 13435/16704 (80.43%) | loss: 2.524875 | lrm: 0.39 | dt: 646.52ms | tok/sec: 810,939 | mfu: 50.68 | epoch: 2 | total time: 144.32m | eta: 35.1m +step 13436/16704 (80.44%) | loss: 2.540235 | lrm: 0.39 | dt: 648.40ms | tok/sec: 808,581 | mfu: 50.54 | epoch: 2 | total time: 144.34m | eta: 35.1m +step 13437/16704 (80.44%) | loss: 2.543625 | lrm: 0.39 | dt: 649.49ms | tok/sec: 807,231 | mfu: 50.45 | epoch: 2 | total time: 144.35m | eta: 35.1m +step 13438/16704 (80.45%) | loss: 2.541998 | lrm: 0.39 | dt: 645.20ms | tok/sec: 812,599 | mfu: 50.79 | epoch: 2 | total time: 144.36m | eta: 35.1m +step 13439/16704 (80.45%) | loss: 2.542806 | lrm: 0.39 | dt: 648.36ms | tok/sec: 808,631 | mfu: 50.54 | epoch: 2 | total time: 144.37m | eta: 35.1m +step 13440/16704 (80.46%) | loss: 2.554015 | lrm: 0.39 | dt: 643.96ms | tok/sec: 814,156 | mfu: 50.89 | epoch: 2 | total time: 144.38m | eta: 35.1m +step 13441/16704 (80.47%) | loss: 2.551448 | lrm: 0.39 | dt: 646.00ms | tok/sec: 811,585 | mfu: 50.73 | epoch: 2 | total time: 144.39m | eta: 35.1m +step 13442/16704 (80.47%) | loss: 2.557559 | lrm: 0.39 | dt: 648.28ms | tok/sec: 808,743 | mfu: 50.55 | epoch: 2 | total time: 144.40m | eta: 35.1m +step 13443/16704 (80.48%) | loss: 2.565371 | lrm: 0.39 | dt: 645.00ms | tok/sec: 812,845 | mfu: 50.80 | epoch: 2 | total time: 144.41m | eta: 35.1m +step 13444/16704 (80.48%) | loss: 2.555934 | lrm: 0.39 | dt: 647.98ms | tok/sec: 809,116 | mfu: 50.57 | epoch: 2 | total time: 144.42m | eta: 35.0m +step 13445/16704 (80.49%) | loss: 2.557814 | lrm: 0.39 | dt: 645.01ms | tok/sec: 812,839 | mfu: 50.80 | epoch: 2 | total time: 144.43m | eta: 35.0m +step 13446/16704 (80.50%) | loss: 2.546395 | lrm: 0.39 | dt: 646.90ms | tok/sec: 810,465 | mfu: 50.66 | epoch: 2 | total time: 144.44m | eta: 35.0m +step 13447/16704 (80.50%) | loss: 2.547078 | lrm: 0.39 | dt: 644.42ms | tok/sec: 813,582 | mfu: 50.85 | epoch: 2 | total time: 144.45m | eta: 35.0m +step 13448/16704 (80.51%) | loss: 2.544078 | lrm: 0.39 | dt: 645.82ms | tok/sec: 811,823 | mfu: 50.74 | epoch: 2 | total time: 144.46m | eta: 35.0m +step 13449/16704 (80.51%) | loss: 2.554057 | lrm: 0.39 | dt: 647.78ms | tok/sec: 809,355 | mfu: 50.59 | epoch: 2 | total time: 144.48m | eta: 35.0m +step 13450/16704 (80.52%) | loss: 2.551397 | lrm: 0.39 | dt: 646.10ms | tok/sec: 811,468 | mfu: 50.72 | epoch: 2 | total time: 144.49m | eta: 35.0m +step 13451/16704 (80.53%) | loss: 2.558838 | lrm: 0.39 | dt: 645.28ms | tok/sec: 812,495 | mfu: 50.78 | epoch: 2 | total time: 144.50m | eta: 35.0m +step 13452/16704 (80.53%) | loss: 2.532995 | lrm: 0.39 | dt: 646.62ms | tok/sec: 810,814 | mfu: 50.68 | epoch: 2 | total time: 144.51m | eta: 35.0m +step 13453/16704 (80.54%) | loss: 2.512762 | lrm: 0.39 | dt: 645.66ms | tok/sec: 812,021 | mfu: 50.75 | epoch: 2 | total time: 144.52m | eta: 34.9m +step 13454/16704 (80.54%) | loss: 2.516889 | lrm: 0.39 | dt: 647.48ms | tok/sec: 809,737 | mfu: 50.61 | epoch: 2 | total time: 144.53m | eta: 34.9m +step 13455/16704 (80.55%) | loss: 2.515584 | lrm: 0.39 | dt: 647.89ms | tok/sec: 809,218 | mfu: 50.58 | epoch: 2 | total time: 144.54m | eta: 34.9m +step 13456/16704 (80.56%) | loss: 2.528935 | lrm: 0.39 | dt: 645.51ms | tok/sec: 812,207 | mfu: 50.76 | epoch: 2 | total time: 144.55m | eta: 34.9m +step 13457/16704 (80.56%) | loss: 2.532071 | lrm: 0.39 | dt: 644.44ms | tok/sec: 813,553 | mfu: 50.85 | epoch: 2 | total time: 144.56m | eta: 34.9m +step 13458/16704 (80.57%) | loss: 2.530070 | lrm: 0.39 | dt: 646.05ms | tok/sec: 811,530 | mfu: 50.72 | epoch: 2 | total time: 144.57m | eta: 34.9m +step 13459/16704 (80.57%) | loss: 2.534977 | lrm: 0.39 | dt: 645.76ms | tok/sec: 811,894 | mfu: 50.74 | epoch: 2 | total time: 144.58m | eta: 34.9m +step 13460/16704 (80.58%) | loss: 2.541056 | lrm: 0.39 | dt: 646.62ms | tok/sec: 810,810 | mfu: 50.68 | epoch: 2 | total time: 144.59m | eta: 34.9m +step 13461/16704 (80.59%) | loss: 2.542921 | lrm: 0.39 | dt: 645.01ms | tok/sec: 812,831 | mfu: 50.80 | epoch: 2 | total time: 144.60m | eta: 34.9m +step 13462/16704 (80.59%) | loss: 2.547925 | lrm: 0.39 | dt: 646.28ms | tok/sec: 811,239 | mfu: 50.70 | epoch: 2 | total time: 144.62m | eta: 34.9m +step 13463/16704 (80.60%) | loss: 2.543972 | lrm: 0.39 | dt: 645.23ms | tok/sec: 812,562 | mfu: 50.79 | epoch: 2 | total time: 144.63m | eta: 34.8m +step 13464/16704 (80.60%) | loss: 2.538862 | lrm: 0.39 | dt: 646.51ms | tok/sec: 810,946 | mfu: 50.69 | epoch: 2 | total time: 144.64m | eta: 34.8m +step 13465/16704 (80.61%) | loss: 2.531395 | lrm: 0.39 | dt: 644.89ms | tok/sec: 812,985 | mfu: 50.81 | epoch: 2 | total time: 144.65m | eta: 34.8m +step 13466/16704 (80.62%) | loss: 2.519368 | lrm: 0.39 | dt: 643.56ms | tok/sec: 814,665 | mfu: 50.92 | epoch: 2 | total time: 144.66m | eta: 34.8m +step 13467/16704 (80.62%) | loss: 2.505339 | lrm: 0.39 | dt: 645.10ms | tok/sec: 812,729 | mfu: 50.80 | epoch: 2 | total time: 144.67m | eta: 34.8m +step 13468/16704 (80.63%) | loss: 2.504116 | lrm: 0.39 | dt: 643.73ms | tok/sec: 814,447 | mfu: 50.90 | epoch: 2 | total time: 144.68m | eta: 34.8m +step 13469/16704 (80.63%) | loss: 2.493715 | lrm: 0.39 | dt: 642.65ms | tok/sec: 815,821 | mfu: 50.99 | epoch: 2 | total time: 144.69m | eta: 34.8m +step 13470/16704 (80.64%) | loss: 2.491901 | lrm: 0.39 | dt: 645.65ms | tok/sec: 812,033 | mfu: 50.75 | epoch: 2 | total time: 144.70m | eta: 34.8m +step 13471/16704 (80.65%) | loss: 2.493848 | lrm: 0.39 | dt: 641.56ms | tok/sec: 817,202 | mfu: 51.08 | epoch: 2 | total time: 144.71m | eta: 34.8m +step 13472/16704 (80.65%) | loss: 2.492066 | lrm: 0.39 | dt: 644.76ms | tok/sec: 813,153 | mfu: 50.82 | epoch: 2 | total time: 144.72m | eta: 34.7m +step 13473/16704 (80.66%) | loss: 2.504658 | lrm: 0.39 | dt: 643.70ms | tok/sec: 814,495 | mfu: 50.91 | epoch: 2 | total time: 144.73m | eta: 34.7m +step 13474/16704 (80.66%) | loss: 2.520282 | lrm: 0.39 | dt: 642.37ms | tok/sec: 816,175 | mfu: 51.01 | epoch: 2 | total time: 144.74m | eta: 34.7m +step 13475/16704 (80.67%) | loss: 2.521035 | lrm: 0.39 | dt: 644.92ms | tok/sec: 812,946 | mfu: 50.81 | epoch: 2 | total time: 144.75m | eta: 34.7m +step 13476/16704 (80.68%) | loss: 2.537365 | lrm: 0.39 | dt: 643.87ms | tok/sec: 814,279 | mfu: 50.89 | epoch: 2 | total time: 144.77m | eta: 34.7m +step 13477/16704 (80.68%) | loss: 2.527366 | lrm: 0.39 | dt: 643.28ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 2 | total time: 144.78m | eta: 34.7m +step 13478/16704 (80.69%) | loss: 2.517035 | lrm: 0.39 | dt: 646.32ms | tok/sec: 811,189 | mfu: 50.70 | epoch: 2 | total time: 144.79m | eta: 34.7m +step 13479/16704 (80.69%) | loss: 2.509508 | lrm: 0.39 | dt: 645.23ms | tok/sec: 812,555 | mfu: 50.79 | epoch: 2 | total time: 144.80m | eta: 34.7m +step 13480/16704 (80.70%) | loss: 2.514887 | lrm: 0.39 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 2 | total time: 144.81m | eta: 34.7m +step 13481/16704 (80.71%) | loss: 2.514049 | lrm: 0.39 | dt: 644.92ms | tok/sec: 812,946 | mfu: 50.81 | epoch: 2 | total time: 144.82m | eta: 34.6m +step 13482/16704 (80.71%) | loss: 2.519975 | lrm: 0.39 | dt: 645.05ms | tok/sec: 812,792 | mfu: 50.80 | epoch: 2 | total time: 144.83m | eta: 34.6m +step 13483/16704 (80.72%) | loss: 2.519727 | lrm: 0.39 | dt: 648.17ms | tok/sec: 808,876 | mfu: 50.56 | epoch: 2 | total time: 144.84m | eta: 34.6m +step 13484/16704 (80.72%) | loss: 2.523817 | lrm: 0.39 | dt: 643.91ms | tok/sec: 814,228 | mfu: 50.89 | epoch: 2 | total time: 144.85m | eta: 34.6m +step 13485/16704 (80.73%) | loss: 2.506865 | lrm: 0.39 | dt: 644.23ms | tok/sec: 813,822 | mfu: 50.87 | epoch: 2 | total time: 144.86m | eta: 34.6m +step 13486/16704 (80.74%) | loss: 2.517312 | lrm: 0.39 | dt: 645.68ms | tok/sec: 811,990 | mfu: 50.75 | epoch: 2 | total time: 144.87m | eta: 34.6m +step 13487/16704 (80.74%) | loss: 2.515252 | lrm: 0.39 | dt: 641.35ms | tok/sec: 817,477 | mfu: 51.09 | epoch: 2 | total time: 144.88m | eta: 34.6m +step 13488/16704 (80.75%) | loss: 2.505107 | lrm: 0.39 | dt: 645.40ms | tok/sec: 812,345 | mfu: 50.77 | epoch: 2 | total time: 144.89m | eta: 34.6m +step 13489/16704 (80.75%) | loss: 2.502562 | lrm: 0.38 | dt: 643.93ms | tok/sec: 814,197 | mfu: 50.89 | epoch: 2 | total time: 144.91m | eta: 34.6m +step 13490/16704 (80.76%) | loss: 2.497062 | lrm: 0.38 | dt: 645.35ms | tok/sec: 812,407 | mfu: 50.78 | epoch: 2 | total time: 144.92m | eta: 34.6m +step 13491/16704 (80.77%) | loss: 2.486535 | lrm: 0.38 | dt: 642.62ms | tok/sec: 815,864 | mfu: 50.99 | epoch: 2 | total time: 144.93m | eta: 34.5m +step 13492/16704 (80.77%) | loss: 2.479680 | lrm: 0.38 | dt: 646.27ms | tok/sec: 811,252 | mfu: 50.70 | epoch: 2 | total time: 144.94m | eta: 34.5m +step 13493/16704 (80.78%) | loss: 2.487341 | lrm: 0.38 | dt: 644.60ms | tok/sec: 813,350 | mfu: 50.84 | epoch: 2 | total time: 144.95m | eta: 34.5m +step 13494/16704 (80.78%) | loss: 2.479055 | lrm: 0.38 | dt: 643.73ms | tok/sec: 814,453 | mfu: 50.90 | epoch: 2 | total time: 144.96m | eta: 34.5m +step 13495/16704 (80.79%) | loss: 2.494665 | lrm: 0.38 | dt: 646.31ms | tok/sec: 811,201 | mfu: 50.70 | epoch: 2 | total time: 144.97m | eta: 34.5m +step 13496/16704 (80.80%) | loss: 2.494106 | lrm: 0.38 | dt: 645.76ms | tok/sec: 811,894 | mfu: 50.74 | epoch: 2 | total time: 144.98m | eta: 34.5m +step 13497/16704 (80.80%) | loss: 2.486528 | lrm: 0.38 | dt: 643.59ms | tok/sec: 814,633 | mfu: 50.92 | epoch: 2 | total time: 144.99m | eta: 34.5m +step 13498/16704 (80.81%) | loss: 2.492176 | lrm: 0.38 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 2 | total time: 145.00m | eta: 34.5m +step 13499/16704 (80.81%) | loss: 2.474692 | lrm: 0.38 | dt: 644.43ms | tok/sec: 813,568 | mfu: 50.85 | epoch: 2 | total time: 145.01m | eta: 34.5m +Step 13500 | Validation bpb: 0.773576 +step 13500/16704 (80.82%) | loss: 2.481615 | lrm: 0.38 | dt: 645.76ms | tok/sec: 811,897 | mfu: 50.74 | epoch: 2 | total time: 145.02m | eta: 34.4m +step 13501/16704 (80.82%) | loss: 2.491224 | lrm: 0.38 | dt: 648.08ms | tok/sec: 808,981 | mfu: 50.56 | epoch: 2 | total time: 145.03m | eta: 34.4m +step 13502/16704 (80.83%) | loss: 2.487469 | lrm: 0.38 | dt: 644.09ms | tok/sec: 813,996 | mfu: 50.88 | epoch: 2 | total time: 145.04m | eta: 34.4m +step 13503/16704 (80.84%) | loss: 2.499703 | lrm: 0.38 | dt: 642.94ms | tok/sec: 815,452 | mfu: 50.97 | epoch: 2 | total time: 145.06m | eta: 34.4m +step 13504/16704 (80.84%) | loss: 2.508251 | lrm: 0.38 | dt: 645.52ms | tok/sec: 812,188 | mfu: 50.76 | epoch: 2 | total time: 145.07m | eta: 34.4m +step 13505/16704 (80.85%) | loss: 2.522388 | lrm: 0.38 | dt: 644.81ms | tok/sec: 813,092 | mfu: 50.82 | epoch: 2 | total time: 145.08m | eta: 34.4m +step 13506/16704 (80.85%) | loss: 2.511328 | lrm: 0.38 | dt: 646.41ms | tok/sec: 811,081 | mfu: 50.69 | epoch: 2 | total time: 145.09m | eta: 34.4m +step 13507/16704 (80.86%) | loss: 2.510630 | lrm: 0.38 | dt: 643.66ms | tok/sec: 814,543 | mfu: 50.91 | epoch: 2 | total time: 145.10m | eta: 34.4m +step 13508/16704 (80.87%) | loss: 2.502410 | lrm: 0.38 | dt: 644.38ms | tok/sec: 813,626 | mfu: 50.85 | epoch: 2 | total time: 145.11m | eta: 34.4m +step 13509/16704 (80.87%) | loss: 2.497313 | lrm: 0.38 | dt: 645.41ms | tok/sec: 812,335 | mfu: 50.77 | epoch: 2 | total time: 145.12m | eta: 34.3m +step 13510/16704 (80.88%) | loss: 2.506046 | lrm: 0.38 | dt: 642.08ms | tok/sec: 816,542 | mfu: 51.04 | epoch: 2 | total time: 145.13m | eta: 34.3m +step 13511/16704 (80.88%) | loss: 2.506818 | lrm: 0.38 | dt: 644.29ms | tok/sec: 813,749 | mfu: 50.86 | epoch: 2 | total time: 145.14m | eta: 34.3m +step 13512/16704 (80.89%) | loss: 2.513676 | lrm: 0.38 | dt: 645.22ms | tok/sec: 812,572 | mfu: 50.79 | epoch: 2 | total time: 145.15m | eta: 34.3m +step 13513/16704 (80.90%) | loss: 2.515308 | lrm: 0.38 | dt: 642.10ms | tok/sec: 816,527 | mfu: 51.03 | epoch: 2 | total time: 145.16m | eta: 34.3m +step 13514/16704 (80.90%) | loss: 2.514371 | lrm: 0.38 | dt: 645.70ms | tok/sec: 811,966 | mfu: 50.75 | epoch: 2 | total time: 145.17m | eta: 34.3m +step 13515/16704 (80.91%) | loss: 2.512639 | lrm: 0.38 | dt: 642.32ms | tok/sec: 816,243 | mfu: 51.02 | epoch: 2 | total time: 145.18m | eta: 34.3m +step 13516/16704 (80.91%) | loss: 2.502982 | lrm: 0.38 | dt: 644.65ms | tok/sec: 813,294 | mfu: 50.83 | epoch: 2 | total time: 145.20m | eta: 34.3m +step 13517/16704 (80.92%) | loss: 2.495837 | lrm: 0.38 | dt: 645.89ms | tok/sec: 811,729 | mfu: 50.73 | epoch: 2 | total time: 145.21m | eta: 34.3m +step 13518/16704 (80.93%) | loss: 2.505378 | lrm: 0.38 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 2 | total time: 145.22m | eta: 34.3m +step 13519/16704 (80.93%) | loss: 2.500205 | lrm: 0.38 | dt: 645.01ms | tok/sec: 812,842 | mfu: 50.80 | epoch: 2 | total time: 145.23m | eta: 34.2m +step 13520/16704 (80.94%) | loss: 2.505109 | lrm: 0.38 | dt: 642.79ms | tok/sec: 815,639 | mfu: 50.98 | epoch: 2 | total time: 145.24m | eta: 34.2m +step 13521/16704 (80.94%) | loss: 2.498665 | lrm: 0.38 | dt: 644.83ms | tok/sec: 813,064 | mfu: 50.82 | epoch: 2 | total time: 145.25m | eta: 34.2m +step 13522/16704 (80.95%) | loss: 2.517365 | lrm: 0.38 | dt: 645.05ms | tok/sec: 812,791 | mfu: 50.80 | epoch: 2 | total time: 145.26m | eta: 34.2m +step 13523/16704 (80.96%) | loss: 2.513150 | lrm: 0.38 | dt: 643.59ms | tok/sec: 814,626 | mfu: 50.92 | epoch: 2 | total time: 145.27m | eta: 34.2m +step 13524/16704 (80.96%) | loss: 2.529940 | lrm: 0.38 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 2 | total time: 145.28m | eta: 34.2m +step 13525/16704 (80.97%) | loss: 2.528786 | lrm: 0.38 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 2 | total time: 145.29m | eta: 34.2m +step 13526/16704 (80.97%) | loss: 2.527182 | lrm: 0.38 | dt: 645.69ms | tok/sec: 811,977 | mfu: 50.75 | epoch: 2 | total time: 145.30m | eta: 34.2m +step 13527/16704 (80.98%) | loss: 2.518739 | lrm: 0.38 | dt: 643.30ms | tok/sec: 814,992 | mfu: 50.94 | epoch: 2 | total time: 145.31m | eta: 34.2m +step 13528/16704 (80.99%) | loss: 2.501266 | lrm: 0.38 | dt: 644.16ms | tok/sec: 813,906 | mfu: 50.87 | epoch: 2 | total time: 145.32m | eta: 34.1m +step 13529/16704 (80.99%) | loss: 2.481002 | lrm: 0.38 | dt: 644.22ms | tok/sec: 813,830 | mfu: 50.87 | epoch: 2 | total time: 145.33m | eta: 34.1m +step 13530/16704 (81.00%) | loss: 2.475580 | lrm: 0.38 | dt: 644.86ms | tok/sec: 813,026 | mfu: 50.82 | epoch: 2 | total time: 145.35m | eta: 34.1m +step 13531/16704 (81.00%) | loss: 2.469321 | lrm: 0.38 | dt: 646.62ms | tok/sec: 810,809 | mfu: 50.68 | epoch: 2 | total time: 145.36m | eta: 34.1m +step 13532/16704 (81.01%) | loss: 2.469576 | lrm: 0.38 | dt: 644.99ms | tok/sec: 812,866 | mfu: 50.81 | epoch: 2 | total time: 145.37m | eta: 34.1m +step 13533/16704 (81.02%) | loss: 2.473652 | lrm: 0.38 | dt: 642.13ms | tok/sec: 816,478 | mfu: 51.03 | epoch: 2 | total time: 145.38m | eta: 34.1m +step 13534/16704 (81.02%) | loss: 2.475779 | lrm: 0.38 | dt: 645.96ms | tok/sec: 811,644 | mfu: 50.73 | epoch: 2 | total time: 145.39m | eta: 34.1m +step 13535/16704 (81.03%) | loss: 2.474001 | lrm: 0.38 | dt: 645.18ms | tok/sec: 812,623 | mfu: 50.79 | epoch: 2 | total time: 145.40m | eta: 34.1m +step 13536/16704 (81.03%) | loss: 2.477207 | lrm: 0.38 | dt: 644.27ms | tok/sec: 813,771 | mfu: 50.86 | epoch: 2 | total time: 145.41m | eta: 34.1m +step 13537/16704 (81.04%) | loss: 2.491706 | lrm: 0.38 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 2 | total time: 145.42m | eta: 34.0m +step 13538/16704 (81.05%) | loss: 2.501681 | lrm: 0.38 | dt: 643.85ms | tok/sec: 814,298 | mfu: 50.89 | epoch: 2 | total time: 145.43m | eta: 34.0m +step 13539/16704 (81.05%) | loss: 2.504867 | lrm: 0.38 | dt: 644.56ms | tok/sec: 813,399 | mfu: 50.84 | epoch: 2 | total time: 145.44m | eta: 34.0m +step 13540/16704 (81.06%) | loss: 2.499467 | lrm: 0.38 | dt: 644.06ms | tok/sec: 814,038 | mfu: 50.88 | epoch: 2 | total time: 145.45m | eta: 34.0m +step 13541/16704 (81.06%) | loss: 2.494021 | lrm: 0.38 | dt: 645.50ms | tok/sec: 812,216 | mfu: 50.76 | epoch: 2 | total time: 145.46m | eta: 34.0m +step 13542/16704 (81.07%) | loss: 2.509076 | lrm: 0.38 | dt: 646.31ms | tok/sec: 811,204 | mfu: 50.70 | epoch: 2 | total time: 145.47m | eta: 34.0m +step 13543/16704 (81.08%) | loss: 2.500870 | lrm: 0.38 | dt: 643.63ms | tok/sec: 814,581 | mfu: 50.91 | epoch: 2 | total time: 145.49m | eta: 34.0m +step 13544/16704 (81.08%) | loss: 2.507851 | lrm: 0.38 | dt: 645.35ms | tok/sec: 812,402 | mfu: 50.78 | epoch: 2 | total time: 145.50m | eta: 34.0m +step 13545/16704 (81.09%) | loss: 2.515504 | lrm: 0.38 | dt: 644.53ms | tok/sec: 813,446 | mfu: 50.84 | epoch: 2 | total time: 145.51m | eta: 34.0m +step 13546/16704 (81.09%) | loss: 2.509607 | lrm: 0.38 | dt: 643.16ms | tok/sec: 815,177 | mfu: 50.95 | epoch: 2 | total time: 145.52m | eta: 33.9m +step 13547/16704 (81.10%) | loss: 2.516855 | lrm: 0.38 | dt: 646.87ms | tok/sec: 810,495 | mfu: 50.66 | epoch: 2 | total time: 145.53m | eta: 33.9m +step 13548/16704 (81.11%) | loss: 2.530679 | lrm: 0.38 | dt: 643.78ms | tok/sec: 814,386 | mfu: 50.90 | epoch: 2 | total time: 145.54m | eta: 33.9m +step 13549/16704 (81.11%) | loss: 2.522895 | lrm: 0.38 | dt: 647.61ms | tok/sec: 809,576 | mfu: 50.60 | epoch: 2 | total time: 145.55m | eta: 33.9m +step 13550/16704 (81.12%) | loss: 2.523193 | lrm: 0.38 | dt: 641.84ms | tok/sec: 816,847 | mfu: 51.05 | epoch: 2 | total time: 145.56m | eta: 33.9m +step 13551/16704 (81.12%) | loss: 2.525324 | lrm: 0.38 | dt: 643.34ms | tok/sec: 814,946 | mfu: 50.94 | epoch: 2 | total time: 145.57m | eta: 33.9m +step 13552/16704 (81.13%) | loss: 2.531821 | lrm: 0.38 | dt: 645.36ms | tok/sec: 812,401 | mfu: 50.78 | epoch: 2 | total time: 145.58m | eta: 33.9m +step 13553/16704 (81.14%) | loss: 2.520919 | lrm: 0.38 | dt: 641.88ms | tok/sec: 816,806 | mfu: 51.05 | epoch: 2 | total time: 145.59m | eta: 33.9m +step 13554/16704 (81.14%) | loss: 2.531038 | lrm: 0.38 | dt: 645.88ms | tok/sec: 811,740 | mfu: 50.73 | epoch: 2 | total time: 145.60m | eta: 33.9m +step 13555/16704 (81.15%) | loss: 2.539757 | lrm: 0.38 | dt: 643.68ms | tok/sec: 814,522 | mfu: 50.91 | epoch: 2 | total time: 145.61m | eta: 33.9m +step 13556/16704 (81.15%) | loss: 2.536657 | lrm: 0.38 | dt: 643.73ms | tok/sec: 814,455 | mfu: 50.90 | epoch: 2 | total time: 145.62m | eta: 33.8m +step 13557/16704 (81.16%) | loss: 2.526732 | lrm: 0.38 | dt: 645.48ms | tok/sec: 812,243 | mfu: 50.77 | epoch: 2 | total time: 145.64m | eta: 33.8m +step 13558/16704 (81.17%) | loss: 2.526521 | lrm: 0.38 | dt: 641.90ms | tok/sec: 816,779 | mfu: 51.05 | epoch: 2 | total time: 145.65m | eta: 33.8m +step 13559/16704 (81.17%) | loss: 2.521760 | lrm: 0.38 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 2 | total time: 145.66m | eta: 33.8m +step 13560/16704 (81.18%) | loss: 2.520509 | lrm: 0.38 | dt: 642.66ms | tok/sec: 815,813 | mfu: 50.99 | epoch: 2 | total time: 145.67m | eta: 33.8m +step 13561/16704 (81.18%) | loss: 2.524339 | lrm: 0.38 | dt: 643.56ms | tok/sec: 814,662 | mfu: 50.92 | epoch: 2 | total time: 145.68m | eta: 33.8m +step 13562/16704 (81.19%) | loss: 2.539489 | lrm: 0.38 | dt: 643.36ms | tok/sec: 814,916 | mfu: 50.93 | epoch: 2 | total time: 145.69m | eta: 33.8m +step 13563/16704 (81.20%) | loss: 2.531705 | lrm: 0.38 | dt: 644.96ms | tok/sec: 812,899 | mfu: 50.81 | epoch: 2 | total time: 145.70m | eta: 33.8m +step 13564/16704 (81.20%) | loss: 2.533099 | lrm: 0.38 | dt: 645.50ms | tok/sec: 812,225 | mfu: 50.77 | epoch: 2 | total time: 145.71m | eta: 33.8m +step 13565/16704 (81.21%) | loss: 2.529078 | lrm: 0.38 | dt: 645.21ms | tok/sec: 812,582 | mfu: 50.79 | epoch: 2 | total time: 145.72m | eta: 33.7m +step 13566/16704 (81.21%) | loss: 2.532752 | lrm: 0.38 | dt: 643.13ms | tok/sec: 815,208 | mfu: 50.95 | epoch: 2 | total time: 145.73m | eta: 33.7m +step 13567/16704 (81.22%) | loss: 2.526474 | lrm: 0.38 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 2 | total time: 145.74m | eta: 33.7m +step 13568/16704 (81.23%) | loss: 2.529305 | lrm: 0.38 | dt: 647.96ms | tok/sec: 809,133 | mfu: 50.57 | epoch: 2 | total time: 145.75m | eta: 33.7m +step 13569/16704 (81.23%) | loss: 2.537349 | lrm: 0.38 | dt: 643.28ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 2 | total time: 145.76m | eta: 33.7m +step 13570/16704 (81.24%) | loss: 2.541239 | lrm: 0.38 | dt: 644.99ms | tok/sec: 812,864 | mfu: 50.81 | epoch: 2 | total time: 145.78m | eta: 33.7m +step 13571/16704 (81.24%) | loss: 2.541710 | lrm: 0.38 | dt: 645.06ms | tok/sec: 812,772 | mfu: 50.80 | epoch: 2 | total time: 145.79m | eta: 33.7m +step 13572/16704 (81.25%) | loss: 2.531729 | lrm: 0.38 | dt: 643.83ms | tok/sec: 814,330 | mfu: 50.90 | epoch: 2 | total time: 145.80m | eta: 33.7m +step 13573/16704 (81.26%) | loss: 2.533539 | lrm: 0.37 | dt: 643.38ms | tok/sec: 814,899 | mfu: 50.93 | epoch: 2 | total time: 145.81m | eta: 33.7m +step 13574/16704 (81.26%) | loss: 2.532783 | lrm: 0.37 | dt: 645.37ms | tok/sec: 812,379 | mfu: 50.77 | epoch: 2 | total time: 145.82m | eta: 33.6m +step 13575/16704 (81.27%) | loss: 2.542857 | lrm: 0.37 | dt: 642.34ms | tok/sec: 816,211 | mfu: 51.01 | epoch: 2 | total time: 145.83m | eta: 33.6m +step 13576/16704 (81.27%) | loss: 2.527148 | lrm: 0.37 | dt: 645.51ms | tok/sec: 812,210 | mfu: 50.76 | epoch: 2 | total time: 145.84m | eta: 33.6m +step 13577/16704 (81.28%) | loss: 2.524302 | lrm: 0.37 | dt: 643.05ms | tok/sec: 815,312 | mfu: 50.96 | epoch: 2 | total time: 145.85m | eta: 33.6m +step 13578/16704 (81.29%) | loss: 2.513102 | lrm: 0.37 | dt: 644.84ms | tok/sec: 813,055 | mfu: 50.82 | epoch: 2 | total time: 145.86m | eta: 33.6m +step 13579/16704 (81.29%) | loss: 2.513185 | lrm: 0.37 | dt: 644.54ms | tok/sec: 813,426 | mfu: 50.84 | epoch: 2 | total time: 145.87m | eta: 33.6m +step 13580/16704 (81.30%) | loss: 2.518789 | lrm: 0.37 | dt: 642.80ms | tok/sec: 815,635 | mfu: 50.98 | epoch: 2 | total time: 145.88m | eta: 33.6m +step 13581/16704 (81.30%) | loss: 2.525796 | lrm: 0.37 | dt: 644.78ms | tok/sec: 813,127 | mfu: 50.82 | epoch: 2 | total time: 145.89m | eta: 33.6m +step 13582/16704 (81.31%) | loss: 2.533161 | lrm: 0.37 | dt: 642.48ms | tok/sec: 816,037 | mfu: 51.00 | epoch: 2 | total time: 145.90m | eta: 33.6m +step 13583/16704 (81.32%) | loss: 2.545570 | lrm: 0.37 | dt: 645.57ms | tok/sec: 812,128 | mfu: 50.76 | epoch: 2 | total time: 145.91m | eta: 33.6m +step 13584/16704 (81.32%) | loss: 2.532859 | lrm: 0.37 | dt: 643.28ms | tok/sec: 815,025 | mfu: 50.94 | epoch: 2 | total time: 145.93m | eta: 33.5m +step 13585/16704 (81.33%) | loss: 2.518182 | lrm: 0.37 | dt: 646.40ms | tok/sec: 811,085 | mfu: 50.69 | epoch: 2 | total time: 145.94m | eta: 33.5m +step 13586/16704 (81.33%) | loss: 2.514242 | lrm: 0.37 | dt: 642.95ms | tok/sec: 815,437 | mfu: 50.97 | epoch: 2 | total time: 145.95m | eta: 33.5m +step 13587/16704 (81.34%) | loss: 2.516195 | lrm: 0.37 | dt: 644.12ms | tok/sec: 813,959 | mfu: 50.87 | epoch: 2 | total time: 145.96m | eta: 33.5m +step 13588/16704 (81.35%) | loss: 2.514649 | lrm: 0.37 | dt: 643.06ms | tok/sec: 815,300 | mfu: 50.96 | epoch: 2 | total time: 145.97m | eta: 33.5m +step 13589/16704 (81.35%) | loss: 2.527232 | lrm: 0.37 | dt: 646.30ms | tok/sec: 811,218 | mfu: 50.70 | epoch: 2 | total time: 145.98m | eta: 33.5m +step 13590/16704 (81.36%) | loss: 2.520499 | lrm: 0.37 | dt: 641.28ms | tok/sec: 817,564 | mfu: 51.10 | epoch: 2 | total time: 145.99m | eta: 33.5m +step 13591/16704 (81.36%) | loss: 2.521004 | lrm: 0.37 | dt: 644.24ms | tok/sec: 813,805 | mfu: 50.86 | epoch: 2 | total time: 146.00m | eta: 33.5m +step 13592/16704 (81.37%) | loss: 2.525745 | lrm: 0.37 | dt: 643.15ms | tok/sec: 815,191 | mfu: 50.95 | epoch: 2 | total time: 146.01m | eta: 33.5m +step 13593/16704 (81.38%) | loss: 2.514993 | lrm: 0.37 | dt: 644.54ms | tok/sec: 813,434 | mfu: 50.84 | epoch: 2 | total time: 146.02m | eta: 33.4m +step 13594/16704 (81.38%) | loss: 2.511681 | lrm: 0.37 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 2 | total time: 146.03m | eta: 33.4m +step 13595/16704 (81.39%) | loss: 2.512726 | lrm: 0.37 | dt: 643.47ms | tok/sec: 814,782 | mfu: 50.93 | epoch: 2 | total time: 146.04m | eta: 33.4m +step 13596/16704 (81.39%) | loss: 2.504023 | lrm: 0.37 | dt: 643.61ms | tok/sec: 814,605 | mfu: 50.91 | epoch: 2 | total time: 146.05m | eta: 33.4m +step 13597/16704 (81.40%) | loss: 2.504009 | lrm: 0.37 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 2 | total time: 146.06m | eta: 33.4m +step 13598/16704 (81.41%) | loss: 2.517646 | lrm: 0.37 | dt: 642.60ms | tok/sec: 815,879 | mfu: 50.99 | epoch: 2 | total time: 146.08m | eta: 33.4m +step 13599/16704 (81.41%) | loss: 2.537141 | lrm: 0.37 | dt: 645.18ms | tok/sec: 812,627 | mfu: 50.79 | epoch: 2 | total time: 146.09m | eta: 33.4m +step 13600/16704 (81.42%) | loss: 2.530862 | lrm: 0.37 | dt: 644.76ms | tok/sec: 813,152 | mfu: 50.82 | epoch: 2 | total time: 146.10m | eta: 33.4m +step 13601/16704 (81.42%) | loss: 2.520212 | lrm: 0.37 | dt: 642.29ms | tok/sec: 816,276 | mfu: 51.02 | epoch: 2 | total time: 146.11m | eta: 33.4m +step 13602/16704 (81.43%) | loss: 2.520884 | lrm: 0.37 | dt: 647.05ms | tok/sec: 810,272 | mfu: 50.64 | epoch: 2 | total time: 146.12m | eta: 33.3m +step 13603/16704 (81.44%) | loss: 2.519078 | lrm: 0.37 | dt: 642.93ms | tok/sec: 815,464 | mfu: 50.97 | epoch: 2 | total time: 146.13m | eta: 33.3m +step 13604/16704 (81.44%) | loss: 2.515000 | lrm: 0.37 | dt: 643.47ms | tok/sec: 814,787 | mfu: 50.93 | epoch: 2 | total time: 146.14m | eta: 33.3m +step 13605/16704 (81.45%) | loss: 2.529620 | lrm: 0.37 | dt: 645.41ms | tok/sec: 812,336 | mfu: 50.77 | epoch: 2 | total time: 146.15m | eta: 33.3m +step 13606/16704 (81.45%) | loss: 2.516343 | lrm: 0.37 | dt: 644.57ms | tok/sec: 813,396 | mfu: 50.84 | epoch: 2 | total time: 146.16m | eta: 33.3m +step 13607/16704 (81.46%) | loss: 2.524282 | lrm: 0.37 | dt: 641.78ms | tok/sec: 816,929 | mfu: 51.06 | epoch: 2 | total time: 146.17m | eta: 33.3m +step 13608/16704 (81.47%) | loss: 2.527580 | lrm: 0.37 | dt: 643.51ms | tok/sec: 814,727 | mfu: 50.92 | epoch: 2 | total time: 146.18m | eta: 33.3m +step 13609/16704 (81.47%) | loss: 2.525329 | lrm: 0.37 | dt: 643.34ms | tok/sec: 814,945 | mfu: 50.94 | epoch: 2 | total time: 146.19m | eta: 33.3m +step 13610/16704 (81.48%) | loss: 2.533097 | lrm: 0.37 | dt: 643.34ms | tok/sec: 814,942 | mfu: 50.94 | epoch: 2 | total time: 146.20m | eta: 33.3m +step 13611/16704 (81.48%) | loss: 2.527514 | lrm: 0.37 | dt: 643.63ms | tok/sec: 814,580 | mfu: 50.91 | epoch: 2 | total time: 146.22m | eta: 33.3m +step 13612/16704 (81.49%) | loss: 2.525856 | lrm: 0.37 | dt: 642.96ms | tok/sec: 815,429 | mfu: 50.97 | epoch: 2 | total time: 146.23m | eta: 33.2m +step 13613/16704 (81.50%) | loss: 2.541147 | lrm: 0.37 | dt: 643.13ms | tok/sec: 815,213 | mfu: 50.95 | epoch: 2 | total time: 146.24m | eta: 33.2m +step 13614/16704 (81.50%) | loss: 2.552106 | lrm: 0.37 | dt: 644.87ms | tok/sec: 813,019 | mfu: 50.81 | epoch: 2 | total time: 146.25m | eta: 33.2m +step 13615/16704 (81.51%) | loss: 2.550807 | lrm: 0.37 | dt: 646.03ms | tok/sec: 811,551 | mfu: 50.72 | epoch: 2 | total time: 146.26m | eta: 33.2m +step 13616/16704 (81.51%) | loss: 2.547123 | lrm: 0.37 | dt: 643.98ms | tok/sec: 814,134 | mfu: 50.88 | epoch: 2 | total time: 146.27m | eta: 33.2m +step 13617/16704 (81.52%) | loss: 2.535590 | lrm: 0.37 | dt: 644.15ms | tok/sec: 813,922 | mfu: 50.87 | epoch: 2 | total time: 146.28m | eta: 33.2m +step 13618/16704 (81.53%) | loss: 2.537724 | lrm: 0.37 | dt: 642.24ms | tok/sec: 816,348 | mfu: 51.02 | epoch: 2 | total time: 146.29m | eta: 33.2m +step 13619/16704 (81.53%) | loss: 2.537156 | lrm: 0.37 | dt: 645.64ms | tok/sec: 812,045 | mfu: 50.75 | epoch: 2 | total time: 146.30m | eta: 33.2m +step 13620/16704 (81.54%) | loss: 2.541272 | lrm: 0.37 | dt: 644.67ms | tok/sec: 813,269 | mfu: 50.83 | epoch: 2 | total time: 146.31m | eta: 33.2m +step 13621/16704 (81.54%) | loss: 2.544198 | lrm: 0.37 | dt: 644.09ms | tok/sec: 813,998 | mfu: 50.88 | epoch: 2 | total time: 146.32m | eta: 33.1m +step 13622/16704 (81.55%) | loss: 2.527171 | lrm: 0.37 | dt: 644.83ms | tok/sec: 813,069 | mfu: 50.82 | epoch: 2 | total time: 146.33m | eta: 33.1m +step 13623/16704 (81.56%) | loss: 2.527155 | lrm: 0.37 | dt: 643.83ms | tok/sec: 814,325 | mfu: 50.90 | epoch: 2 | total time: 146.34m | eta: 33.1m +step 13624/16704 (81.56%) | loss: 2.532609 | lrm: 0.37 | dt: 642.78ms | tok/sec: 815,657 | mfu: 50.98 | epoch: 2 | total time: 146.35m | eta: 33.1m +step 13625/16704 (81.57%) | loss: 2.533435 | lrm: 0.37 | dt: 646.37ms | tok/sec: 811,128 | mfu: 50.70 | epoch: 2 | total time: 146.37m | eta: 33.1m +step 13626/16704 (81.57%) | loss: 2.530289 | lrm: 0.37 | dt: 642.20ms | tok/sec: 816,391 | mfu: 51.03 | epoch: 2 | total time: 146.38m | eta: 33.1m +step 13627/16704 (81.58%) | loss: 2.518187 | lrm: 0.37 | dt: 643.62ms | tok/sec: 814,595 | mfu: 50.91 | epoch: 2 | total time: 146.39m | eta: 33.1m +step 13628/16704 (81.59%) | loss: 2.517499 | lrm: 0.37 | dt: 645.18ms | tok/sec: 812,618 | mfu: 50.79 | epoch: 2 | total time: 146.40m | eta: 33.1m +step 13629/16704 (81.59%) | loss: 2.520913 | lrm: 0.37 | dt: 642.82ms | tok/sec: 815,609 | mfu: 50.98 | epoch: 2 | total time: 146.41m | eta: 33.1m +step 13630/16704 (81.60%) | loss: 2.519157 | lrm: 0.37 | dt: 645.45ms | tok/sec: 812,280 | mfu: 50.77 | epoch: 2 | total time: 146.42m | eta: 33.0m +step 13631/16704 (81.60%) | loss: 2.513768 | lrm: 0.37 | dt: 642.74ms | tok/sec: 815,709 | mfu: 50.98 | epoch: 2 | total time: 146.43m | eta: 33.0m +step 13632/16704 (81.61%) | loss: 2.506902 | lrm: 0.37 | dt: 642.52ms | tok/sec: 815,988 | mfu: 51.00 | epoch: 2 | total time: 146.44m | eta: 33.0m +step 13633/16704 (81.62%) | loss: 2.503971 | lrm: 0.37 | dt: 646.06ms | tok/sec: 811,520 | mfu: 50.72 | epoch: 2 | total time: 146.45m | eta: 33.0m +step 13634/16704 (81.62%) | loss: 2.507371 | lrm: 0.37 | dt: 643.66ms | tok/sec: 814,546 | mfu: 50.91 | epoch: 2 | total time: 146.46m | eta: 33.0m +step 13635/16704 (81.63%) | loss: 2.507918 | lrm: 0.37 | dt: 641.00ms | tok/sec: 817,917 | mfu: 51.12 | epoch: 2 | total time: 146.47m | eta: 33.0m +step 13636/16704 (81.63%) | loss: 2.504226 | lrm: 0.37 | dt: 644.50ms | tok/sec: 813,482 | mfu: 50.84 | epoch: 2 | total time: 146.48m | eta: 33.0m +step 13637/16704 (81.64%) | loss: 2.496482 | lrm: 0.37 | dt: 643.22ms | tok/sec: 815,093 | mfu: 50.94 | epoch: 2 | total time: 146.49m | eta: 33.0m +step 13638/16704 (81.65%) | loss: 2.499911 | lrm: 0.37 | dt: 642.69ms | tok/sec: 815,770 | mfu: 50.99 | epoch: 2 | total time: 146.50m | eta: 33.0m +step 13639/16704 (81.65%) | loss: 2.509346 | lrm: 0.37 | dt: 645.13ms | tok/sec: 812,687 | mfu: 50.79 | epoch: 2 | total time: 146.52m | eta: 32.9m +step 13640/16704 (81.66%) | loss: 2.515675 | lrm: 0.37 | dt: 642.12ms | tok/sec: 816,497 | mfu: 51.03 | epoch: 2 | total time: 146.53m | eta: 32.9m +step 13641/16704 (81.66%) | loss: 2.524364 | lrm: 0.37 | dt: 644.31ms | tok/sec: 813,715 | mfu: 50.86 | epoch: 2 | total time: 146.54m | eta: 32.9m +step 13642/16704 (81.67%) | loss: 2.518304 | lrm: 0.37 | dt: 643.73ms | tok/sec: 814,452 | mfu: 50.90 | epoch: 2 | total time: 146.55m | eta: 32.9m +step 13643/16704 (81.68%) | loss: 2.517987 | lrm: 0.37 | dt: 643.91ms | tok/sec: 814,226 | mfu: 50.89 | epoch: 2 | total time: 146.56m | eta: 32.9m +step 13644/16704 (81.68%) | loss: 2.505408 | lrm: 0.37 | dt: 646.66ms | tok/sec: 810,758 | mfu: 50.67 | epoch: 2 | total time: 146.57m | eta: 32.9m +step 13645/16704 (81.69%) | loss: 2.514333 | lrm: 0.37 | dt: 641.46ms | tok/sec: 817,332 | mfu: 51.08 | epoch: 2 | total time: 146.58m | eta: 32.9m +step 13646/16704 (81.69%) | loss: 2.523735 | lrm: 0.37 | dt: 643.83ms | tok/sec: 814,326 | mfu: 50.90 | epoch: 2 | total time: 146.59m | eta: 32.9m +step 13647/16704 (81.70%) | loss: 2.539026 | lrm: 0.37 | dt: 643.45ms | tok/sec: 814,805 | mfu: 50.93 | epoch: 2 | total time: 146.60m | eta: 32.9m +step 13648/16704 (81.70%) | loss: 2.530272 | lrm: 0.37 | dt: 643.25ms | tok/sec: 815,054 | mfu: 50.94 | epoch: 2 | total time: 146.61m | eta: 32.9m +step 13649/16704 (81.71%) | loss: 2.537798 | lrm: 0.37 | dt: 644.63ms | tok/sec: 813,321 | mfu: 50.83 | epoch: 2 | total time: 146.62m | eta: 32.8m +step 13650/16704 (81.72%) | loss: 2.537187 | lrm: 0.37 | dt: 644.54ms | tok/sec: 813,434 | mfu: 50.84 | epoch: 2 | total time: 146.63m | eta: 32.8m +step 13651/16704 (81.72%) | loss: 2.525324 | lrm: 0.37 | dt: 642.76ms | tok/sec: 815,681 | mfu: 50.98 | epoch: 2 | total time: 146.64m | eta: 32.8m +step 13652/16704 (81.73%) | loss: 2.530181 | lrm: 0.37 | dt: 644.43ms | tok/sec: 813,568 | mfu: 50.85 | epoch: 2 | total time: 146.66m | eta: 32.8m +step 13653/16704 (81.73%) | loss: 2.528251 | lrm: 0.37 | dt: 642.43ms | tok/sec: 816,107 | mfu: 51.01 | epoch: 2 | total time: 146.67m | eta: 32.8m +step 13654/16704 (81.74%) | loss: 2.517586 | lrm: 0.37 | dt: 646.91ms | tok/sec: 810,450 | mfu: 50.65 | epoch: 2 | total time: 146.68m | eta: 32.8m +step 13655/16704 (81.75%) | loss: 2.519840 | lrm: 0.37 | dt: 642.68ms | tok/sec: 815,789 | mfu: 50.99 | epoch: 2 | total time: 146.69m | eta: 32.8m +step 13656/16704 (81.75%) | loss: 2.521187 | lrm: 0.36 | dt: 642.64ms | tok/sec: 815,830 | mfu: 50.99 | epoch: 2 | total time: 146.70m | eta: 32.8m +step 13657/16704 (81.76%) | loss: 2.518815 | lrm: 0.36 | dt: 643.95ms | tok/sec: 814,173 | mfu: 50.89 | epoch: 2 | total time: 146.71m | eta: 32.8m +step 13658/16704 (81.76%) | loss: 2.518283 | lrm: 0.36 | dt: 644.00ms | tok/sec: 814,110 | mfu: 50.88 | epoch: 2 | total time: 146.72m | eta: 32.7m +step 13659/16704 (81.77%) | loss: 2.499975 | lrm: 0.36 | dt: 645.23ms | tok/sec: 812,557 | mfu: 50.79 | epoch: 2 | total time: 146.73m | eta: 32.7m +step 13660/16704 (81.78%) | loss: 2.496159 | lrm: 0.36 | dt: 643.91ms | tok/sec: 814,229 | mfu: 50.89 | epoch: 2 | total time: 146.74m | eta: 32.7m +step 13661/16704 (81.78%) | loss: 2.493196 | lrm: 0.36 | dt: 641.95ms | tok/sec: 816,708 | mfu: 51.05 | epoch: 2 | total time: 146.75m | eta: 32.7m +step 13662/16704 (81.79%) | loss: 2.494280 | lrm: 0.36 | dt: 643.58ms | tok/sec: 814,644 | mfu: 50.92 | epoch: 2 | total time: 146.76m | eta: 32.7m +step 13663/16704 (81.79%) | loss: 2.511584 | lrm: 0.36 | dt: 641.79ms | tok/sec: 816,918 | mfu: 51.06 | epoch: 2 | total time: 146.77m | eta: 32.7m +step 13664/16704 (81.80%) | loss: 2.510490 | lrm: 0.36 | dt: 642.84ms | tok/sec: 815,575 | mfu: 50.97 | epoch: 2 | total time: 146.78m | eta: 32.7m +step 13665/16704 (81.81%) | loss: 2.514773 | lrm: 0.36 | dt: 643.01ms | tok/sec: 815,362 | mfu: 50.96 | epoch: 2 | total time: 146.79m | eta: 32.7m +step 13666/16704 (81.81%) | loss: 2.523832 | lrm: 0.36 | dt: 644.06ms | tok/sec: 814,031 | mfu: 50.88 | epoch: 2 | total time: 146.81m | eta: 32.7m +step 13667/16704 (81.82%) | loss: 2.529301 | lrm: 0.36 | dt: 645.21ms | tok/sec: 812,587 | mfu: 50.79 | epoch: 2 | total time: 146.82m | eta: 32.6m +step 13668/16704 (81.82%) | loss: 2.529248 | lrm: 0.36 | dt: 643.79ms | tok/sec: 814,372 | mfu: 50.90 | epoch: 2 | total time: 146.83m | eta: 32.6m +step 13669/16704 (81.83%) | loss: 2.537927 | lrm: 0.36 | dt: 644.92ms | tok/sec: 812,949 | mfu: 50.81 | epoch: 2 | total time: 146.84m | eta: 32.6m +step 13670/16704 (81.84%) | loss: 2.549596 | lrm: 0.36 | dt: 642.36ms | tok/sec: 816,187 | mfu: 51.01 | epoch: 2 | total time: 146.85m | eta: 32.6m +step 13671/16704 (81.84%) | loss: 2.543321 | lrm: 0.36 | dt: 645.55ms | tok/sec: 812,162 | mfu: 50.76 | epoch: 2 | total time: 146.86m | eta: 32.6m +step 13672/16704 (81.85%) | loss: 2.544690 | lrm: 0.36 | dt: 644.75ms | tok/sec: 813,164 | mfu: 50.82 | epoch: 2 | total time: 146.87m | eta: 32.6m +step 13673/16704 (81.85%) | loss: 2.550555 | lrm: 0.36 | dt: 642.27ms | tok/sec: 816,309 | mfu: 51.02 | epoch: 2 | total time: 146.88m | eta: 32.6m +step 13674/16704 (81.86%) | loss: 2.535228 | lrm: 0.36 | dt: 646.90ms | tok/sec: 810,463 | mfu: 50.66 | epoch: 2 | total time: 146.89m | eta: 32.6m +step 13675/16704 (81.87%) | loss: 2.531353 | lrm: 0.36 | dt: 646.63ms | tok/sec: 810,799 | mfu: 50.68 | epoch: 2 | total time: 146.90m | eta: 32.6m +step 13676/16704 (81.87%) | loss: 2.525524 | lrm: 0.36 | dt: 642.54ms | tok/sec: 815,958 | mfu: 51.00 | epoch: 2 | total time: 146.91m | eta: 32.6m +step 13677/16704 (81.88%) | loss: 2.528990 | lrm: 0.36 | dt: 646.96ms | tok/sec: 810,386 | mfu: 50.65 | epoch: 2 | total time: 146.92m | eta: 32.5m +step 13678/16704 (81.88%) | loss: 2.523767 | lrm: 0.36 | dt: 644.03ms | tok/sec: 814,075 | mfu: 50.88 | epoch: 2 | total time: 146.93m | eta: 32.5m +step 13679/16704 (81.89%) | loss: 2.534280 | lrm: 0.36 | dt: 644.16ms | tok/sec: 813,911 | mfu: 50.87 | epoch: 2 | total time: 146.95m | eta: 32.5m +step 13680/16704 (81.90%) | loss: 2.534574 | lrm: 0.36 | dt: 644.72ms | tok/sec: 813,204 | mfu: 50.83 | epoch: 2 | total time: 146.96m | eta: 32.5m +step 13681/16704 (81.90%) | loss: 2.529310 | lrm: 0.36 | dt: 641.98ms | tok/sec: 816,672 | mfu: 51.04 | epoch: 2 | total time: 146.97m | eta: 32.5m +step 13682/16704 (81.91%) | loss: 2.537150 | lrm: 0.36 | dt: 642.39ms | tok/sec: 816,154 | mfu: 51.01 | epoch: 2 | total time: 146.98m | eta: 32.5m +step 13683/16704 (81.91%) | loss: 2.537702 | lrm: 0.36 | dt: 644.11ms | tok/sec: 813,967 | mfu: 50.87 | epoch: 2 | total time: 146.99m | eta: 32.5m +step 13684/16704 (81.92%) | loss: 2.538640 | lrm: 0.36 | dt: 642.90ms | tok/sec: 815,505 | mfu: 50.97 | epoch: 2 | total time: 147.00m | eta: 32.5m +step 13685/16704 (81.93%) | loss: 2.534024 | lrm: 0.36 | dt: 643.96ms | tok/sec: 814,161 | mfu: 50.89 | epoch: 2 | total time: 147.01m | eta: 32.5m +step 13686/16704 (81.93%) | loss: 2.535548 | lrm: 0.36 | dt: 644.47ms | tok/sec: 813,513 | mfu: 50.85 | epoch: 2 | total time: 147.02m | eta: 32.4m +step 13687/16704 (81.94%) | loss: 2.523315 | lrm: 0.36 | dt: 643.68ms | tok/sec: 814,518 | mfu: 50.91 | epoch: 2 | total time: 147.03m | eta: 32.4m +step 13688/16704 (81.94%) | loss: 2.530017 | lrm: 0.36 | dt: 644.69ms | tok/sec: 813,243 | mfu: 50.83 | epoch: 2 | total time: 147.04m | eta: 32.4m +step 13689/16704 (81.95%) | loss: 2.534393 | lrm: 0.36 | dt: 643.11ms | tok/sec: 815,239 | mfu: 50.95 | epoch: 2 | total time: 147.05m | eta: 32.4m +step 13690/16704 (81.96%) | loss: 2.532903 | lrm: 0.36 | dt: 644.77ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 2 | total time: 147.06m | eta: 32.4m +step 13691/16704 (81.96%) | loss: 2.514473 | lrm: 0.36 | dt: 641.17ms | tok/sec: 817,706 | mfu: 51.11 | epoch: 2 | total time: 147.07m | eta: 32.4m +step 13692/16704 (81.97%) | loss: 2.520036 | lrm: 0.36 | dt: 644.31ms | tok/sec: 813,725 | mfu: 50.86 | epoch: 2 | total time: 147.08m | eta: 32.4m +step 13693/16704 (81.97%) | loss: 2.520039 | lrm: 0.36 | dt: 644.86ms | tok/sec: 813,028 | mfu: 50.82 | epoch: 2 | total time: 147.10m | eta: 32.4m +step 13694/16704 (81.98%) | loss: 2.517977 | lrm: 0.36 | dt: 644.97ms | tok/sec: 812,891 | mfu: 50.81 | epoch: 2 | total time: 147.11m | eta: 32.4m +step 13695/16704 (81.99%) | loss: 2.510762 | lrm: 0.36 | dt: 642.89ms | tok/sec: 815,518 | mfu: 50.97 | epoch: 2 | total time: 147.12m | eta: 32.3m +step 13696/16704 (81.99%) | loss: 2.492269 | lrm: 0.36 | dt: 644.57ms | tok/sec: 813,386 | mfu: 50.84 | epoch: 2 | total time: 147.13m | eta: 32.3m +step 13697/16704 (82.00%) | loss: 2.502025 | lrm: 0.36 | dt: 642.98ms | tok/sec: 815,397 | mfu: 50.96 | epoch: 2 | total time: 147.14m | eta: 32.3m +step 13698/16704 (82.00%) | loss: 2.495582 | lrm: 0.36 | dt: 644.27ms | tok/sec: 813,764 | mfu: 50.86 | epoch: 2 | total time: 147.15m | eta: 32.3m +step 13699/16704 (82.01%) | loss: 2.503670 | lrm: 0.36 | dt: 645.43ms | tok/sec: 812,308 | mfu: 50.77 | epoch: 2 | total time: 147.16m | eta: 32.3m +step 13700/16704 (82.02%) | loss: 2.498906 | lrm: 0.36 | dt: 643.48ms | tok/sec: 814,770 | mfu: 50.92 | epoch: 2 | total time: 147.17m | eta: 32.3m +step 13701/16704 (82.02%) | loss: 2.493263 | lrm: 0.36 | dt: 645.41ms | tok/sec: 812,329 | mfu: 50.77 | epoch: 2 | total time: 147.18m | eta: 32.3m +step 13702/16704 (82.03%) | loss: 2.490294 | lrm: 0.36 | dt: 642.39ms | tok/sec: 816,155 | mfu: 51.01 | epoch: 2 | total time: 147.19m | eta: 32.3m +step 13703/16704 (82.03%) | loss: 2.494941 | lrm: 0.36 | dt: 647.06ms | tok/sec: 810,256 | mfu: 50.64 | epoch: 2 | total time: 147.20m | eta: 32.3m +step 13704/16704 (82.04%) | loss: 2.511844 | lrm: 0.36 | dt: 644.40ms | tok/sec: 813,610 | mfu: 50.85 | epoch: 2 | total time: 147.21m | eta: 32.3m +step 13705/16704 (82.05%) | loss: 2.521522 | lrm: 0.36 | dt: 644.57ms | tok/sec: 813,387 | mfu: 50.84 | epoch: 2 | total time: 147.22m | eta: 32.2m +step 13706/16704 (82.05%) | loss: 2.524727 | lrm: 0.36 | dt: 644.73ms | tok/sec: 813,189 | mfu: 50.83 | epoch: 2 | total time: 147.23m | eta: 32.2m +step 13707/16704 (82.06%) | loss: 2.508808 | lrm: 0.36 | dt: 644.32ms | tok/sec: 813,708 | mfu: 50.86 | epoch: 2 | total time: 147.25m | eta: 32.2m +step 13708/16704 (82.06%) | loss: 2.511445 | lrm: 0.36 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 2 | total time: 147.26m | eta: 32.2m +step 13709/16704 (82.07%) | loss: 2.505952 | lrm: 0.36 | dt: 641.08ms | tok/sec: 817,813 | mfu: 51.11 | epoch: 2 | total time: 147.27m | eta: 32.2m +step 13710/16704 (82.08%) | loss: 2.501758 | lrm: 0.36 | dt: 644.28ms | tok/sec: 813,763 | mfu: 50.86 | epoch: 2 | total time: 147.28m | eta: 32.2m +step 13711/16704 (82.08%) | loss: 2.502159 | lrm: 0.36 | dt: 646.16ms | tok/sec: 811,392 | mfu: 50.71 | epoch: 2 | total time: 147.29m | eta: 32.2m +step 13712/16704 (82.09%) | loss: 2.497405 | lrm: 0.36 | dt: 643.48ms | tok/sec: 814,768 | mfu: 50.92 | epoch: 2 | total time: 147.30m | eta: 32.2m +step 13713/16704 (82.09%) | loss: 2.489979 | lrm: 0.36 | dt: 646.13ms | tok/sec: 811,423 | mfu: 50.72 | epoch: 2 | total time: 147.31m | eta: 32.2m +step 13714/16704 (82.10%) | loss: 2.491299 | lrm: 0.36 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 2 | total time: 147.32m | eta: 32.1m +step 13715/16704 (82.11%) | loss: 2.499165 | lrm: 0.36 | dt: 643.52ms | tok/sec: 814,721 | mfu: 50.92 | epoch: 2 | total time: 147.33m | eta: 32.1m +step 13716/16704 (82.11%) | loss: 2.502460 | lrm: 0.36 | dt: 645.63ms | tok/sec: 812,058 | mfu: 50.75 | epoch: 2 | total time: 147.34m | eta: 32.1m +step 13717/16704 (82.12%) | loss: 2.503402 | lrm: 0.36 | dt: 645.80ms | tok/sec: 811,848 | mfu: 50.74 | epoch: 2 | total time: 147.35m | eta: 32.1m +step 13718/16704 (82.12%) | loss: 2.500937 | lrm: 0.36 | dt: 641.07ms | tok/sec: 817,826 | mfu: 51.12 | epoch: 2 | total time: 147.36m | eta: 32.1m +step 13719/16704 (82.13%) | loss: 2.511401 | lrm: 0.36 | dt: 643.66ms | tok/sec: 814,541 | mfu: 50.91 | epoch: 2 | total time: 147.37m | eta: 32.1m +step 13720/16704 (82.14%) | loss: 2.518819 | lrm: 0.36 | dt: 645.02ms | tok/sec: 812,828 | mfu: 50.80 | epoch: 2 | total time: 147.39m | eta: 32.1m +step 13721/16704 (82.14%) | loss: 2.524032 | lrm: 0.36 | dt: 643.66ms | tok/sec: 814,547 | mfu: 50.91 | epoch: 2 | total time: 147.40m | eta: 32.1m +step 13722/16704 (82.15%) | loss: 2.529374 | lrm: 0.36 | dt: 645.23ms | tok/sec: 812,555 | mfu: 50.79 | epoch: 2 | total time: 147.41m | eta: 32.1m +step 13723/16704 (82.15%) | loss: 2.531349 | lrm: 0.36 | dt: 645.36ms | tok/sec: 812,390 | mfu: 50.78 | epoch: 2 | total time: 147.42m | eta: 32.0m +step 13724/16704 (82.16%) | loss: 2.530036 | lrm: 0.36 | dt: 644.15ms | tok/sec: 813,923 | mfu: 50.87 | epoch: 2 | total time: 147.43m | eta: 32.0m +step 13725/16704 (82.17%) | loss: 2.521408 | lrm: 0.36 | dt: 643.14ms | tok/sec: 815,203 | mfu: 50.95 | epoch: 2 | total time: 147.44m | eta: 32.0m +step 13726/16704 (82.17%) | loss: 2.516953 | lrm: 0.36 | dt: 643.70ms | tok/sec: 814,491 | mfu: 50.91 | epoch: 2 | total time: 147.45m | eta: 32.0m +step 13727/16704 (82.18%) | loss: 2.528123 | lrm: 0.36 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 2 | total time: 147.46m | eta: 32.0m +step 13728/16704 (82.18%) | loss: 2.524775 | lrm: 0.36 | dt: 644.48ms | tok/sec: 813,499 | mfu: 50.84 | epoch: 2 | total time: 147.47m | eta: 32.0m +step 13729/16704 (82.19%) | loss: 2.516282 | lrm: 0.36 | dt: 643.55ms | tok/sec: 814,679 | mfu: 50.92 | epoch: 2 | total time: 147.48m | eta: 32.0m +step 13730/16704 (82.20%) | loss: 2.515098 | lrm: 0.36 | dt: 643.95ms | tok/sec: 814,173 | mfu: 50.89 | epoch: 2 | total time: 147.49m | eta: 32.0m +step 13731/16704 (82.20%) | loss: 2.524470 | lrm: 0.36 | dt: 645.27ms | tok/sec: 812,511 | mfu: 50.78 | epoch: 2 | total time: 147.50m | eta: 32.0m +step 13732/16704 (82.21%) | loss: 2.517135 | lrm: 0.36 | dt: 642.94ms | tok/sec: 815,456 | mfu: 50.97 | epoch: 2 | total time: 147.51m | eta: 31.9m +step 13733/16704 (82.21%) | loss: 2.519804 | lrm: 0.36 | dt: 642.39ms | tok/sec: 816,147 | mfu: 51.01 | epoch: 2 | total time: 147.52m | eta: 31.9m +step 13734/16704 (82.22%) | loss: 2.517051 | lrm: 0.36 | dt: 645.37ms | tok/sec: 812,389 | mfu: 50.78 | epoch: 2 | total time: 147.54m | eta: 31.9m +step 13735/16704 (82.23%) | loss: 2.523606 | lrm: 0.36 | dt: 643.33ms | tok/sec: 814,959 | mfu: 50.94 | epoch: 2 | total time: 147.55m | eta: 31.9m +step 13736/16704 (82.23%) | loss: 2.521302 | lrm: 0.36 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 2 | total time: 147.56m | eta: 31.9m +step 13737/16704 (82.24%) | loss: 2.506545 | lrm: 0.36 | dt: 645.77ms | tok/sec: 811,875 | mfu: 50.74 | epoch: 2 | total time: 147.57m | eta: 31.9m +step 13738/16704 (82.24%) | loss: 2.509164 | lrm: 0.36 | dt: 644.29ms | tok/sec: 813,742 | mfu: 50.86 | epoch: 2 | total time: 147.58m | eta: 31.9m +step 13739/16704 (82.25%) | loss: 2.515096 | lrm: 0.36 | dt: 644.89ms | tok/sec: 812,982 | mfu: 50.81 | epoch: 2 | total time: 147.59m | eta: 31.9m +step 13740/16704 (82.26%) | loss: 2.520775 | lrm: 0.35 | dt: 645.24ms | tok/sec: 812,548 | mfu: 50.79 | epoch: 2 | total time: 147.60m | eta: 31.9m +step 13741/16704 (82.26%) | loss: 2.511850 | lrm: 0.35 | dt: 642.14ms | tok/sec: 816,464 | mfu: 51.03 | epoch: 2 | total time: 147.61m | eta: 31.9m +step 13742/16704 (82.27%) | loss: 2.509660 | lrm: 0.35 | dt: 643.36ms | tok/sec: 814,924 | mfu: 50.93 | epoch: 2 | total time: 147.62m | eta: 31.8m +step 13743/16704 (82.27%) | loss: 2.515918 | lrm: 0.35 | dt: 643.42ms | tok/sec: 814,849 | mfu: 50.93 | epoch: 2 | total time: 147.63m | eta: 31.8m +step 13744/16704 (82.28%) | loss: 2.501117 | lrm: 0.35 | dt: 643.03ms | tok/sec: 815,336 | mfu: 50.96 | epoch: 2 | total time: 147.64m | eta: 31.8m +step 13745/16704 (82.29%) | loss: 2.494065 | lrm: 0.35 | dt: 645.49ms | tok/sec: 812,235 | mfu: 50.77 | epoch: 2 | total time: 147.65m | eta: 31.8m +step 13746/16704 (82.29%) | loss: 2.493973 | lrm: 0.35 | dt: 646.67ms | tok/sec: 810,744 | mfu: 50.67 | epoch: 2 | total time: 147.66m | eta: 31.8m +step 13747/16704 (82.30%) | loss: 2.507867 | lrm: 0.35 | dt: 644.45ms | tok/sec: 813,548 | mfu: 50.85 | epoch: 2 | total time: 147.68m | eta: 31.8m +step 13748/16704 (82.30%) | loss: 2.502190 | lrm: 0.35 | dt: 643.07ms | tok/sec: 815,290 | mfu: 50.96 | epoch: 2 | total time: 147.69m | eta: 31.8m +step 13749/16704 (82.31%) | loss: 2.512806 | lrm: 0.35 | dt: 643.64ms | tok/sec: 814,570 | mfu: 50.91 | epoch: 2 | total time: 147.70m | eta: 31.8m +Step 13750 | Validation bpb: 0.771594 +step 13750/16704 (82.32%) | loss: 2.504496 | lrm: 0.35 | dt: 650.97ms | tok/sec: 805,399 | mfu: 50.34 | epoch: 2 | total time: 147.71m | eta: 31.8m +step 13751/16704 (82.32%) | loss: 2.516704 | lrm: 0.35 | dt: 647.12ms | tok/sec: 810,183 | mfu: 50.64 | epoch: 2 | total time: 147.72m | eta: 31.7m +step 13752/16704 (82.33%) | loss: 2.506081 | lrm: 0.35 | dt: 644.65ms | tok/sec: 813,290 | mfu: 50.83 | epoch: 2 | total time: 147.73m | eta: 31.7m +step 13753/16704 (82.33%) | loss: 2.508373 | lrm: 0.35 | dt: 643.78ms | tok/sec: 814,389 | mfu: 50.90 | epoch: 2 | total time: 147.74m | eta: 31.7m +step 13754/16704 (82.34%) | loss: 2.507717 | lrm: 0.35 | dt: 646.42ms | tok/sec: 811,067 | mfu: 50.69 | epoch: 2 | total time: 147.75m | eta: 31.7m +step 13755/16704 (82.35%) | loss: 2.495342 | lrm: 0.35 | dt: 645.42ms | tok/sec: 812,314 | mfu: 50.77 | epoch: 2 | total time: 147.76m | eta: 31.7m +step 13756/16704 (82.35%) | loss: 2.501206 | lrm: 0.35 | dt: 641.72ms | tok/sec: 817,002 | mfu: 51.06 | epoch: 2 | total time: 147.77m | eta: 31.7m +step 13757/16704 (82.36%) | loss: 2.509734 | lrm: 0.35 | dt: 645.13ms | tok/sec: 812,689 | mfu: 50.79 | epoch: 2 | total time: 147.78m | eta: 31.7m +step 13758/16704 (82.36%) | loss: 2.512327 | lrm: 0.35 | dt: 645.04ms | tok/sec: 812,805 | mfu: 50.80 | epoch: 2 | total time: 147.79m | eta: 31.7m +step 13759/16704 (82.37%) | loss: 2.505682 | lrm: 0.35 | dt: 642.73ms | tok/sec: 815,715 | mfu: 50.98 | epoch: 2 | total time: 147.80m | eta: 31.7m +step 13760/16704 (82.38%) | loss: 2.507042 | lrm: 0.35 | dt: 645.57ms | tok/sec: 812,136 | mfu: 50.76 | epoch: 2 | total time: 147.81m | eta: 31.6m +step 13761/16704 (82.38%) | loss: 2.502434 | lrm: 0.35 | dt: 645.19ms | tok/sec: 812,605 | mfu: 50.79 | epoch: 2 | total time: 147.83m | eta: 31.6m +step 13762/16704 (82.39%) | loss: 2.499802 | lrm: 0.35 | dt: 642.97ms | tok/sec: 815,418 | mfu: 50.96 | epoch: 2 | total time: 147.84m | eta: 31.6m +step 13763/16704 (82.39%) | loss: 2.501752 | lrm: 0.35 | dt: 646.53ms | tok/sec: 810,923 | mfu: 50.68 | epoch: 2 | total time: 147.85m | eta: 31.6m +step 13764/16704 (82.40%) | loss: 2.503308 | lrm: 0.35 | dt: 641.31ms | tok/sec: 817,529 | mfu: 51.10 | epoch: 2 | total time: 147.86m | eta: 31.6m +step 13765/16704 (82.41%) | loss: 2.501891 | lrm: 0.35 | dt: 645.41ms | tok/sec: 812,339 | mfu: 50.77 | epoch: 2 | total time: 147.87m | eta: 31.6m +step 13766/16704 (82.41%) | loss: 2.502753 | lrm: 0.35 | dt: 646.37ms | tok/sec: 811,129 | mfu: 50.70 | epoch: 2 | total time: 147.88m | eta: 31.6m +step 13767/16704 (82.42%) | loss: 2.507138 | lrm: 0.35 | dt: 645.24ms | tok/sec: 812,545 | mfu: 50.79 | epoch: 2 | total time: 147.89m | eta: 31.6m +step 13768/16704 (82.42%) | loss: 2.502502 | lrm: 0.35 | dt: 643.57ms | tok/sec: 814,650 | mfu: 50.92 | epoch: 2 | total time: 147.90m | eta: 31.6m +step 13769/16704 (82.43%) | loss: 2.494733 | lrm: 0.35 | dt: 644.78ms | tok/sec: 813,124 | mfu: 50.82 | epoch: 2 | total time: 147.91m | eta: 31.6m +step 13770/16704 (82.44%) | loss: 2.500372 | lrm: 0.35 | dt: 643.46ms | tok/sec: 814,793 | mfu: 50.93 | epoch: 2 | total time: 147.92m | eta: 31.5m +step 13771/16704 (82.44%) | loss: 2.509579 | lrm: 0.35 | dt: 645.06ms | tok/sec: 812,770 | mfu: 50.80 | epoch: 2 | total time: 147.93m | eta: 31.5m +step 13772/16704 (82.45%) | loss: 2.506348 | lrm: 0.35 | dt: 639.58ms | tok/sec: 819,740 | mfu: 51.23 | epoch: 2 | total time: 147.94m | eta: 31.5m +step 13773/16704 (82.45%) | loss: 2.521191 | lrm: 0.35 | dt: 646.69ms | tok/sec: 810,721 | mfu: 50.67 | epoch: 2 | total time: 147.95m | eta: 31.5m +step 13774/16704 (82.46%) | loss: 2.523111 | lrm: 0.35 | dt: 645.55ms | tok/sec: 812,163 | mfu: 50.76 | epoch: 2 | total time: 147.97m | eta: 31.5m +step 13775/16704 (82.47%) | loss: 2.526961 | lrm: 0.35 | dt: 645.97ms | tok/sec: 811,632 | mfu: 50.73 | epoch: 2 | total time: 147.98m | eta: 31.5m +step 13776/16704 (82.47%) | loss: 2.533712 | lrm: 0.35 | dt: 644.04ms | tok/sec: 814,057 | mfu: 50.88 | epoch: 2 | total time: 147.99m | eta: 31.5m +step 13777/16704 (82.48%) | loss: 2.524131 | lrm: 0.35 | dt: 645.87ms | tok/sec: 811,750 | mfu: 50.74 | epoch: 2 | total time: 148.00m | eta: 31.5m +step 13778/16704 (82.48%) | loss: 2.518293 | lrm: 0.35 | dt: 645.10ms | tok/sec: 812,724 | mfu: 50.80 | epoch: 2 | total time: 148.01m | eta: 31.5m +step 13779/16704 (82.49%) | loss: 2.522275 | lrm: 0.35 | dt: 643.94ms | tok/sec: 814,191 | mfu: 50.89 | epoch: 2 | total time: 148.02m | eta: 31.4m +step 13780/16704 (82.50%) | loss: 2.526002 | lrm: 0.35 | dt: 642.93ms | tok/sec: 815,465 | mfu: 50.97 | epoch: 2 | total time: 148.03m | eta: 31.4m +step 13781/16704 (82.50%) | loss: 2.527068 | lrm: 0.35 | dt: 643.94ms | tok/sec: 814,186 | mfu: 50.89 | epoch: 2 | total time: 148.04m | eta: 31.4m +step 13782/16704 (82.51%) | loss: 2.533398 | lrm: 0.35 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 2 | total time: 148.05m | eta: 31.4m +step 13783/16704 (82.51%) | loss: 2.529227 | lrm: 0.35 | dt: 644.10ms | tok/sec: 813,984 | mfu: 50.88 | epoch: 2 | total time: 148.06m | eta: 31.4m +step 13784/16704 (82.52%) | loss: 2.524623 | lrm: 0.35 | dt: 643.89ms | tok/sec: 814,251 | mfu: 50.89 | epoch: 2 | total time: 148.07m | eta: 31.4m +step 13785/16704 (82.53%) | loss: 2.526433 | lrm: 0.35 | dt: 644.34ms | tok/sec: 813,686 | mfu: 50.86 | epoch: 2 | total time: 148.08m | eta: 31.4m +step 13786/16704 (82.53%) | loss: 2.520453 | lrm: 0.35 | dt: 643.96ms | tok/sec: 814,167 | mfu: 50.89 | epoch: 2 | total time: 148.09m | eta: 31.4m +step 13787/16704 (82.54%) | loss: 2.524963 | lrm: 0.35 | dt: 643.28ms | tok/sec: 815,026 | mfu: 50.94 | epoch: 2 | total time: 148.10m | eta: 31.4m +step 13788/16704 (82.54%) | loss: 2.528033 | lrm: 0.35 | dt: 644.01ms | tok/sec: 814,096 | mfu: 50.88 | epoch: 2 | total time: 148.12m | eta: 31.3m +step 13789/16704 (82.55%) | loss: 2.515940 | lrm: 0.35 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 2 | total time: 148.13m | eta: 31.3m +step 13790/16704 (82.56%) | loss: 2.514983 | lrm: 0.35 | dt: 646.86ms | tok/sec: 810,513 | mfu: 50.66 | epoch: 2 | total time: 148.14m | eta: 31.3m +step 13791/16704 (82.56%) | loss: 2.516306 | lrm: 0.35 | dt: 644.69ms | tok/sec: 813,244 | mfu: 50.83 | epoch: 2 | total time: 148.15m | eta: 31.3m +step 13792/16704 (82.57%) | loss: 2.519599 | lrm: 0.35 | dt: 644.68ms | tok/sec: 813,250 | mfu: 50.83 | epoch: 2 | total time: 148.16m | eta: 31.3m +step 13793/16704 (82.57%) | loss: 2.523201 | lrm: 0.35 | dt: 644.48ms | tok/sec: 813,500 | mfu: 50.84 | epoch: 2 | total time: 148.17m | eta: 31.3m +step 13794/16704 (82.58%) | loss: 2.522958 | lrm: 0.35 | dt: 641.36ms | tok/sec: 817,466 | mfu: 51.09 | epoch: 2 | total time: 148.18m | eta: 31.3m +step 13795/16704 (82.59%) | loss: 2.532465 | lrm: 0.35 | dt: 645.03ms | tok/sec: 812,816 | mfu: 50.80 | epoch: 2 | total time: 148.19m | eta: 31.3m +step 13796/16704 (82.59%) | loss: 2.526740 | lrm: 0.35 | dt: 644.08ms | tok/sec: 814,011 | mfu: 50.88 | epoch: 2 | total time: 148.20m | eta: 31.3m +step 13797/16704 (82.60%) | loss: 2.519888 | lrm: 0.35 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 2 | total time: 148.21m | eta: 31.3m +step 13798/16704 (82.60%) | loss: 2.512421 | lrm: 0.35 | dt: 643.65ms | tok/sec: 814,560 | mfu: 50.91 | epoch: 2 | total time: 148.22m | eta: 31.2m +step 13799/16704 (82.61%) | loss: 2.514114 | lrm: 0.35 | dt: 643.03ms | tok/sec: 815,336 | mfu: 50.96 | epoch: 2 | total time: 148.23m | eta: 31.2m +step 13800/16704 (82.61%) | loss: 2.514997 | lrm: 0.35 | dt: 644.51ms | tok/sec: 813,468 | mfu: 50.84 | epoch: 2 | total time: 148.24m | eta: 31.2m +step 13801/16704 (82.62%) | loss: 2.529677 | lrm: 0.35 | dt: 642.94ms | tok/sec: 815,451 | mfu: 50.97 | epoch: 2 | total time: 148.26m | eta: 31.2m +step 13802/16704 (82.63%) | loss: 2.525538 | lrm: 0.35 | dt: 643.42ms | tok/sec: 814,842 | mfu: 50.93 | epoch: 2 | total time: 148.27m | eta: 31.2m +step 13803/16704 (82.63%) | loss: 2.525375 | lrm: 0.35 | dt: 647.59ms | tok/sec: 809,593 | mfu: 50.60 | epoch: 2 | total time: 148.28m | eta: 31.2m +step 13804/16704 (82.64%) | loss: 2.528930 | lrm: 0.35 | dt: 644.59ms | tok/sec: 813,364 | mfu: 50.84 | epoch: 2 | total time: 148.29m | eta: 31.2m +step 13805/16704 (82.64%) | loss: 2.520884 | lrm: 0.35 | dt: 646.64ms | tok/sec: 810,792 | mfu: 50.68 | epoch: 2 | total time: 148.30m | eta: 31.2m +step 13806/16704 (82.65%) | loss: 2.510936 | lrm: 0.35 | dt: 645.27ms | tok/sec: 812,511 | mfu: 50.78 | epoch: 2 | total time: 148.31m | eta: 31.2m +step 13807/16704 (82.66%) | loss: 2.492891 | lrm: 0.35 | dt: 641.66ms | tok/sec: 817,077 | mfu: 51.07 | epoch: 2 | total time: 148.32m | eta: 31.1m +step 13808/16704 (82.66%) | loss: 2.492390 | lrm: 0.35 | dt: 645.00ms | tok/sec: 812,847 | mfu: 50.80 | epoch: 2 | total time: 148.33m | eta: 31.1m +step 13809/16704 (82.67%) | loss: 2.493272 | lrm: 0.35 | dt: 642.99ms | tok/sec: 815,392 | mfu: 50.96 | epoch: 2 | total time: 148.34m | eta: 31.1m +step 13810/16704 (82.67%) | loss: 2.497454 | lrm: 0.35 | dt: 646.41ms | tok/sec: 811,078 | mfu: 50.69 | epoch: 2 | total time: 148.35m | eta: 31.1m +step 13811/16704 (82.68%) | loss: 2.503352 | lrm: 0.35 | dt: 645.31ms | tok/sec: 812,454 | mfu: 50.78 | epoch: 2 | total time: 148.36m | eta: 31.1m +step 13812/16704 (82.69%) | loss: 2.503450 | lrm: 0.35 | dt: 643.12ms | tok/sec: 815,226 | mfu: 50.95 | epoch: 2 | total time: 148.37m | eta: 31.1m +step 13813/16704 (82.69%) | loss: 2.505986 | lrm: 0.35 | dt: 643.68ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 2 | total time: 148.38m | eta: 31.1m +step 13814/16704 (82.70%) | loss: 2.523656 | lrm: 0.35 | dt: 641.50ms | tok/sec: 817,278 | mfu: 51.08 | epoch: 2 | total time: 148.39m | eta: 31.1m +step 13815/16704 (82.70%) | loss: 2.532716 | lrm: 0.35 | dt: 644.57ms | tok/sec: 813,394 | mfu: 50.84 | epoch: 2 | total time: 148.41m | eta: 31.1m +step 13816/16704 (82.71%) | loss: 2.526123 | lrm: 0.35 | dt: 645.53ms | tok/sec: 812,178 | mfu: 50.76 | epoch: 2 | total time: 148.42m | eta: 31.0m +step 13817/16704 (82.72%) | loss: 2.534127 | lrm: 0.35 | dt: 643.34ms | tok/sec: 814,944 | mfu: 50.94 | epoch: 2 | total time: 148.43m | eta: 31.0m +step 13818/16704 (82.72%) | loss: 2.530783 | lrm: 0.35 | dt: 643.65ms | tok/sec: 814,554 | mfu: 50.91 | epoch: 2 | total time: 148.44m | eta: 31.0m +step 13819/16704 (82.73%) | loss: 2.520040 | lrm: 0.35 | dt: 642.10ms | tok/sec: 816,521 | mfu: 51.03 | epoch: 2 | total time: 148.45m | eta: 31.0m +step 13820/16704 (82.73%) | loss: 2.494290 | lrm: 0.35 | dt: 644.18ms | tok/sec: 813,878 | mfu: 50.87 | epoch: 2 | total time: 148.46m | eta: 31.0m +step 13821/16704 (82.74%) | loss: 2.503164 | lrm: 0.35 | dt: 643.80ms | tok/sec: 814,368 | mfu: 50.90 | epoch: 2 | total time: 148.47m | eta: 31.0m +step 13822/16704 (82.75%) | loss: 2.508538 | lrm: 0.35 | dt: 643.65ms | tok/sec: 814,549 | mfu: 50.91 | epoch: 2 | total time: 148.48m | eta: 31.0m +step 13823/16704 (82.75%) | loss: 2.510301 | lrm: 0.34 | dt: 644.64ms | tok/sec: 813,306 | mfu: 50.83 | epoch: 2 | total time: 148.49m | eta: 31.0m +step 13824/16704 (82.76%) | loss: 2.508846 | lrm: 0.34 | dt: 646.12ms | tok/sec: 811,444 | mfu: 50.72 | epoch: 2 | total time: 148.50m | eta: 31.0m +step 13825/16704 (82.76%) | loss: 2.505336 | lrm: 0.34 | dt: 642.02ms | tok/sec: 816,625 | mfu: 51.04 | epoch: 2 | total time: 148.51m | eta: 30.9m +step 13826/16704 (82.77%) | loss: 2.502695 | lrm: 0.34 | dt: 644.66ms | tok/sec: 813,280 | mfu: 50.83 | epoch: 2 | total time: 148.52m | eta: 30.9m +step 13827/16704 (82.78%) | loss: 2.500774 | lrm: 0.34 | dt: 644.89ms | tok/sec: 812,993 | mfu: 50.81 | epoch: 2 | total time: 148.53m | eta: 30.9m +step 13828/16704 (82.78%) | loss: 2.491561 | lrm: 0.34 | dt: 642.45ms | tok/sec: 816,070 | mfu: 51.01 | epoch: 2 | total time: 148.54m | eta: 30.9m +step 13829/16704 (82.79%) | loss: 2.497278 | lrm: 0.34 | dt: 644.72ms | tok/sec: 813,201 | mfu: 50.83 | epoch: 2 | total time: 148.56m | eta: 30.9m +step 13830/16704 (82.79%) | loss: 2.485812 | lrm: 0.34 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 2 | total time: 148.57m | eta: 30.9m +step 13831/16704 (82.80%) | loss: 2.505987 | lrm: 0.34 | dt: 643.00ms | tok/sec: 815,380 | mfu: 50.96 | epoch: 2 | total time: 148.58m | eta: 30.9m +step 13832/16704 (82.81%) | loss: 2.505437 | lrm: 0.34 | dt: 647.09ms | tok/sec: 810,225 | mfu: 50.64 | epoch: 2 | total time: 148.59m | eta: 30.9m +step 13833/16704 (82.81%) | loss: 2.494838 | lrm: 0.34 | dt: 643.42ms | tok/sec: 814,851 | mfu: 50.93 | epoch: 2 | total time: 148.60m | eta: 30.9m +step 13834/16704 (82.82%) | loss: 2.496670 | lrm: 0.34 | dt: 644.98ms | tok/sec: 812,880 | mfu: 50.81 | epoch: 2 | total time: 148.61m | eta: 30.9m +step 13835/16704 (82.82%) | loss: 2.486232 | lrm: 0.34 | dt: 642.70ms | tok/sec: 815,756 | mfu: 50.99 | epoch: 2 | total time: 148.62m | eta: 30.8m +step 13836/16704 (82.83%) | loss: 2.485068 | lrm: 0.34 | dt: 644.26ms | tok/sec: 813,785 | mfu: 50.86 | epoch: 2 | total time: 148.63m | eta: 30.8m +step 13837/16704 (82.84%) | loss: 2.500583 | lrm: 0.34 | dt: 642.53ms | tok/sec: 815,975 | mfu: 51.00 | epoch: 2 | total time: 148.64m | eta: 30.8m +step 13838/16704 (82.84%) | loss: 2.502860 | lrm: 0.34 | dt: 643.19ms | tok/sec: 815,133 | mfu: 50.95 | epoch: 2 | total time: 148.65m | eta: 30.8m +step 13839/16704 (82.85%) | loss: 2.507696 | lrm: 0.34 | dt: 643.06ms | tok/sec: 815,305 | mfu: 50.96 | epoch: 2 | total time: 148.66m | eta: 30.8m +step 13840/16704 (82.85%) | loss: 2.521770 | lrm: 0.34 | dt: 644.62ms | tok/sec: 813,324 | mfu: 50.83 | epoch: 2 | total time: 148.67m | eta: 30.8m +step 13841/16704 (82.86%) | loss: 2.529376 | lrm: 0.34 | dt: 644.43ms | tok/sec: 813,564 | mfu: 50.85 | epoch: 2 | total time: 148.68m | eta: 30.8m +step 13842/16704 (82.87%) | loss: 2.524605 | lrm: 0.34 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 2 | total time: 148.70m | eta: 30.8m +step 13843/16704 (82.87%) | loss: 2.516387 | lrm: 0.34 | dt: 645.23ms | tok/sec: 812,566 | mfu: 50.79 | epoch: 2 | total time: 148.71m | eta: 30.8m +step 13844/16704 (82.88%) | loss: 2.538004 | lrm: 0.34 | dt: 641.28ms | tok/sec: 817,561 | mfu: 51.10 | epoch: 2 | total time: 148.72m | eta: 30.7m +step 13845/16704 (82.88%) | loss: 2.534910 | lrm: 0.34 | dt: 646.32ms | tok/sec: 811,194 | mfu: 50.70 | epoch: 2 | total time: 148.73m | eta: 30.7m +step 13846/16704 (82.89%) | loss: 2.526606 | lrm: 0.34 | dt: 643.75ms | tok/sec: 814,433 | mfu: 50.90 | epoch: 2 | total time: 148.74m | eta: 30.7m +step 13847/16704 (82.90%) | loss: 2.523860 | lrm: 0.34 | dt: 644.13ms | tok/sec: 813,943 | mfu: 50.87 | epoch: 2 | total time: 148.75m | eta: 30.7m +step 13848/16704 (82.90%) | loss: 2.523934 | lrm: 0.34 | dt: 644.90ms | tok/sec: 812,974 | mfu: 50.81 | epoch: 2 | total time: 148.76m | eta: 30.7m +step 13849/16704 (82.91%) | loss: 2.526712 | lrm: 0.34 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 2 | total time: 148.77m | eta: 30.7m +step 13850/16704 (82.91%) | loss: 2.532018 | lrm: 0.34 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 2 | total time: 148.78m | eta: 30.7m +step 13851/16704 (82.92%) | loss: 2.515200 | lrm: 0.34 | dt: 647.16ms | tok/sec: 810,135 | mfu: 50.63 | epoch: 2 | total time: 148.79m | eta: 30.7m +step 13852/16704 (82.93%) | loss: 2.519921 | lrm: 0.34 | dt: 644.26ms | tok/sec: 813,777 | mfu: 50.86 | epoch: 2 | total time: 148.80m | eta: 30.7m +step 13853/16704 (82.93%) | loss: 2.515566 | lrm: 0.34 | dt: 642.96ms | tok/sec: 815,422 | mfu: 50.97 | epoch: 2 | total time: 148.81m | eta: 30.6m +step 13854/16704 (82.94%) | loss: 2.520053 | lrm: 0.34 | dt: 645.76ms | tok/sec: 811,891 | mfu: 50.74 | epoch: 2 | total time: 148.82m | eta: 30.6m +step 13855/16704 (82.94%) | loss: 2.521625 | lrm: 0.34 | dt: 643.37ms | tok/sec: 814,909 | mfu: 50.93 | epoch: 2 | total time: 148.83m | eta: 30.6m +step 13856/16704 (82.95%) | loss: 2.513925 | lrm: 0.34 | dt: 646.80ms | tok/sec: 810,582 | mfu: 50.66 | epoch: 2 | total time: 148.85m | eta: 30.6m +step 13857/16704 (82.96%) | loss: 2.515267 | lrm: 0.34 | dt: 643.02ms | tok/sec: 815,348 | mfu: 50.96 | epoch: 2 | total time: 148.86m | eta: 30.6m +step 13858/16704 (82.96%) | loss: 2.515571 | lrm: 0.34 | dt: 645.96ms | tok/sec: 811,640 | mfu: 50.73 | epoch: 2 | total time: 148.87m | eta: 30.6m +step 13859/16704 (82.97%) | loss: 2.507071 | lrm: 0.34 | dt: 644.66ms | tok/sec: 813,283 | mfu: 50.83 | epoch: 2 | total time: 148.88m | eta: 30.6m +step 13860/16704 (82.97%) | loss: 2.498120 | lrm: 0.34 | dt: 644.51ms | tok/sec: 813,463 | mfu: 50.84 | epoch: 2 | total time: 148.89m | eta: 30.6m +step 13861/16704 (82.98%) | loss: 2.501385 | lrm: 0.34 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 2 | total time: 148.90m | eta: 30.6m +step 13862/16704 (82.99%) | loss: 2.507325 | lrm: 0.34 | dt: 644.03ms | tok/sec: 814,078 | mfu: 50.88 | epoch: 2 | total time: 148.91m | eta: 30.6m +step 13863/16704 (82.99%) | loss: 2.504828 | lrm: 0.34 | dt: 643.91ms | tok/sec: 814,222 | mfu: 50.89 | epoch: 2 | total time: 148.92m | eta: 30.5m +step 13864/16704 (83.00%) | loss: 2.512439 | lrm: 0.34 | dt: 646.52ms | tok/sec: 810,933 | mfu: 50.68 | epoch: 2 | total time: 148.93m | eta: 30.5m +step 13865/16704 (83.00%) | loss: 2.517497 | lrm: 0.34 | dt: 642.01ms | tok/sec: 816,637 | mfu: 51.04 | epoch: 2 | total time: 148.94m | eta: 30.5m +step 13866/16704 (83.01%) | loss: 2.512303 | lrm: 0.34 | dt: 646.16ms | tok/sec: 811,384 | mfu: 50.71 | epoch: 2 | total time: 148.95m | eta: 30.5m +step 13867/16704 (83.02%) | loss: 2.525065 | lrm: 0.34 | dt: 643.99ms | tok/sec: 814,124 | mfu: 50.88 | epoch: 2 | total time: 148.96m | eta: 30.5m +step 13868/16704 (83.02%) | loss: 2.524921 | lrm: 0.34 | dt: 643.14ms | tok/sec: 815,203 | mfu: 50.95 | epoch: 2 | total time: 148.97m | eta: 30.5m +step 13869/16704 (83.03%) | loss: 2.517854 | lrm: 0.34 | dt: 646.22ms | tok/sec: 811,319 | mfu: 50.71 | epoch: 2 | total time: 148.99m | eta: 30.5m +step 13870/16704 (83.03%) | loss: 2.538662 | lrm: 0.34 | dt: 642.53ms | tok/sec: 815,975 | mfu: 51.00 | epoch: 2 | total time: 149.00m | eta: 30.5m +step 13871/16704 (83.04%) | loss: 2.544247 | lrm: 0.34 | dt: 645.33ms | tok/sec: 812,436 | mfu: 50.78 | epoch: 2 | total time: 149.01m | eta: 30.5m +step 13872/16704 (83.05%) | loss: 2.551853 | lrm: 0.34 | dt: 646.03ms | tok/sec: 811,549 | mfu: 50.72 | epoch: 2 | total time: 149.02m | eta: 30.4m +step 13873/16704 (83.05%) | loss: 2.545285 | lrm: 0.34 | dt: 641.24ms | tok/sec: 817,614 | mfu: 51.10 | epoch: 2 | total time: 149.03m | eta: 30.4m +step 13874/16704 (83.06%) | loss: 2.538508 | lrm: 0.34 | dt: 646.57ms | tok/sec: 810,876 | mfu: 50.68 | epoch: 2 | total time: 149.04m | eta: 30.4m +step 13875/16704 (83.06%) | loss: 2.533285 | lrm: 0.34 | dt: 644.35ms | tok/sec: 813,663 | mfu: 50.86 | epoch: 2 | total time: 149.05m | eta: 30.4m +step 13876/16704 (83.07%) | loss: 2.527158 | lrm: 0.34 | dt: 642.85ms | tok/sec: 815,569 | mfu: 50.97 | epoch: 2 | total time: 149.06m | eta: 30.4m +step 13877/16704 (83.08%) | loss: 2.519324 | lrm: 0.34 | dt: 643.57ms | tok/sec: 814,657 | mfu: 50.92 | epoch: 2 | total time: 149.07m | eta: 30.4m +step 13878/16704 (83.08%) | loss: 2.510068 | lrm: 0.34 | dt: 642.29ms | tok/sec: 816,274 | mfu: 51.02 | epoch: 2 | total time: 149.08m | eta: 30.4m +step 13879/16704 (83.09%) | loss: 2.510040 | lrm: 0.34 | dt: 645.06ms | tok/sec: 812,772 | mfu: 50.80 | epoch: 2 | total time: 149.09m | eta: 30.4m +step 13880/16704 (83.09%) | loss: 2.508900 | lrm: 0.34 | dt: 644.52ms | tok/sec: 813,453 | mfu: 50.84 | epoch: 2 | total time: 149.10m | eta: 30.4m +step 13881/16704 (83.10%) | loss: 2.513899 | lrm: 0.34 | dt: 646.93ms | tok/sec: 810,423 | mfu: 50.65 | epoch: 2 | total time: 149.11m | eta: 30.3m +step 13882/16704 (83.11%) | loss: 2.521124 | lrm: 0.34 | dt: 641.94ms | tok/sec: 816,719 | mfu: 51.05 | epoch: 2 | total time: 149.12m | eta: 30.3m +step 13883/16704 (83.11%) | loss: 2.534717 | lrm: 0.34 | dt: 647.00ms | tok/sec: 810,332 | mfu: 50.65 | epoch: 2 | total time: 149.14m | eta: 30.3m +step 13884/16704 (83.12%) | loss: 2.522123 | lrm: 0.34 | dt: 642.52ms | tok/sec: 815,993 | mfu: 51.00 | epoch: 2 | total time: 149.15m | eta: 30.3m +step 13885/16704 (83.12%) | loss: 2.513212 | lrm: 0.34 | dt: 644.05ms | tok/sec: 814,049 | mfu: 50.88 | epoch: 2 | total time: 149.16m | eta: 30.3m +step 13886/16704 (83.13%) | loss: 2.516984 | lrm: 0.34 | dt: 648.34ms | tok/sec: 808,663 | mfu: 50.54 | epoch: 2 | total time: 149.17m | eta: 30.3m +step 13887/16704 (83.14%) | loss: 2.526429 | lrm: 0.34 | dt: 647.73ms | tok/sec: 809,423 | mfu: 50.59 | epoch: 2 | total time: 149.18m | eta: 30.3m +step 13888/16704 (83.14%) | loss: 2.523052 | lrm: 0.34 | dt: 642.47ms | tok/sec: 816,052 | mfu: 51.00 | epoch: 2 | total time: 149.19m | eta: 30.3m +step 13889/16704 (83.15%) | loss: 2.507686 | lrm: 0.34 | dt: 648.18ms | tok/sec: 808,857 | mfu: 50.55 | epoch: 2 | total time: 149.20m | eta: 30.3m +step 13890/16704 (83.15%) | loss: 2.505419 | lrm: 0.34 | dt: 641.40ms | tok/sec: 817,411 | mfu: 51.09 | epoch: 2 | total time: 149.21m | eta: 30.3m +step 13891/16704 (83.16%) | loss: 2.508241 | lrm: 0.34 | dt: 643.74ms | tok/sec: 814,445 | mfu: 50.90 | epoch: 2 | total time: 149.22m | eta: 30.2m +step 13892/16704 (83.17%) | loss: 2.505359 | lrm: 0.34 | dt: 645.96ms | tok/sec: 811,644 | mfu: 50.73 | epoch: 2 | total time: 149.23m | eta: 30.2m +step 13893/16704 (83.17%) | loss: 2.491914 | lrm: 0.34 | dt: 644.07ms | tok/sec: 814,019 | mfu: 50.88 | epoch: 2 | total time: 149.24m | eta: 30.2m +step 13894/16704 (83.18%) | loss: 2.493042 | lrm: 0.34 | dt: 645.82ms | tok/sec: 811,818 | mfu: 50.74 | epoch: 2 | total time: 149.25m | eta: 30.2m +step 13895/16704 (83.18%) | loss: 2.492934 | lrm: 0.34 | dt: 643.44ms | tok/sec: 814,819 | mfu: 50.93 | epoch: 2 | total time: 149.26m | eta: 30.2m +step 13896/16704 (83.19%) | loss: 2.508146 | lrm: 0.34 | dt: 647.00ms | tok/sec: 810,341 | mfu: 50.65 | epoch: 2 | total time: 149.28m | eta: 30.2m +step 13897/16704 (83.20%) | loss: 2.507287 | lrm: 0.34 | dt: 645.11ms | tok/sec: 812,710 | mfu: 50.80 | epoch: 2 | total time: 149.29m | eta: 30.2m +step 13898/16704 (83.20%) | loss: 2.510249 | lrm: 0.34 | dt: 645.91ms | tok/sec: 811,706 | mfu: 50.73 | epoch: 2 | total time: 149.30m | eta: 30.2m +step 13899/16704 (83.21%) | loss: 2.519780 | lrm: 0.34 | dt: 644.09ms | tok/sec: 813,994 | mfu: 50.88 | epoch: 2 | total time: 149.31m | eta: 30.2m +step 13900/16704 (83.21%) | loss: 2.506470 | lrm: 0.34 | dt: 646.21ms | tok/sec: 811,321 | mfu: 50.71 | epoch: 2 | total time: 149.32m | eta: 30.1m +step 13901/16704 (83.22%) | loss: 2.504873 | lrm: 0.34 | dt: 643.34ms | tok/sec: 814,943 | mfu: 50.94 | epoch: 2 | total time: 149.33m | eta: 30.1m +step 13902/16704 (83.23%) | loss: 2.499674 | lrm: 0.34 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 2 | total time: 149.34m | eta: 30.1m +step 13903/16704 (83.23%) | loss: 2.502620 | lrm: 0.34 | dt: 643.57ms | tok/sec: 814,655 | mfu: 50.92 | epoch: 2 | total time: 149.35m | eta: 30.1m +step 13904/16704 (83.24%) | loss: 2.501428 | lrm: 0.34 | dt: 644.64ms | tok/sec: 813,301 | mfu: 50.83 | epoch: 2 | total time: 149.36m | eta: 30.1m +step 13905/16704 (83.24%) | loss: 2.497593 | lrm: 0.34 | dt: 644.28ms | tok/sec: 813,757 | mfu: 50.86 | epoch: 2 | total time: 149.37m | eta: 30.1m +step 13906/16704 (83.25%) | loss: 2.506047 | lrm: 0.34 | dt: 645.32ms | tok/sec: 812,451 | mfu: 50.78 | epoch: 2 | total time: 149.38m | eta: 30.1m +step 13907/16704 (83.26%) | loss: 2.505026 | lrm: 0.33 | dt: 643.61ms | tok/sec: 814,600 | mfu: 50.91 | epoch: 2 | total time: 149.39m | eta: 30.1m +step 13908/16704 (83.26%) | loss: 2.513753 | lrm: 0.33 | dt: 643.52ms | tok/sec: 814,721 | mfu: 50.92 | epoch: 2 | total time: 149.40m | eta: 30.1m +step 13909/16704 (83.27%) | loss: 2.520244 | lrm: 0.33 | dt: 645.86ms | tok/sec: 811,766 | mfu: 50.74 | epoch: 2 | total time: 149.41m | eta: 30.0m +step 13910/16704 (83.27%) | loss: 2.514736 | lrm: 0.33 | dt: 647.04ms | tok/sec: 810,292 | mfu: 50.64 | epoch: 2 | total time: 149.43m | eta: 30.0m +step 13911/16704 (83.28%) | loss: 2.526240 | lrm: 0.33 | dt: 643.40ms | tok/sec: 814,873 | mfu: 50.93 | epoch: 2 | total time: 149.44m | eta: 30.0m +step 13912/16704 (83.29%) | loss: 2.524999 | lrm: 0.33 | dt: 646.24ms | tok/sec: 811,292 | mfu: 50.71 | epoch: 2 | total time: 149.45m | eta: 30.0m +step 13913/16704 (83.29%) | loss: 2.530943 | lrm: 0.33 | dt: 643.01ms | tok/sec: 815,366 | mfu: 50.96 | epoch: 2 | total time: 149.46m | eta: 30.0m +step 13914/16704 (83.30%) | loss: 2.529089 | lrm: 0.33 | dt: 643.71ms | tok/sec: 814,475 | mfu: 50.91 | epoch: 2 | total time: 149.47m | eta: 30.0m +step 13915/16704 (83.30%) | loss: 2.536835 | lrm: 0.33 | dt: 643.97ms | tok/sec: 814,148 | mfu: 50.89 | epoch: 2 | total time: 149.48m | eta: 30.0m +step 13916/16704 (83.31%) | loss: 2.533737 | lrm: 0.33 | dt: 642.40ms | tok/sec: 816,139 | mfu: 51.01 | epoch: 2 | total time: 149.49m | eta: 30.0m +step 13917/16704 (83.32%) | loss: 2.525853 | lrm: 0.33 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 2 | total time: 149.50m | eta: 30.0m +step 13918/16704 (83.32%) | loss: 2.518641 | lrm: 0.33 | dt: 642.32ms | tok/sec: 816,246 | mfu: 51.02 | epoch: 2 | total time: 149.51m | eta: 29.9m +step 13919/16704 (83.33%) | loss: 2.517959 | lrm: 0.33 | dt: 643.89ms | tok/sec: 814,254 | mfu: 50.89 | epoch: 2 | total time: 149.52m | eta: 29.9m +step 13920/16704 (83.33%) | loss: 2.514379 | lrm: 0.33 | dt: 644.41ms | tok/sec: 813,592 | mfu: 50.85 | epoch: 2 | total time: 149.53m | eta: 29.9m +step 13921/16704 (83.34%) | loss: 2.526385 | lrm: 0.33 | dt: 642.74ms | tok/sec: 815,702 | mfu: 50.98 | epoch: 2 | total time: 149.54m | eta: 29.9m +step 13922/16704 (83.35%) | loss: 2.518865 | lrm: 0.33 | dt: 643.50ms | tok/sec: 814,745 | mfu: 50.92 | epoch: 2 | total time: 149.55m | eta: 29.9m +step 13923/16704 (83.35%) | loss: 2.514983 | lrm: 0.33 | dt: 643.09ms | tok/sec: 815,266 | mfu: 50.96 | epoch: 2 | total time: 149.57m | eta: 29.9m +step 13924/16704 (83.36%) | loss: 2.507503 | lrm: 0.33 | dt: 642.26ms | tok/sec: 816,317 | mfu: 51.02 | epoch: 2 | total time: 149.58m | eta: 29.9m +step 13925/16704 (83.36%) | loss: 2.503302 | lrm: 0.33 | dt: 645.30ms | tok/sec: 812,466 | mfu: 50.78 | epoch: 2 | total time: 149.59m | eta: 29.9m +step 13926/16704 (83.37%) | loss: 2.496933 | lrm: 0.33 | dt: 642.86ms | tok/sec: 815,551 | mfu: 50.97 | epoch: 2 | total time: 149.60m | eta: 29.9m +step 13927/16704 (83.38%) | loss: 2.501859 | lrm: 0.33 | dt: 643.11ms | tok/sec: 815,236 | mfu: 50.95 | epoch: 2 | total time: 149.61m | eta: 29.9m +step 13928/16704 (83.38%) | loss: 2.499844 | lrm: 0.33 | dt: 643.28ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 2 | total time: 149.62m | eta: 29.8m +step 13929/16704 (83.39%) | loss: 2.503725 | lrm: 0.33 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 2 | total time: 149.63m | eta: 29.8m +step 13930/16704 (83.39%) | loss: 2.502874 | lrm: 0.33 | dt: 642.48ms | tok/sec: 816,042 | mfu: 51.00 | epoch: 2 | total time: 149.64m | eta: 29.8m +step 13931/16704 (83.40%) | loss: 2.493734 | lrm: 0.33 | dt: 644.53ms | tok/sec: 813,444 | mfu: 50.84 | epoch: 2 | total time: 149.65m | eta: 29.8m +step 13932/16704 (83.41%) | loss: 2.483958 | lrm: 0.33 | dt: 642.27ms | tok/sec: 816,305 | mfu: 51.02 | epoch: 2 | total time: 149.66m | eta: 29.8m +step 13933/16704 (83.41%) | loss: 2.487345 | lrm: 0.33 | dt: 643.90ms | tok/sec: 814,239 | mfu: 50.89 | epoch: 2 | total time: 149.67m | eta: 29.8m +step 13934/16704 (83.42%) | loss: 2.472336 | lrm: 0.33 | dt: 643.12ms | tok/sec: 815,230 | mfu: 50.95 | epoch: 2 | total time: 149.68m | eta: 29.8m +step 13935/16704 (83.42%) | loss: 2.476903 | lrm: 0.33 | dt: 645.36ms | tok/sec: 812,391 | mfu: 50.78 | epoch: 2 | total time: 149.69m | eta: 29.8m +step 13936/16704 (83.43%) | loss: 2.479781 | lrm: 0.33 | dt: 643.13ms | tok/sec: 815,217 | mfu: 50.95 | epoch: 2 | total time: 149.70m | eta: 29.8m +step 13937/16704 (83.44%) | loss: 2.491108 | lrm: 0.33 | dt: 643.50ms | tok/sec: 814,749 | mfu: 50.92 | epoch: 2 | total time: 149.72m | eta: 29.7m +step 13938/16704 (83.44%) | loss: 2.486167 | lrm: 0.33 | dt: 644.86ms | tok/sec: 813,022 | mfu: 50.82 | epoch: 2 | total time: 149.73m | eta: 29.7m +step 13939/16704 (83.45%) | loss: 2.492910 | lrm: 0.33 | dt: 643.99ms | tok/sec: 814,118 | mfu: 50.88 | epoch: 2 | total time: 149.74m | eta: 29.7m +step 13940/16704 (83.45%) | loss: 2.496682 | lrm: 0.33 | dt: 641.30ms | tok/sec: 817,534 | mfu: 51.10 | epoch: 2 | total time: 149.75m | eta: 29.7m +step 13941/16704 (83.46%) | loss: 2.492865 | lrm: 0.33 | dt: 645.32ms | tok/sec: 812,443 | mfu: 50.78 | epoch: 2 | total time: 149.76m | eta: 29.7m +step 13942/16704 (83.47%) | loss: 2.498301 | lrm: 0.33 | dt: 642.96ms | tok/sec: 815,426 | mfu: 50.97 | epoch: 2 | total time: 149.77m | eta: 29.7m +step 13943/16704 (83.47%) | loss: 2.498676 | lrm: 0.33 | dt: 645.10ms | tok/sec: 812,729 | mfu: 50.80 | epoch: 2 | total time: 149.78m | eta: 29.7m +step 13944/16704 (83.48%) | loss: 2.496117 | lrm: 0.33 | dt: 642.92ms | tok/sec: 815,480 | mfu: 50.97 | epoch: 2 | total time: 149.79m | eta: 29.7m +step 13945/16704 (83.48%) | loss: 2.504613 | lrm: 0.33 | dt: 644.08ms | tok/sec: 814,011 | mfu: 50.88 | epoch: 2 | total time: 149.80m | eta: 29.7m +step 13946/16704 (83.49%) | loss: 2.509581 | lrm: 0.33 | dt: 644.48ms | tok/sec: 813,509 | mfu: 50.85 | epoch: 2 | total time: 149.81m | eta: 29.6m +step 13947/16704 (83.49%) | loss: 2.501056 | lrm: 0.33 | dt: 641.08ms | tok/sec: 817,817 | mfu: 51.11 | epoch: 2 | total time: 149.82m | eta: 29.6m +step 13948/16704 (83.50%) | loss: 2.515633 | lrm: 0.33 | dt: 642.93ms | tok/sec: 815,468 | mfu: 50.97 | epoch: 2 | total time: 149.83m | eta: 29.6m +step 13949/16704 (83.51%) | loss: 2.521267 | lrm: 0.33 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 2 | total time: 149.84m | eta: 29.6m +step 13950/16704 (83.51%) | loss: 2.531973 | lrm: 0.33 | dt: 642.77ms | tok/sec: 815,668 | mfu: 50.98 | epoch: 2 | total time: 149.85m | eta: 29.6m +step 13951/16704 (83.52%) | loss: 2.534215 | lrm: 0.33 | dt: 644.75ms | tok/sec: 813,161 | mfu: 50.82 | epoch: 2 | total time: 149.87m | eta: 29.6m +step 13952/16704 (83.52%) | loss: 2.544005 | lrm: 0.33 | dt: 640.32ms | tok/sec: 818,791 | mfu: 51.18 | epoch: 2 | total time: 149.88m | eta: 29.6m +step 13953/16704 (83.53%) | loss: 2.548341 | lrm: 0.33 | dt: 644.78ms | tok/sec: 813,121 | mfu: 50.82 | epoch: 2 | total time: 149.89m | eta: 29.6m +step 13954/16704 (83.54%) | loss: 2.548274 | lrm: 0.33 | dt: 645.28ms | tok/sec: 812,499 | mfu: 50.78 | epoch: 2 | total time: 149.90m | eta: 29.6m +step 13955/16704 (83.54%) | loss: 2.548354 | lrm: 0.33 | dt: 643.27ms | tok/sec: 815,029 | mfu: 50.94 | epoch: 2 | total time: 149.91m | eta: 29.6m +step 13956/16704 (83.55%) | loss: 2.553996 | lrm: 0.33 | dt: 642.18ms | tok/sec: 816,417 | mfu: 51.03 | epoch: 2 | total time: 149.92m | eta: 29.5m +step 13957/16704 (83.55%) | loss: 2.560467 | lrm: 0.33 | dt: 644.12ms | tok/sec: 813,965 | mfu: 50.87 | epoch: 2 | total time: 149.93m | eta: 29.5m +step 13958/16704 (83.56%) | loss: 2.552801 | lrm: 0.33 | dt: 644.25ms | tok/sec: 813,800 | mfu: 50.86 | epoch: 2 | total time: 149.94m | eta: 29.5m +step 13959/16704 (83.57%) | loss: 2.539376 | lrm: 0.33 | dt: 644.78ms | tok/sec: 813,130 | mfu: 50.82 | epoch: 2 | total time: 149.95m | eta: 29.5m +step 13960/16704 (83.57%) | loss: 2.540443 | lrm: 0.33 | dt: 643.45ms | tok/sec: 814,808 | mfu: 50.93 | epoch: 2 | total time: 149.96m | eta: 29.5m +step 13961/16704 (83.58%) | loss: 2.534662 | lrm: 0.33 | dt: 643.31ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 2 | total time: 149.97m | eta: 29.5m +step 13962/16704 (83.58%) | loss: 2.532254 | lrm: 0.33 | dt: 644.12ms | tok/sec: 813,966 | mfu: 50.87 | epoch: 2 | total time: 149.98m | eta: 29.5m +step 13963/16704 (83.59%) | loss: 2.531671 | lrm: 0.33 | dt: 643.39ms | tok/sec: 814,884 | mfu: 50.93 | epoch: 2 | total time: 149.99m | eta: 29.5m +step 13964/16704 (83.60%) | loss: 2.539297 | lrm: 0.33 | dt: 642.21ms | tok/sec: 816,375 | mfu: 51.02 | epoch: 2 | total time: 150.00m | eta: 29.5m +step 13965/16704 (83.60%) | loss: 2.540043 | lrm: 0.33 | dt: 645.60ms | tok/sec: 812,090 | mfu: 50.76 | epoch: 2 | total time: 150.02m | eta: 29.4m +step 13966/16704 (83.61%) | loss: 2.534901 | lrm: 0.33 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 2 | total time: 150.03m | eta: 29.4m +step 13967/16704 (83.61%) | loss: 2.535357 | lrm: 0.33 | dt: 643.42ms | tok/sec: 814,850 | mfu: 50.93 | epoch: 2 | total time: 150.04m | eta: 29.4m +step 13968/16704 (83.62%) | loss: 2.536978 | lrm: 0.33 | dt: 643.57ms | tok/sec: 814,654 | mfu: 50.92 | epoch: 2 | total time: 150.05m | eta: 29.4m +step 13969/16704 (83.63%) | loss: 2.526187 | lrm: 0.33 | dt: 641.46ms | tok/sec: 817,331 | mfu: 51.08 | epoch: 2 | total time: 150.06m | eta: 29.4m +step 13970/16704 (83.63%) | loss: 2.516347 | lrm: 0.33 | dt: 644.72ms | tok/sec: 813,196 | mfu: 50.83 | epoch: 2 | total time: 150.07m | eta: 29.4m +step 13971/16704 (83.64%) | loss: 2.510350 | lrm: 0.33 | dt: 643.07ms | tok/sec: 815,289 | mfu: 50.96 | epoch: 2 | total time: 150.08m | eta: 29.4m +step 13972/16704 (83.64%) | loss: 2.513875 | lrm: 0.33 | dt: 646.51ms | tok/sec: 810,946 | mfu: 50.69 | epoch: 2 | total time: 150.09m | eta: 29.4m +step 13973/16704 (83.65%) | loss: 2.518000 | lrm: 0.33 | dt: 644.48ms | tok/sec: 813,508 | mfu: 50.85 | epoch: 2 | total time: 150.10m | eta: 29.4m +step 13974/16704 (83.66%) | loss: 2.523639 | lrm: 0.33 | dt: 644.06ms | tok/sec: 814,031 | mfu: 50.88 | epoch: 2 | total time: 150.11m | eta: 29.3m +step 13975/16704 (83.66%) | loss: 2.513534 | lrm: 0.33 | dt: 643.74ms | tok/sec: 814,440 | mfu: 50.90 | epoch: 2 | total time: 150.12m | eta: 29.3m +step 13976/16704 (83.67%) | loss: 2.507852 | lrm: 0.33 | dt: 644.56ms | tok/sec: 813,403 | mfu: 50.84 | epoch: 2 | total time: 150.13m | eta: 29.3m +step 13977/16704 (83.67%) | loss: 2.510524 | lrm: 0.33 | dt: 643.17ms | tok/sec: 815,156 | mfu: 50.95 | epoch: 2 | total time: 150.14m | eta: 29.3m +step 13978/16704 (83.68%) | loss: 2.511124 | lrm: 0.33 | dt: 643.59ms | tok/sec: 814,634 | mfu: 50.92 | epoch: 2 | total time: 150.16m | eta: 29.3m +step 13979/16704 (83.69%) | loss: 2.538268 | lrm: 0.33 | dt: 644.39ms | tok/sec: 813,619 | mfu: 50.85 | epoch: 2 | total time: 150.17m | eta: 29.3m +step 13980/16704 (83.69%) | loss: 2.538328 | lrm: 0.33 | dt: 643.82ms | tok/sec: 814,336 | mfu: 50.90 | epoch: 2 | total time: 150.18m | eta: 29.3m +step 13981/16704 (83.70%) | loss: 2.522287 | lrm: 0.33 | dt: 646.25ms | tok/sec: 811,276 | mfu: 50.71 | epoch: 2 | total time: 150.19m | eta: 29.3m +step 13982/16704 (83.70%) | loss: 2.514491 | lrm: 0.33 | dt: 642.64ms | tok/sec: 815,838 | mfu: 50.99 | epoch: 2 | total time: 150.20m | eta: 29.3m +step 13983/16704 (83.71%) | loss: 2.506921 | lrm: 0.33 | dt: 643.21ms | tok/sec: 815,116 | mfu: 50.95 | epoch: 2 | total time: 150.21m | eta: 29.3m +step 13984/16704 (83.72%) | loss: 2.517284 | lrm: 0.33 | dt: 645.08ms | tok/sec: 812,752 | mfu: 50.80 | epoch: 2 | total time: 150.22m | eta: 29.2m +step 13985/16704 (83.72%) | loss: 2.522597 | lrm: 0.33 | dt: 643.74ms | tok/sec: 814,445 | mfu: 50.90 | epoch: 2 | total time: 150.23m | eta: 29.2m +step 13986/16704 (83.73%) | loss: 2.518538 | lrm: 0.33 | dt: 643.08ms | tok/sec: 815,275 | mfu: 50.96 | epoch: 2 | total time: 150.24m | eta: 29.2m +step 13987/16704 (83.73%) | loss: 2.519068 | lrm: 0.33 | dt: 643.96ms | tok/sec: 814,159 | mfu: 50.89 | epoch: 2 | total time: 150.25m | eta: 29.2m +step 13988/16704 (83.74%) | loss: 2.514707 | lrm: 0.33 | dt: 643.69ms | tok/sec: 814,499 | mfu: 50.91 | epoch: 2 | total time: 150.26m | eta: 29.2m +step 13989/16704 (83.75%) | loss: 2.509194 | lrm: 0.33 | dt: 646.58ms | tok/sec: 810,868 | mfu: 50.68 | epoch: 2 | total time: 150.27m | eta: 29.2m +step 13990/16704 (83.75%) | loss: 2.505161 | lrm: 0.32 | dt: 643.63ms | tok/sec: 814,573 | mfu: 50.91 | epoch: 2 | total time: 150.28m | eta: 29.2m +step 13991/16704 (83.76%) | loss: 2.494419 | lrm: 0.32 | dt: 645.20ms | tok/sec: 812,595 | mfu: 50.79 | epoch: 2 | total time: 150.29m | eta: 29.2m +step 13992/16704 (83.76%) | loss: 2.493773 | lrm: 0.32 | dt: 646.14ms | tok/sec: 811,410 | mfu: 50.71 | epoch: 2 | total time: 150.31m | eta: 29.2m +step 13993/16704 (83.77%) | loss: 2.497621 | lrm: 0.32 | dt: 642.62ms | tok/sec: 815,859 | mfu: 50.99 | epoch: 2 | total time: 150.32m | eta: 29.1m +step 13994/16704 (83.78%) | loss: 2.489466 | lrm: 0.32 | dt: 645.00ms | tok/sec: 812,851 | mfu: 50.80 | epoch: 2 | total time: 150.33m | eta: 29.1m +step 13995/16704 (83.78%) | loss: 2.491218 | lrm: 0.32 | dt: 643.36ms | tok/sec: 814,922 | mfu: 50.93 | epoch: 2 | total time: 150.34m | eta: 29.1m +step 13996/16704 (83.79%) | loss: 2.487169 | lrm: 0.32 | dt: 642.06ms | tok/sec: 816,574 | mfu: 51.04 | epoch: 2 | total time: 150.35m | eta: 29.1m +step 13997/16704 (83.79%) | loss: 2.489585 | lrm: 0.32 | dt: 646.64ms | tok/sec: 810,788 | mfu: 50.68 | epoch: 2 | total time: 150.36m | eta: 29.1m +step 13998/16704 (83.80%) | loss: 2.486773 | lrm: 0.32 | dt: 640.49ms | tok/sec: 818,567 | mfu: 51.16 | epoch: 2 | total time: 150.37m | eta: 29.1m +step 13999/16704 (83.81%) | loss: 2.491862 | lrm: 0.32 | dt: 644.66ms | tok/sec: 813,280 | mfu: 50.83 | epoch: 2 | total time: 150.38m | eta: 29.1m +[GC rank5] gen2: 321.7ms collected 91008 objects +[GC rank0] gen2: 325.2ms collected 91112 objects +[GC rank6] gen2: 329.4ms collected 90984 objects +[GC rank1] gen2: 329.6ms collected 91088 objects +[GC rank2] gen2: 332.5ms collected 91056 objects +[GC rank3] gen2: 426.9ms collected 91048 objects +[GC rank4] gen2: 467.1ms collected 91024 objects +[GC rank7] gen2: 478.9ms collected 90976 objects +Step 14000 | Validation bpb: 0.769665 +step 14000/16704 (83.81%) | loss: 2.498053 | lrm: 0.32 | dt: 632.12ms | tok/sec: 829,415 | mfu: 51.84 | epoch: 2 | total time: 150.39m | eta: 29.1m +step 14001/16704 (83.82%) | loss: 2.503363 | lrm: 0.32 | dt: 653.64ms | tok/sec: 802,100 | mfu: 50.13 | epoch: 2 | total time: 150.40m | eta: 29.1m +step 14002/16704 (83.82%) | loss: 2.506138 | lrm: 0.32 | dt: 640.59ms | tok/sec: 818,443 | mfu: 51.15 | epoch: 2 | total time: 150.41m | eta: 29.0m +step 14003/16704 (83.83%) | loss: 2.513221 | lrm: 0.32 | dt: 645.16ms | tok/sec: 812,648 | mfu: 50.79 | epoch: 2 | total time: 150.42m | eta: 29.0m +step 14004/16704 (83.84%) | loss: 2.508227 | lrm: 0.32 | dt: 652.35ms | tok/sec: 803,688 | mfu: 50.23 | epoch: 2 | total time: 150.43m | eta: 29.0m +step 14005/16704 (83.84%) | loss: 2.520303 | lrm: 0.32 | dt: 637.67ms | tok/sec: 822,199 | mfu: 51.39 | epoch: 2 | total time: 150.45m | eta: 29.0m +step 14006/16704 (83.85%) | loss: 2.527279 | lrm: 0.32 | dt: 646.49ms | tok/sec: 810,974 | mfu: 50.69 | epoch: 2 | total time: 150.46m | eta: 29.0m +step 14007/16704 (83.85%) | loss: 2.512939 | lrm: 0.32 | dt: 646.48ms | tok/sec: 810,982 | mfu: 50.69 | epoch: 2 | total time: 150.47m | eta: 29.0m +step 14008/16704 (83.86%) | loss: 2.518658 | lrm: 0.32 | dt: 642.02ms | tok/sec: 816,628 | mfu: 51.04 | epoch: 2 | total time: 150.48m | eta: 29.0m +step 14009/16704 (83.87%) | loss: 2.527598 | lrm: 0.32 | dt: 643.97ms | tok/sec: 814,150 | mfu: 50.89 | epoch: 2 | total time: 150.49m | eta: 29.0m +step 14010/16704 (83.87%) | loss: 2.520489 | lrm: 0.32 | dt: 644.28ms | tok/sec: 813,761 | mfu: 50.86 | epoch: 2 | total time: 150.50m | eta: 29.0m +step 14011/16704 (83.88%) | loss: 2.516235 | lrm: 0.32 | dt: 643.50ms | tok/sec: 814,749 | mfu: 50.92 | epoch: 2 | total time: 150.51m | eta: 28.9m +step 14012/16704 (83.88%) | loss: 2.515427 | lrm: 0.32 | dt: 644.30ms | tok/sec: 813,738 | mfu: 50.86 | epoch: 2 | total time: 150.52m | eta: 28.9m +step 14013/16704 (83.89%) | loss: 2.514600 | lrm: 0.32 | dt: 642.97ms | tok/sec: 815,419 | mfu: 50.96 | epoch: 2 | total time: 150.53m | eta: 28.9m +step 14014/16704 (83.90%) | loss: 2.503319 | lrm: 0.32 | dt: 643.57ms | tok/sec: 814,661 | mfu: 50.92 | epoch: 2 | total time: 150.54m | eta: 28.9m +step 14015/16704 (83.90%) | loss: 2.511854 | lrm: 0.32 | dt: 643.95ms | tok/sec: 814,174 | mfu: 50.89 | epoch: 2 | total time: 150.55m | eta: 28.9m +step 14016/16704 (83.91%) | loss: 2.510389 | lrm: 0.32 | dt: 643.50ms | tok/sec: 814,745 | mfu: 50.92 | epoch: 2 | total time: 150.56m | eta: 28.9m +step 14017/16704 (83.91%) | loss: 2.509536 | lrm: 0.32 | dt: 645.18ms | tok/sec: 812,626 | mfu: 50.79 | epoch: 2 | total time: 150.57m | eta: 28.9m +step 14018/16704 (83.92%) | loss: 2.508909 | lrm: 0.32 | dt: 644.44ms | tok/sec: 813,561 | mfu: 50.85 | epoch: 2 | total time: 150.58m | eta: 28.9m +step 14019/16704 (83.93%) | loss: 2.515142 | lrm: 0.32 | dt: 643.84ms | tok/sec: 814,310 | mfu: 50.90 | epoch: 2 | total time: 150.60m | eta: 28.9m +step 14020/16704 (83.93%) | loss: 2.512790 | lrm: 0.32 | dt: 645.62ms | tok/sec: 812,065 | mfu: 50.76 | epoch: 2 | total time: 150.61m | eta: 28.9m +step 14021/16704 (83.94%) | loss: 2.512281 | lrm: 0.32 | dt: 645.20ms | tok/sec: 812,598 | mfu: 50.79 | epoch: 2 | total time: 150.62m | eta: 28.8m +step 14022/16704 (83.94%) | loss: 2.506797 | lrm: 0.32 | dt: 646.97ms | tok/sec: 810,380 | mfu: 50.65 | epoch: 2 | total time: 150.63m | eta: 28.8m +step 14023/16704 (83.95%) | loss: 2.507995 | lrm: 0.32 | dt: 643.73ms | tok/sec: 814,457 | mfu: 50.90 | epoch: 2 | total time: 150.64m | eta: 28.8m +step 14024/16704 (83.96%) | loss: 2.497707 | lrm: 0.32 | dt: 643.45ms | tok/sec: 814,805 | mfu: 50.93 | epoch: 2 | total time: 150.65m | eta: 28.8m +step 14025/16704 (83.96%) | loss: 2.507963 | lrm: 0.32 | dt: 648.70ms | tok/sec: 808,214 | mfu: 50.51 | epoch: 2 | total time: 150.66m | eta: 28.8m +step 14026/16704 (83.97%) | loss: 2.504165 | lrm: 0.32 | dt: 644.53ms | tok/sec: 813,442 | mfu: 50.84 | epoch: 2 | total time: 150.67m | eta: 28.8m +step 14027/16704 (83.97%) | loss: 2.504247 | lrm: 0.32 | dt: 643.66ms | tok/sec: 814,541 | mfu: 50.91 | epoch: 2 | total time: 150.68m | eta: 28.8m +step 14028/16704 (83.98%) | loss: 2.513529 | lrm: 0.32 | dt: 643.01ms | tok/sec: 815,370 | mfu: 50.96 | epoch: 2 | total time: 150.69m | eta: 28.8m +step 14029/16704 (83.99%) | loss: 2.531837 | lrm: 0.32 | dt: 647.03ms | tok/sec: 810,295 | mfu: 50.64 | epoch: 2 | total time: 150.70m | eta: 28.8m +step 14030/16704 (83.99%) | loss: 2.534154 | lrm: 0.32 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.85 | epoch: 2 | total time: 150.71m | eta: 28.7m +step 14031/16704 (84.00%) | loss: 2.538973 | lrm: 0.32 | dt: 645.67ms | tok/sec: 812,011 | mfu: 50.75 | epoch: 2 | total time: 150.72m | eta: 28.7m +step 14032/16704 (84.00%) | loss: 2.518276 | lrm: 0.32 | dt: 647.87ms | tok/sec: 809,249 | mfu: 50.58 | epoch: 2 | total time: 150.74m | eta: 28.7m +step 14033/16704 (84.01%) | loss: 2.522864 | lrm: 0.32 | dt: 643.28ms | tok/sec: 815,025 | mfu: 50.94 | epoch: 2 | total time: 150.75m | eta: 28.7m +step 14034/16704 (84.02%) | loss: 2.522218 | lrm: 0.32 | dt: 646.87ms | tok/sec: 810,499 | mfu: 50.66 | epoch: 2 | total time: 150.76m | eta: 28.7m +step 14035/16704 (84.02%) | loss: 2.527183 | lrm: 0.32 | dt: 646.86ms | tok/sec: 810,508 | mfu: 50.66 | epoch: 2 | total time: 150.77m | eta: 28.7m +step 14036/16704 (84.03%) | loss: 2.539051 | lrm: 0.32 | dt: 644.92ms | tok/sec: 812,953 | mfu: 50.81 | epoch: 2 | total time: 150.78m | eta: 28.7m +step 14037/16704 (84.03%) | loss: 2.527230 | lrm: 0.32 | dt: 645.87ms | tok/sec: 811,755 | mfu: 50.74 | epoch: 2 | total time: 150.79m | eta: 28.7m +step 14038/16704 (84.04%) | loss: 2.531968 | lrm: 0.32 | dt: 643.04ms | tok/sec: 815,331 | mfu: 50.96 | epoch: 2 | total time: 150.80m | eta: 28.7m +step 14039/16704 (84.05%) | loss: 2.532841 | lrm: 0.32 | dt: 646.95ms | tok/sec: 810,405 | mfu: 50.65 | epoch: 2 | total time: 150.81m | eta: 28.6m +step 14040/16704 (84.05%) | loss: 2.522306 | lrm: 0.32 | dt: 646.09ms | tok/sec: 811,477 | mfu: 50.72 | epoch: 2 | total time: 150.82m | eta: 28.6m +step 14041/16704 (84.06%) | loss: 2.525064 | lrm: 0.32 | dt: 646.39ms | tok/sec: 811,104 | mfu: 50.70 | epoch: 2 | total time: 150.83m | eta: 28.6m +step 14042/16704 (84.06%) | loss: 2.512288 | lrm: 0.32 | dt: 642.31ms | tok/sec: 816,250 | mfu: 51.02 | epoch: 2 | total time: 150.84m | eta: 28.6m +step 14043/16704 (84.07%) | loss: 2.506853 | lrm: 0.32 | dt: 645.12ms | tok/sec: 812,700 | mfu: 50.79 | epoch: 2 | total time: 150.85m | eta: 28.6m +step 14044/16704 (84.08%) | loss: 2.491840 | lrm: 0.32 | dt: 644.38ms | tok/sec: 813,628 | mfu: 50.85 | epoch: 2 | total time: 150.86m | eta: 28.6m +step 14045/16704 (84.08%) | loss: 2.498109 | lrm: 0.32 | dt: 645.48ms | tok/sec: 812,241 | mfu: 50.77 | epoch: 2 | total time: 150.87m | eta: 28.6m +step 14046/16704 (84.09%) | loss: 2.505752 | lrm: 0.32 | dt: 645.41ms | tok/sec: 812,339 | mfu: 50.77 | epoch: 2 | total time: 150.89m | eta: 28.6m +step 14047/16704 (84.09%) | loss: 2.519251 | lrm: 0.32 | dt: 648.16ms | tok/sec: 808,880 | mfu: 50.56 | epoch: 2 | total time: 150.90m | eta: 28.6m +step 14048/16704 (84.10%) | loss: 2.520579 | lrm: 0.32 | dt: 645.66ms | tok/sec: 812,013 | mfu: 50.75 | epoch: 2 | total time: 150.91m | eta: 28.6m +step 14049/16704 (84.11%) | loss: 2.511400 | lrm: 0.32 | dt: 644.85ms | tok/sec: 813,035 | mfu: 50.82 | epoch: 2 | total time: 150.92m | eta: 28.5m +step 14050/16704 (84.11%) | loss: 2.505286 | lrm: 0.32 | dt: 646.56ms | tok/sec: 810,886 | mfu: 50.68 | epoch: 2 | total time: 150.93m | eta: 28.5m +step 14051/16704 (84.12%) | loss: 2.502714 | lrm: 0.32 | dt: 643.22ms | tok/sec: 815,099 | mfu: 50.94 | epoch: 2 | total time: 150.94m | eta: 28.5m +step 14052/16704 (84.12%) | loss: 2.501142 | lrm: 0.32 | dt: 645.96ms | tok/sec: 811,641 | mfu: 50.73 | epoch: 2 | total time: 150.95m | eta: 28.5m +step 14053/16704 (84.13%) | loss: 2.509259 | lrm: 0.32 | dt: 645.59ms | tok/sec: 812,103 | mfu: 50.76 | epoch: 2 | total time: 150.96m | eta: 28.5m +step 14054/16704 (84.14%) | loss: 2.503874 | lrm: 0.32 | dt: 644.51ms | tok/sec: 813,468 | mfu: 50.84 | epoch: 2 | total time: 150.97m | eta: 28.5m +step 14055/16704 (84.14%) | loss: 2.511267 | lrm: 0.32 | dt: 644.57ms | tok/sec: 813,390 | mfu: 50.84 | epoch: 2 | total time: 150.98m | eta: 28.5m +step 14056/16704 (84.15%) | loss: 2.502246 | lrm: 0.32 | dt: 644.73ms | tok/sec: 813,184 | mfu: 50.83 | epoch: 2 | total time: 150.99m | eta: 28.5m +step 14057/16704 (84.15%) | loss: 2.501167 | lrm: 0.32 | dt: 644.25ms | tok/sec: 813,795 | mfu: 50.86 | epoch: 2 | total time: 151.00m | eta: 28.5m +step 14058/16704 (84.16%) | loss: 2.499082 | lrm: 0.32 | dt: 646.57ms | tok/sec: 810,881 | mfu: 50.68 | epoch: 2 | total time: 151.01m | eta: 28.4m +step 14059/16704 (84.17%) | loss: 2.495973 | lrm: 0.32 | dt: 644.99ms | tok/sec: 812,862 | mfu: 50.81 | epoch: 2 | total time: 151.03m | eta: 28.4m +step 14060/16704 (84.17%) | loss: 2.498575 | lrm: 0.32 | dt: 644.88ms | tok/sec: 812,999 | mfu: 50.81 | epoch: 2 | total time: 151.04m | eta: 28.4m +step 14061/16704 (84.18%) | loss: 2.491328 | lrm: 0.32 | dt: 644.34ms | tok/sec: 813,680 | mfu: 50.86 | epoch: 2 | total time: 151.05m | eta: 28.4m +step 14062/16704 (84.18%) | loss: 2.485568 | lrm: 0.32 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 2 | total time: 151.06m | eta: 28.4m +step 14063/16704 (84.19%) | loss: 2.482910 | lrm: 0.32 | dt: 647.86ms | tok/sec: 809,260 | mfu: 50.58 | epoch: 2 | total time: 151.07m | eta: 28.4m +step 14064/16704 (84.20%) | loss: 2.492659 | lrm: 0.32 | dt: 645.82ms | tok/sec: 811,817 | mfu: 50.74 | epoch: 2 | total time: 151.08m | eta: 28.4m +step 14065/16704 (84.20%) | loss: 2.503854 | lrm: 0.32 | dt: 643.87ms | tok/sec: 814,280 | mfu: 50.89 | epoch: 2 | total time: 151.09m | eta: 28.4m +step 14066/16704 (84.21%) | loss: 2.509362 | lrm: 0.32 | dt: 645.00ms | tok/sec: 812,843 | mfu: 50.80 | epoch: 2 | total time: 151.10m | eta: 28.4m +step 14067/16704 (84.21%) | loss: 2.511213 | lrm: 0.32 | dt: 646.61ms | tok/sec: 810,827 | mfu: 50.68 | epoch: 2 | total time: 151.11m | eta: 28.3m +step 14068/16704 (84.22%) | loss: 2.519606 | lrm: 0.32 | dt: 645.16ms | tok/sec: 812,641 | mfu: 50.79 | epoch: 2 | total time: 151.12m | eta: 28.3m +step 14069/16704 (84.23%) | loss: 2.504993 | lrm: 0.32 | dt: 646.40ms | tok/sec: 811,084 | mfu: 50.69 | epoch: 2 | total time: 151.13m | eta: 28.3m +step 14070/16704 (84.23%) | loss: 2.516731 | lrm: 0.32 | dt: 645.13ms | tok/sec: 812,690 | mfu: 50.79 | epoch: 2 | total time: 151.14m | eta: 28.3m +step 14071/16704 (84.24%) | loss: 2.514689 | lrm: 0.32 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 2 | total time: 151.15m | eta: 28.3m +step 14072/16704 (84.24%) | loss: 2.509265 | lrm: 0.32 | dt: 644.41ms | tok/sec: 813,593 | mfu: 50.85 | epoch: 2 | total time: 151.17m | eta: 28.3m +step 14073/16704 (84.25%) | loss: 2.514282 | lrm: 0.32 | dt: 644.90ms | tok/sec: 812,980 | mfu: 50.81 | epoch: 2 | total time: 151.18m | eta: 28.3m +step 14074/16704 (84.26%) | loss: 2.517914 | lrm: 0.31 | dt: 643.55ms | tok/sec: 814,679 | mfu: 50.92 | epoch: 2 | total time: 151.19m | eta: 28.3m +step 14075/16704 (84.26%) | loss: 2.515085 | lrm: 0.31 | dt: 644.55ms | tok/sec: 813,412 | mfu: 50.84 | epoch: 2 | total time: 151.20m | eta: 28.3m +step 14076/16704 (84.27%) | loss: 2.521488 | lrm: 0.31 | dt: 645.15ms | tok/sec: 812,657 | mfu: 50.79 | epoch: 2 | total time: 151.21m | eta: 28.3m +step 14077/16704 (84.27%) | loss: 2.515841 | lrm: 0.31 | dt: 644.60ms | tok/sec: 813,349 | mfu: 50.84 | epoch: 2 | total time: 151.22m | eta: 28.2m +step 14078/16704 (84.28%) | loss: 2.504867 | lrm: 0.31 | dt: 647.26ms | tok/sec: 810,005 | mfu: 50.63 | epoch: 2 | total time: 151.23m | eta: 28.2m +step 14079/16704 (84.29%) | loss: 2.497275 | lrm: 0.31 | dt: 642.59ms | tok/sec: 815,896 | mfu: 50.99 | epoch: 2 | total time: 151.24m | eta: 28.2m +step 14080/16704 (84.29%) | loss: 2.500590 | lrm: 0.31 | dt: 645.62ms | tok/sec: 812,065 | mfu: 50.76 | epoch: 2 | total time: 151.25m | eta: 28.2m +step 14081/16704 (84.30%) | loss: 2.488375 | lrm: 0.31 | dt: 648.32ms | tok/sec: 808,690 | mfu: 50.54 | epoch: 2 | total time: 151.26m | eta: 28.2m +step 14082/16704 (84.30%) | loss: 2.485548 | lrm: 0.31 | dt: 642.58ms | tok/sec: 815,907 | mfu: 51.00 | epoch: 2 | total time: 151.27m | eta: 28.2m +step 14083/16704 (84.31%) | loss: 2.476298 | lrm: 0.31 | dt: 645.24ms | tok/sec: 812,549 | mfu: 50.79 | epoch: 2 | total time: 151.28m | eta: 28.2m +step 14084/16704 (84.32%) | loss: 2.476372 | lrm: 0.31 | dt: 643.89ms | tok/sec: 814,256 | mfu: 50.89 | epoch: 2 | total time: 151.29m | eta: 28.2m +step 14085/16704 (84.32%) | loss: 2.476583 | lrm: 0.31 | dt: 644.92ms | tok/sec: 812,954 | mfu: 50.81 | epoch: 2 | total time: 151.31m | eta: 28.2m +step 14086/16704 (84.33%) | loss: 2.470946 | lrm: 0.31 | dt: 647.98ms | tok/sec: 809,106 | mfu: 50.57 | epoch: 2 | total time: 151.32m | eta: 28.1m +step 14087/16704 (84.33%) | loss: 2.477104 | lrm: 0.31 | dt: 643.26ms | tok/sec: 815,054 | mfu: 50.94 | epoch: 2 | total time: 151.33m | eta: 28.1m +step 14088/16704 (84.34%) | loss: 2.478114 | lrm: 0.31 | dt: 646.49ms | tok/sec: 810,971 | mfu: 50.69 | epoch: 2 | total time: 151.34m | eta: 28.1m +step 14089/16704 (84.35%) | loss: 2.463718 | lrm: 0.31 | dt: 643.46ms | tok/sec: 814,793 | mfu: 50.93 | epoch: 2 | total time: 151.35m | eta: 28.1m +step 14090/16704 (84.35%) | loss: 2.461907 | lrm: 0.31 | dt: 644.49ms | tok/sec: 813,494 | mfu: 50.84 | epoch: 2 | total time: 151.36m | eta: 28.1m +step 14091/16704 (84.36%) | loss: 2.464962 | lrm: 0.31 | dt: 646.76ms | tok/sec: 810,638 | mfu: 50.67 | epoch: 2 | total time: 151.37m | eta: 28.1m +step 14092/16704 (84.36%) | loss: 2.479254 | lrm: 0.31 | dt: 646.44ms | tok/sec: 811,041 | mfu: 50.69 | epoch: 2 | total time: 151.38m | eta: 28.1m +step 14093/16704 (84.37%) | loss: 2.481869 | lrm: 0.31 | dt: 644.84ms | tok/sec: 813,046 | mfu: 50.82 | epoch: 2 | total time: 151.39m | eta: 28.1m +step 14094/16704 (84.38%) | loss: 2.495372 | lrm: 0.31 | dt: 646.88ms | tok/sec: 810,488 | mfu: 50.66 | epoch: 2 | total time: 151.40m | eta: 28.1m +step 14095/16704 (84.38%) | loss: 2.501790 | lrm: 0.31 | dt: 644.91ms | tok/sec: 812,959 | mfu: 50.81 | epoch: 2 | total time: 151.41m | eta: 28.0m +step 14096/16704 (84.39%) | loss: 2.511618 | lrm: 0.31 | dt: 645.99ms | tok/sec: 811,601 | mfu: 50.73 | epoch: 2 | total time: 151.42m | eta: 28.0m +step 14097/16704 (84.39%) | loss: 2.515704 | lrm: 0.31 | dt: 644.82ms | tok/sec: 813,074 | mfu: 50.82 | epoch: 2 | total time: 151.43m | eta: 28.0m +step 14098/16704 (84.40%) | loss: 2.517219 | lrm: 0.31 | dt: 645.61ms | tok/sec: 812,080 | mfu: 50.76 | epoch: 2 | total time: 151.44m | eta: 28.0m +step 14099/16704 (84.40%) | loss: 2.498172 | lrm: 0.31 | dt: 644.78ms | tok/sec: 813,123 | mfu: 50.82 | epoch: 2 | total time: 151.46m | eta: 28.0m +step 14100/16704 (84.41%) | loss: 2.512259 | lrm: 0.31 | dt: 645.16ms | tok/sec: 812,642 | mfu: 50.79 | epoch: 2 | total time: 151.47m | eta: 28.0m +step 14101/16704 (84.42%) | loss: 2.518301 | lrm: 0.31 | dt: 643.44ms | tok/sec: 814,818 | mfu: 50.93 | epoch: 2 | total time: 151.48m | eta: 28.0m +step 14102/16704 (84.42%) | loss: 2.529720 | lrm: 0.31 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 2 | total time: 151.49m | eta: 28.0m +step 14103/16704 (84.43%) | loss: 2.533090 | lrm: 0.31 | dt: 643.93ms | tok/sec: 814,196 | mfu: 50.89 | epoch: 2 | total time: 151.50m | eta: 28.0m +step 14104/16704 (84.43%) | loss: 2.532095 | lrm: 0.31 | dt: 648.37ms | tok/sec: 808,627 | mfu: 50.54 | epoch: 2 | total time: 151.51m | eta: 27.9m +step 14105/16704 (84.44%) | loss: 2.520637 | lrm: 0.31 | dt: 645.59ms | tok/sec: 812,106 | mfu: 50.76 | epoch: 2 | total time: 151.52m | eta: 27.9m +step 14106/16704 (84.45%) | loss: 2.516225 | lrm: 0.31 | dt: 643.54ms | tok/sec: 814,688 | mfu: 50.92 | epoch: 2 | total time: 151.53m | eta: 27.9m +step 14107/16704 (84.45%) | loss: 2.514466 | lrm: 0.31 | dt: 645.08ms | tok/sec: 812,751 | mfu: 50.80 | epoch: 2 | total time: 151.54m | eta: 27.9m +step 14108/16704 (84.46%) | loss: 2.515497 | lrm: 0.31 | dt: 644.71ms | tok/sec: 813,217 | mfu: 50.83 | epoch: 2 | total time: 151.55m | eta: 27.9m +step 14109/16704 (84.46%) | loss: 2.513771 | lrm: 0.31 | dt: 644.34ms | tok/sec: 813,678 | mfu: 50.86 | epoch: 2 | total time: 151.56m | eta: 27.9m +step 14110/16704 (84.47%) | loss: 2.520726 | lrm: 0.31 | dt: 645.35ms | tok/sec: 812,412 | mfu: 50.78 | epoch: 2 | total time: 151.57m | eta: 27.9m +step 14111/16704 (84.48%) | loss: 2.515105 | lrm: 0.31 | dt: 645.66ms | tok/sec: 812,019 | mfu: 50.75 | epoch: 2 | total time: 151.58m | eta: 27.9m +step 14112/16704 (84.48%) | loss: 2.509148 | lrm: 0.31 | dt: 647.70ms | tok/sec: 809,463 | mfu: 50.59 | epoch: 2 | total time: 151.60m | eta: 27.9m +step 14113/16704 (84.49%) | loss: 2.511257 | lrm: 0.31 | dt: 645.28ms | tok/sec: 812,497 | mfu: 50.78 | epoch: 2 | total time: 151.61m | eta: 27.9m +step 14114/16704 (84.49%) | loss: 2.505009 | lrm: 0.31 | dt: 644.88ms | tok/sec: 813,001 | mfu: 50.81 | epoch: 2 | total time: 151.62m | eta: 27.8m +step 14115/16704 (84.50%) | loss: 2.511701 | lrm: 0.31 | dt: 647.11ms | tok/sec: 810,194 | mfu: 50.64 | epoch: 2 | total time: 151.63m | eta: 27.8m +step 14116/16704 (84.51%) | loss: 2.503429 | lrm: 0.31 | dt: 643.40ms | tok/sec: 814,871 | mfu: 50.93 | epoch: 2 | total time: 151.64m | eta: 27.8m +step 14117/16704 (84.51%) | loss: 2.512352 | lrm: 0.31 | dt: 645.99ms | tok/sec: 811,608 | mfu: 50.73 | epoch: 2 | total time: 151.65m | eta: 27.8m +step 14118/16704 (84.52%) | loss: 2.508711 | lrm: 0.31 | dt: 643.38ms | tok/sec: 814,897 | mfu: 50.93 | epoch: 2 | total time: 151.66m | eta: 27.8m +step 14119/16704 (84.52%) | loss: 2.502324 | lrm: 0.31 | dt: 644.20ms | tok/sec: 813,861 | mfu: 50.87 | epoch: 2 | total time: 151.67m | eta: 27.8m +step 14120/16704 (84.53%) | loss: 2.503671 | lrm: 0.31 | dt: 644.83ms | tok/sec: 813,069 | mfu: 50.82 | epoch: 2 | total time: 151.68m | eta: 27.8m +step 14121/16704 (84.54%) | loss: 2.509594 | lrm: 0.31 | dt: 646.63ms | tok/sec: 810,802 | mfu: 50.68 | epoch: 2 | total time: 151.69m | eta: 27.8m +step 14122/16704 (84.54%) | loss: 2.512421 | lrm: 0.31 | dt: 644.09ms | tok/sec: 813,993 | mfu: 50.88 | epoch: 2 | total time: 151.70m | eta: 27.8m +step 14123/16704 (84.55%) | loss: 2.513849 | lrm: 0.31 | dt: 645.27ms | tok/sec: 812,512 | mfu: 50.78 | epoch: 2 | total time: 151.71m | eta: 27.7m +step 14124/16704 (84.55%) | loss: 2.517971 | lrm: 0.31 | dt: 645.81ms | tok/sec: 811,824 | mfu: 50.74 | epoch: 2 | total time: 151.72m | eta: 27.7m +step 14125/16704 (84.56%) | loss: 2.510175 | lrm: 0.31 | dt: 645.87ms | tok/sec: 811,752 | mfu: 50.74 | epoch: 2 | total time: 151.74m | eta: 27.7m +step 14126/16704 (84.57%) | loss: 2.501986 | lrm: 0.31 | dt: 644.59ms | tok/sec: 813,364 | mfu: 50.84 | epoch: 2 | total time: 151.75m | eta: 27.7m +step 14127/16704 (84.57%) | loss: 2.487039 | lrm: 0.31 | dt: 645.96ms | tok/sec: 811,642 | mfu: 50.73 | epoch: 2 | total time: 151.76m | eta: 27.7m +step 14128/16704 (84.58%) | loss: 2.482228 | lrm: 0.31 | dt: 645.80ms | tok/sec: 811,839 | mfu: 50.74 | epoch: 2 | total time: 151.77m | eta: 27.7m +step 14129/16704 (84.58%) | loss: 2.482520 | lrm: 0.31 | dt: 644.19ms | tok/sec: 813,867 | mfu: 50.87 | epoch: 2 | total time: 151.78m | eta: 27.7m +step 14130/16704 (84.59%) | loss: 2.490820 | lrm: 0.31 | dt: 645.03ms | tok/sec: 812,813 | mfu: 50.80 | epoch: 2 | total time: 151.79m | eta: 27.7m +step 14131/16704 (84.60%) | loss: 2.489151 | lrm: 0.31 | dt: 644.03ms | tok/sec: 814,078 | mfu: 50.88 | epoch: 2 | total time: 151.80m | eta: 27.7m +step 14132/16704 (84.60%) | loss: 2.492484 | lrm: 0.31 | dt: 644.80ms | tok/sec: 813,097 | mfu: 50.82 | epoch: 2 | total time: 151.81m | eta: 27.6m +step 14133/16704 (84.61%) | loss: 2.496478 | lrm: 0.31 | dt: 645.14ms | tok/sec: 812,676 | mfu: 50.79 | epoch: 2 | total time: 151.82m | eta: 27.6m +step 14134/16704 (84.61%) | loss: 2.494501 | lrm: 0.31 | dt: 646.52ms | tok/sec: 810,943 | mfu: 50.69 | epoch: 2 | total time: 151.83m | eta: 27.6m +step 14135/16704 (84.62%) | loss: 2.501860 | lrm: 0.31 | dt: 644.53ms | tok/sec: 813,442 | mfu: 50.84 | epoch: 2 | total time: 151.84m | eta: 27.6m +step 14136/16704 (84.63%) | loss: 2.497730 | lrm: 0.31 | dt: 644.17ms | tok/sec: 813,891 | mfu: 50.87 | epoch: 2 | total time: 151.85m | eta: 27.6m +step 14137/16704 (84.63%) | loss: 2.496058 | lrm: 0.31 | dt: 647.05ms | tok/sec: 810,276 | mfu: 50.64 | epoch: 2 | total time: 151.86m | eta: 27.6m +step 14138/16704 (84.64%) | loss: 2.493344 | lrm: 0.31 | dt: 645.22ms | tok/sec: 812,577 | mfu: 50.79 | epoch: 2 | total time: 151.88m | eta: 27.6m +step 14139/16704 (84.64%) | loss: 2.503255 | lrm: 0.31 | dt: 642.45ms | tok/sec: 816,076 | mfu: 51.01 | epoch: 2 | total time: 151.89m | eta: 27.6m +step 14140/16704 (84.65%) | loss: 2.498323 | lrm: 0.31 | dt: 644.94ms | tok/sec: 812,930 | mfu: 50.81 | epoch: 2 | total time: 151.90m | eta: 27.6m +step 14141/16704 (84.66%) | loss: 2.499817 | lrm: 0.31 | dt: 644.96ms | tok/sec: 812,894 | mfu: 50.81 | epoch: 2 | total time: 151.91m | eta: 27.6m +step 14142/16704 (84.66%) | loss: 2.502665 | lrm: 0.31 | dt: 644.20ms | tok/sec: 813,855 | mfu: 50.87 | epoch: 2 | total time: 151.92m | eta: 27.5m +step 14143/16704 (84.67%) | loss: 2.499852 | lrm: 0.31 | dt: 646.50ms | tok/sec: 810,967 | mfu: 50.69 | epoch: 2 | total time: 151.93m | eta: 27.5m +step 14144/16704 (84.67%) | loss: 2.504974 | lrm: 0.31 | dt: 646.23ms | tok/sec: 811,306 | mfu: 50.71 | epoch: 2 | total time: 151.94m | eta: 27.5m +step 14145/16704 (84.68%) | loss: 2.498407 | lrm: 0.31 | dt: 643.66ms | tok/sec: 814,538 | mfu: 50.91 | epoch: 2 | total time: 151.95m | eta: 27.5m +step 14146/16704 (84.69%) | loss: 2.500357 | lrm: 0.31 | dt: 645.80ms | tok/sec: 811,845 | mfu: 50.74 | epoch: 2 | total time: 151.96m | eta: 27.5m +step 14147/16704 (84.69%) | loss: 2.501353 | lrm: 0.31 | dt: 645.51ms | tok/sec: 812,213 | mfu: 50.76 | epoch: 2 | total time: 151.97m | eta: 27.5m +step 14148/16704 (84.70%) | loss: 2.494019 | lrm: 0.31 | dt: 643.88ms | tok/sec: 814,267 | mfu: 50.89 | epoch: 2 | total time: 151.98m | eta: 27.5m +step 14149/16704 (84.70%) | loss: 2.493656 | lrm: 0.31 | dt: 647.31ms | tok/sec: 809,943 | mfu: 50.62 | epoch: 2 | total time: 151.99m | eta: 27.5m +step 14150/16704 (84.71%) | loss: 2.508529 | lrm: 0.31 | dt: 645.81ms | tok/sec: 811,827 | mfu: 50.74 | epoch: 2 | total time: 152.00m | eta: 27.5m +step 14151/16704 (84.72%) | loss: 2.503255 | lrm: 0.31 | dt: 646.79ms | tok/sec: 810,600 | mfu: 50.66 | epoch: 2 | total time: 152.01m | eta: 27.4m +step 14152/16704 (84.72%) | loss: 2.506586 | lrm: 0.31 | dt: 644.35ms | tok/sec: 813,669 | mfu: 50.86 | epoch: 2 | total time: 152.03m | eta: 27.4m +step 14153/16704 (84.73%) | loss: 2.500701 | lrm: 0.31 | dt: 645.93ms | tok/sec: 811,675 | mfu: 50.73 | epoch: 2 | total time: 152.04m | eta: 27.4m +step 14154/16704 (84.73%) | loss: 2.494070 | lrm: 0.31 | dt: 644.97ms | tok/sec: 812,888 | mfu: 50.81 | epoch: 2 | total time: 152.05m | eta: 27.4m +step 14155/16704 (84.74%) | loss: 2.483418 | lrm: 0.31 | dt: 644.73ms | tok/sec: 813,191 | mfu: 50.83 | epoch: 2 | total time: 152.06m | eta: 27.4m +step 14156/16704 (84.75%) | loss: 2.486000 | lrm: 0.31 | dt: 646.15ms | tok/sec: 811,397 | mfu: 50.71 | epoch: 2 | total time: 152.07m | eta: 27.4m +step 14157/16704 (84.75%) | loss: 2.487318 | lrm: 0.30 | dt: 643.83ms | tok/sec: 814,330 | mfu: 50.90 | epoch: 2 | total time: 152.08m | eta: 27.4m +step 14158/16704 (84.76%) | loss: 2.497936 | lrm: 0.30 | dt: 644.26ms | tok/sec: 813,783 | mfu: 50.86 | epoch: 2 | total time: 152.09m | eta: 27.4m +step 14159/16704 (84.76%) | loss: 2.495240 | lrm: 0.30 | dt: 645.05ms | tok/sec: 812,781 | mfu: 50.80 | epoch: 2 | total time: 152.10m | eta: 27.4m +step 14160/16704 (84.77%) | loss: 2.498425 | lrm: 0.30 | dt: 644.59ms | tok/sec: 813,363 | mfu: 50.84 | epoch: 2 | total time: 152.11m | eta: 27.3m +step 14161/16704 (84.78%) | loss: 2.485432 | lrm: 0.30 | dt: 645.89ms | tok/sec: 811,724 | mfu: 50.73 | epoch: 2 | total time: 152.12m | eta: 27.3m +step 14162/16704 (84.78%) | loss: 2.475679 | lrm: 0.30 | dt: 643.18ms | tok/sec: 815,144 | mfu: 50.95 | epoch: 2 | total time: 152.13m | eta: 27.3m +step 14163/16704 (84.79%) | loss: 2.476549 | lrm: 0.30 | dt: 645.43ms | tok/sec: 812,313 | mfu: 50.77 | epoch: 2 | total time: 152.14m | eta: 27.3m +step 14164/16704 (84.79%) | loss: 2.486726 | lrm: 0.30 | dt: 645.86ms | tok/sec: 811,762 | mfu: 50.74 | epoch: 2 | total time: 152.15m | eta: 27.3m +step 14165/16704 (84.80%) | loss: 2.483504 | lrm: 0.30 | dt: 643.15ms | tok/sec: 815,184 | mfu: 50.95 | epoch: 2 | total time: 152.17m | eta: 27.3m +step 14166/16704 (84.81%) | loss: 2.491724 | lrm: 0.30 | dt: 644.49ms | tok/sec: 813,495 | mfu: 50.84 | epoch: 2 | total time: 152.18m | eta: 27.3m +step 14167/16704 (84.81%) | loss: 2.480013 | lrm: 0.30 | dt: 647.00ms | tok/sec: 810,340 | mfu: 50.65 | epoch: 2 | total time: 152.19m | eta: 27.3m +step 14168/16704 (84.82%) | loss: 2.486404 | lrm: 0.30 | dt: 646.89ms | tok/sec: 810,478 | mfu: 50.66 | epoch: 2 | total time: 152.20m | eta: 27.3m +step 14169/16704 (84.82%) | loss: 2.482488 | lrm: 0.30 | dt: 645.66ms | tok/sec: 812,020 | mfu: 50.75 | epoch: 2 | total time: 152.21m | eta: 27.3m +step 14170/16704 (84.83%) | loss: 2.494373 | lrm: 0.30 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 2 | total time: 152.22m | eta: 27.2m +step 14171/16704 (84.84%) | loss: 2.489680 | lrm: 0.30 | dt: 646.39ms | tok/sec: 811,104 | mfu: 50.70 | epoch: 2 | total time: 152.23m | eta: 27.2m +step 14172/16704 (84.84%) | loss: 2.482908 | lrm: 0.30 | dt: 643.67ms | tok/sec: 814,529 | mfu: 50.91 | epoch: 2 | total time: 152.24m | eta: 27.2m +step 14173/16704 (84.85%) | loss: 2.497758 | lrm: 0.30 | dt: 646.01ms | tok/sec: 811,575 | mfu: 50.72 | epoch: 2 | total time: 152.25m | eta: 27.2m +step 14174/16704 (84.85%) | loss: 2.492819 | lrm: 0.30 | dt: 646.38ms | tok/sec: 811,108 | mfu: 50.70 | epoch: 2 | total time: 152.26m | eta: 27.2m +step 14175/16704 (84.86%) | loss: 2.504452 | lrm: 0.30 | dt: 643.83ms | tok/sec: 814,331 | mfu: 50.90 | epoch: 2 | total time: 152.27m | eta: 27.2m +step 14176/16704 (84.87%) | loss: 2.500263 | lrm: 0.30 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 2 | total time: 152.28m | eta: 27.2m +step 14177/16704 (84.87%) | loss: 2.500133 | lrm: 0.30 | dt: 646.08ms | tok/sec: 811,495 | mfu: 50.72 | epoch: 2 | total time: 152.29m | eta: 27.2m +step 14178/16704 (84.88%) | loss: 2.504941 | lrm: 0.30 | dt: 646.09ms | tok/sec: 811,479 | mfu: 50.72 | epoch: 2 | total time: 152.31m | eta: 27.2m +step 14179/16704 (84.88%) | loss: 2.499802 | lrm: 0.30 | dt: 647.93ms | tok/sec: 809,179 | mfu: 50.57 | epoch: 2 | total time: 152.32m | eta: 27.1m +step 14180/16704 (84.89%) | loss: 2.504716 | lrm: 0.30 | dt: 645.06ms | tok/sec: 812,772 | mfu: 50.80 | epoch: 2 | total time: 152.33m | eta: 27.1m +step 14181/16704 (84.90%) | loss: 2.481192 | lrm: 0.30 | dt: 644.92ms | tok/sec: 812,951 | mfu: 50.81 | epoch: 2 | total time: 152.34m | eta: 27.1m +step 14182/16704 (84.90%) | loss: 2.481560 | lrm: 0.30 | dt: 645.82ms | tok/sec: 811,821 | mfu: 50.74 | epoch: 2 | total time: 152.35m | eta: 27.1m +step 14183/16704 (84.91%) | loss: 2.498504 | lrm: 0.30 | dt: 645.05ms | tok/sec: 812,783 | mfu: 50.80 | epoch: 2 | total time: 152.36m | eta: 27.1m +step 14184/16704 (84.91%) | loss: 2.502458 | lrm: 0.30 | dt: 643.55ms | tok/sec: 814,684 | mfu: 50.92 | epoch: 2 | total time: 152.37m | eta: 27.1m +step 14185/16704 (84.92%) | loss: 2.501467 | lrm: 0.30 | dt: 643.53ms | tok/sec: 814,712 | mfu: 50.92 | epoch: 2 | total time: 152.38m | eta: 27.1m +step 14186/16704 (84.93%) | loss: 2.512602 | lrm: 0.30 | dt: 644.07ms | tok/sec: 814,023 | mfu: 50.88 | epoch: 2 | total time: 152.39m | eta: 27.1m +step 14187/16704 (84.93%) | loss: 2.516159 | lrm: 0.30 | dt: 644.35ms | tok/sec: 813,672 | mfu: 50.86 | epoch: 2 | total time: 152.40m | eta: 27.1m +step 14188/16704 (84.94%) | loss: 2.501720 | lrm: 0.30 | dt: 647.86ms | tok/sec: 809,265 | mfu: 50.58 | epoch: 2 | total time: 152.41m | eta: 27.0m +step 14189/16704 (84.94%) | loss: 2.499689 | lrm: 0.30 | dt: 645.03ms | tok/sec: 812,812 | mfu: 50.80 | epoch: 2 | total time: 152.42m | eta: 27.0m +step 14190/16704 (84.95%) | loss: 2.506913 | lrm: 0.30 | dt: 644.27ms | tok/sec: 813,775 | mfu: 50.86 | epoch: 2 | total time: 152.43m | eta: 27.0m +step 14191/16704 (84.96%) | loss: 2.522637 | lrm: 0.30 | dt: 646.71ms | tok/sec: 810,706 | mfu: 50.67 | epoch: 2 | total time: 152.44m | eta: 27.0m +step 14192/16704 (84.96%) | loss: 2.508339 | lrm: 0.30 | dt: 644.90ms | tok/sec: 812,976 | mfu: 50.81 | epoch: 2 | total time: 152.46m | eta: 27.0m +step 14193/16704 (84.97%) | loss: 2.483722 | lrm: 0.30 | dt: 645.52ms | tok/sec: 812,193 | mfu: 50.76 | epoch: 2 | total time: 152.47m | eta: 27.0m +step 14194/16704 (84.97%) | loss: 2.484890 | lrm: 0.30 | dt: 643.25ms | tok/sec: 815,067 | mfu: 50.94 | epoch: 2 | total time: 152.48m | eta: 27.0m +step 14195/16704 (84.98%) | loss: 2.484435 | lrm: 0.30 | dt: 644.90ms | tok/sec: 812,975 | mfu: 50.81 | epoch: 2 | total time: 152.49m | eta: 27.0m +step 14196/16704 (84.99%) | loss: 2.491188 | lrm: 0.30 | dt: 647.16ms | tok/sec: 810,142 | mfu: 50.64 | epoch: 2 | total time: 152.50m | eta: 27.0m +step 14197/16704 (84.99%) | loss: 2.497023 | lrm: 0.30 | dt: 644.66ms | tok/sec: 813,283 | mfu: 50.83 | epoch: 2 | total time: 152.51m | eta: 27.0m +step 14198/16704 (85.00%) | loss: 2.492368 | lrm: 0.30 | dt: 644.19ms | tok/sec: 813,866 | mfu: 50.87 | epoch: 2 | total time: 152.52m | eta: 26.9m +step 14199/16704 (85.00%) | loss: 2.489994 | lrm: 0.30 | dt: 645.49ms | tok/sec: 812,237 | mfu: 50.77 | epoch: 2 | total time: 152.53m | eta: 26.9m +step 14200/16704 (85.01%) | loss: 2.490480 | lrm: 0.30 | dt: 643.55ms | tok/sec: 814,681 | mfu: 50.92 | epoch: 2 | total time: 152.54m | eta: 26.9m +step 14201/16704 (85.02%) | loss: 2.496215 | lrm: 0.30 | dt: 647.42ms | tok/sec: 809,806 | mfu: 50.61 | epoch: 2 | total time: 152.55m | eta: 26.9m +step 14202/16704 (85.02%) | loss: 2.495344 | lrm: 0.30 | dt: 643.71ms | tok/sec: 814,484 | mfu: 50.91 | epoch: 2 | total time: 152.56m | eta: 26.9m +step 14203/16704 (85.03%) | loss: 2.494389 | lrm: 0.30 | dt: 645.76ms | tok/sec: 811,897 | mfu: 50.74 | epoch: 2 | total time: 152.57m | eta: 26.9m +step 14204/16704 (85.03%) | loss: 2.492937 | lrm: 0.30 | dt: 646.01ms | tok/sec: 811,579 | mfu: 50.72 | epoch: 2 | total time: 152.58m | eta: 26.9m +step 14205/16704 (85.04%) | loss: 2.490627 | lrm: 0.30 | dt: 645.25ms | tok/sec: 812,539 | mfu: 50.78 | epoch: 2 | total time: 152.60m | eta: 26.9m +step 14206/16704 (85.05%) | loss: 2.497825 | lrm: 0.30 | dt: 644.53ms | tok/sec: 813,438 | mfu: 50.84 | epoch: 2 | total time: 152.61m | eta: 26.9m +step 14207/16704 (85.05%) | loss: 2.504800 | lrm: 0.30 | dt: 644.02ms | tok/sec: 814,090 | mfu: 50.88 | epoch: 2 | total time: 152.62m | eta: 26.8m +step 14208/16704 (85.06%) | loss: 2.504700 | lrm: 0.30 | dt: 644.40ms | tok/sec: 813,610 | mfu: 50.85 | epoch: 2 | total time: 152.63m | eta: 26.8m +step 14209/16704 (85.06%) | loss: 2.533394 | lrm: 0.30 | dt: 644.62ms | tok/sec: 813,332 | mfu: 50.83 | epoch: 2 | total time: 152.64m | eta: 26.8m +step 14210/16704 (85.07%) | loss: 2.540281 | lrm: 0.30 | dt: 644.78ms | tok/sec: 813,125 | mfu: 50.82 | epoch: 2 | total time: 152.65m | eta: 26.8m +step 14211/16704 (85.08%) | loss: 2.537449 | lrm: 0.30 | dt: 645.26ms | tok/sec: 812,524 | mfu: 50.78 | epoch: 2 | total time: 152.66m | eta: 26.8m +step 14212/16704 (85.08%) | loss: 2.539535 | lrm: 0.30 | dt: 645.13ms | tok/sec: 812,690 | mfu: 50.79 | epoch: 2 | total time: 152.67m | eta: 26.8m +step 14213/16704 (85.09%) | loss: 2.532859 | lrm: 0.30 | dt: 644.64ms | tok/sec: 813,307 | mfu: 50.83 | epoch: 2 | total time: 152.68m | eta: 26.8m +step 14214/16704 (85.09%) | loss: 2.518043 | lrm: 0.30 | dt: 645.70ms | tok/sec: 811,974 | mfu: 50.75 | epoch: 2 | total time: 152.69m | eta: 26.8m +step 14215/16704 (85.10%) | loss: 2.508618 | lrm: 0.30 | dt: 645.24ms | tok/sec: 812,542 | mfu: 50.79 | epoch: 2 | total time: 152.70m | eta: 26.8m +step 14216/16704 (85.11%) | loss: 2.508118 | lrm: 0.30 | dt: 646.60ms | tok/sec: 810,837 | mfu: 50.68 | epoch: 2 | total time: 152.71m | eta: 26.7m +step 14217/16704 (85.11%) | loss: 2.506510 | lrm: 0.30 | dt: 644.68ms | tok/sec: 813,250 | mfu: 50.83 | epoch: 2 | total time: 152.72m | eta: 26.7m +step 14218/16704 (85.12%) | loss: 2.489546 | lrm: 0.30 | dt: 644.21ms | tok/sec: 813,851 | mfu: 50.87 | epoch: 2 | total time: 152.74m | eta: 26.7m +step 14219/16704 (85.12%) | loss: 2.490009 | lrm: 0.30 | dt: 648.06ms | tok/sec: 809,006 | mfu: 50.56 | epoch: 2 | total time: 152.75m | eta: 26.7m +step 14220/16704 (85.13%) | loss: 2.492502 | lrm: 0.30 | dt: 645.08ms | tok/sec: 812,748 | mfu: 50.80 | epoch: 2 | total time: 152.76m | eta: 26.7m +step 14221/16704 (85.14%) | loss: 2.508634 | lrm: 0.30 | dt: 643.29ms | tok/sec: 815,011 | mfu: 50.94 | epoch: 2 | total time: 152.77m | eta: 26.7m +step 14222/16704 (85.14%) | loss: 2.494105 | lrm: 0.30 | dt: 645.61ms | tok/sec: 812,086 | mfu: 50.76 | epoch: 2 | total time: 152.78m | eta: 26.7m +step 14223/16704 (85.15%) | loss: 2.489728 | lrm: 0.30 | dt: 643.13ms | tok/sec: 815,218 | mfu: 50.95 | epoch: 2 | total time: 152.79m | eta: 26.7m +step 14224/16704 (85.15%) | loss: 2.490656 | lrm: 0.30 | dt: 645.83ms | tok/sec: 811,798 | mfu: 50.74 | epoch: 2 | total time: 152.80m | eta: 26.7m +step 14225/16704 (85.16%) | loss: 2.484667 | lrm: 0.30 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 2 | total time: 152.81m | eta: 26.6m +step 14226/16704 (85.17%) | loss: 2.484114 | lrm: 0.30 | dt: 648.20ms | tok/sec: 808,834 | mfu: 50.55 | epoch: 2 | total time: 152.82m | eta: 26.6m +step 14227/16704 (85.17%) | loss: 2.493576 | lrm: 0.30 | dt: 643.93ms | tok/sec: 814,200 | mfu: 50.89 | epoch: 2 | total time: 152.83m | eta: 26.6m +step 14228/16704 (85.18%) | loss: 2.504374 | lrm: 0.30 | dt: 646.36ms | tok/sec: 811,142 | mfu: 50.70 | epoch: 2 | total time: 152.84m | eta: 26.6m +step 14229/16704 (85.18%) | loss: 2.508145 | lrm: 0.30 | dt: 644.95ms | tok/sec: 812,910 | mfu: 50.81 | epoch: 2 | total time: 152.85m | eta: 26.6m +step 14230/16704 (85.19%) | loss: 2.493828 | lrm: 0.30 | dt: 644.78ms | tok/sec: 813,124 | mfu: 50.82 | epoch: 2 | total time: 152.86m | eta: 26.6m +step 14231/16704 (85.20%) | loss: 2.483944 | lrm: 0.30 | dt: 645.11ms | tok/sec: 812,704 | mfu: 50.80 | epoch: 2 | total time: 152.88m | eta: 26.6m +step 14232/16704 (85.20%) | loss: 2.486415 | lrm: 0.30 | dt: 645.19ms | tok/sec: 812,604 | mfu: 50.79 | epoch: 2 | total time: 152.89m | eta: 26.6m +step 14233/16704 (85.21%) | loss: 2.483678 | lrm: 0.30 | dt: 644.48ms | tok/sec: 813,510 | mfu: 50.85 | epoch: 2 | total time: 152.90m | eta: 26.6m +step 14234/16704 (85.21%) | loss: 2.476841 | lrm: 0.30 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 2 | total time: 152.91m | eta: 26.6m +step 14235/16704 (85.22%) | loss: 2.474435 | lrm: 0.30 | dt: 644.86ms | tok/sec: 813,023 | mfu: 50.82 | epoch: 2 | total time: 152.92m | eta: 26.5m +step 14236/16704 (85.23%) | loss: 2.470592 | lrm: 0.30 | dt: 645.37ms | tok/sec: 812,385 | mfu: 50.78 | epoch: 2 | total time: 152.93m | eta: 26.5m +step 14237/16704 (85.23%) | loss: 2.476786 | lrm: 0.30 | dt: 645.57ms | tok/sec: 812,134 | mfu: 50.76 | epoch: 2 | total time: 152.94m | eta: 26.5m +step 14238/16704 (85.24%) | loss: 2.475516 | lrm: 0.30 | dt: 644.56ms | tok/sec: 813,398 | mfu: 50.84 | epoch: 2 | total time: 152.95m | eta: 26.5m +step 14239/16704 (85.24%) | loss: 2.465293 | lrm: 0.30 | dt: 643.34ms | tok/sec: 814,952 | mfu: 50.94 | epoch: 2 | total time: 152.96m | eta: 26.5m +step 14240/16704 (85.25%) | loss: 2.483521 | lrm: 0.30 | dt: 644.25ms | tok/sec: 813,799 | mfu: 50.86 | epoch: 2 | total time: 152.97m | eta: 26.5m +step 14241/16704 (85.26%) | loss: 2.504852 | lrm: 0.29 | dt: 643.47ms | tok/sec: 814,787 | mfu: 50.93 | epoch: 2 | total time: 152.98m | eta: 26.5m +step 14242/16704 (85.26%) | loss: 2.513888 | lrm: 0.29 | dt: 644.58ms | tok/sec: 813,384 | mfu: 50.84 | epoch: 2 | total time: 152.99m | eta: 26.5m +step 14243/16704 (85.27%) | loss: 2.520378 | lrm: 0.29 | dt: 643.54ms | tok/sec: 814,697 | mfu: 50.92 | epoch: 2 | total time: 153.00m | eta: 26.5m +step 14244/16704 (85.27%) | loss: 2.513250 | lrm: 0.29 | dt: 645.75ms | tok/sec: 811,901 | mfu: 50.74 | epoch: 2 | total time: 153.01m | eta: 26.4m +step 14245/16704 (85.28%) | loss: 2.510995 | lrm: 0.29 | dt: 648.03ms | tok/sec: 809,048 | mfu: 50.57 | epoch: 2 | total time: 153.03m | eta: 26.4m +step 14246/16704 (85.28%) | loss: 2.509142 | lrm: 0.29 | dt: 644.31ms | tok/sec: 813,723 | mfu: 50.86 | epoch: 2 | total time: 153.04m | eta: 26.4m +step 14247/16704 (85.29%) | loss: 2.500628 | lrm: 0.29 | dt: 643.11ms | tok/sec: 815,244 | mfu: 50.95 | epoch: 2 | total time: 153.05m | eta: 26.4m +step 14248/16704 (85.30%) | loss: 2.493126 | lrm: 0.29 | dt: 645.45ms | tok/sec: 812,282 | mfu: 50.77 | epoch: 2 | total time: 153.06m | eta: 26.4m +step 14249/16704 (85.30%) | loss: 2.496400 | lrm: 0.29 | dt: 643.72ms | tok/sec: 814,469 | mfu: 50.91 | epoch: 2 | total time: 153.07m | eta: 26.4m +Step 14250 | Validation bpb: 0.767500 +step 14250/16704 (85.31%) | loss: 2.498235 | lrm: 0.29 | dt: 630.40ms | tok/sec: 831,678 | mfu: 51.98 | epoch: 2 | total time: 153.08m | eta: 26.4m +step 14251/16704 (85.31%) | loss: 2.509406 | lrm: 0.29 | dt: 656.68ms | tok/sec: 798,390 | mfu: 49.90 | epoch: 2 | total time: 153.09m | eta: 26.4m +step 14252/16704 (85.32%) | loss: 2.508542 | lrm: 0.29 | dt: 638.23ms | tok/sec: 821,467 | mfu: 51.34 | epoch: 2 | total time: 153.10m | eta: 26.4m +step 14253/16704 (85.33%) | loss: 2.502056 | lrm: 0.29 | dt: 649.01ms | tok/sec: 807,830 | mfu: 50.49 | epoch: 2 | total time: 153.11m | eta: 26.3m +step 14254/16704 (85.33%) | loss: 2.503908 | lrm: 0.29 | dt: 647.83ms | tok/sec: 809,299 | mfu: 50.58 | epoch: 2 | total time: 153.12m | eta: 26.3m +step 14255/16704 (85.34%) | loss: 2.509375 | lrm: 0.29 | dt: 640.00ms | tok/sec: 819,194 | mfu: 51.20 | epoch: 2 | total time: 153.13m | eta: 26.3m +step 14256/16704 (85.34%) | loss: 2.500175 | lrm: 0.29 | dt: 649.03ms | tok/sec: 807,804 | mfu: 50.49 | epoch: 2 | total time: 153.14m | eta: 26.3m +step 14257/16704 (85.35%) | loss: 2.498429 | lrm: 0.29 | dt: 644.45ms | tok/sec: 813,546 | mfu: 50.85 | epoch: 2 | total time: 153.15m | eta: 26.3m +step 14258/16704 (85.36%) | loss: 2.491579 | lrm: 0.29 | dt: 642.13ms | tok/sec: 816,482 | mfu: 51.03 | epoch: 2 | total time: 153.17m | eta: 26.3m +step 14259/16704 (85.36%) | loss: 2.494531 | lrm: 0.29 | dt: 645.99ms | tok/sec: 811,601 | mfu: 50.73 | epoch: 2 | total time: 153.18m | eta: 26.3m +step 14260/16704 (85.37%) | loss: 2.497997 | lrm: 0.29 | dt: 644.35ms | tok/sec: 813,663 | mfu: 50.86 | epoch: 2 | total time: 153.19m | eta: 26.3m +step 14261/16704 (85.37%) | loss: 2.497706 | lrm: 0.29 | dt: 643.59ms | tok/sec: 814,634 | mfu: 50.92 | epoch: 2 | total time: 153.20m | eta: 26.3m +step 14262/16704 (85.38%) | loss: 2.491959 | lrm: 0.29 | dt: 646.15ms | tok/sec: 811,408 | mfu: 50.71 | epoch: 2 | total time: 153.21m | eta: 26.3m +step 14263/16704 (85.39%) | loss: 2.497817 | lrm: 0.29 | dt: 643.00ms | tok/sec: 815,380 | mfu: 50.96 | epoch: 2 | total time: 153.22m | eta: 26.2m +step 14264/16704 (85.39%) | loss: 2.507364 | lrm: 0.29 | dt: 646.29ms | tok/sec: 811,226 | mfu: 50.70 | epoch: 2 | total time: 153.23m | eta: 26.2m +step 14265/16704 (85.40%) | loss: 2.514848 | lrm: 0.29 | dt: 645.34ms | tok/sec: 812,420 | mfu: 50.78 | epoch: 2 | total time: 153.24m | eta: 26.2m +step 14266/16704 (85.40%) | loss: 2.495104 | lrm: 0.29 | dt: 644.80ms | tok/sec: 813,107 | mfu: 50.82 | epoch: 2 | total time: 153.25m | eta: 26.2m +step 14267/16704 (85.41%) | loss: 2.486008 | lrm: 0.29 | dt: 645.94ms | tok/sec: 811,672 | mfu: 50.73 | epoch: 2 | total time: 153.26m | eta: 26.2m +step 14268/16704 (85.42%) | loss: 2.482890 | lrm: 0.29 | dt: 643.77ms | tok/sec: 814,407 | mfu: 50.90 | epoch: 2 | total time: 153.27m | eta: 26.2m +step 14269/16704 (85.42%) | loss: 2.485353 | lrm: 0.29 | dt: 643.64ms | tok/sec: 814,565 | mfu: 50.91 | epoch: 2 | total time: 153.28m | eta: 26.2m +step 14270/16704 (85.43%) | loss: 2.487222 | lrm: 0.29 | dt: 643.60ms | tok/sec: 814,611 | mfu: 50.91 | epoch: 2 | total time: 153.29m | eta: 26.2m +step 14271/16704 (85.43%) | loss: 2.486168 | lrm: 0.29 | dt: 646.46ms | tok/sec: 811,013 | mfu: 50.69 | epoch: 2 | total time: 153.30m | eta: 26.2m +step 14272/16704 (85.44%) | loss: 2.484301 | lrm: 0.29 | dt: 641.30ms | tok/sec: 817,539 | mfu: 51.10 | epoch: 2 | total time: 153.32m | eta: 26.1m +step 14273/16704 (85.45%) | loss: 2.486519 | lrm: 0.29 | dt: 644.85ms | tok/sec: 813,041 | mfu: 50.82 | epoch: 2 | total time: 153.33m | eta: 26.1m +step 14274/16704 (85.45%) | loss: 2.508298 | lrm: 0.29 | dt: 644.39ms | tok/sec: 813,618 | mfu: 50.85 | epoch: 2 | total time: 153.34m | eta: 26.1m +step 14275/16704 (85.46%) | loss: 2.510829 | lrm: 0.29 | dt: 644.63ms | tok/sec: 813,318 | mfu: 50.83 | epoch: 2 | total time: 153.35m | eta: 26.1m +step 14276/16704 (85.46%) | loss: 2.504384 | lrm: 0.29 | dt: 643.68ms | tok/sec: 814,515 | mfu: 50.91 | epoch: 2 | total time: 153.36m | eta: 26.1m +step 14277/16704 (85.47%) | loss: 2.508111 | lrm: 0.29 | dt: 644.99ms | tok/sec: 812,865 | mfu: 50.81 | epoch: 2 | total time: 153.37m | eta: 26.1m +step 14278/16704 (85.48%) | loss: 2.502142 | lrm: 0.29 | dt: 645.71ms | tok/sec: 811,959 | mfu: 50.75 | epoch: 2 | total time: 153.38m | eta: 26.1m +step 14279/16704 (85.48%) | loss: 2.492474 | lrm: 0.29 | dt: 644.13ms | tok/sec: 813,942 | mfu: 50.87 | epoch: 2 | total time: 153.39m | eta: 26.1m +step 14280/16704 (85.49%) | loss: 2.476070 | lrm: 0.29 | dt: 646.90ms | tok/sec: 810,461 | mfu: 50.65 | epoch: 2 | total time: 153.40m | eta: 26.1m +step 14281/16704 (85.49%) | loss: 2.469352 | lrm: 0.29 | dt: 644.85ms | tok/sec: 813,036 | mfu: 50.82 | epoch: 2 | total time: 153.41m | eta: 26.0m +step 14282/16704 (85.50%) | loss: 2.466585 | lrm: 0.29 | dt: 645.32ms | tok/sec: 812,442 | mfu: 50.78 | epoch: 2 | total time: 153.42m | eta: 26.0m +step 14283/16704 (85.51%) | loss: 2.452627 | lrm: 0.29 | dt: 645.97ms | tok/sec: 811,628 | mfu: 50.73 | epoch: 2 | total time: 153.43m | eta: 26.0m +step 14284/16704 (85.51%) | loss: 2.465250 | lrm: 0.29 | dt: 644.65ms | tok/sec: 813,290 | mfu: 50.83 | epoch: 2 | total time: 153.44m | eta: 26.0m +step 14285/16704 (85.52%) | loss: 2.461506 | lrm: 0.29 | dt: 645.48ms | tok/sec: 812,250 | mfu: 50.77 | epoch: 2 | total time: 153.46m | eta: 26.0m +step 14286/16704 (85.52%) | loss: 2.469926 | lrm: 0.29 | dt: 645.15ms | tok/sec: 812,662 | mfu: 50.79 | epoch: 2 | total time: 153.47m | eta: 26.0m +step 14287/16704 (85.53%) | loss: 2.478659 | lrm: 0.29 | dt: 646.95ms | tok/sec: 810,393 | mfu: 50.65 | epoch: 2 | total time: 153.48m | eta: 26.0m +step 14288/16704 (85.54%) | loss: 2.469943 | lrm: 0.29 | dt: 643.21ms | tok/sec: 815,111 | mfu: 50.95 | epoch: 2 | total time: 153.49m | eta: 26.0m +step 14289/16704 (85.54%) | loss: 2.476737 | lrm: 0.29 | dt: 644.81ms | tok/sec: 813,095 | mfu: 50.82 | epoch: 2 | total time: 153.50m | eta: 26.0m +step 14290/16704 (85.55%) | loss: 2.470560 | lrm: 0.29 | dt: 644.64ms | tok/sec: 813,308 | mfu: 50.83 | epoch: 2 | total time: 153.51m | eta: 26.0m +step 14291/16704 (85.55%) | loss: 2.473075 | lrm: 0.29 | dt: 646.37ms | tok/sec: 811,129 | mfu: 50.70 | epoch: 2 | total time: 153.52m | eta: 25.9m +step 14292/16704 (85.56%) | loss: 2.475483 | lrm: 0.29 | dt: 645.05ms | tok/sec: 812,790 | mfu: 50.80 | epoch: 2 | total time: 153.53m | eta: 25.9m +step 14293/16704 (85.57%) | loss: 2.465263 | lrm: 0.29 | dt: 643.52ms | tok/sec: 814,718 | mfu: 50.92 | epoch: 2 | total time: 153.54m | eta: 25.9m +step 14294/16704 (85.57%) | loss: 2.477812 | lrm: 0.29 | dt: 645.08ms | tok/sec: 812,746 | mfu: 50.80 | epoch: 2 | total time: 153.55m | eta: 25.9m +step 14295/16704 (85.58%) | loss: 2.485058 | lrm: 0.29 | dt: 644.59ms | tok/sec: 813,369 | mfu: 50.84 | epoch: 2 | total time: 153.56m | eta: 25.9m +step 14296/16704 (85.58%) | loss: 2.466963 | lrm: 0.29 | dt: 644.81ms | tok/sec: 813,087 | mfu: 50.82 | epoch: 2 | total time: 153.57m | eta: 25.9m +step 14297/16704 (85.59%) | loss: 2.465758 | lrm: 0.29 | dt: 643.54ms | tok/sec: 814,690 | mfu: 50.92 | epoch: 2 | total time: 153.58m | eta: 25.9m +step 14298/16704 (85.60%) | loss: 2.477594 | lrm: 0.29 | dt: 644.53ms | tok/sec: 813,442 | mfu: 50.84 | epoch: 2 | total time: 153.59m | eta: 25.9m +step 14299/16704 (85.60%) | loss: 2.486010 | lrm: 0.29 | dt: 643.00ms | tok/sec: 815,375 | mfu: 50.96 | epoch: 2 | total time: 153.61m | eta: 25.9m +step 14300/16704 (85.61%) | loss: 2.498242 | lrm: 0.29 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 2 | total time: 153.62m | eta: 25.8m +step 14301/16704 (85.61%) | loss: 2.495484 | lrm: 0.29 | dt: 644.29ms | tok/sec: 813,747 | mfu: 50.86 | epoch: 2 | total time: 153.63m | eta: 25.8m +step 14302/16704 (85.62%) | loss: 2.506234 | lrm: 0.29 | dt: 645.01ms | tok/sec: 812,840 | mfu: 50.80 | epoch: 2 | total time: 153.64m | eta: 25.8m +step 14303/16704 (85.63%) | loss: 2.513984 | lrm: 0.29 | dt: 642.69ms | tok/sec: 815,777 | mfu: 50.99 | epoch: 2 | total time: 153.65m | eta: 25.8m +step 14304/16704 (85.63%) | loss: 2.501554 | lrm: 0.29 | dt: 649.25ms | tok/sec: 807,532 | mfu: 50.47 | epoch: 2 | total time: 153.66m | eta: 25.8m +step 14305/16704 (85.64%) | loss: 2.501822 | lrm: 0.29 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 2 | total time: 153.67m | eta: 25.8m +step 14306/16704 (85.64%) | loss: 2.505992 | lrm: 0.29 | dt: 642.68ms | tok/sec: 815,785 | mfu: 50.99 | epoch: 2 | total time: 153.68m | eta: 25.8m +step 14307/16704 (85.65%) | loss: 2.497527 | lrm: 0.29 | dt: 645.01ms | tok/sec: 812,834 | mfu: 50.80 | epoch: 2 | total time: 153.69m | eta: 25.8m +step 14308/16704 (85.66%) | loss: 2.492373 | lrm: 0.29 | dt: 645.85ms | tok/sec: 811,783 | mfu: 50.74 | epoch: 2 | total time: 153.70m | eta: 25.8m +step 14309/16704 (85.66%) | loss: 2.499494 | lrm: 0.29 | dt: 646.56ms | tok/sec: 810,894 | mfu: 50.68 | epoch: 2 | total time: 153.71m | eta: 25.7m +step 14310/16704 (85.67%) | loss: 2.509110 | lrm: 0.29 | dt: 643.90ms | tok/sec: 814,239 | mfu: 50.89 | epoch: 2 | total time: 153.72m | eta: 25.7m +step 14311/16704 (85.67%) | loss: 2.511967 | lrm: 0.29 | dt: 644.20ms | tok/sec: 813,864 | mfu: 50.87 | epoch: 2 | total time: 153.73m | eta: 25.7m +step 14312/16704 (85.68%) | loss: 2.510706 | lrm: 0.29 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 2 | total time: 153.75m | eta: 25.7m +step 14313/16704 (85.69%) | loss: 2.511129 | lrm: 0.29 | dt: 645.20ms | tok/sec: 812,603 | mfu: 50.79 | epoch: 2 | total time: 153.76m | eta: 25.7m +step 14314/16704 (85.69%) | loss: 2.515095 | lrm: 0.29 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 2 | total time: 153.77m | eta: 25.7m +step 14315/16704 (85.70%) | loss: 2.507309 | lrm: 0.29 | dt: 644.93ms | tok/sec: 812,942 | mfu: 50.81 | epoch: 2 | total time: 153.78m | eta: 25.7m +step 14316/16704 (85.70%) | loss: 2.501270 | lrm: 0.29 | dt: 643.80ms | tok/sec: 814,363 | mfu: 50.90 | epoch: 2 | total time: 153.79m | eta: 25.7m +step 14317/16704 (85.71%) | loss: 2.505183 | lrm: 0.29 | dt: 644.55ms | tok/sec: 813,417 | mfu: 50.84 | epoch: 2 | total time: 153.80m | eta: 25.7m +step 14318/16704 (85.72%) | loss: 2.511489 | lrm: 0.29 | dt: 645.40ms | tok/sec: 812,347 | mfu: 50.77 | epoch: 2 | total time: 153.81m | eta: 25.6m +step 14319/16704 (85.72%) | loss: 2.499197 | lrm: 0.29 | dt: 643.48ms | tok/sec: 814,771 | mfu: 50.92 | epoch: 2 | total time: 153.82m | eta: 25.6m +step 14320/16704 (85.73%) | loss: 2.512419 | lrm: 0.29 | dt: 644.03ms | tok/sec: 814,070 | mfu: 50.88 | epoch: 2 | total time: 153.83m | eta: 25.6m +step 14321/16704 (85.73%) | loss: 2.509866 | lrm: 0.29 | dt: 645.62ms | tok/sec: 812,066 | mfu: 50.76 | epoch: 2 | total time: 153.84m | eta: 25.6m +step 14322/16704 (85.74%) | loss: 2.509896 | lrm: 0.29 | dt: 645.54ms | tok/sec: 812,164 | mfu: 50.76 | epoch: 2 | total time: 153.85m | eta: 25.6m +step 14323/16704 (85.75%) | loss: 2.513921 | lrm: 0.29 | dt: 643.05ms | tok/sec: 815,311 | mfu: 50.96 | epoch: 2 | total time: 153.86m | eta: 25.6m +step 14324/16704 (85.75%) | loss: 2.516795 | lrm: 0.28 | dt: 646.92ms | tok/sec: 810,441 | mfu: 50.65 | epoch: 2 | total time: 153.87m | eta: 25.6m +step 14325/16704 (85.76%) | loss: 2.509551 | lrm: 0.28 | dt: 642.67ms | tok/sec: 815,798 | mfu: 50.99 | epoch: 2 | total time: 153.89m | eta: 25.6m +step 14326/16704 (85.76%) | loss: 2.514369 | lrm: 0.28 | dt: 643.96ms | tok/sec: 814,162 | mfu: 50.89 | epoch: 2 | total time: 153.90m | eta: 25.6m +step 14327/16704 (85.77%) | loss: 2.505408 | lrm: 0.28 | dt: 647.13ms | tok/sec: 810,179 | mfu: 50.64 | epoch: 2 | total time: 153.91m | eta: 25.6m +step 14328/16704 (85.78%) | loss: 2.499487 | lrm: 0.28 | dt: 643.71ms | tok/sec: 814,476 | mfu: 50.91 | epoch: 2 | total time: 153.92m | eta: 25.5m +step 14329/16704 (85.78%) | loss: 2.500682 | lrm: 0.28 | dt: 646.39ms | tok/sec: 811,100 | mfu: 50.69 | epoch: 2 | total time: 153.93m | eta: 25.5m +step 14330/16704 (85.79%) | loss: 2.507353 | lrm: 0.28 | dt: 643.47ms | tok/sec: 814,777 | mfu: 50.92 | epoch: 2 | total time: 153.94m | eta: 25.5m +step 14331/16704 (85.79%) | loss: 2.509869 | lrm: 0.28 | dt: 645.24ms | tok/sec: 812,542 | mfu: 50.79 | epoch: 2 | total time: 153.95m | eta: 25.5m +step 14332/16704 (85.80%) | loss: 2.505156 | lrm: 0.28 | dt: 643.84ms | tok/sec: 814,312 | mfu: 50.90 | epoch: 2 | total time: 153.96m | eta: 25.5m +step 14333/16704 (85.81%) | loss: 2.511588 | lrm: 0.28 | dt: 643.97ms | tok/sec: 814,154 | mfu: 50.89 | epoch: 2 | total time: 153.97m | eta: 25.5m +step 14334/16704 (85.81%) | loss: 2.507204 | lrm: 0.28 | dt: 643.15ms | tok/sec: 815,191 | mfu: 50.95 | epoch: 2 | total time: 153.98m | eta: 25.5m +step 14335/16704 (85.82%) | loss: 2.515767 | lrm: 0.28 | dt: 646.62ms | tok/sec: 810,808 | mfu: 50.68 | epoch: 2 | total time: 153.99m | eta: 25.5m +step 14336/16704 (85.82%) | loss: 2.514182 | lrm: 0.28 | dt: 645.23ms | tok/sec: 812,556 | mfu: 50.79 | epoch: 2 | total time: 154.00m | eta: 25.5m +step 14337/16704 (85.83%) | loss: 2.515690 | lrm: 0.28 | dt: 641.32ms | tok/sec: 817,517 | mfu: 51.10 | epoch: 2 | total time: 154.01m | eta: 25.4m +step 14338/16704 (85.84%) | loss: 2.508875 | lrm: 0.28 | dt: 644.91ms | tok/sec: 812,962 | mfu: 50.81 | epoch: 2 | total time: 154.02m | eta: 25.4m +step 14339/16704 (85.84%) | loss: 2.506424 | lrm: 0.28 | dt: 645.07ms | tok/sec: 812,766 | mfu: 50.80 | epoch: 2 | total time: 154.04m | eta: 25.4m +step 14340/16704 (85.85%) | loss: 2.520561 | lrm: 0.28 | dt: 644.18ms | tok/sec: 813,883 | mfu: 50.87 | epoch: 2 | total time: 154.05m | eta: 25.4m +step 14341/16704 (85.85%) | loss: 2.520114 | lrm: 0.28 | dt: 646.70ms | tok/sec: 810,716 | mfu: 50.67 | epoch: 2 | total time: 154.06m | eta: 25.4m +step 14342/16704 (85.86%) | loss: 2.507194 | lrm: 0.28 | dt: 642.25ms | tok/sec: 816,335 | mfu: 51.02 | epoch: 2 | total time: 154.07m | eta: 25.4m +step 14343/16704 (85.87%) | loss: 2.499471 | lrm: 0.28 | dt: 644.51ms | tok/sec: 813,466 | mfu: 50.84 | epoch: 2 | total time: 154.08m | eta: 25.4m +step 14344/16704 (85.87%) | loss: 2.501770 | lrm: 0.28 | dt: 643.67ms | tok/sec: 814,528 | mfu: 50.91 | epoch: 2 | total time: 154.09m | eta: 25.4m +step 14345/16704 (85.88%) | loss: 2.505229 | lrm: 0.28 | dt: 645.34ms | tok/sec: 812,418 | mfu: 50.78 | epoch: 2 | total time: 154.10m | eta: 25.4m +step 14346/16704 (85.88%) | loss: 2.512327 | lrm: 0.28 | dt: 645.12ms | tok/sec: 812,695 | mfu: 50.79 | epoch: 2 | total time: 154.11m | eta: 25.3m +step 14347/16704 (85.89%) | loss: 2.496711 | lrm: 0.28 | dt: 643.39ms | tok/sec: 814,879 | mfu: 50.93 | epoch: 2 | total time: 154.12m | eta: 25.3m +step 14348/16704 (85.90%) | loss: 2.491275 | lrm: 0.28 | dt: 645.23ms | tok/sec: 812,560 | mfu: 50.79 | epoch: 2 | total time: 154.13m | eta: 25.3m +step 14349/16704 (85.90%) | loss: 2.495529 | lrm: 0.28 | dt: 645.30ms | tok/sec: 812,471 | mfu: 50.78 | epoch: 2 | total time: 154.14m | eta: 25.3m +step 14350/16704 (85.91%) | loss: 2.494376 | lrm: 0.28 | dt: 646.19ms | tok/sec: 811,353 | mfu: 50.71 | epoch: 2 | total time: 154.15m | eta: 25.3m +step 14351/16704 (85.91%) | loss: 2.499923 | lrm: 0.28 | dt: 646.02ms | tok/sec: 811,560 | mfu: 50.72 | epoch: 2 | total time: 154.16m | eta: 25.3m +step 14352/16704 (85.92%) | loss: 2.505855 | lrm: 0.28 | dt: 642.99ms | tok/sec: 815,396 | mfu: 50.96 | epoch: 2 | total time: 154.18m | eta: 25.3m +step 14353/16704 (85.93%) | loss: 2.509186 | lrm: 0.28 | dt: 646.54ms | tok/sec: 810,912 | mfu: 50.68 | epoch: 2 | total time: 154.19m | eta: 25.3m +step 14354/16704 (85.93%) | loss: 2.513045 | lrm: 0.28 | dt: 643.28ms | tok/sec: 815,023 | mfu: 50.94 | epoch: 2 | total time: 154.20m | eta: 25.3m +step 14355/16704 (85.94%) | loss: 2.515219 | lrm: 0.28 | dt: 643.68ms | tok/sec: 814,512 | mfu: 50.91 | epoch: 2 | total time: 154.21m | eta: 25.3m +step 14356/16704 (85.94%) | loss: 2.517741 | lrm: 0.28 | dt: 643.34ms | tok/sec: 814,944 | mfu: 50.94 | epoch: 2 | total time: 154.22m | eta: 25.2m +step 14357/16704 (85.95%) | loss: 2.521059 | lrm: 0.28 | dt: 643.36ms | tok/sec: 814,920 | mfu: 50.93 | epoch: 2 | total time: 154.23m | eta: 25.2m +step 14358/16704 (85.96%) | loss: 2.521512 | lrm: 0.28 | dt: 647.88ms | tok/sec: 809,237 | mfu: 50.58 | epoch: 2 | total time: 154.24m | eta: 25.2m +step 14359/16704 (85.96%) | loss: 2.518703 | lrm: 0.28 | dt: 642.69ms | tok/sec: 815,771 | mfu: 50.99 | epoch: 2 | total time: 154.25m | eta: 25.2m +step 14360/16704 (85.97%) | loss: 2.513474 | lrm: 0.28 | dt: 644.58ms | tok/sec: 813,379 | mfu: 50.84 | epoch: 2 | total time: 154.26m | eta: 25.2m +step 14361/16704 (85.97%) | loss: 2.510952 | lrm: 0.28 | dt: 644.95ms | tok/sec: 812,907 | mfu: 50.81 | epoch: 2 | total time: 154.27m | eta: 25.2m +step 14362/16704 (85.98%) | loss: 2.494876 | lrm: 0.28 | dt: 645.36ms | tok/sec: 812,394 | mfu: 50.78 | epoch: 2 | total time: 154.28m | eta: 25.2m +step 14363/16704 (85.99%) | loss: 2.496531 | lrm: 0.28 | dt: 644.14ms | tok/sec: 813,936 | mfu: 50.87 | epoch: 2 | total time: 154.29m | eta: 25.2m +step 14364/16704 (85.99%) | loss: 2.498408 | lrm: 0.28 | dt: 643.34ms | tok/sec: 814,942 | mfu: 50.94 | epoch: 2 | total time: 154.30m | eta: 25.2m +step 14365/16704 (86.00%) | loss: 2.505317 | lrm: 0.28 | dt: 646.09ms | tok/sec: 811,477 | mfu: 50.72 | epoch: 2 | total time: 154.31m | eta: 25.1m +step 14366/16704 (86.00%) | loss: 2.516876 | lrm: 0.28 | dt: 644.92ms | tok/sec: 812,948 | mfu: 50.81 | epoch: 2 | total time: 154.33m | eta: 25.1m +step 14367/16704 (86.01%) | loss: 2.514506 | lrm: 0.28 | dt: 643.34ms | tok/sec: 814,946 | mfu: 50.94 | epoch: 2 | total time: 154.34m | eta: 25.1m +step 14368/16704 (86.02%) | loss: 2.519173 | lrm: 0.28 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 2 | total time: 154.35m | eta: 25.1m +step 14369/16704 (86.02%) | loss: 2.517007 | lrm: 0.28 | dt: 645.14ms | tok/sec: 812,673 | mfu: 50.79 | epoch: 2 | total time: 154.36m | eta: 25.1m +step 14370/16704 (86.03%) | loss: 2.526984 | lrm: 0.28 | dt: 644.10ms | tok/sec: 813,980 | mfu: 50.87 | epoch: 2 | total time: 154.37m | eta: 25.1m +step 14371/16704 (86.03%) | loss: 2.520719 | lrm: 0.28 | dt: 645.06ms | tok/sec: 812,773 | mfu: 50.80 | epoch: 2 | total time: 154.38m | eta: 25.1m +step 14372/16704 (86.04%) | loss: 2.517379 | lrm: 0.28 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 2 | total time: 154.39m | eta: 25.1m +step 14373/16704 (86.05%) | loss: 2.513336 | lrm: 0.28 | dt: 646.97ms | tok/sec: 810,370 | mfu: 50.65 | epoch: 2 | total time: 154.40m | eta: 25.1m +step 14374/16704 (86.05%) | loss: 2.510705 | lrm: 0.28 | dt: 645.85ms | tok/sec: 811,783 | mfu: 50.74 | epoch: 2 | total time: 154.41m | eta: 25.0m +step 14375/16704 (86.06%) | loss: 2.509817 | lrm: 0.28 | dt: 643.09ms | tok/sec: 815,268 | mfu: 50.96 | epoch: 2 | total time: 154.42m | eta: 25.0m +step 14376/16704 (86.06%) | loss: 2.500422 | lrm: 0.28 | dt: 646.02ms | tok/sec: 811,567 | mfu: 50.72 | epoch: 2 | total time: 154.43m | eta: 25.0m +step 14377/16704 (86.07%) | loss: 2.511056 | lrm: 0.28 | dt: 645.69ms | tok/sec: 811,974 | mfu: 50.75 | epoch: 2 | total time: 154.44m | eta: 25.0m +step 14378/16704 (86.08%) | loss: 2.514709 | lrm: 0.28 | dt: 645.03ms | tok/sec: 812,806 | mfu: 50.80 | epoch: 2 | total time: 154.45m | eta: 25.0m +step 14379/16704 (86.08%) | loss: 2.515097 | lrm: 0.28 | dt: 645.91ms | tok/sec: 811,699 | mfu: 50.73 | epoch: 2 | total time: 154.47m | eta: 25.0m +step 14380/16704 (86.09%) | loss: 2.515050 | lrm: 0.28 | dt: 643.44ms | tok/sec: 814,814 | mfu: 50.93 | epoch: 2 | total time: 154.48m | eta: 25.0m +step 14381/16704 (86.09%) | loss: 2.499820 | lrm: 0.28 | dt: 643.99ms | tok/sec: 814,122 | mfu: 50.88 | epoch: 2 | total time: 154.49m | eta: 25.0m +step 14382/16704 (86.10%) | loss: 2.507873 | lrm: 0.28 | dt: 645.34ms | tok/sec: 812,420 | mfu: 50.78 | epoch: 2 | total time: 154.50m | eta: 25.0m +step 14383/16704 (86.11%) | loss: 2.514945 | lrm: 0.28 | dt: 642.45ms | tok/sec: 816,071 | mfu: 51.01 | epoch: 2 | total time: 154.51m | eta: 25.0m +step 14384/16704 (86.11%) | loss: 2.498978 | lrm: 0.28 | dt: 643.82ms | tok/sec: 814,333 | mfu: 50.90 | epoch: 2 | total time: 154.52m | eta: 24.9m +step 14385/16704 (86.12%) | loss: 2.490584 | lrm: 0.28 | dt: 644.43ms | tok/sec: 813,568 | mfu: 50.85 | epoch: 2 | total time: 154.53m | eta: 24.9m +step 14386/16704 (86.12%) | loss: 2.503943 | lrm: 0.28 | dt: 644.82ms | tok/sec: 813,081 | mfu: 50.82 | epoch: 2 | total time: 154.54m | eta: 24.9m +step 14387/16704 (86.13%) | loss: 2.509653 | lrm: 0.28 | dt: 645.84ms | tok/sec: 811,790 | mfu: 50.74 | epoch: 2 | total time: 154.55m | eta: 24.9m +step 14388/16704 (86.14%) | loss: 2.487224 | lrm: 0.28 | dt: 644.21ms | tok/sec: 813,842 | mfu: 50.87 | epoch: 2 | total time: 154.56m | eta: 24.9m +step 14389/16704 (86.14%) | loss: 2.499151 | lrm: 0.28 | dt: 644.37ms | tok/sec: 813,644 | mfu: 50.85 | epoch: 2 | total time: 154.57m | eta: 24.9m +step 14390/16704 (86.15%) | loss: 2.486735 | lrm: 0.28 | dt: 644.50ms | tok/sec: 813,480 | mfu: 50.84 | epoch: 2 | total time: 154.58m | eta: 24.9m +step 14391/16704 (86.15%) | loss: 2.488146 | lrm: 0.28 | dt: 646.22ms | tok/sec: 811,315 | mfu: 50.71 | epoch: 2 | total time: 154.59m | eta: 24.9m +step 14392/16704 (86.16%) | loss: 2.484409 | lrm: 0.28 | dt: 643.40ms | tok/sec: 814,872 | mfu: 50.93 | epoch: 2 | total time: 154.60m | eta: 24.9m +step 14393/16704 (86.16%) | loss: 2.496982 | lrm: 0.28 | dt: 646.33ms | tok/sec: 811,179 | mfu: 50.70 | epoch: 2 | total time: 154.62m | eta: 24.8m +step 14394/16704 (86.17%) | loss: 2.504648 | lrm: 0.28 | dt: 644.95ms | tok/sec: 812,914 | mfu: 50.81 | epoch: 2 | total time: 154.63m | eta: 24.8m +step 14395/16704 (86.18%) | loss: 2.492852 | lrm: 0.28 | dt: 647.11ms | tok/sec: 810,200 | mfu: 50.64 | epoch: 2 | total time: 154.64m | eta: 24.8m +step 14396/16704 (86.18%) | loss: 2.488168 | lrm: 0.28 | dt: 644.67ms | tok/sec: 813,264 | mfu: 50.83 | epoch: 2 | total time: 154.65m | eta: 24.8m +step 14397/16704 (86.19%) | loss: 2.489537 | lrm: 0.28 | dt: 644.07ms | tok/sec: 814,027 | mfu: 50.88 | epoch: 2 | total time: 154.66m | eta: 24.8m +step 14398/16704 (86.19%) | loss: 2.484520 | lrm: 0.28 | dt: 645.75ms | tok/sec: 811,900 | mfu: 50.74 | epoch: 2 | total time: 154.67m | eta: 24.8m +step 14399/16704 (86.20%) | loss: 2.486429 | lrm: 0.28 | dt: 645.25ms | tok/sec: 812,536 | mfu: 50.78 | epoch: 2 | total time: 154.68m | eta: 24.8m +step 14400/16704 (86.21%) | loss: 2.488500 | lrm: 0.28 | dt: 643.45ms | tok/sec: 814,803 | mfu: 50.93 | epoch: 2 | total time: 154.69m | eta: 24.8m +step 14401/16704 (86.21%) | loss: 2.499417 | lrm: 0.28 | dt: 643.98ms | tok/sec: 814,133 | mfu: 50.88 | epoch: 2 | total time: 154.70m | eta: 24.8m +step 14402/16704 (86.22%) | loss: 2.488474 | lrm: 0.28 | dt: 644.16ms | tok/sec: 813,910 | mfu: 50.87 | epoch: 2 | total time: 154.71m | eta: 24.7m +step 14403/16704 (86.22%) | loss: 2.483517 | lrm: 0.28 | dt: 645.87ms | tok/sec: 811,756 | mfu: 50.74 | epoch: 2 | total time: 154.72m | eta: 24.7m +step 14404/16704 (86.23%) | loss: 2.488039 | lrm: 0.28 | dt: 646.13ms | tok/sec: 811,426 | mfu: 50.72 | epoch: 2 | total time: 154.73m | eta: 24.7m +step 14405/16704 (86.24%) | loss: 2.480631 | lrm: 0.28 | dt: 645.63ms | tok/sec: 812,059 | mfu: 50.75 | epoch: 2 | total time: 154.74m | eta: 24.7m +step 14406/16704 (86.24%) | loss: 2.501152 | lrm: 0.28 | dt: 645.51ms | tok/sec: 812,205 | mfu: 50.76 | epoch: 2 | total time: 154.76m | eta: 24.7m +step 14407/16704 (86.25%) | loss: 2.473349 | lrm: 0.28 | dt: 644.75ms | tok/sec: 813,160 | mfu: 50.82 | epoch: 2 | total time: 154.77m | eta: 24.7m +step 14408/16704 (86.25%) | loss: 2.488717 | lrm: 0.27 | dt: 645.37ms | tok/sec: 812,387 | mfu: 50.78 | epoch: 2 | total time: 154.78m | eta: 24.7m +step 14409/16704 (86.26%) | loss: 2.476205 | lrm: 0.27 | dt: 644.04ms | tok/sec: 814,060 | mfu: 50.88 | epoch: 2 | total time: 154.79m | eta: 24.7m +step 14410/16704 (86.27%) | loss: 2.468337 | lrm: 0.27 | dt: 644.90ms | tok/sec: 812,976 | mfu: 50.81 | epoch: 2 | total time: 154.80m | eta: 24.7m +step 14411/16704 (86.27%) | loss: 2.486696 | lrm: 0.27 | dt: 645.42ms | tok/sec: 812,323 | mfu: 50.77 | epoch: 2 | total time: 154.81m | eta: 24.6m +step 14412/16704 (86.28%) | loss: 2.485902 | lrm: 0.27 | dt: 645.24ms | tok/sec: 812,545 | mfu: 50.79 | epoch: 2 | total time: 154.82m | eta: 24.6m +step 14413/16704 (86.28%) | loss: 2.477750 | lrm: 0.27 | dt: 645.59ms | tok/sec: 812,109 | mfu: 50.76 | epoch: 2 | total time: 154.83m | eta: 24.6m +step 14414/16704 (86.29%) | loss: 2.480694 | lrm: 0.27 | dt: 643.78ms | tok/sec: 814,390 | mfu: 50.90 | epoch: 2 | total time: 154.84m | eta: 24.6m +step 14415/16704 (86.30%) | loss: 2.488349 | lrm: 0.27 | dt: 645.99ms | tok/sec: 811,602 | mfu: 50.73 | epoch: 2 | total time: 154.85m | eta: 24.6m +step 14416/16704 (86.30%) | loss: 2.486871 | lrm: 0.27 | dt: 643.56ms | tok/sec: 814,673 | mfu: 50.92 | epoch: 2 | total time: 154.86m | eta: 24.6m +step 14417/16704 (86.31%) | loss: 2.491542 | lrm: 0.27 | dt: 645.65ms | tok/sec: 812,028 | mfu: 50.75 | epoch: 2 | total time: 154.87m | eta: 24.6m +step 14418/16704 (86.31%) | loss: 2.503245 | lrm: 0.27 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 2 | total time: 154.88m | eta: 24.6m +step 14419/16704 (86.32%) | loss: 2.503694 | lrm: 0.27 | dt: 645.45ms | tok/sec: 812,280 | mfu: 50.77 | epoch: 2 | total time: 154.90m | eta: 24.6m +step 14420/16704 (86.33%) | loss: 2.511890 | lrm: 0.27 | dt: 644.93ms | tok/sec: 812,942 | mfu: 50.81 | epoch: 2 | total time: 154.91m | eta: 24.6m +step 14421/16704 (86.33%) | loss: 2.499448 | lrm: 0.27 | dt: 644.19ms | tok/sec: 813,870 | mfu: 50.87 | epoch: 2 | total time: 154.92m | eta: 24.5m +step 14422/16704 (86.34%) | loss: 2.498123 | lrm: 0.27 | dt: 643.89ms | tok/sec: 814,249 | mfu: 50.89 | epoch: 2 | total time: 154.93m | eta: 24.5m +step 14423/16704 (86.34%) | loss: 2.509255 | lrm: 0.27 | dt: 647.85ms | tok/sec: 809,273 | mfu: 50.58 | epoch: 2 | total time: 154.94m | eta: 24.5m +step 14424/16704 (86.35%) | loss: 2.501107 | lrm: 0.27 | dt: 641.93ms | tok/sec: 816,735 | mfu: 51.05 | epoch: 2 | total time: 154.95m | eta: 24.5m +step 14425/16704 (86.36%) | loss: 2.499310 | lrm: 0.27 | dt: 644.74ms | tok/sec: 813,176 | mfu: 50.82 | epoch: 2 | total time: 154.96m | eta: 24.5m +step 14426/16704 (86.36%) | loss: 2.493327 | lrm: 0.27 | dt: 643.28ms | tok/sec: 815,024 | mfu: 50.94 | epoch: 2 | total time: 154.97m | eta: 24.5m +step 14427/16704 (86.37%) | loss: 2.503957 | lrm: 0.27 | dt: 644.73ms | tok/sec: 813,188 | mfu: 50.83 | epoch: 2 | total time: 154.98m | eta: 24.5m +step 14428/16704 (86.37%) | loss: 2.513501 | lrm: 0.27 | dt: 646.59ms | tok/sec: 810,851 | mfu: 50.68 | epoch: 2 | total time: 154.99m | eta: 24.5m +step 14429/16704 (86.38%) | loss: 2.502525 | lrm: 0.27 | dt: 646.14ms | tok/sec: 811,412 | mfu: 50.71 | epoch: 2 | total time: 155.00m | eta: 24.5m +step 14430/16704 (86.39%) | loss: 2.506789 | lrm: 0.27 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 2 | total time: 155.01m | eta: 24.4m +step 14431/16704 (86.39%) | loss: 2.513294 | lrm: 0.27 | dt: 645.03ms | tok/sec: 812,806 | mfu: 50.80 | epoch: 2 | total time: 155.02m | eta: 24.4m +step 14432/16704 (86.40%) | loss: 2.509591 | lrm: 0.27 | dt: 642.97ms | tok/sec: 815,412 | mfu: 50.96 | epoch: 2 | total time: 155.03m | eta: 24.4m +step 14433/16704 (86.40%) | loss: 2.509998 | lrm: 0.27 | dt: 643.29ms | tok/sec: 815,016 | mfu: 50.94 | epoch: 2 | total time: 155.05m | eta: 24.4m +step 14434/16704 (86.41%) | loss: 2.505117 | lrm: 0.27 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 2 | total time: 155.06m | eta: 24.4m +step 14435/16704 (86.42%) | loss: 2.508347 | lrm: 0.27 | dt: 645.22ms | tok/sec: 812,566 | mfu: 50.79 | epoch: 2 | total time: 155.07m | eta: 24.4m +step 14436/16704 (86.42%) | loss: 2.509497 | lrm: 0.27 | dt: 643.40ms | tok/sec: 814,868 | mfu: 50.93 | epoch: 2 | total time: 155.08m | eta: 24.4m +step 14437/16704 (86.43%) | loss: 2.497098 | lrm: 0.27 | dt: 643.37ms | tok/sec: 814,903 | mfu: 50.93 | epoch: 2 | total time: 155.09m | eta: 24.4m +step 14438/16704 (86.43%) | loss: 2.486022 | lrm: 0.27 | dt: 645.39ms | tok/sec: 812,361 | mfu: 50.77 | epoch: 2 | total time: 155.10m | eta: 24.4m +step 14439/16704 (86.44%) | loss: 2.485281 | lrm: 0.27 | dt: 646.48ms | tok/sec: 810,987 | mfu: 50.69 | epoch: 2 | total time: 155.11m | eta: 24.3m +step 14440/16704 (86.45%) | loss: 2.489991 | lrm: 0.27 | dt: 645.14ms | tok/sec: 812,672 | mfu: 50.79 | epoch: 2 | total time: 155.12m | eta: 24.3m +step 14441/16704 (86.45%) | loss: 2.472480 | lrm: 0.27 | dt: 642.56ms | tok/sec: 815,938 | mfu: 51.00 | epoch: 2 | total time: 155.13m | eta: 24.3m +step 14442/16704 (86.46%) | loss: 2.472594 | lrm: 0.27 | dt: 645.37ms | tok/sec: 812,381 | mfu: 50.77 | epoch: 2 | total time: 155.14m | eta: 24.3m +step 14443/16704 (86.46%) | loss: 2.471553 | lrm: 0.27 | dt: 642.72ms | tok/sec: 815,739 | mfu: 50.98 | epoch: 2 | total time: 155.15m | eta: 24.3m +step 14444/16704 (86.47%) | loss: 2.464154 | lrm: 0.27 | dt: 642.97ms | tok/sec: 815,421 | mfu: 50.96 | epoch: 2 | total time: 155.16m | eta: 24.3m +step 14445/16704 (86.48%) | loss: 2.457424 | lrm: 0.27 | dt: 646.10ms | tok/sec: 811,464 | mfu: 50.72 | epoch: 2 | total time: 155.17m | eta: 24.3m +step 14446/16704 (86.48%) | loss: 2.464809 | lrm: 0.27 | dt: 641.88ms | tok/sec: 816,806 | mfu: 51.05 | epoch: 2 | total time: 155.19m | eta: 24.3m +step 14447/16704 (86.49%) | loss: 2.459982 | lrm: 0.27 | dt: 644.32ms | tok/sec: 813,712 | mfu: 50.86 | epoch: 2 | total time: 155.20m | eta: 24.3m +step 14448/16704 (86.49%) | loss: 2.454160 | lrm: 0.27 | dt: 644.17ms | tok/sec: 813,891 | mfu: 50.87 | epoch: 2 | total time: 155.21m | eta: 24.3m +step 14449/16704 (86.50%) | loss: 2.456304 | lrm: 0.27 | dt: 643.37ms | tok/sec: 814,908 | mfu: 50.93 | epoch: 2 | total time: 155.22m | eta: 24.2m +step 14450/16704 (86.51%) | loss: 2.451128 | lrm: 0.27 | dt: 644.74ms | tok/sec: 813,180 | mfu: 50.82 | epoch: 2 | total time: 155.23m | eta: 24.2m +step 14451/16704 (86.51%) | loss: 2.455135 | lrm: 0.27 | dt: 645.82ms | tok/sec: 811,814 | mfu: 50.74 | epoch: 2 | total time: 155.24m | eta: 24.2m +step 14452/16704 (86.52%) | loss: 2.465180 | lrm: 0.27 | dt: 644.46ms | tok/sec: 813,536 | mfu: 50.85 | epoch: 2 | total time: 155.25m | eta: 24.2m +step 14453/16704 (86.52%) | loss: 2.473778 | lrm: 0.27 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 2 | total time: 155.26m | eta: 24.2m +step 14454/16704 (86.53%) | loss: 2.478504 | lrm: 0.27 | dt: 645.74ms | tok/sec: 811,921 | mfu: 50.75 | epoch: 2 | total time: 155.27m | eta: 24.2m +step 14455/16704 (86.54%) | loss: 2.495154 | lrm: 0.27 | dt: 646.16ms | tok/sec: 811,389 | mfu: 50.71 | epoch: 2 | total time: 155.28m | eta: 24.2m +step 14456/16704 (86.54%) | loss: 2.488030 | lrm: 0.27 | dt: 643.90ms | tok/sec: 814,240 | mfu: 50.89 | epoch: 2 | total time: 155.29m | eta: 24.2m +step 14457/16704 (86.55%) | loss: 2.476575 | lrm: 0.27 | dt: 645.34ms | tok/sec: 812,424 | mfu: 50.78 | epoch: 2 | total time: 155.30m | eta: 24.2m +step 14458/16704 (86.55%) | loss: 2.478047 | lrm: 0.27 | dt: 643.46ms | tok/sec: 814,800 | mfu: 50.93 | epoch: 2 | total time: 155.31m | eta: 24.1m +step 14459/16704 (86.56%) | loss: 2.479572 | lrm: 0.27 | dt: 645.29ms | tok/sec: 812,481 | mfu: 50.78 | epoch: 2 | total time: 155.32m | eta: 24.1m +step 14460/16704 (86.57%) | loss: 2.480983 | lrm: 0.27 | dt: 647.98ms | tok/sec: 809,117 | mfu: 50.57 | epoch: 2 | total time: 155.34m | eta: 24.1m +step 14461/16704 (86.57%) | loss: 2.490190 | lrm: 0.27 | dt: 645.96ms | tok/sec: 811,643 | mfu: 50.73 | epoch: 2 | total time: 155.35m | eta: 24.1m +step 14462/16704 (86.58%) | loss: 2.497055 | lrm: 0.27 | dt: 644.91ms | tok/sec: 812,958 | mfu: 50.81 | epoch: 2 | total time: 155.36m | eta: 24.1m +step 14463/16704 (86.58%) | loss: 2.507061 | lrm: 0.27 | dt: 644.89ms | tok/sec: 812,990 | mfu: 50.81 | epoch: 2 | total time: 155.37m | eta: 24.1m +step 14464/16704 (86.59%) | loss: 2.509336 | lrm: 0.27 | dt: 644.71ms | tok/sec: 813,216 | mfu: 50.83 | epoch: 2 | total time: 155.38m | eta: 24.1m +step 14465/16704 (86.60%) | loss: 2.510138 | lrm: 0.27 | dt: 644.65ms | tok/sec: 813,290 | mfu: 50.83 | epoch: 2 | total time: 155.39m | eta: 24.1m +step 14466/16704 (86.60%) | loss: 2.502909 | lrm: 0.27 | dt: 646.95ms | tok/sec: 810,400 | mfu: 50.65 | epoch: 2 | total time: 155.40m | eta: 24.1m +step 14467/16704 (86.61%) | loss: 2.502885 | lrm: 0.27 | dt: 644.97ms | tok/sec: 812,892 | mfu: 50.81 | epoch: 2 | total time: 155.41m | eta: 24.0m +step 14468/16704 (86.61%) | loss: 2.497161 | lrm: 0.27 | dt: 645.96ms | tok/sec: 811,647 | mfu: 50.73 | epoch: 2 | total time: 155.42m | eta: 24.0m +step 14469/16704 (86.62%) | loss: 2.489996 | lrm: 0.27 | dt: 644.13ms | tok/sec: 813,951 | mfu: 50.87 | epoch: 2 | total time: 155.43m | eta: 24.0m +step 14470/16704 (86.63%) | loss: 2.493433 | lrm: 0.27 | dt: 643.58ms | tok/sec: 814,640 | mfu: 50.92 | epoch: 2 | total time: 155.44m | eta: 24.0m +step 14471/16704 (86.63%) | loss: 2.490946 | lrm: 0.27 | dt: 645.22ms | tok/sec: 812,571 | mfu: 50.79 | epoch: 2 | total time: 155.45m | eta: 24.0m +step 14472/16704 (86.64%) | loss: 2.502759 | lrm: 0.27 | dt: 644.20ms | tok/sec: 813,860 | mfu: 50.87 | epoch: 2 | total time: 155.46m | eta: 24.0m +step 14473/16704 (86.64%) | loss: 2.499522 | lrm: 0.27 | dt: 645.80ms | tok/sec: 811,846 | mfu: 50.74 | epoch: 2 | total time: 155.48m | eta: 24.0m +step 14474/16704 (86.65%) | loss: 2.489878 | lrm: 0.27 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 2 | total time: 155.49m | eta: 24.0m +step 14475/16704 (86.66%) | loss: 2.487773 | lrm: 0.27 | dt: 643.01ms | tok/sec: 815,360 | mfu: 50.96 | epoch: 2 | total time: 155.50m | eta: 24.0m +step 14476/16704 (86.66%) | loss: 2.489583 | lrm: 0.27 | dt: 644.95ms | tok/sec: 812,914 | mfu: 50.81 | epoch: 2 | total time: 155.51m | eta: 24.0m +step 14477/16704 (86.67%) | loss: 2.488014 | lrm: 0.27 | dt: 643.41ms | tok/sec: 814,853 | mfu: 50.93 | epoch: 2 | total time: 155.52m | eta: 23.9m +step 14478/16704 (86.67%) | loss: 2.498738 | lrm: 0.27 | dt: 647.77ms | tok/sec: 809,372 | mfu: 50.59 | epoch: 2 | total time: 155.53m | eta: 23.9m +step 14479/16704 (86.68%) | loss: 2.493057 | lrm: 0.27 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 2 | total time: 155.54m | eta: 23.9m +step 14480/16704 (86.69%) | loss: 2.491236 | lrm: 0.27 | dt: 646.27ms | tok/sec: 811,250 | mfu: 50.70 | epoch: 2 | total time: 155.55m | eta: 23.9m +step 14481/16704 (86.69%) | loss: 2.496037 | lrm: 0.27 | dt: 644.66ms | tok/sec: 813,272 | mfu: 50.83 | epoch: 2 | total time: 155.56m | eta: 23.9m +step 14482/16704 (86.70%) | loss: 2.491722 | lrm: 0.27 | dt: 645.61ms | tok/sec: 812,079 | mfu: 50.76 | epoch: 2 | total time: 155.57m | eta: 23.9m +step 14483/16704 (86.70%) | loss: 2.483918 | lrm: 0.27 | dt: 645.79ms | tok/sec: 811,850 | mfu: 50.74 | epoch: 2 | total time: 155.58m | eta: 23.9m +step 14484/16704 (86.71%) | loss: 2.482079 | lrm: 0.27 | dt: 647.20ms | tok/sec: 810,089 | mfu: 50.63 | epoch: 2 | total time: 155.59m | eta: 23.9m +step 14485/16704 (86.72%) | loss: 2.488252 | lrm: 0.27 | dt: 640.78ms | tok/sec: 818,204 | mfu: 51.14 | epoch: 2 | total time: 155.60m | eta: 23.9m +step 14486/16704 (86.72%) | loss: 2.487590 | lrm: 0.27 | dt: 647.78ms | tok/sec: 809,360 | mfu: 50.59 | epoch: 2 | total time: 155.62m | eta: 23.8m +step 14487/16704 (86.73%) | loss: 2.491307 | lrm: 0.27 | dt: 641.18ms | tok/sec: 817,689 | mfu: 51.11 | epoch: 2 | total time: 155.63m | eta: 23.8m +step 14488/16704 (86.73%) | loss: 2.497316 | lrm: 0.27 | dt: 643.76ms | tok/sec: 814,414 | mfu: 50.90 | epoch: 2 | total time: 155.64m | eta: 23.8m +step 14489/16704 (86.74%) | loss: 2.491253 | lrm: 0.27 | dt: 645.99ms | tok/sec: 811,601 | mfu: 50.73 | epoch: 2 | total time: 155.65m | eta: 23.8m +step 14490/16704 (86.75%) | loss: 2.502702 | lrm: 0.27 | dt: 644.08ms | tok/sec: 814,011 | mfu: 50.88 | epoch: 2 | total time: 155.66m | eta: 23.8m +step 14491/16704 (86.75%) | loss: 2.505223 | lrm: 0.26 | dt: 645.21ms | tok/sec: 812,583 | mfu: 50.79 | epoch: 2 | total time: 155.67m | eta: 23.8m +step 14492/16704 (86.76%) | loss: 2.498789 | lrm: 0.26 | dt: 645.56ms | tok/sec: 812,140 | mfu: 50.76 | epoch: 2 | total time: 155.68m | eta: 23.8m +step 14493/16704 (86.76%) | loss: 2.490681 | lrm: 0.26 | dt: 641.84ms | tok/sec: 816,855 | mfu: 51.05 | epoch: 2 | total time: 155.69m | eta: 23.8m +step 14494/16704 (86.77%) | loss: 2.476171 | lrm: 0.26 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 2 | total time: 155.70m | eta: 23.8m +step 14495/16704 (86.78%) | loss: 2.489967 | lrm: 0.26 | dt: 643.67ms | tok/sec: 814,527 | mfu: 50.91 | epoch: 2 | total time: 155.71m | eta: 23.7m +step 14496/16704 (86.78%) | loss: 2.475072 | lrm: 0.26 | dt: 645.06ms | tok/sec: 812,779 | mfu: 50.80 | epoch: 2 | total time: 155.72m | eta: 23.7m +step 14497/16704 (86.79%) | loss: 2.488307 | lrm: 0.26 | dt: 644.48ms | tok/sec: 813,500 | mfu: 50.84 | epoch: 2 | total time: 155.73m | eta: 23.7m +step 14498/16704 (86.79%) | loss: 2.493186 | lrm: 0.26 | dt: 645.34ms | tok/sec: 812,416 | mfu: 50.78 | epoch: 2 | total time: 155.74m | eta: 23.7m +step 14499/16704 (86.80%) | loss: 2.498440 | lrm: 0.26 | dt: 643.32ms | tok/sec: 814,976 | mfu: 50.94 | epoch: 2 | total time: 155.75m | eta: 23.7m +Step 14500 | Validation bpb: 0.765702 +step 14500/16704 (86.81%) | loss: 2.506580 | lrm: 0.26 | dt: 633.51ms | tok/sec: 827,588 | mfu: 51.73 | epoch: 2 | total time: 155.77m | eta: 23.7m +step 14501/16704 (86.81%) | loss: 2.501030 | lrm: 0.26 | dt: 652.83ms | tok/sec: 803,101 | mfu: 50.20 | epoch: 2 | total time: 155.78m | eta: 23.7m +step 14502/16704 (86.82%) | loss: 2.492813 | lrm: 0.26 | dt: 644.57ms | tok/sec: 813,385 | mfu: 50.84 | epoch: 2 | total time: 155.79m | eta: 23.7m +step 14503/16704 (86.82%) | loss: 2.504852 | lrm: 0.26 | dt: 645.38ms | tok/sec: 812,372 | mfu: 50.77 | epoch: 2 | total time: 155.80m | eta: 23.7m +step 14504/16704 (86.83%) | loss: 2.510866 | lrm: 0.26 | dt: 647.97ms | tok/sec: 809,125 | mfu: 50.57 | epoch: 2 | total time: 155.81m | eta: 23.6m +step 14505/16704 (86.84%) | loss: 2.515680 | lrm: 0.26 | dt: 643.22ms | tok/sec: 815,097 | mfu: 50.94 | epoch: 2 | total time: 155.82m | eta: 23.6m +step 14506/16704 (86.84%) | loss: 2.521039 | lrm: 0.26 | dt: 646.80ms | tok/sec: 810,592 | mfu: 50.66 | epoch: 2 | total time: 155.83m | eta: 23.6m +step 14507/16704 (86.85%) | loss: 2.509768 | lrm: 0.26 | dt: 645.18ms | tok/sec: 812,624 | mfu: 50.79 | epoch: 2 | total time: 155.84m | eta: 23.6m +step 14508/16704 (86.85%) | loss: 2.507059 | lrm: 0.26 | dt: 645.99ms | tok/sec: 811,599 | mfu: 50.73 | epoch: 2 | total time: 155.85m | eta: 23.6m +step 14509/16704 (86.86%) | loss: 2.503846 | lrm: 0.26 | dt: 642.61ms | tok/sec: 815,867 | mfu: 50.99 | epoch: 2 | total time: 155.86m | eta: 23.6m +step 14510/16704 (86.87%) | loss: 2.513979 | lrm: 0.26 | dt: 649.35ms | tok/sec: 807,404 | mfu: 50.46 | epoch: 2 | total time: 155.87m | eta: 23.6m +step 14511/16704 (86.87%) | loss: 2.519625 | lrm: 0.26 | dt: 641.06ms | tok/sec: 817,846 | mfu: 51.12 | epoch: 2 | total time: 155.88m | eta: 23.6m +step 14512/16704 (86.88%) | loss: 2.516785 | lrm: 0.26 | dt: 647.40ms | tok/sec: 809,838 | mfu: 50.62 | epoch: 2 | total time: 155.89m | eta: 23.6m +step 14513/16704 (86.88%) | loss: 2.511522 | lrm: 0.26 | dt: 647.15ms | tok/sec: 810,149 | mfu: 50.64 | epoch: 2 | total time: 155.91m | eta: 23.6m +step 14514/16704 (86.89%) | loss: 2.496489 | lrm: 0.26 | dt: 646.66ms | tok/sec: 810,756 | mfu: 50.67 | epoch: 2 | total time: 155.92m | eta: 23.5m +step 14515/16704 (86.90%) | loss: 2.497245 | lrm: 0.26 | dt: 645.67ms | tok/sec: 812,001 | mfu: 50.75 | epoch: 2 | total time: 155.93m | eta: 23.5m +step 14516/16704 (86.90%) | loss: 2.495039 | lrm: 0.26 | dt: 646.02ms | tok/sec: 811,569 | mfu: 50.72 | epoch: 2 | total time: 155.94m | eta: 23.5m +step 14517/16704 (86.91%) | loss: 2.494140 | lrm: 0.26 | dt: 645.31ms | tok/sec: 812,459 | mfu: 50.78 | epoch: 2 | total time: 155.95m | eta: 23.5m +step 14518/16704 (86.91%) | loss: 2.490653 | lrm: 0.26 | dt: 646.32ms | tok/sec: 811,189 | mfu: 50.70 | epoch: 2 | total time: 155.96m | eta: 23.5m +step 14519/16704 (86.92%) | loss: 2.476319 | lrm: 0.26 | dt: 644.64ms | tok/sec: 813,306 | mfu: 50.83 | epoch: 2 | total time: 155.97m | eta: 23.5m +step 14520/16704 (86.93%) | loss: 2.469548 | lrm: 0.26 | dt: 644.51ms | tok/sec: 813,467 | mfu: 50.84 | epoch: 2 | total time: 155.98m | eta: 23.5m +step 14521/16704 (86.93%) | loss: 2.468296 | lrm: 0.26 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 2 | total time: 155.99m | eta: 23.5m +step 14522/16704 (86.94%) | loss: 2.463775 | lrm: 0.26 | dt: 645.86ms | tok/sec: 811,767 | mfu: 50.74 | epoch: 2 | total time: 156.00m | eta: 23.5m +step 14523/16704 (86.94%) | loss: 2.458716 | lrm: 0.26 | dt: 644.01ms | tok/sec: 814,100 | mfu: 50.88 | epoch: 2 | total time: 156.01m | eta: 23.4m +step 14524/16704 (86.95%) | loss: 2.458898 | lrm: 0.26 | dt: 644.83ms | tok/sec: 813,063 | mfu: 50.82 | epoch: 2 | total time: 156.02m | eta: 23.4m +step 14525/16704 (86.96%) | loss: 2.448025 | lrm: 0.26 | dt: 644.83ms | tok/sec: 813,060 | mfu: 50.82 | epoch: 2 | total time: 156.03m | eta: 23.4m +step 14526/16704 (86.96%) | loss: 2.472157 | lrm: 0.26 | dt: 647.37ms | tok/sec: 809,876 | mfu: 50.62 | epoch: 2 | total time: 156.05m | eta: 23.4m +step 14527/16704 (86.97%) | loss: 2.473573 | lrm: 0.26 | dt: 642.95ms | tok/sec: 815,445 | mfu: 50.97 | epoch: 2 | total time: 156.06m | eta: 23.4m +step 14528/16704 (86.97%) | loss: 2.468315 | lrm: 0.26 | dt: 645.85ms | tok/sec: 811,780 | mfu: 50.74 | epoch: 2 | total time: 156.07m | eta: 23.4m +step 14529/16704 (86.98%) | loss: 2.457098 | lrm: 0.26 | dt: 645.10ms | tok/sec: 812,722 | mfu: 50.80 | epoch: 2 | total time: 156.08m | eta: 23.4m +step 14530/16704 (86.99%) | loss: 2.470039 | lrm: 0.26 | dt: 644.77ms | tok/sec: 813,138 | mfu: 50.82 | epoch: 2 | total time: 156.09m | eta: 23.4m +step 14531/16704 (86.99%) | loss: 2.464269 | lrm: 0.26 | dt: 646.16ms | tok/sec: 811,389 | mfu: 50.71 | epoch: 2 | total time: 156.10m | eta: 23.4m +step 14532/16704 (87.00%) | loss: 2.485317 | lrm: 0.26 | dt: 645.01ms | tok/sec: 812,841 | mfu: 50.80 | epoch: 2 | total time: 156.11m | eta: 23.3m +step 14533/16704 (87.00%) | loss: 2.500039 | lrm: 0.26 | dt: 646.87ms | tok/sec: 810,499 | mfu: 50.66 | epoch: 2 | total time: 156.12m | eta: 23.3m +step 14534/16704 (87.01%) | loss: 2.494391 | lrm: 0.26 | dt: 643.14ms | tok/sec: 815,202 | mfu: 50.95 | epoch: 2 | total time: 156.13m | eta: 23.3m +step 14535/16704 (87.02%) | loss: 2.491940 | lrm: 0.26 | dt: 644.33ms | tok/sec: 813,698 | mfu: 50.86 | epoch: 2 | total time: 156.14m | eta: 23.3m +step 14536/16704 (87.02%) | loss: 2.484806 | lrm: 0.26 | dt: 644.44ms | tok/sec: 813,550 | mfu: 50.85 | epoch: 2 | total time: 156.15m | eta: 23.3m +step 14537/16704 (87.03%) | loss: 2.478251 | lrm: 0.26 | dt: 645.44ms | tok/sec: 812,301 | mfu: 50.77 | epoch: 2 | total time: 156.16m | eta: 23.3m +step 14538/16704 (87.03%) | loss: 2.475844 | lrm: 0.26 | dt: 644.25ms | tok/sec: 813,799 | mfu: 50.86 | epoch: 2 | total time: 156.17m | eta: 23.3m +step 14539/16704 (87.04%) | loss: 2.472957 | lrm: 0.26 | dt: 646.65ms | tok/sec: 810,770 | mfu: 50.67 | epoch: 2 | total time: 156.18m | eta: 23.3m +step 14540/16704 (87.05%) | loss: 2.476305 | lrm: 0.26 | dt: 644.34ms | tok/sec: 813,684 | mfu: 50.86 | epoch: 2 | total time: 156.20m | eta: 23.3m +step 14541/16704 (87.05%) | loss: 2.480724 | lrm: 0.26 | dt: 645.81ms | tok/sec: 811,829 | mfu: 50.74 | epoch: 2 | total time: 156.21m | eta: 23.3m +step 14542/16704 (87.06%) | loss: 2.493438 | lrm: 0.26 | dt: 646.90ms | tok/sec: 810,467 | mfu: 50.66 | epoch: 2 | total time: 156.22m | eta: 23.2m +step 14543/16704 (87.06%) | loss: 2.490718 | lrm: 0.26 | dt: 646.26ms | tok/sec: 811,270 | mfu: 50.71 | epoch: 2 | total time: 156.23m | eta: 23.2m +step 14544/16704 (87.07%) | loss: 2.485695 | lrm: 0.26 | dt: 644.97ms | tok/sec: 812,893 | mfu: 50.81 | epoch: 2 | total time: 156.24m | eta: 23.2m +step 14545/16704 (87.07%) | loss: 2.495066 | lrm: 0.26 | dt: 645.75ms | tok/sec: 811,911 | mfu: 50.75 | epoch: 2 | total time: 156.25m | eta: 23.2m +step 14546/16704 (87.08%) | loss: 2.496422 | lrm: 0.26 | dt: 645.18ms | tok/sec: 812,617 | mfu: 50.79 | epoch: 2 | total time: 156.26m | eta: 23.2m +step 14547/16704 (87.09%) | loss: 2.503004 | lrm: 0.26 | dt: 643.90ms | tok/sec: 814,234 | mfu: 50.89 | epoch: 2 | total time: 156.27m | eta: 23.2m +step 14548/16704 (87.09%) | loss: 2.502563 | lrm: 0.26 | dt: 645.99ms | tok/sec: 811,601 | mfu: 50.73 | epoch: 2 | total time: 156.28m | eta: 23.2m +step 14549/16704 (87.10%) | loss: 2.496554 | lrm: 0.26 | dt: 643.95ms | tok/sec: 814,176 | mfu: 50.89 | epoch: 2 | total time: 156.29m | eta: 23.2m +step 14550/16704 (87.10%) | loss: 2.490188 | lrm: 0.26 | dt: 644.19ms | tok/sec: 813,867 | mfu: 50.87 | epoch: 2 | total time: 156.30m | eta: 23.2m +step 14551/16704 (87.11%) | loss: 2.488733 | lrm: 0.26 | dt: 644.64ms | tok/sec: 813,300 | mfu: 50.83 | epoch: 2 | total time: 156.31m | eta: 23.1m +step 14552/16704 (87.12%) | loss: 2.477853 | lrm: 0.26 | dt: 643.31ms | tok/sec: 814,988 | mfu: 50.94 | epoch: 2 | total time: 156.32m | eta: 23.1m +step 14553/16704 (87.12%) | loss: 2.476383 | lrm: 0.26 | dt: 644.63ms | tok/sec: 813,317 | mfu: 50.83 | epoch: 2 | total time: 156.34m | eta: 23.1m +step 14554/16704 (87.13%) | loss: 2.466180 | lrm: 0.26 | dt: 644.55ms | tok/sec: 813,418 | mfu: 50.84 | epoch: 2 | total time: 156.35m | eta: 23.1m +step 14555/16704 (87.13%) | loss: 2.464894 | lrm: 0.26 | dt: 644.81ms | tok/sec: 813,091 | mfu: 50.82 | epoch: 2 | total time: 156.36m | eta: 23.1m +step 14556/16704 (87.14%) | loss: 2.471776 | lrm: 0.26 | dt: 645.35ms | tok/sec: 812,409 | mfu: 50.78 | epoch: 2 | total time: 156.37m | eta: 23.1m +step 14557/16704 (87.15%) | loss: 2.477024 | lrm: 0.26 | dt: 645.22ms | tok/sec: 812,578 | mfu: 50.79 | epoch: 2 | total time: 156.38m | eta: 23.1m +step 14558/16704 (87.15%) | loss: 2.486740 | lrm: 0.26 | dt: 649.07ms | tok/sec: 807,747 | mfu: 50.49 | epoch: 2 | total time: 156.39m | eta: 23.1m +step 14559/16704 (87.16%) | loss: 2.477596 | lrm: 0.26 | dt: 645.39ms | tok/sec: 812,362 | mfu: 50.77 | epoch: 2 | total time: 156.40m | eta: 23.1m +step 14560/16704 (87.16%) | loss: 2.475110 | lrm: 0.26 | dt: 644.11ms | tok/sec: 813,971 | mfu: 50.87 | epoch: 2 | total time: 156.41m | eta: 23.0m +step 14561/16704 (87.17%) | loss: 2.472639 | lrm: 0.26 | dt: 644.40ms | tok/sec: 813,612 | mfu: 50.85 | epoch: 2 | total time: 156.42m | eta: 23.0m +step 14562/16704 (87.18%) | loss: 2.478623 | lrm: 0.26 | dt: 645.37ms | tok/sec: 812,381 | mfu: 50.78 | epoch: 2 | total time: 156.43m | eta: 23.0m +step 14563/16704 (87.18%) | loss: 2.478856 | lrm: 0.26 | dt: 643.94ms | tok/sec: 814,184 | mfu: 50.89 | epoch: 2 | total time: 156.44m | eta: 23.0m +step 14564/16704 (87.19%) | loss: 2.490495 | lrm: 0.26 | dt: 643.59ms | tok/sec: 814,630 | mfu: 50.92 | epoch: 2 | total time: 156.45m | eta: 23.0m +step 14565/16704 (87.19%) | loss: 2.492333 | lrm: 0.26 | dt: 643.27ms | tok/sec: 815,038 | mfu: 50.94 | epoch: 2 | total time: 156.46m | eta: 23.0m +step 14566/16704 (87.20%) | loss: 2.499323 | lrm: 0.26 | dt: 644.83ms | tok/sec: 813,058 | mfu: 50.82 | epoch: 2 | total time: 156.48m | eta: 23.0m +step 14567/16704 (87.21%) | loss: 2.490032 | lrm: 0.26 | dt: 643.76ms | tok/sec: 814,411 | mfu: 50.90 | epoch: 2 | total time: 156.49m | eta: 23.0m +step 14568/16704 (87.21%) | loss: 2.485938 | lrm: 0.26 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 2 | total time: 156.50m | eta: 23.0m +step 14569/16704 (87.22%) | loss: 2.483314 | lrm: 0.26 | dt: 644.77ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 2 | total time: 156.51m | eta: 23.0m +step 14570/16704 (87.22%) | loss: 2.486971 | lrm: 0.26 | dt: 645.26ms | tok/sec: 812,516 | mfu: 50.78 | epoch: 2 | total time: 156.52m | eta: 22.9m +step 14571/16704 (87.23%) | loss: 2.481626 | lrm: 0.26 | dt: 646.54ms | tok/sec: 810,910 | mfu: 50.68 | epoch: 2 | total time: 156.53m | eta: 22.9m +step 14572/16704 (87.24%) | loss: 2.465132 | lrm: 0.26 | dt: 643.74ms | tok/sec: 814,439 | mfu: 50.90 | epoch: 2 | total time: 156.54m | eta: 22.9m +step 14573/16704 (87.24%) | loss: 2.466203 | lrm: 0.26 | dt: 646.44ms | tok/sec: 811,035 | mfu: 50.69 | epoch: 2 | total time: 156.55m | eta: 22.9m +step 14574/16704 (87.25%) | loss: 2.456015 | lrm: 0.26 | dt: 644.84ms | tok/sec: 813,046 | mfu: 50.82 | epoch: 2 | total time: 156.56m | eta: 22.9m +step 14575/16704 (87.25%) | loss: 2.477313 | lrm: 0.25 | dt: 642.61ms | tok/sec: 815,868 | mfu: 50.99 | epoch: 2 | total time: 156.57m | eta: 22.9m +step 14576/16704 (87.26%) | loss: 2.478682 | lrm: 0.25 | dt: 646.05ms | tok/sec: 811,530 | mfu: 50.72 | epoch: 2 | total time: 156.58m | eta: 22.9m +step 14577/16704 (87.27%) | loss: 2.472379 | lrm: 0.25 | dt: 646.06ms | tok/sec: 811,513 | mfu: 50.72 | epoch: 2 | total time: 156.59m | eta: 22.9m +step 14578/16704 (87.27%) | loss: 2.476572 | lrm: 0.25 | dt: 644.69ms | tok/sec: 813,239 | mfu: 50.83 | epoch: 2 | total time: 156.60m | eta: 22.9m +step 14579/16704 (87.28%) | loss: 2.472361 | lrm: 0.25 | dt: 644.88ms | tok/sec: 812,996 | mfu: 50.81 | epoch: 2 | total time: 156.61m | eta: 22.8m +step 14580/16704 (87.28%) | loss: 2.467843 | lrm: 0.25 | dt: 642.82ms | tok/sec: 815,605 | mfu: 50.98 | epoch: 2 | total time: 156.63m | eta: 22.8m +step 14581/16704 (87.29%) | loss: 2.481192 | lrm: 0.25 | dt: 644.87ms | tok/sec: 813,014 | mfu: 50.81 | epoch: 2 | total time: 156.64m | eta: 22.8m +step 14582/16704 (87.30%) | loss: 2.479061 | lrm: 0.25 | dt: 645.85ms | tok/sec: 811,775 | mfu: 50.74 | epoch: 2 | total time: 156.65m | eta: 22.8m +step 14583/16704 (87.30%) | loss: 2.483138 | lrm: 0.25 | dt: 646.96ms | tok/sec: 810,381 | mfu: 50.65 | epoch: 2 | total time: 156.66m | eta: 22.8m +step 14584/16704 (87.31%) | loss: 2.490708 | lrm: 0.25 | dt: 643.44ms | tok/sec: 814,820 | mfu: 50.93 | epoch: 2 | total time: 156.67m | eta: 22.8m +step 14585/16704 (87.31%) | loss: 2.489037 | lrm: 0.25 | dt: 647.45ms | tok/sec: 809,774 | mfu: 50.61 | epoch: 2 | total time: 156.68m | eta: 22.8m +step 14586/16704 (87.32%) | loss: 2.490758 | lrm: 0.25 | dt: 644.56ms | tok/sec: 813,403 | mfu: 50.84 | epoch: 2 | total time: 156.69m | eta: 22.8m +step 14587/16704 (87.33%) | loss: 2.494232 | lrm: 0.25 | dt: 642.10ms | tok/sec: 816,518 | mfu: 51.03 | epoch: 2 | total time: 156.70m | eta: 22.8m +step 14588/16704 (87.33%) | loss: 2.486160 | lrm: 0.25 | dt: 645.72ms | tok/sec: 811,937 | mfu: 50.75 | epoch: 2 | total time: 156.71m | eta: 22.7m +step 14589/16704 (87.34%) | loss: 2.485119 | lrm: 0.25 | dt: 645.00ms | tok/sec: 812,852 | mfu: 50.80 | epoch: 2 | total time: 156.72m | eta: 22.7m +step 14590/16704 (87.34%) | loss: 2.484593 | lrm: 0.25 | dt: 645.54ms | tok/sec: 812,164 | mfu: 50.76 | epoch: 2 | total time: 156.73m | eta: 22.7m +step 14591/16704 (87.35%) | loss: 2.492073 | lrm: 0.25 | dt: 645.49ms | tok/sec: 812,236 | mfu: 50.77 | epoch: 2 | total time: 156.74m | eta: 22.7m +step 14592/16704 (87.36%) | loss: 2.501903 | lrm: 0.25 | dt: 645.34ms | tok/sec: 812,415 | mfu: 50.78 | epoch: 2 | total time: 156.75m | eta: 22.7m +step 14593/16704 (87.36%) | loss: 2.493462 | lrm: 0.25 | dt: 644.22ms | tok/sec: 813,832 | mfu: 50.87 | epoch: 2 | total time: 156.77m | eta: 22.7m +step 14594/16704 (87.37%) | loss: 2.487518 | lrm: 0.25 | dt: 643.84ms | tok/sec: 814,313 | mfu: 50.90 | epoch: 2 | total time: 156.78m | eta: 22.7m +step 14595/16704 (87.37%) | loss: 2.509851 | lrm: 0.25 | dt: 645.30ms | tok/sec: 812,467 | mfu: 50.78 | epoch: 2 | total time: 156.79m | eta: 22.7m +step 14596/16704 (87.38%) | loss: 2.510178 | lrm: 0.25 | dt: 645.16ms | tok/sec: 812,648 | mfu: 50.79 | epoch: 2 | total time: 156.80m | eta: 22.7m +step 14597/16704 (87.39%) | loss: 2.506102 | lrm: 0.25 | dt: 647.77ms | tok/sec: 809,372 | mfu: 50.59 | epoch: 2 | total time: 156.81m | eta: 22.6m +step 14598/16704 (87.39%) | loss: 2.508230 | lrm: 0.25 | dt: 646.55ms | tok/sec: 810,898 | mfu: 50.68 | epoch: 2 | total time: 156.82m | eta: 22.6m +step 14599/16704 (87.40%) | loss: 2.502242 | lrm: 0.25 | dt: 643.23ms | tok/sec: 815,080 | mfu: 50.94 | epoch: 2 | total time: 156.83m | eta: 22.6m +step 14600/16704 (87.40%) | loss: 2.507677 | lrm: 0.25 | dt: 644.66ms | tok/sec: 813,277 | mfu: 50.83 | epoch: 2 | total time: 156.84m | eta: 22.6m +step 14601/16704 (87.41%) | loss: 2.513916 | lrm: 0.25 | dt: 643.00ms | tok/sec: 815,378 | mfu: 50.96 | epoch: 2 | total time: 156.85m | eta: 22.6m +step 14602/16704 (87.42%) | loss: 2.511962 | lrm: 0.25 | dt: 645.39ms | tok/sec: 812,363 | mfu: 50.77 | epoch: 2 | total time: 156.86m | eta: 22.6m +step 14603/16704 (87.42%) | loss: 2.508121 | lrm: 0.25 | dt: 643.56ms | tok/sec: 814,667 | mfu: 50.92 | epoch: 2 | total time: 156.87m | eta: 22.6m +step 14604/16704 (87.43%) | loss: 2.503343 | lrm: 0.25 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 2 | total time: 156.88m | eta: 22.6m +step 14605/16704 (87.43%) | loss: 2.499802 | lrm: 0.25 | dt: 644.31ms | tok/sec: 813,719 | mfu: 50.86 | epoch: 2 | total time: 156.89m | eta: 22.6m +step 14606/16704 (87.44%) | loss: 2.497446 | lrm: 0.25 | dt: 645.70ms | tok/sec: 811,968 | mfu: 50.75 | epoch: 2 | total time: 156.91m | eta: 22.6m +step 14607/16704 (87.45%) | loss: 2.496425 | lrm: 0.25 | dt: 644.82ms | tok/sec: 813,077 | mfu: 50.82 | epoch: 2 | total time: 156.92m | eta: 22.5m +step 14608/16704 (87.45%) | loss: 2.509532 | lrm: 0.25 | dt: 644.52ms | tok/sec: 813,453 | mfu: 50.84 | epoch: 2 | total time: 156.93m | eta: 22.5m +step 14609/16704 (87.46%) | loss: 2.512677 | lrm: 0.25 | dt: 645.01ms | tok/sec: 812,841 | mfu: 50.80 | epoch: 2 | total time: 156.94m | eta: 22.5m +step 14610/16704 (87.46%) | loss: 2.510453 | lrm: 0.25 | dt: 645.32ms | tok/sec: 812,444 | mfu: 50.78 | epoch: 2 | total time: 156.95m | eta: 22.5m +step 14611/16704 (87.47%) | loss: 2.499001 | lrm: 0.25 | dt: 643.69ms | tok/sec: 814,508 | mfu: 50.91 | epoch: 2 | total time: 156.96m | eta: 22.5m +step 14612/16704 (87.48%) | loss: 2.499744 | lrm: 0.25 | dt: 642.47ms | tok/sec: 816,053 | mfu: 51.00 | epoch: 2 | total time: 156.97m | eta: 22.5m +step 14613/16704 (87.48%) | loss: 2.509523 | lrm: 0.25 | dt: 644.33ms | tok/sec: 813,694 | mfu: 50.86 | epoch: 2 | total time: 156.98m | eta: 22.5m +step 14614/16704 (87.49%) | loss: 2.501116 | lrm: 0.25 | dt: 646.17ms | tok/sec: 811,380 | mfu: 50.71 | epoch: 2 | total time: 156.99m | eta: 22.5m +step 14615/16704 (87.49%) | loss: 2.492390 | lrm: 0.25 | dt: 646.51ms | tok/sec: 810,957 | mfu: 50.69 | epoch: 2 | total time: 157.00m | eta: 22.5m +step 14616/16704 (87.50%) | loss: 2.497340 | lrm: 0.25 | dt: 644.94ms | tok/sec: 812,922 | mfu: 50.81 | epoch: 2 | total time: 157.01m | eta: 22.4m +step 14617/16704 (87.51%) | loss: 2.493929 | lrm: 0.25 | dt: 645.51ms | tok/sec: 812,204 | mfu: 50.76 | epoch: 2 | total time: 157.02m | eta: 22.4m +step 14618/16704 (87.51%) | loss: 2.486180 | lrm: 0.25 | dt: 647.65ms | tok/sec: 809,518 | mfu: 50.60 | epoch: 2 | total time: 157.03m | eta: 22.4m +step 14619/16704 (87.52%) | loss: 2.483286 | lrm: 0.25 | dt: 643.48ms | tok/sec: 814,774 | mfu: 50.92 | epoch: 2 | total time: 157.04m | eta: 22.4m +step 14620/16704 (87.52%) | loss: 2.484862 | lrm: 0.25 | dt: 643.62ms | tok/sec: 814,597 | mfu: 50.91 | epoch: 2 | total time: 157.06m | eta: 22.4m +step 14621/16704 (87.53%) | loss: 2.484416 | lrm: 0.25 | dt: 645.13ms | tok/sec: 812,683 | mfu: 50.79 | epoch: 2 | total time: 157.07m | eta: 22.4m +step 14622/16704 (87.54%) | loss: 2.487638 | lrm: 0.25 | dt: 643.94ms | tok/sec: 814,192 | mfu: 50.89 | epoch: 2 | total time: 157.08m | eta: 22.4m +step 14623/16704 (87.54%) | loss: 2.477120 | lrm: 0.25 | dt: 645.15ms | tok/sec: 812,660 | mfu: 50.79 | epoch: 2 | total time: 157.09m | eta: 22.4m +step 14624/16704 (87.55%) | loss: 2.495087 | lrm: 0.25 | dt: 642.73ms | tok/sec: 815,723 | mfu: 50.98 | epoch: 2 | total time: 157.10m | eta: 22.4m +step 14625/16704 (87.55%) | loss: 2.489187 | lrm: 0.25 | dt: 642.90ms | tok/sec: 815,499 | mfu: 50.97 | epoch: 2 | total time: 157.11m | eta: 22.3m +step 14626/16704 (87.56%) | loss: 2.497143 | lrm: 0.25 | dt: 646.91ms | tok/sec: 810,449 | mfu: 50.65 | epoch: 2 | total time: 157.12m | eta: 22.3m +step 14627/16704 (87.57%) | loss: 2.485187 | lrm: 0.25 | dt: 643.57ms | tok/sec: 814,652 | mfu: 50.92 | epoch: 2 | total time: 157.13m | eta: 22.3m +step 14628/16704 (87.57%) | loss: 2.487855 | lrm: 0.25 | dt: 642.36ms | tok/sec: 816,191 | mfu: 51.01 | epoch: 2 | total time: 157.14m | eta: 22.3m +step 14629/16704 (87.58%) | loss: 2.484810 | lrm: 0.25 | dt: 643.98ms | tok/sec: 814,141 | mfu: 50.89 | epoch: 2 | total time: 157.15m | eta: 22.3m +step 14630/16704 (87.58%) | loss: 2.480099 | lrm: 0.25 | dt: 648.44ms | tok/sec: 808,541 | mfu: 50.53 | epoch: 2 | total time: 157.16m | eta: 22.3m +step 14631/16704 (87.59%) | loss: 2.479119 | lrm: 0.25 | dt: 645.36ms | tok/sec: 812,396 | mfu: 50.78 | epoch: 2 | total time: 157.17m | eta: 22.3m +step 14632/16704 (87.60%) | loss: 2.475887 | lrm: 0.25 | dt: 643.92ms | tok/sec: 814,207 | mfu: 50.89 | epoch: 2 | total time: 157.18m | eta: 22.3m +step 14633/16704 (87.60%) | loss: 2.473705 | lrm: 0.25 | dt: 645.86ms | tok/sec: 811,761 | mfu: 50.74 | epoch: 2 | total time: 157.20m | eta: 22.3m +step 14634/16704 (87.61%) | loss: 2.487472 | lrm: 0.25 | dt: 645.50ms | tok/sec: 812,226 | mfu: 50.77 | epoch: 2 | total time: 157.21m | eta: 22.3m +step 14635/16704 (87.61%) | loss: 2.478988 | lrm: 0.25 | dt: 645.38ms | tok/sec: 812,370 | mfu: 50.77 | epoch: 2 | total time: 157.22m | eta: 22.2m +step 14636/16704 (87.62%) | loss: 2.478906 | lrm: 0.25 | dt: 644.93ms | tok/sec: 812,940 | mfu: 50.81 | epoch: 2 | total time: 157.23m | eta: 22.2m +step 14637/16704 (87.63%) | loss: 2.491893 | lrm: 0.25 | dt: 644.85ms | tok/sec: 813,042 | mfu: 50.82 | epoch: 2 | total time: 157.24m | eta: 22.2m +step 14638/16704 (87.63%) | loss: 2.494427 | lrm: 0.25 | dt: 647.88ms | tok/sec: 809,238 | mfu: 50.58 | epoch: 2 | total time: 157.25m | eta: 22.2m +step 14639/16704 (87.64%) | loss: 2.497196 | lrm: 0.25 | dt: 643.08ms | tok/sec: 815,272 | mfu: 50.96 | epoch: 2 | total time: 157.26m | eta: 22.2m +step 14640/16704 (87.64%) | loss: 2.496163 | lrm: 0.25 | dt: 646.73ms | tok/sec: 810,673 | mfu: 50.67 | epoch: 2 | total time: 157.27m | eta: 22.2m +step 14641/16704 (87.65%) | loss: 2.491906 | lrm: 0.25 | dt: 644.04ms | tok/sec: 814,063 | mfu: 50.88 | epoch: 2 | total time: 157.28m | eta: 22.2m +step 14642/16704 (87.66%) | loss: 2.491800 | lrm: 0.25 | dt: 642.60ms | tok/sec: 815,882 | mfu: 50.99 | epoch: 2 | total time: 157.29m | eta: 22.2m +step 14643/16704 (87.66%) | loss: 2.489807 | lrm: 0.25 | dt: 644.97ms | tok/sec: 812,886 | mfu: 50.81 | epoch: 2 | total time: 157.30m | eta: 22.2m +step 14644/16704 (87.67%) | loss: 2.490591 | lrm: 0.25 | dt: 645.71ms | tok/sec: 811,951 | mfu: 50.75 | epoch: 2 | total time: 157.31m | eta: 22.1m +step 14645/16704 (87.67%) | loss: 2.497777 | lrm: 0.25 | dt: 643.37ms | tok/sec: 814,911 | mfu: 50.93 | epoch: 2 | total time: 157.32m | eta: 22.1m +step 14646/16704 (87.68%) | loss: 2.488276 | lrm: 0.25 | dt: 645.83ms | tok/sec: 811,806 | mfu: 50.74 | epoch: 2 | total time: 157.33m | eta: 22.1m +step 14647/16704 (87.69%) | loss: 2.485013 | lrm: 0.25 | dt: 642.29ms | tok/sec: 816,274 | mfu: 51.02 | epoch: 2 | total time: 157.35m | eta: 22.1m +step 14648/16704 (87.69%) | loss: 2.479224 | lrm: 0.25 | dt: 647.24ms | tok/sec: 810,039 | mfu: 50.63 | epoch: 2 | total time: 157.36m | eta: 22.1m +step 14649/16704 (87.70%) | loss: 2.480121 | lrm: 0.25 | dt: 645.40ms | tok/sec: 812,351 | mfu: 50.77 | epoch: 2 | total time: 157.37m | eta: 22.1m +step 14650/16704 (87.70%) | loss: 2.474026 | lrm: 0.25 | dt: 642.05ms | tok/sec: 816,581 | mfu: 51.04 | epoch: 2 | total time: 157.38m | eta: 22.1m +step 14651/16704 (87.71%) | loss: 2.457778 | lrm: 0.25 | dt: 646.32ms | tok/sec: 811,192 | mfu: 50.70 | epoch: 2 | total time: 157.39m | eta: 22.1m +step 14652/16704 (87.72%) | loss: 2.466077 | lrm: 0.25 | dt: 642.50ms | tok/sec: 816,011 | mfu: 51.00 | epoch: 2 | total time: 157.40m | eta: 22.1m +step 14653/16704 (87.72%) | loss: 2.467202 | lrm: 0.25 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 2 | total time: 157.41m | eta: 22.0m +step 14654/16704 (87.73%) | loss: 2.475216 | lrm: 0.25 | dt: 646.73ms | tok/sec: 810,670 | mfu: 50.67 | epoch: 2 | total time: 157.42m | eta: 22.0m +step 14655/16704 (87.73%) | loss: 2.457650 | lrm: 0.25 | dt: 643.33ms | tok/sec: 814,953 | mfu: 50.94 | epoch: 2 | total time: 157.43m | eta: 22.0m +step 14656/16704 (87.74%) | loss: 2.460881 | lrm: 0.25 | dt: 643.77ms | tok/sec: 814,408 | mfu: 50.90 | epoch: 2 | total time: 157.44m | eta: 22.0m +step 14657/16704 (87.75%) | loss: 2.450693 | lrm: 0.25 | dt: 645.65ms | tok/sec: 812,031 | mfu: 50.75 | epoch: 2 | total time: 157.45m | eta: 22.0m +step 14658/16704 (87.75%) | loss: 2.444432 | lrm: 0.24 | dt: 642.22ms | tok/sec: 816,367 | mfu: 51.02 | epoch: 2 | total time: 157.46m | eta: 22.0m +step 14659/16704 (87.76%) | loss: 2.450059 | lrm: 0.24 | dt: 645.39ms | tok/sec: 812,362 | mfu: 50.77 | epoch: 2 | total time: 157.47m | eta: 22.0m +step 14660/16704 (87.76%) | loss: 2.454744 | lrm: 0.24 | dt: 643.87ms | tok/sec: 814,270 | mfu: 50.89 | epoch: 2 | total time: 157.49m | eta: 22.0m +step 14661/16704 (87.77%) | loss: 2.459090 | lrm: 0.24 | dt: 644.45ms | tok/sec: 813,538 | mfu: 50.85 | epoch: 2 | total time: 157.50m | eta: 22.0m +step 14662/16704 (87.78%) | loss: 2.462101 | lrm: 0.24 | dt: 645.49ms | tok/sec: 812,228 | mfu: 50.77 | epoch: 2 | total time: 157.51m | eta: 22.0m +step 14663/16704 (87.78%) | loss: 2.466692 | lrm: 0.24 | dt: 643.90ms | tok/sec: 814,235 | mfu: 50.89 | epoch: 2 | total time: 157.52m | eta: 21.9m +step 14664/16704 (87.79%) | loss: 2.472457 | lrm: 0.24 | dt: 645.52ms | tok/sec: 812,197 | mfu: 50.76 | epoch: 2 | total time: 157.53m | eta: 21.9m +step 14665/16704 (87.79%) | loss: 2.478335 | lrm: 0.24 | dt: 645.96ms | tok/sec: 811,641 | mfu: 50.73 | epoch: 2 | total time: 157.54m | eta: 21.9m +step 14666/16704 (87.80%) | loss: 2.488872 | lrm: 0.24 | dt: 643.89ms | tok/sec: 814,244 | mfu: 50.89 | epoch: 2 | total time: 157.55m | eta: 21.9m +step 14667/16704 (87.81%) | loss: 2.485834 | lrm: 0.24 | dt: 645.36ms | tok/sec: 812,398 | mfu: 50.78 | epoch: 2 | total time: 157.56m | eta: 21.9m +step 14668/16704 (87.81%) | loss: 2.488516 | lrm: 0.24 | dt: 643.57ms | tok/sec: 814,650 | mfu: 50.92 | epoch: 2 | total time: 157.57m | eta: 21.9m +step 14669/16704 (87.82%) | loss: 2.486773 | lrm: 0.24 | dt: 643.02ms | tok/sec: 815,352 | mfu: 50.96 | epoch: 2 | total time: 157.58m | eta: 21.9m +step 14670/16704 (87.82%) | loss: 2.491993 | lrm: 0.24 | dt: 644.79ms | tok/sec: 813,114 | mfu: 50.82 | epoch: 2 | total time: 157.59m | eta: 21.9m +step 14671/16704 (87.83%) | loss: 2.499677 | lrm: 0.24 | dt: 646.00ms | tok/sec: 811,595 | mfu: 50.73 | epoch: 2 | total time: 157.60m | eta: 21.9m +step 14672/16704 (87.84%) | loss: 2.481239 | lrm: 0.24 | dt: 644.40ms | tok/sec: 813,603 | mfu: 50.85 | epoch: 2 | total time: 157.61m | eta: 21.8m +step 14673/16704 (87.84%) | loss: 2.479426 | lrm: 0.24 | dt: 643.78ms | tok/sec: 814,384 | mfu: 50.90 | epoch: 2 | total time: 157.62m | eta: 21.8m +step 14674/16704 (87.85%) | loss: 2.485271 | lrm: 0.24 | dt: 643.96ms | tok/sec: 814,163 | mfu: 50.89 | epoch: 2 | total time: 157.64m | eta: 21.8m +step 14675/16704 (87.85%) | loss: 2.481868 | lrm: 0.24 | dt: 644.39ms | tok/sec: 813,624 | mfu: 50.85 | epoch: 2 | total time: 157.65m | eta: 21.8m +step 14676/16704 (87.86%) | loss: 2.482187 | lrm: 0.24 | dt: 647.09ms | tok/sec: 810,229 | mfu: 50.64 | epoch: 2 | total time: 157.66m | eta: 21.8m +step 14677/16704 (87.87%) | loss: 2.483472 | lrm: 0.24 | dt: 643.71ms | tok/sec: 814,479 | mfu: 50.91 | epoch: 2 | total time: 157.67m | eta: 21.8m +step 14678/16704 (87.87%) | loss: 2.491607 | lrm: 0.24 | dt: 647.06ms | tok/sec: 810,262 | mfu: 50.64 | epoch: 2 | total time: 157.68m | eta: 21.8m +step 14679/16704 (87.88%) | loss: 2.487327 | lrm: 0.24 | dt: 647.17ms | tok/sec: 810,128 | mfu: 50.63 | epoch: 2 | total time: 157.69m | eta: 21.8m +step 14680/16704 (87.88%) | loss: 2.502143 | lrm: 0.24 | dt: 645.90ms | tok/sec: 811,723 | mfu: 50.73 | epoch: 2 | total time: 157.70m | eta: 21.8m +step 14681/16704 (87.89%) | loss: 2.497961 | lrm: 0.24 | dt: 645.71ms | tok/sec: 811,955 | mfu: 50.75 | epoch: 2 | total time: 157.71m | eta: 21.7m +step 14682/16704 (87.90%) | loss: 2.500117 | lrm: 0.24 | dt: 645.89ms | tok/sec: 811,731 | mfu: 50.73 | epoch: 2 | total time: 157.72m | eta: 21.7m +step 14683/16704 (87.90%) | loss: 2.491510 | lrm: 0.24 | dt: 644.33ms | tok/sec: 813,696 | mfu: 50.86 | epoch: 2 | total time: 157.73m | eta: 21.7m +step 14684/16704 (87.91%) | loss: 2.495533 | lrm: 0.24 | dt: 645.46ms | tok/sec: 812,267 | mfu: 50.77 | epoch: 2 | total time: 157.74m | eta: 21.7m +step 14685/16704 (87.91%) | loss: 2.496833 | lrm: 0.24 | dt: 642.43ms | tok/sec: 816,097 | mfu: 51.01 | epoch: 2 | total time: 157.75m | eta: 21.7m +step 14686/16704 (87.92%) | loss: 2.490128 | lrm: 0.24 | dt: 646.26ms | tok/sec: 811,271 | mfu: 50.71 | epoch: 2 | total time: 157.76m | eta: 21.7m +step 14687/16704 (87.93%) | loss: 2.498670 | lrm: 0.24 | dt: 644.14ms | tok/sec: 813,937 | mfu: 50.87 | epoch: 2 | total time: 157.78m | eta: 21.7m +step 14688/16704 (87.93%) | loss: 2.499845 | lrm: 0.24 | dt: 646.31ms | tok/sec: 811,207 | mfu: 50.70 | epoch: 2 | total time: 157.79m | eta: 21.7m +step 14689/16704 (87.94%) | loss: 2.501384 | lrm: 0.24 | dt: 646.04ms | tok/sec: 811,538 | mfu: 50.72 | epoch: 2 | total time: 157.80m | eta: 21.7m +step 14690/16704 (87.94%) | loss: 2.507003 | lrm: 0.24 | dt: 642.95ms | tok/sec: 815,438 | mfu: 50.97 | epoch: 2 | total time: 157.81m | eta: 21.7m +step 14691/16704 (87.95%) | loss: 2.506271 | lrm: 0.24 | dt: 645.50ms | tok/sec: 812,214 | mfu: 50.76 | epoch: 2 | total time: 157.82m | eta: 21.6m +step 14692/16704 (87.95%) | loss: 2.500695 | lrm: 0.24 | dt: 644.92ms | tok/sec: 812,956 | mfu: 50.81 | epoch: 2 | total time: 157.83m | eta: 21.6m +step 14693/16704 (87.96%) | loss: 2.513252 | lrm: 0.24 | dt: 644.42ms | tok/sec: 813,583 | mfu: 50.85 | epoch: 2 | total time: 157.84m | eta: 21.6m +step 14694/16704 (87.97%) | loss: 2.500219 | lrm: 0.24 | dt: 647.58ms | tok/sec: 809,609 | mfu: 50.60 | epoch: 2 | total time: 157.85m | eta: 21.6m +step 14695/16704 (87.97%) | loss: 2.508484 | lrm: 0.24 | dt: 647.45ms | tok/sec: 809,777 | mfu: 50.61 | epoch: 2 | total time: 157.86m | eta: 21.6m +step 14696/16704 (87.98%) | loss: 2.507577 | lrm: 0.24 | dt: 643.15ms | tok/sec: 815,189 | mfu: 50.95 | epoch: 2 | total time: 157.87m | eta: 21.6m +step 14697/16704 (87.98%) | loss: 2.494540 | lrm: 0.24 | dt: 644.00ms | tok/sec: 814,112 | mfu: 50.88 | epoch: 2 | total time: 157.88m | eta: 21.6m +step 14698/16704 (87.99%) | loss: 2.496064 | lrm: 0.24 | dt: 645.32ms | tok/sec: 812,444 | mfu: 50.78 | epoch: 2 | total time: 157.89m | eta: 21.6m +step 14699/16704 (88.00%) | loss: 2.502092 | lrm: 0.24 | dt: 645.47ms | tok/sec: 812,263 | mfu: 50.77 | epoch: 2 | total time: 157.90m | eta: 21.6m +step 14700/16704 (88.00%) | loss: 2.494937 | lrm: 0.24 | dt: 644.74ms | tok/sec: 813,172 | mfu: 50.82 | epoch: 2 | total time: 157.92m | eta: 21.5m +step 14701/16704 (88.01%) | loss: 2.515958 | lrm: 0.24 | dt: 643.66ms | tok/sec: 814,535 | mfu: 50.91 | epoch: 2 | total time: 157.93m | eta: 21.5m +step 14702/16704 (88.01%) | loss: 2.515289 | lrm: 0.24 | dt: 644.46ms | tok/sec: 813,527 | mfu: 50.85 | epoch: 2 | total time: 157.94m | eta: 21.5m +step 14703/16704 (88.02%) | loss: 2.514659 | lrm: 0.24 | dt: 646.85ms | tok/sec: 810,519 | mfu: 50.66 | epoch: 2 | total time: 157.95m | eta: 21.5m +step 14704/16704 (88.03%) | loss: 2.496225 | lrm: 0.24 | dt: 642.72ms | tok/sec: 815,726 | mfu: 50.98 | epoch: 2 | total time: 157.96m | eta: 21.5m +step 14705/16704 (88.03%) | loss: 2.496983 | lrm: 0.24 | dt: 644.98ms | tok/sec: 812,872 | mfu: 50.81 | epoch: 2 | total time: 157.97m | eta: 21.5m +step 14706/16704 (88.04%) | loss: 2.501481 | lrm: 0.24 | dt: 644.72ms | tok/sec: 813,197 | mfu: 50.83 | epoch: 2 | total time: 157.98m | eta: 21.5m +step 14707/16704 (88.04%) | loss: 2.503061 | lrm: 0.24 | dt: 644.65ms | tok/sec: 813,295 | mfu: 50.83 | epoch: 2 | total time: 157.99m | eta: 21.5m +step 14708/16704 (88.05%) | loss: 2.497897 | lrm: 0.24 | dt: 645.61ms | tok/sec: 812,075 | mfu: 50.76 | epoch: 2 | total time: 158.00m | eta: 21.5m +step 14709/16704 (88.06%) | loss: 2.485365 | lrm: 0.24 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 2 | total time: 158.01m | eta: 21.4m +step 14710/16704 (88.06%) | loss: 2.479698 | lrm: 0.24 | dt: 647.93ms | tok/sec: 809,177 | mfu: 50.57 | epoch: 2 | total time: 158.02m | eta: 21.4m +step 14711/16704 (88.07%) | loss: 2.473318 | lrm: 0.24 | dt: 645.68ms | tok/sec: 811,988 | mfu: 50.75 | epoch: 2 | total time: 158.03m | eta: 21.4m +step 14712/16704 (88.07%) | loss: 2.475283 | lrm: 0.24 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 2 | total time: 158.04m | eta: 21.4m +step 14713/16704 (88.08%) | loss: 2.476660 | lrm: 0.24 | dt: 645.32ms | tok/sec: 812,441 | mfu: 50.78 | epoch: 2 | total time: 158.06m | eta: 21.4m +step 14714/16704 (88.09%) | loss: 2.466976 | lrm: 0.24 | dt: 645.20ms | tok/sec: 812,597 | mfu: 50.79 | epoch: 2 | total time: 158.07m | eta: 21.4m +step 14715/16704 (88.09%) | loss: 2.462532 | lrm: 0.24 | dt: 641.73ms | tok/sec: 816,986 | mfu: 51.06 | epoch: 2 | total time: 158.08m | eta: 21.4m +step 14716/16704 (88.10%) | loss: 2.462327 | lrm: 0.24 | dt: 648.64ms | tok/sec: 808,292 | mfu: 50.52 | epoch: 2 | total time: 158.09m | eta: 21.4m +step 14717/16704 (88.10%) | loss: 2.449452 | lrm: 0.24 | dt: 644.49ms | tok/sec: 813,488 | mfu: 50.84 | epoch: 2 | total time: 158.10m | eta: 21.4m +step 14718/16704 (88.11%) | loss: 2.460779 | lrm: 0.24 | dt: 644.01ms | tok/sec: 814,094 | mfu: 50.88 | epoch: 2 | total time: 158.11m | eta: 21.3m +step 14719/16704 (88.12%) | loss: 2.457053 | lrm: 0.24 | dt: 649.89ms | tok/sec: 806,735 | mfu: 50.42 | epoch: 2 | total time: 158.12m | eta: 21.3m +step 14720/16704 (88.12%) | loss: 2.450730 | lrm: 0.24 | dt: 642.67ms | tok/sec: 815,796 | mfu: 50.99 | epoch: 2 | total time: 158.13m | eta: 21.3m +step 14721/16704 (88.13%) | loss: 2.450756 | lrm: 0.24 | dt: 649.03ms | tok/sec: 807,805 | mfu: 50.49 | epoch: 2 | total time: 158.14m | eta: 21.3m +step 14722/16704 (88.13%) | loss: 2.463815 | lrm: 0.24 | dt: 645.47ms | tok/sec: 812,257 | mfu: 50.77 | epoch: 2 | total time: 158.15m | eta: 21.3m +step 14723/16704 (88.14%) | loss: 2.477580 | lrm: 0.24 | dt: 645.39ms | tok/sec: 812,359 | mfu: 50.77 | epoch: 2 | total time: 158.16m | eta: 21.3m +step 14724/16704 (88.15%) | loss: 2.476624 | lrm: 0.24 | dt: 647.11ms | tok/sec: 810,196 | mfu: 50.64 | epoch: 2 | total time: 158.17m | eta: 21.3m +step 14725/16704 (88.15%) | loss: 2.485323 | lrm: 0.24 | dt: 644.70ms | tok/sec: 813,231 | mfu: 50.83 | epoch: 2 | total time: 158.18m | eta: 21.3m +step 14726/16704 (88.16%) | loss: 2.483813 | lrm: 0.24 | dt: 645.29ms | tok/sec: 812,485 | mfu: 50.78 | epoch: 2 | total time: 158.19m | eta: 21.3m +step 14727/16704 (88.16%) | loss: 2.484479 | lrm: 0.24 | dt: 646.63ms | tok/sec: 810,803 | mfu: 50.68 | epoch: 2 | total time: 158.21m | eta: 21.3m +step 14728/16704 (88.17%) | loss: 2.486492 | lrm: 0.24 | dt: 645.15ms | tok/sec: 812,660 | mfu: 50.79 | epoch: 2 | total time: 158.22m | eta: 21.2m +step 14729/16704 (88.18%) | loss: 2.472446 | lrm: 0.24 | dt: 648.37ms | tok/sec: 808,618 | mfu: 50.54 | epoch: 2 | total time: 158.23m | eta: 21.2m +step 14730/16704 (88.18%) | loss: 2.467026 | lrm: 0.24 | dt: 642.80ms | tok/sec: 815,629 | mfu: 50.98 | epoch: 2 | total time: 158.24m | eta: 21.2m +step 14731/16704 (88.19%) | loss: 2.467037 | lrm: 0.24 | dt: 645.26ms | tok/sec: 812,516 | mfu: 50.78 | epoch: 2 | total time: 158.25m | eta: 21.2m +step 14732/16704 (88.19%) | loss: 2.457650 | lrm: 0.24 | dt: 645.90ms | tok/sec: 811,711 | mfu: 50.73 | epoch: 2 | total time: 158.26m | eta: 21.2m +step 14733/16704 (88.20%) | loss: 2.463338 | lrm: 0.24 | dt: 645.93ms | tok/sec: 811,680 | mfu: 50.73 | epoch: 2 | total time: 158.27m | eta: 21.2m +step 14734/16704 (88.21%) | loss: 2.460630 | lrm: 0.24 | dt: 646.43ms | tok/sec: 811,055 | mfu: 50.69 | epoch: 2 | total time: 158.28m | eta: 21.2m +step 14735/16704 (88.21%) | loss: 2.459863 | lrm: 0.24 | dt: 647.21ms | tok/sec: 810,069 | mfu: 50.63 | epoch: 2 | total time: 158.29m | eta: 21.2m +step 14736/16704 (88.22%) | loss: 2.453020 | lrm: 0.24 | dt: 645.23ms | tok/sec: 812,560 | mfu: 50.79 | epoch: 2 | total time: 158.30m | eta: 21.2m +step 14737/16704 (88.22%) | loss: 2.458520 | lrm: 0.24 | dt: 646.48ms | tok/sec: 810,988 | mfu: 50.69 | epoch: 2 | total time: 158.31m | eta: 21.1m +step 14738/16704 (88.23%) | loss: 2.452142 | lrm: 0.24 | dt: 646.75ms | tok/sec: 810,649 | mfu: 50.67 | epoch: 2 | total time: 158.32m | eta: 21.1m +step 14739/16704 (88.24%) | loss: 2.461546 | lrm: 0.24 | dt: 645.14ms | tok/sec: 812,670 | mfu: 50.79 | epoch: 2 | total time: 158.33m | eta: 21.1m +step 14740/16704 (88.24%) | loss: 2.456460 | lrm: 0.24 | dt: 644.83ms | tok/sec: 813,057 | mfu: 50.82 | epoch: 2 | total time: 158.35m | eta: 21.1m +step 14741/16704 (88.25%) | loss: 2.447415 | lrm: 0.24 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 2 | total time: 158.36m | eta: 21.1m +step 14742/16704 (88.25%) | loss: 2.447064 | lrm: 0.23 | dt: 645.41ms | tok/sec: 812,330 | mfu: 50.77 | epoch: 2 | total time: 158.37m | eta: 21.1m +step 14743/16704 (88.26%) | loss: 2.452259 | lrm: 0.23 | dt: 646.09ms | tok/sec: 811,479 | mfu: 50.72 | epoch: 2 | total time: 158.38m | eta: 21.1m +step 14744/16704 (88.27%) | loss: 2.443425 | lrm: 0.23 | dt: 644.53ms | tok/sec: 813,436 | mfu: 50.84 | epoch: 2 | total time: 158.39m | eta: 21.1m +step 14745/16704 (88.27%) | loss: 2.437366 | lrm: 0.23 | dt: 645.27ms | tok/sec: 812,515 | mfu: 50.78 | epoch: 2 | total time: 158.40m | eta: 21.1m +step 14746/16704 (88.28%) | loss: 2.433771 | lrm: 0.23 | dt: 644.71ms | tok/sec: 813,213 | mfu: 50.83 | epoch: 2 | total time: 158.41m | eta: 21.0m +step 14747/16704 (88.28%) | loss: 2.443106 | lrm: 0.23 | dt: 645.78ms | tok/sec: 811,863 | mfu: 50.74 | epoch: 2 | total time: 158.42m | eta: 21.0m +step 14748/16704 (88.29%) | loss: 2.455916 | lrm: 0.23 | dt: 645.40ms | tok/sec: 812,350 | mfu: 50.77 | epoch: 2 | total time: 158.43m | eta: 21.0m +step 14749/16704 (88.30%) | loss: 2.439446 | lrm: 0.23 | dt: 646.53ms | tok/sec: 810,922 | mfu: 50.68 | epoch: 2 | total time: 158.44m | eta: 21.0m +Step 14750 | Validation bpb: 0.763719 +step 14750/16704 (88.30%) | loss: 2.436089 | lrm: 0.23 | dt: 629.06ms | tok/sec: 833,447 | mfu: 52.09 | epoch: 2 | total time: 158.45m | eta: 21.0m +step 14751/16704 (88.31%) | loss: 2.426925 | lrm: 0.23 | dt: 651.91ms | tok/sec: 804,237 | mfu: 50.27 | epoch: 2 | total time: 158.46m | eta: 21.0m +step 14752/16704 (88.31%) | loss: 2.428520 | lrm: 0.23 | dt: 639.19ms | tok/sec: 820,235 | mfu: 51.27 | epoch: 2 | total time: 158.47m | eta: 21.0m +step 14753/16704 (88.32%) | loss: 2.430683 | lrm: 0.23 | dt: 647.67ms | tok/sec: 809,503 | mfu: 50.60 | epoch: 2 | total time: 158.49m | eta: 21.0m +step 14754/16704 (88.33%) | loss: 2.436340 | lrm: 0.23 | dt: 647.33ms | tok/sec: 809,925 | mfu: 50.62 | epoch: 2 | total time: 158.50m | eta: 21.0m +step 14755/16704 (88.33%) | loss: 2.433199 | lrm: 0.23 | dt: 640.08ms | tok/sec: 819,094 | mfu: 51.19 | epoch: 2 | total time: 158.51m | eta: 21.0m +step 14756/16704 (88.34%) | loss: 2.462655 | lrm: 0.23 | dt: 646.88ms | tok/sec: 810,486 | mfu: 50.66 | epoch: 2 | total time: 158.52m | eta: 20.9m +step 14757/16704 (88.34%) | loss: 2.462382 | lrm: 0.23 | dt: 645.92ms | tok/sec: 811,693 | mfu: 50.73 | epoch: 2 | total time: 158.53m | eta: 20.9m +step 14758/16704 (88.35%) | loss: 2.460461 | lrm: 0.23 | dt: 644.17ms | tok/sec: 813,898 | mfu: 50.87 | epoch: 2 | total time: 158.54m | eta: 20.9m +step 14759/16704 (88.36%) | loss: 2.474736 | lrm: 0.23 | dt: 647.58ms | tok/sec: 809,616 | mfu: 50.60 | epoch: 2 | total time: 158.55m | eta: 20.9m +step 14760/16704 (88.36%) | loss: 2.473944 | lrm: 0.23 | dt: 643.30ms | tok/sec: 815,002 | mfu: 50.94 | epoch: 2 | total time: 158.56m | eta: 20.9m +step 14761/16704 (88.37%) | loss: 2.480607 | lrm: 0.23 | dt: 644.01ms | tok/sec: 814,096 | mfu: 50.88 | epoch: 2 | total time: 158.57m | eta: 20.9m +step 14762/16704 (88.37%) | loss: 2.462701 | lrm: 0.23 | dt: 647.11ms | tok/sec: 810,196 | mfu: 50.64 | epoch: 2 | total time: 158.58m | eta: 20.9m +step 14763/16704 (88.38%) | loss: 2.464787 | lrm: 0.23 | dt: 645.41ms | tok/sec: 812,328 | mfu: 50.77 | epoch: 2 | total time: 158.59m | eta: 20.9m +step 14764/16704 (88.39%) | loss: 2.472021 | lrm: 0.23 | dt: 643.77ms | tok/sec: 814,396 | mfu: 50.90 | epoch: 2 | total time: 158.60m | eta: 20.9m +step 14765/16704 (88.39%) | loss: 2.479248 | lrm: 0.23 | dt: 646.05ms | tok/sec: 811,525 | mfu: 50.72 | epoch: 2 | total time: 158.61m | eta: 20.8m +step 14766/16704 (88.40%) | loss: 2.469323 | lrm: 0.23 | dt: 646.50ms | tok/sec: 810,967 | mfu: 50.69 | epoch: 2 | total time: 158.63m | eta: 20.8m +step 14767/16704 (88.40%) | loss: 2.479043 | lrm: 0.23 | dt: 645.40ms | tok/sec: 812,345 | mfu: 50.77 | epoch: 2 | total time: 158.64m | eta: 20.8m +step 14768/16704 (88.41%) | loss: 2.475581 | lrm: 0.23 | dt: 645.57ms | tok/sec: 812,126 | mfu: 50.76 | epoch: 2 | total time: 158.65m | eta: 20.8m +step 14769/16704 (88.42%) | loss: 2.455752 | lrm: 0.23 | dt: 643.99ms | tok/sec: 814,123 | mfu: 50.88 | epoch: 2 | total time: 158.66m | eta: 20.8m +step 14770/16704 (88.42%) | loss: 2.466315 | lrm: 0.23 | dt: 646.24ms | tok/sec: 811,285 | mfu: 50.71 | epoch: 2 | total time: 158.67m | eta: 20.8m +step 14771/16704 (88.43%) | loss: 2.471357 | lrm: 0.23 | dt: 645.03ms | tok/sec: 812,814 | mfu: 50.80 | epoch: 2 | total time: 158.68m | eta: 20.8m +step 14772/16704 (88.43%) | loss: 2.470890 | lrm: 0.23 | dt: 645.13ms | tok/sec: 812,688 | mfu: 50.79 | epoch: 2 | total time: 158.69m | eta: 20.8m +step 14773/16704 (88.44%) | loss: 2.490741 | lrm: 0.23 | dt: 646.74ms | tok/sec: 810,659 | mfu: 50.67 | epoch: 2 | total time: 158.70m | eta: 20.8m +step 14774/16704 (88.45%) | loss: 2.482707 | lrm: 0.23 | dt: 645.34ms | tok/sec: 812,423 | mfu: 50.78 | epoch: 2 | total time: 158.71m | eta: 20.7m +step 14775/16704 (88.45%) | loss: 2.479589 | lrm: 0.23 | dt: 646.07ms | tok/sec: 811,503 | mfu: 50.72 | epoch: 2 | total time: 158.72m | eta: 20.7m +step 14776/16704 (88.46%) | loss: 2.476783 | lrm: 0.23 | dt: 645.35ms | tok/sec: 812,413 | mfu: 50.78 | epoch: 2 | total time: 158.73m | eta: 20.7m +step 14777/16704 (88.46%) | loss: 2.464305 | lrm: 0.23 | dt: 645.84ms | tok/sec: 811,790 | mfu: 50.74 | epoch: 2 | total time: 158.74m | eta: 20.7m +step 14778/16704 (88.47%) | loss: 2.464937 | lrm: 0.23 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 2 | total time: 158.75m | eta: 20.7m +step 14779/16704 (88.48%) | loss: 2.475287 | lrm: 0.23 | dt: 646.21ms | tok/sec: 811,327 | mfu: 50.71 | epoch: 2 | total time: 158.76m | eta: 20.7m +step 14780/16704 (88.48%) | loss: 2.469186 | lrm: 0.23 | dt: 644.33ms | tok/sec: 813,700 | mfu: 50.86 | epoch: 2 | total time: 158.78m | eta: 20.7m +step 14781/16704 (88.49%) | loss: 2.455288 | lrm: 0.23 | dt: 647.13ms | tok/sec: 810,179 | mfu: 50.64 | epoch: 2 | total time: 158.79m | eta: 20.7m +step 14782/16704 (88.49%) | loss: 2.454374 | lrm: 0.23 | dt: 644.15ms | tok/sec: 813,916 | mfu: 50.87 | epoch: 2 | total time: 158.80m | eta: 20.7m +step 14783/16704 (88.50%) | loss: 2.455188 | lrm: 0.23 | dt: 649.45ms | tok/sec: 807,278 | mfu: 50.46 | epoch: 2 | total time: 158.81m | eta: 20.7m +step 14784/16704 (88.51%) | loss: 2.455211 | lrm: 0.23 | dt: 644.03ms | tok/sec: 814,076 | mfu: 50.88 | epoch: 2 | total time: 158.82m | eta: 20.6m +step 14785/16704 (88.51%) | loss: 2.460261 | lrm: 0.23 | dt: 646.34ms | tok/sec: 811,163 | mfu: 50.70 | epoch: 2 | total time: 158.83m | eta: 20.6m +step 14786/16704 (88.52%) | loss: 2.464695 | lrm: 0.23 | dt: 644.43ms | tok/sec: 813,567 | mfu: 50.85 | epoch: 2 | total time: 158.84m | eta: 20.6m +step 14787/16704 (88.52%) | loss: 2.462482 | lrm: 0.23 | dt: 645.67ms | tok/sec: 812,006 | mfu: 50.75 | epoch: 2 | total time: 158.85m | eta: 20.6m +step 14788/16704 (88.53%) | loss: 2.460181 | lrm: 0.23 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 2 | total time: 158.86m | eta: 20.6m +step 14789/16704 (88.54%) | loss: 2.469544 | lrm: 0.23 | dt: 646.10ms | tok/sec: 811,469 | mfu: 50.72 | epoch: 2 | total time: 158.87m | eta: 20.6m +step 14790/16704 (88.54%) | loss: 2.472978 | lrm: 0.23 | dt: 643.83ms | tok/sec: 814,324 | mfu: 50.90 | epoch: 2 | total time: 158.88m | eta: 20.6m +step 14791/16704 (88.55%) | loss: 2.476250 | lrm: 0.23 | dt: 644.41ms | tok/sec: 813,595 | mfu: 50.85 | epoch: 2 | total time: 158.89m | eta: 20.6m +step 14792/16704 (88.55%) | loss: 2.468567 | lrm: 0.23 | dt: 645.44ms | tok/sec: 812,299 | mfu: 50.77 | epoch: 2 | total time: 158.90m | eta: 20.6m +step 14793/16704 (88.56%) | loss: 2.472493 | lrm: 0.23 | dt: 644.79ms | tok/sec: 813,110 | mfu: 50.82 | epoch: 2 | total time: 158.92m | eta: 20.5m +step 14794/16704 (88.57%) | loss: 2.459582 | lrm: 0.23 | dt: 646.04ms | tok/sec: 811,536 | mfu: 50.72 | epoch: 2 | total time: 158.93m | eta: 20.5m +step 14795/16704 (88.57%) | loss: 2.470604 | lrm: 0.23 | dt: 644.70ms | tok/sec: 813,226 | mfu: 50.83 | epoch: 2 | total time: 158.94m | eta: 20.5m +step 14796/16704 (88.58%) | loss: 2.459732 | lrm: 0.23 | dt: 645.29ms | tok/sec: 812,482 | mfu: 50.78 | epoch: 2 | total time: 158.95m | eta: 20.5m +step 14797/16704 (88.58%) | loss: 2.471951 | lrm: 0.23 | dt: 644.00ms | tok/sec: 814,111 | mfu: 50.88 | epoch: 2 | total time: 158.96m | eta: 20.5m +step 14798/16704 (88.59%) | loss: 2.473522 | lrm: 0.23 | dt: 646.11ms | tok/sec: 811,447 | mfu: 50.72 | epoch: 2 | total time: 158.97m | eta: 20.5m +step 14799/16704 (88.60%) | loss: 2.470471 | lrm: 0.23 | dt: 642.65ms | tok/sec: 815,816 | mfu: 50.99 | epoch: 2 | total time: 158.98m | eta: 20.5m +step 14800/16704 (88.60%) | loss: 2.470644 | lrm: 0.23 | dt: 645.30ms | tok/sec: 812,471 | mfu: 50.78 | epoch: 2 | total time: 158.99m | eta: 20.5m +step 14801/16704 (88.61%) | loss: 2.462225 | lrm: 0.23 | dt: 644.36ms | tok/sec: 813,653 | mfu: 50.85 | epoch: 2 | total time: 159.00m | eta: 20.5m +step 14802/16704 (88.61%) | loss: 2.445237 | lrm: 0.23 | dt: 645.09ms | tok/sec: 812,739 | mfu: 50.80 | epoch: 2 | total time: 159.01m | eta: 20.4m +step 14803/16704 (88.62%) | loss: 2.450488 | lrm: 0.23 | dt: 644.94ms | tok/sec: 812,922 | mfu: 50.81 | epoch: 2 | total time: 159.02m | eta: 20.4m +step 14804/16704 (88.63%) | loss: 2.455172 | lrm: 0.23 | dt: 649.74ms | tok/sec: 806,922 | mfu: 50.43 | epoch: 2 | total time: 159.03m | eta: 20.4m +step 14805/16704 (88.63%) | loss: 2.453580 | lrm: 0.23 | dt: 642.99ms | tok/sec: 815,396 | mfu: 50.96 | epoch: 2 | total time: 159.04m | eta: 20.4m +step 14806/16704 (88.64%) | loss: 2.456169 | lrm: 0.23 | dt: 645.45ms | tok/sec: 812,280 | mfu: 50.77 | epoch: 2 | total time: 159.06m | eta: 20.4m +step 14807/16704 (88.64%) | loss: 2.461639 | lrm: 0.23 | dt: 647.28ms | tok/sec: 809,990 | mfu: 50.63 | epoch: 2 | total time: 159.07m | eta: 20.4m +step 14808/16704 (88.65%) | loss: 2.467546 | lrm: 0.23 | dt: 643.88ms | tok/sec: 814,260 | mfu: 50.89 | epoch: 2 | total time: 159.08m | eta: 20.4m +step 14809/16704 (88.66%) | loss: 2.473930 | lrm: 0.23 | dt: 645.04ms | tok/sec: 812,803 | mfu: 50.80 | epoch: 2 | total time: 159.09m | eta: 20.4m +step 14810/16704 (88.66%) | loss: 2.477158 | lrm: 0.23 | dt: 644.32ms | tok/sec: 813,705 | mfu: 50.86 | epoch: 2 | total time: 159.10m | eta: 20.4m +step 14811/16704 (88.67%) | loss: 2.480175 | lrm: 0.23 | dt: 643.87ms | tok/sec: 814,278 | mfu: 50.89 | epoch: 2 | total time: 159.11m | eta: 20.3m +step 14812/16704 (88.67%) | loss: 2.496635 | lrm: 0.23 | dt: 645.96ms | tok/sec: 811,639 | mfu: 50.73 | epoch: 2 | total time: 159.12m | eta: 20.3m +step 14813/16704 (88.68%) | loss: 2.497977 | lrm: 0.23 | dt: 645.89ms | tok/sec: 811,735 | mfu: 50.73 | epoch: 2 | total time: 159.13m | eta: 20.3m +step 14814/16704 (88.69%) | loss: 2.500917 | lrm: 0.23 | dt: 645.03ms | tok/sec: 812,807 | mfu: 50.80 | epoch: 2 | total time: 159.14m | eta: 20.3m +step 14815/16704 (88.69%) | loss: 2.505334 | lrm: 0.23 | dt: 645.34ms | tok/sec: 812,415 | mfu: 50.78 | epoch: 2 | total time: 159.15m | eta: 20.3m +step 14816/16704 (88.70%) | loss: 2.503779 | lrm: 0.23 | dt: 643.49ms | tok/sec: 814,752 | mfu: 50.92 | epoch: 2 | total time: 159.16m | eta: 20.3m +step 14817/16704 (88.70%) | loss: 2.498249 | lrm: 0.23 | dt: 645.45ms | tok/sec: 812,285 | mfu: 50.77 | epoch: 2 | total time: 159.17m | eta: 20.3m +step 14818/16704 (88.71%) | loss: 2.500469 | lrm: 0.23 | dt: 644.45ms | tok/sec: 813,537 | mfu: 50.85 | epoch: 2 | total time: 159.18m | eta: 20.3m +step 14819/16704 (88.72%) | loss: 2.483116 | lrm: 0.23 | dt: 646.51ms | tok/sec: 810,945 | mfu: 50.69 | epoch: 2 | total time: 159.20m | eta: 20.3m +step 14820/16704 (88.72%) | loss: 2.488103 | lrm: 0.23 | dt: 644.30ms | tok/sec: 813,737 | mfu: 50.86 | epoch: 2 | total time: 159.21m | eta: 20.3m +step 14821/16704 (88.73%) | loss: 2.487332 | lrm: 0.23 | dt: 643.32ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 2 | total time: 159.22m | eta: 20.2m +step 14822/16704 (88.73%) | loss: 2.489868 | lrm: 0.23 | dt: 645.39ms | tok/sec: 812,361 | mfu: 50.77 | epoch: 2 | total time: 159.23m | eta: 20.2m +step 14823/16704 (88.74%) | loss: 2.489304 | lrm: 0.23 | dt: 645.46ms | tok/sec: 812,273 | mfu: 50.77 | epoch: 2 | total time: 159.24m | eta: 20.2m +step 14824/16704 (88.75%) | loss: 2.488387 | lrm: 0.23 | dt: 642.43ms | tok/sec: 816,103 | mfu: 51.01 | epoch: 2 | total time: 159.25m | eta: 20.2m +step 14825/16704 (88.75%) | loss: 2.483843 | lrm: 0.22 | dt: 645.17ms | tok/sec: 812,639 | mfu: 50.79 | epoch: 2 | total time: 159.26m | eta: 20.2m +step 14826/16704 (88.76%) | loss: 2.480941 | lrm: 0.22 | dt: 645.07ms | tok/sec: 812,756 | mfu: 50.80 | epoch: 2 | total time: 159.27m | eta: 20.2m +step 14827/16704 (88.76%) | loss: 2.483131 | lrm: 0.22 | dt: 645.34ms | tok/sec: 812,419 | mfu: 50.78 | epoch: 2 | total time: 159.28m | eta: 20.2m +step 14828/16704 (88.77%) | loss: 2.464386 | lrm: 0.22 | dt: 646.81ms | tok/sec: 810,580 | mfu: 50.66 | epoch: 2 | total time: 159.29m | eta: 20.2m +step 14829/16704 (88.78%) | loss: 2.464447 | lrm: 0.22 | dt: 645.09ms | tok/sec: 812,740 | mfu: 50.80 | epoch: 2 | total time: 159.30m | eta: 20.2m +step 14830/16704 (88.78%) | loss: 2.469680 | lrm: 0.22 | dt: 643.66ms | tok/sec: 814,546 | mfu: 50.91 | epoch: 2 | total time: 159.31m | eta: 20.1m +step 14831/16704 (88.79%) | loss: 2.468417 | lrm: 0.22 | dt: 645.72ms | tok/sec: 811,947 | mfu: 50.75 | epoch: 2 | total time: 159.32m | eta: 20.1m +step 14832/16704 (88.79%) | loss: 2.480476 | lrm: 0.22 | dt: 645.94ms | tok/sec: 811,663 | mfu: 50.73 | epoch: 2 | total time: 159.33m | eta: 20.1m +step 14833/16704 (88.80%) | loss: 2.471763 | lrm: 0.22 | dt: 645.48ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 2 | total time: 159.35m | eta: 20.1m +step 14834/16704 (88.81%) | loss: 2.465696 | lrm: 0.22 | dt: 646.73ms | tok/sec: 810,670 | mfu: 50.67 | epoch: 2 | total time: 159.36m | eta: 20.1m +step 14835/16704 (88.81%) | loss: 2.473225 | lrm: 0.22 | dt: 646.16ms | tok/sec: 811,396 | mfu: 50.71 | epoch: 2 | total time: 159.37m | eta: 20.1m +step 14836/16704 (88.82%) | loss: 2.464278 | lrm: 0.22 | dt: 645.67ms | tok/sec: 812,003 | mfu: 50.75 | epoch: 2 | total time: 159.38m | eta: 20.1m +step 14837/16704 (88.82%) | loss: 2.481000 | lrm: 0.22 | dt: 643.67ms | tok/sec: 814,523 | mfu: 50.91 | epoch: 2 | total time: 159.39m | eta: 20.1m +step 14838/16704 (88.83%) | loss: 2.484700 | lrm: 0.22 | dt: 644.06ms | tok/sec: 814,030 | mfu: 50.88 | epoch: 2 | total time: 159.40m | eta: 20.1m +step 14839/16704 (88.84%) | loss: 2.471805 | lrm: 0.22 | dt: 644.26ms | tok/sec: 813,787 | mfu: 50.86 | epoch: 2 | total time: 159.41m | eta: 20.0m +step 14840/16704 (88.84%) | loss: 2.473320 | lrm: 0.22 | dt: 644.44ms | tok/sec: 813,558 | mfu: 50.85 | epoch: 2 | total time: 159.42m | eta: 20.0m +step 14841/16704 (88.85%) | loss: 2.468508 | lrm: 0.22 | dt: 647.07ms | tok/sec: 810,249 | mfu: 50.64 | epoch: 2 | total time: 159.43m | eta: 20.0m +step 14842/16704 (88.85%) | loss: 2.476088 | lrm: 0.22 | dt: 645.14ms | tok/sec: 812,672 | mfu: 50.79 | epoch: 2 | total time: 159.44m | eta: 20.0m +step 14843/16704 (88.86%) | loss: 2.468278 | lrm: 0.22 | dt: 645.74ms | tok/sec: 811,917 | mfu: 50.75 | epoch: 2 | total time: 159.45m | eta: 20.0m +step 14844/16704 (88.86%) | loss: 2.467639 | lrm: 0.22 | dt: 644.41ms | tok/sec: 813,589 | mfu: 50.85 | epoch: 2 | total time: 159.46m | eta: 20.0m +step 14845/16704 (88.87%) | loss: 2.467685 | lrm: 0.22 | dt: 644.91ms | tok/sec: 812,962 | mfu: 50.81 | epoch: 2 | total time: 159.47m | eta: 20.0m +step 14846/16704 (88.88%) | loss: 2.463940 | lrm: 0.22 | dt: 645.40ms | tok/sec: 812,346 | mfu: 50.77 | epoch: 2 | total time: 159.49m | eta: 20.0m +step 14847/16704 (88.88%) | loss: 2.461935 | lrm: 0.22 | dt: 644.17ms | tok/sec: 813,897 | mfu: 50.87 | epoch: 2 | total time: 159.50m | eta: 20.0m +step 14848/16704 (88.89%) | loss: 2.456922 | lrm: 0.22 | dt: 644.79ms | tok/sec: 813,109 | mfu: 50.82 | epoch: 2 | total time: 159.51m | eta: 20.0m +step 14849/16704 (88.89%) | loss: 2.447535 | lrm: 0.22 | dt: 645.40ms | tok/sec: 812,342 | mfu: 50.77 | epoch: 2 | total time: 159.52m | eta: 19.9m +step 14850/16704 (88.90%) | loss: 2.444481 | lrm: 0.22 | dt: 645.31ms | tok/sec: 812,461 | mfu: 50.78 | epoch: 2 | total time: 159.53m | eta: 19.9m +step 14851/16704 (88.91%) | loss: 2.452343 | lrm: 0.22 | dt: 645.17ms | tok/sec: 812,633 | mfu: 50.79 | epoch: 2 | total time: 159.54m | eta: 19.9m +step 14852/16704 (88.91%) | loss: 2.449846 | lrm: 0.22 | dt: 645.94ms | tok/sec: 811,663 | mfu: 50.73 | epoch: 2 | total time: 159.55m | eta: 19.9m +step 14853/16704 (88.92%) | loss: 2.456421 | lrm: 0.22 | dt: 646.03ms | tok/sec: 811,559 | mfu: 50.72 | epoch: 2 | total time: 159.56m | eta: 19.9m +step 14854/16704 (88.92%) | loss: 2.466702 | lrm: 0.22 | dt: 644.87ms | tok/sec: 813,017 | mfu: 50.81 | epoch: 2 | total time: 159.57m | eta: 19.9m +step 14855/16704 (88.93%) | loss: 2.460477 | lrm: 0.22 | dt: 643.33ms | tok/sec: 814,954 | mfu: 50.94 | epoch: 2 | total time: 159.58m | eta: 19.9m +step 14856/16704 (88.94%) | loss: 2.480001 | lrm: 0.22 | dt: 645.33ms | tok/sec: 812,427 | mfu: 50.78 | epoch: 2 | total time: 159.59m | eta: 19.9m +step 14857/16704 (88.94%) | loss: 2.471190 | lrm: 0.22 | dt: 647.39ms | tok/sec: 809,846 | mfu: 50.62 | epoch: 2 | total time: 159.60m | eta: 19.9m +step 14858/16704 (88.95%) | loss: 2.472885 | lrm: 0.22 | dt: 643.15ms | tok/sec: 815,186 | mfu: 50.95 | epoch: 2 | total time: 159.61m | eta: 19.8m +step 14859/16704 (88.95%) | loss: 2.457606 | lrm: 0.22 | dt: 645.73ms | tok/sec: 811,935 | mfu: 50.75 | epoch: 2 | total time: 159.63m | eta: 19.8m +step 14860/16704 (88.96%) | loss: 2.460199 | lrm: 0.22 | dt: 644.19ms | tok/sec: 813,872 | mfu: 50.87 | epoch: 2 | total time: 159.64m | eta: 19.8m +step 14861/16704 (88.97%) | loss: 2.451998 | lrm: 0.22 | dt: 644.25ms | tok/sec: 813,800 | mfu: 50.86 | epoch: 2 | total time: 159.65m | eta: 19.8m +step 14862/16704 (88.97%) | loss: 2.450395 | lrm: 0.22 | dt: 645.29ms | tok/sec: 812,481 | mfu: 50.78 | epoch: 2 | total time: 159.66m | eta: 19.8m +step 14863/16704 (88.98%) | loss: 2.459350 | lrm: 0.22 | dt: 645.57ms | tok/sec: 812,130 | mfu: 50.76 | epoch: 2 | total time: 159.67m | eta: 19.8m +step 14864/16704 (88.98%) | loss: 2.463442 | lrm: 0.22 | dt: 645.02ms | tok/sec: 812,818 | mfu: 50.80 | epoch: 2 | total time: 159.68m | eta: 19.8m +step 14865/16704 (88.99%) | loss: 2.470761 | lrm: 0.22 | dt: 645.79ms | tok/sec: 811,849 | mfu: 50.74 | epoch: 2 | total time: 159.69m | eta: 19.8m +step 14866/16704 (89.00%) | loss: 2.467683 | lrm: 0.22 | dt: 646.13ms | tok/sec: 811,429 | mfu: 50.72 | epoch: 2 | total time: 159.70m | eta: 19.8m +step 14867/16704 (89.00%) | loss: 2.473767 | lrm: 0.22 | dt: 644.59ms | tok/sec: 813,361 | mfu: 50.84 | epoch: 2 | total time: 159.71m | eta: 19.7m +step 14868/16704 (89.01%) | loss: 2.472429 | lrm: 0.22 | dt: 643.44ms | tok/sec: 814,820 | mfu: 50.93 | epoch: 2 | total time: 159.72m | eta: 19.7m +step 14869/16704 (89.01%) | loss: 2.474666 | lrm: 0.22 | dt: 645.56ms | tok/sec: 812,148 | mfu: 50.76 | epoch: 2 | total time: 159.73m | eta: 19.7m +step 14870/16704 (89.02%) | loss: 2.472923 | lrm: 0.22 | dt: 644.08ms | tok/sec: 814,016 | mfu: 50.88 | epoch: 2 | total time: 159.74m | eta: 19.7m +step 14871/16704 (89.03%) | loss: 2.476906 | lrm: 0.22 | dt: 644.60ms | tok/sec: 813,349 | mfu: 50.84 | epoch: 2 | total time: 159.75m | eta: 19.7m +step 14872/16704 (89.03%) | loss: 2.490789 | lrm: 0.22 | dt: 646.06ms | tok/sec: 811,514 | mfu: 50.72 | epoch: 2 | total time: 159.76m | eta: 19.7m +step 14873/16704 (89.04%) | loss: 2.480344 | lrm: 0.22 | dt: 644.95ms | tok/sec: 812,914 | mfu: 50.81 | epoch: 2 | total time: 159.78m | eta: 19.7m +step 14874/16704 (89.04%) | loss: 2.476131 | lrm: 0.22 | dt: 645.32ms | tok/sec: 812,452 | mfu: 50.78 | epoch: 2 | total time: 159.79m | eta: 19.7m +step 14875/16704 (89.05%) | loss: 2.485466 | lrm: 0.22 | dt: 644.41ms | tok/sec: 813,587 | mfu: 50.85 | epoch: 2 | total time: 159.80m | eta: 19.7m +step 14876/16704 (89.06%) | loss: 2.482516 | lrm: 0.22 | dt: 645.09ms | tok/sec: 812,734 | mfu: 50.80 | epoch: 2 | total time: 159.81m | eta: 19.7m +step 14877/16704 (89.06%) | loss: 2.484692 | lrm: 0.22 | dt: 645.88ms | tok/sec: 811,738 | mfu: 50.73 | epoch: 2 | total time: 159.82m | eta: 19.6m +step 14878/16704 (89.07%) | loss: 2.474873 | lrm: 0.22 | dt: 644.33ms | tok/sec: 813,700 | mfu: 50.86 | epoch: 2 | total time: 159.83m | eta: 19.6m +step 14879/16704 (89.07%) | loss: 2.472361 | lrm: 0.22 | dt: 642.85ms | tok/sec: 815,571 | mfu: 50.97 | epoch: 2 | total time: 159.84m | eta: 19.6m +step 14880/16704 (89.08%) | loss: 2.473744 | lrm: 0.22 | dt: 646.91ms | tok/sec: 810,453 | mfu: 50.65 | epoch: 2 | total time: 159.85m | eta: 19.6m +step 14881/16704 (89.09%) | loss: 2.476752 | lrm: 0.22 | dt: 644.97ms | tok/sec: 812,887 | mfu: 50.81 | epoch: 2 | total time: 159.86m | eta: 19.6m +step 14882/16704 (89.09%) | loss: 2.475483 | lrm: 0.22 | dt: 646.66ms | tok/sec: 810,760 | mfu: 50.67 | epoch: 2 | total time: 159.87m | eta: 19.6m +step 14883/16704 (89.10%) | loss: 2.479166 | lrm: 0.22 | dt: 644.47ms | tok/sec: 813,518 | mfu: 50.85 | epoch: 2 | total time: 159.88m | eta: 19.6m +step 14884/16704 (89.10%) | loss: 2.472398 | lrm: 0.22 | dt: 644.79ms | tok/sec: 813,114 | mfu: 50.82 | epoch: 2 | total time: 159.89m | eta: 19.6m +step 14885/16704 (89.11%) | loss: 2.474968 | lrm: 0.22 | dt: 645.12ms | tok/sec: 812,698 | mfu: 50.79 | epoch: 2 | total time: 159.90m | eta: 19.6m +step 14886/16704 (89.12%) | loss: 2.477868 | lrm: 0.22 | dt: 644.68ms | tok/sec: 813,255 | mfu: 50.83 | epoch: 2 | total time: 159.92m | eta: 19.5m +step 14887/16704 (89.12%) | loss: 2.492514 | lrm: 0.22 | dt: 646.46ms | tok/sec: 811,008 | mfu: 50.69 | epoch: 2 | total time: 159.93m | eta: 19.5m +step 14888/16704 (89.13%) | loss: 2.487787 | lrm: 0.22 | dt: 645.95ms | tok/sec: 811,651 | mfu: 50.73 | epoch: 2 | total time: 159.94m | eta: 19.5m +step 14889/16704 (89.13%) | loss: 2.484114 | lrm: 0.22 | dt: 645.18ms | tok/sec: 812,620 | mfu: 50.79 | epoch: 2 | total time: 159.95m | eta: 19.5m +step 14890/16704 (89.14%) | loss: 2.487097 | lrm: 0.22 | dt: 643.62ms | tok/sec: 814,595 | mfu: 50.91 | epoch: 2 | total time: 159.96m | eta: 19.5m +step 14891/16704 (89.15%) | loss: 2.484425 | lrm: 0.22 | dt: 644.13ms | tok/sec: 813,952 | mfu: 50.87 | epoch: 2 | total time: 159.97m | eta: 19.5m +step 14892/16704 (89.15%) | loss: 2.480601 | lrm: 0.22 | dt: 648.04ms | tok/sec: 809,033 | mfu: 50.57 | epoch: 2 | total time: 159.98m | eta: 19.5m +step 14893/16704 (89.16%) | loss: 2.467802 | lrm: 0.22 | dt: 642.08ms | tok/sec: 816,544 | mfu: 51.04 | epoch: 2 | total time: 159.99m | eta: 19.5m +step 14894/16704 (89.16%) | loss: 2.451744 | lrm: 0.22 | dt: 645.08ms | tok/sec: 812,748 | mfu: 50.80 | epoch: 2 | total time: 160.00m | eta: 19.5m +step 14895/16704 (89.17%) | loss: 2.471695 | lrm: 0.22 | dt: 644.65ms | tok/sec: 813,287 | mfu: 50.83 | epoch: 2 | total time: 160.01m | eta: 19.4m +step 14896/16704 (89.18%) | loss: 2.467044 | lrm: 0.22 | dt: 646.20ms | tok/sec: 811,337 | mfu: 50.71 | epoch: 2 | total time: 160.02m | eta: 19.4m +step 14897/16704 (89.18%) | loss: 2.465359 | lrm: 0.22 | dt: 643.72ms | tok/sec: 814,460 | mfu: 50.90 | epoch: 2 | total time: 160.03m | eta: 19.4m +step 14898/16704 (89.19%) | loss: 2.469489 | lrm: 0.22 | dt: 645.88ms | tok/sec: 811,737 | mfu: 50.73 | epoch: 2 | total time: 160.04m | eta: 19.4m +step 14899/16704 (89.19%) | loss: 2.472556 | lrm: 0.22 | dt: 644.39ms | tok/sec: 813,622 | mfu: 50.85 | epoch: 2 | total time: 160.06m | eta: 19.4m +step 14900/16704 (89.20%) | loss: 2.471778 | lrm: 0.22 | dt: 646.28ms | tok/sec: 811,235 | mfu: 50.70 | epoch: 2 | total time: 160.07m | eta: 19.4m +step 14901/16704 (89.21%) | loss: 2.467877 | lrm: 0.22 | dt: 644.84ms | tok/sec: 813,054 | mfu: 50.82 | epoch: 2 | total time: 160.08m | eta: 19.4m +step 14902/16704 (89.21%) | loss: 2.470230 | lrm: 0.22 | dt: 644.15ms | tok/sec: 813,926 | mfu: 50.87 | epoch: 2 | total time: 160.09m | eta: 19.4m +step 14903/16704 (89.22%) | loss: 2.474962 | lrm: 0.22 | dt: 646.38ms | tok/sec: 811,109 | mfu: 50.70 | epoch: 2 | total time: 160.10m | eta: 19.4m +step 14904/16704 (89.22%) | loss: 2.481069 | lrm: 0.22 | dt: 646.27ms | tok/sec: 811,256 | mfu: 50.70 | epoch: 2 | total time: 160.11m | eta: 19.3m +step 14905/16704 (89.23%) | loss: 2.500099 | lrm: 0.22 | dt: 646.44ms | tok/sec: 811,042 | mfu: 50.69 | epoch: 2 | total time: 160.12m | eta: 19.3m +step 14906/16704 (89.24%) | loss: 2.492523 | lrm: 0.22 | dt: 644.17ms | tok/sec: 813,900 | mfu: 50.87 | epoch: 2 | total time: 160.13m | eta: 19.3m +step 14907/16704 (89.24%) | loss: 2.495247 | lrm: 0.22 | dt: 644.28ms | tok/sec: 813,760 | mfu: 50.86 | epoch: 2 | total time: 160.14m | eta: 19.3m +step 14908/16704 (89.25%) | loss: 2.483334 | lrm: 0.22 | dt: 646.03ms | tok/sec: 811,556 | mfu: 50.72 | epoch: 2 | total time: 160.15m | eta: 19.3m +step 14909/16704 (89.25%) | loss: 2.482587 | lrm: 0.21 | dt: 648.11ms | tok/sec: 808,949 | mfu: 50.56 | epoch: 2 | total time: 160.16m | eta: 19.3m +step 14910/16704 (89.26%) | loss: 2.474003 | lrm: 0.21 | dt: 644.50ms | tok/sec: 813,475 | mfu: 50.84 | epoch: 2 | total time: 160.17m | eta: 19.3m +step 14911/16704 (89.27%) | loss: 2.480704 | lrm: 0.21 | dt: 646.20ms | tok/sec: 811,345 | mfu: 50.71 | epoch: 2 | total time: 160.18m | eta: 19.3m +step 14912/16704 (89.27%) | loss: 2.486974 | lrm: 0.21 | dt: 648.04ms | tok/sec: 809,032 | mfu: 50.57 | epoch: 2 | total time: 160.20m | eta: 19.3m +step 14913/16704 (89.28%) | loss: 2.473669 | lrm: 0.21 | dt: 644.12ms | tok/sec: 813,957 | mfu: 50.87 | epoch: 2 | total time: 160.21m | eta: 19.3m +step 14914/16704 (89.28%) | loss: 2.482547 | lrm: 0.21 | dt: 645.20ms | tok/sec: 812,592 | mfu: 50.79 | epoch: 2 | total time: 160.22m | eta: 19.2m +step 14915/16704 (89.29%) | loss: 2.490966 | lrm: 0.21 | dt: 647.56ms | tok/sec: 809,638 | mfu: 50.60 | epoch: 2 | total time: 160.23m | eta: 19.2m +step 14916/16704 (89.30%) | loss: 2.499286 | lrm: 0.21 | dt: 644.83ms | tok/sec: 813,068 | mfu: 50.82 | epoch: 2 | total time: 160.24m | eta: 19.2m +step 14917/16704 (89.30%) | loss: 2.486894 | lrm: 0.21 | dt: 651.28ms | tok/sec: 805,010 | mfu: 50.31 | epoch: 2 | total time: 160.25m | eta: 19.2m +step 14918/16704 (89.31%) | loss: 2.480987 | lrm: 0.21 | dt: 643.82ms | tok/sec: 814,336 | mfu: 50.90 | epoch: 2 | total time: 160.26m | eta: 19.2m +step 14919/16704 (89.31%) | loss: 2.488985 | lrm: 0.21 | dt: 646.92ms | tok/sec: 810,438 | mfu: 50.65 | epoch: 2 | total time: 160.27m | eta: 19.2m +step 14920/16704 (89.32%) | loss: 2.493304 | lrm: 0.21 | dt: 647.00ms | tok/sec: 810,333 | mfu: 50.65 | epoch: 2 | total time: 160.28m | eta: 19.2m +step 14921/16704 (89.33%) | loss: 2.493676 | lrm: 0.21 | dt: 642.78ms | tok/sec: 815,662 | mfu: 50.98 | epoch: 2 | total time: 160.29m | eta: 19.2m +step 14922/16704 (89.33%) | loss: 2.489034 | lrm: 0.21 | dt: 648.43ms | tok/sec: 808,548 | mfu: 50.54 | epoch: 2 | total time: 160.30m | eta: 19.2m +step 14923/16704 (89.34%) | loss: 2.490936 | lrm: 0.21 | dt: 647.73ms | tok/sec: 809,417 | mfu: 50.59 | epoch: 2 | total time: 160.31m | eta: 19.1m +step 14924/16704 (89.34%) | loss: 2.493270 | lrm: 0.21 | dt: 645.27ms | tok/sec: 812,505 | mfu: 50.78 | epoch: 2 | total time: 160.32m | eta: 19.1m +step 14925/16704 (89.35%) | loss: 2.491535 | lrm: 0.21 | dt: 645.34ms | tok/sec: 812,416 | mfu: 50.78 | epoch: 2 | total time: 160.34m | eta: 19.1m +step 14926/16704 (89.36%) | loss: 2.484321 | lrm: 0.21 | dt: 645.43ms | tok/sec: 812,310 | mfu: 50.77 | epoch: 2 | total time: 160.35m | eta: 19.1m +step 14927/16704 (89.36%) | loss: 2.474835 | lrm: 0.21 | dt: 644.76ms | tok/sec: 813,149 | mfu: 50.82 | epoch: 2 | total time: 160.36m | eta: 19.1m +step 14928/16704 (89.37%) | loss: 2.467291 | lrm: 0.21 | dt: 647.83ms | tok/sec: 809,301 | mfu: 50.58 | epoch: 2 | total time: 160.37m | eta: 19.1m +step 14929/16704 (89.37%) | loss: 2.466219 | lrm: 0.21 | dt: 648.86ms | tok/sec: 808,012 | mfu: 50.50 | epoch: 2 | total time: 160.38m | eta: 19.1m +step 14930/16704 (89.38%) | loss: 2.469209 | lrm: 0.21 | dt: 646.19ms | tok/sec: 811,357 | mfu: 50.71 | epoch: 2 | total time: 160.39m | eta: 19.1m +step 14931/16704 (89.39%) | loss: 2.470213 | lrm: 0.21 | dt: 644.74ms | tok/sec: 813,177 | mfu: 50.82 | epoch: 2 | total time: 160.40m | eta: 19.1m +step 14932/16704 (89.39%) | loss: 2.471064 | lrm: 0.21 | dt: 648.00ms | tok/sec: 809,084 | mfu: 50.57 | epoch: 2 | total time: 160.41m | eta: 19.0m +step 14933/16704 (89.40%) | loss: 2.470863 | lrm: 0.21 | dt: 645.11ms | tok/sec: 812,709 | mfu: 50.80 | epoch: 2 | total time: 160.42m | eta: 19.0m +step 14934/16704 (89.40%) | loss: 2.478830 | lrm: 0.21 | dt: 645.84ms | tok/sec: 811,792 | mfu: 50.74 | epoch: 2 | total time: 160.43m | eta: 19.0m +step 14935/16704 (89.41%) | loss: 2.484090 | lrm: 0.21 | dt: 646.96ms | tok/sec: 810,382 | mfu: 50.65 | epoch: 2 | total time: 160.44m | eta: 19.0m +step 14936/16704 (89.42%) | loss: 2.483245 | lrm: 0.21 | dt: 645.73ms | tok/sec: 811,935 | mfu: 50.75 | epoch: 2 | total time: 160.45m | eta: 19.0m +step 14937/16704 (89.42%) | loss: 2.475331 | lrm: 0.21 | dt: 647.49ms | tok/sec: 809,726 | mfu: 50.61 | epoch: 2 | total time: 160.46m | eta: 19.0m +step 14938/16704 (89.43%) | loss: 2.466595 | lrm: 0.21 | dt: 648.86ms | tok/sec: 808,012 | mfu: 50.50 | epoch: 2 | total time: 160.48m | eta: 19.0m +step 14939/16704 (89.43%) | loss: 2.476048 | lrm: 0.21 | dt: 648.37ms | tok/sec: 808,619 | mfu: 50.54 | epoch: 2 | total time: 160.49m | eta: 19.0m +step 14940/16704 (89.44%) | loss: 2.463945 | lrm: 0.21 | dt: 646.03ms | tok/sec: 811,557 | mfu: 50.72 | epoch: 2 | total time: 160.50m | eta: 19.0m +step 14941/16704 (89.45%) | loss: 2.463999 | lrm: 0.21 | dt: 645.75ms | tok/sec: 811,901 | mfu: 50.74 | epoch: 2 | total time: 160.51m | eta: 19.0m +step 14942/16704 (89.45%) | loss: 2.457704 | lrm: 0.21 | dt: 647.27ms | tok/sec: 809,999 | mfu: 50.63 | epoch: 2 | total time: 160.52m | eta: 18.9m +step 14943/16704 (89.46%) | loss: 2.466161 | lrm: 0.21 | dt: 645.28ms | tok/sec: 812,498 | mfu: 50.78 | epoch: 2 | total time: 160.53m | eta: 18.9m +step 14944/16704 (89.46%) | loss: 2.469908 | lrm: 0.21 | dt: 650.22ms | tok/sec: 806,324 | mfu: 50.40 | epoch: 2 | total time: 160.54m | eta: 18.9m +step 14945/16704 (89.47%) | loss: 2.465621 | lrm: 0.21 | dt: 646.67ms | tok/sec: 810,753 | mfu: 50.67 | epoch: 2 | total time: 160.55m | eta: 18.9m +step 14946/16704 (89.48%) | loss: 2.464519 | lrm: 0.21 | dt: 646.44ms | tok/sec: 811,039 | mfu: 50.69 | epoch: 2 | total time: 160.56m | eta: 18.9m +step 14947/16704 (89.48%) | loss: 2.459209 | lrm: 0.21 | dt: 648.46ms | tok/sec: 808,512 | mfu: 50.53 | epoch: 2 | total time: 160.57m | eta: 18.9m +step 14948/16704 (89.49%) | loss: 2.453089 | lrm: 0.21 | dt: 646.52ms | tok/sec: 810,936 | mfu: 50.68 | epoch: 2 | total time: 160.58m | eta: 18.9m +step 14949/16704 (89.49%) | loss: 2.459997 | lrm: 0.21 | dt: 650.24ms | tok/sec: 806,297 | mfu: 50.39 | epoch: 2 | total time: 160.59m | eta: 18.9m +step 14950/16704 (89.50%) | loss: 2.477080 | lrm: 0.21 | dt: 647.39ms | tok/sec: 809,852 | mfu: 50.62 | epoch: 2 | total time: 160.60m | eta: 18.9m +step 14951/16704 (89.51%) | loss: 2.473619 | lrm: 0.21 | dt: 647.12ms | tok/sec: 810,185 | mfu: 50.64 | epoch: 2 | total time: 160.62m | eta: 18.8m +step 14952/16704 (89.51%) | loss: 2.479804 | lrm: 0.21 | dt: 649.09ms | tok/sec: 807,722 | mfu: 50.48 | epoch: 2 | total time: 160.63m | eta: 18.8m +step 14953/16704 (89.52%) | loss: 2.466711 | lrm: 0.21 | dt: 644.85ms | tok/sec: 813,038 | mfu: 50.82 | epoch: 2 | total time: 160.64m | eta: 18.8m +step 14954/16704 (89.52%) | loss: 2.478439 | lrm: 0.21 | dt: 645.03ms | tok/sec: 812,812 | mfu: 50.80 | epoch: 2 | total time: 160.65m | eta: 18.8m +step 14955/16704 (89.53%) | loss: 2.465688 | lrm: 0.21 | dt: 645.17ms | tok/sec: 812,641 | mfu: 50.79 | epoch: 2 | total time: 160.66m | eta: 18.8m +step 14956/16704 (89.54%) | loss: 2.465604 | lrm: 0.21 | dt: 645.22ms | tok/sec: 812,574 | mfu: 50.79 | epoch: 2 | total time: 160.67m | eta: 18.8m +step 14957/16704 (89.54%) | loss: 2.458970 | lrm: 0.21 | dt: 648.41ms | tok/sec: 808,570 | mfu: 50.54 | epoch: 2 | total time: 160.68m | eta: 18.8m +step 14958/16704 (89.55%) | loss: 2.460301 | lrm: 0.21 | dt: 645.39ms | tok/sec: 812,358 | mfu: 50.77 | epoch: 2 | total time: 160.69m | eta: 18.8m +step 14959/16704 (89.55%) | loss: 2.470919 | lrm: 0.21 | dt: 646.84ms | tok/sec: 810,536 | mfu: 50.66 | epoch: 2 | total time: 160.70m | eta: 18.8m +step 14960/16704 (89.56%) | loss: 2.455720 | lrm: 0.21 | dt: 647.48ms | tok/sec: 809,741 | mfu: 50.61 | epoch: 2 | total time: 160.71m | eta: 18.7m +step 14961/16704 (89.57%) | loss: 2.459867 | lrm: 0.21 | dt: 649.22ms | tok/sec: 807,560 | mfu: 50.47 | epoch: 2 | total time: 160.72m | eta: 18.7m +step 14962/16704 (89.57%) | loss: 2.453026 | lrm: 0.21 | dt: 646.99ms | tok/sec: 810,353 | mfu: 50.65 | epoch: 2 | total time: 160.73m | eta: 18.7m +step 14963/16704 (89.58%) | loss: 2.462370 | lrm: 0.21 | dt: 647.61ms | tok/sec: 809,578 | mfu: 50.60 | epoch: 2 | total time: 160.74m | eta: 18.7m +step 14964/16704 (89.58%) | loss: 2.462441 | lrm: 0.21 | dt: 648.28ms | tok/sec: 808,737 | mfu: 50.55 | epoch: 2 | total time: 160.76m | eta: 18.7m +step 14965/16704 (89.59%) | loss: 2.458850 | lrm: 0.21 | dt: 646.45ms | tok/sec: 811,028 | mfu: 50.69 | epoch: 2 | total time: 160.77m | eta: 18.7m +step 14966/16704 (89.60%) | loss: 2.455524 | lrm: 0.21 | dt: 647.35ms | tok/sec: 809,901 | mfu: 50.62 | epoch: 2 | total time: 160.78m | eta: 18.7m +step 14967/16704 (89.60%) | loss: 2.458209 | lrm: 0.21 | dt: 647.07ms | tok/sec: 810,246 | mfu: 50.64 | epoch: 2 | total time: 160.79m | eta: 18.7m +step 14968/16704 (89.61%) | loss: 2.464324 | lrm: 0.21 | dt: 649.98ms | tok/sec: 806,624 | mfu: 50.42 | epoch: 2 | total time: 160.80m | eta: 18.7m +step 14969/16704 (89.61%) | loss: 2.466538 | lrm: 0.21 | dt: 649.25ms | tok/sec: 807,528 | mfu: 50.47 | epoch: 2 | total time: 160.81m | eta: 18.7m +step 14970/16704 (89.62%) | loss: 2.471802 | lrm: 0.21 | dt: 647.42ms | tok/sec: 809,813 | mfu: 50.61 | epoch: 2 | total time: 160.82m | eta: 18.6m +step 14971/16704 (89.63%) | loss: 2.489259 | lrm: 0.21 | dt: 645.47ms | tok/sec: 812,259 | mfu: 50.77 | epoch: 2 | total time: 160.83m | eta: 18.6m +step 14972/16704 (89.63%) | loss: 2.483639 | lrm: 0.21 | dt: 647.17ms | tok/sec: 810,129 | mfu: 50.63 | epoch: 2 | total time: 160.84m | eta: 18.6m +step 14973/16704 (89.64%) | loss: 2.482747 | lrm: 0.21 | dt: 647.34ms | tok/sec: 809,908 | mfu: 50.62 | epoch: 2 | total time: 160.85m | eta: 18.6m +step 14974/16704 (89.64%) | loss: 2.496916 | lrm: 0.21 | dt: 647.29ms | tok/sec: 809,976 | mfu: 50.62 | epoch: 2 | total time: 160.86m | eta: 18.6m +step 14975/16704 (89.65%) | loss: 2.489296 | lrm: 0.21 | dt: 647.59ms | tok/sec: 809,595 | mfu: 50.60 | epoch: 2 | total time: 160.87m | eta: 18.6m +step 14976/16704 (89.66%) | loss: 2.485825 | lrm: 0.21 | dt: 648.59ms | tok/sec: 808,346 | mfu: 50.52 | epoch: 2 | total time: 160.89m | eta: 18.6m +step 14977/16704 (89.66%) | loss: 2.487362 | lrm: 0.21 | dt: 644.82ms | tok/sec: 813,074 | mfu: 50.82 | epoch: 2 | total time: 160.90m | eta: 18.6m +step 14978/16704 (89.67%) | loss: 2.471954 | lrm: 0.21 | dt: 646.78ms | tok/sec: 810,609 | mfu: 50.66 | epoch: 2 | total time: 160.91m | eta: 18.6m +step 14979/16704 (89.67%) | loss: 2.457836 | lrm: 0.21 | dt: 650.16ms | tok/sec: 806,395 | mfu: 50.40 | epoch: 2 | total time: 160.92m | eta: 18.5m +step 14980/16704 (89.68%) | loss: 2.458892 | lrm: 0.21 | dt: 645.84ms | tok/sec: 811,795 | mfu: 50.74 | epoch: 2 | total time: 160.93m | eta: 18.5m +step 14981/16704 (89.69%) | loss: 2.465179 | lrm: 0.21 | dt: 645.93ms | tok/sec: 811,685 | mfu: 50.73 | epoch: 2 | total time: 160.94m | eta: 18.5m +step 14982/16704 (89.69%) | loss: 2.465901 | lrm: 0.21 | dt: 647.98ms | tok/sec: 809,106 | mfu: 50.57 | epoch: 2 | total time: 160.95m | eta: 18.5m +step 14983/16704 (89.70%) | loss: 2.458449 | lrm: 0.21 | dt: 643.42ms | tok/sec: 814,851 | mfu: 50.93 | epoch: 2 | total time: 160.96m | eta: 18.5m +step 14984/16704 (89.70%) | loss: 2.457384 | lrm: 0.21 | dt: 645.95ms | tok/sec: 811,659 | mfu: 50.73 | epoch: 2 | total time: 160.97m | eta: 18.5m +step 14985/16704 (89.71%) | loss: 2.468666 | lrm: 0.21 | dt: 647.69ms | tok/sec: 809,467 | mfu: 50.59 | epoch: 2 | total time: 160.98m | eta: 18.5m +step 14986/16704 (89.72%) | loss: 2.474959 | lrm: 0.21 | dt: 645.83ms | tok/sec: 811,799 | mfu: 50.74 | epoch: 2 | total time: 160.99m | eta: 18.5m +step 14987/16704 (89.72%) | loss: 2.470327 | lrm: 0.21 | dt: 646.56ms | tok/sec: 810,886 | mfu: 50.68 | epoch: 2 | total time: 161.00m | eta: 18.5m +step 14988/16704 (89.73%) | loss: 2.467419 | lrm: 0.21 | dt: 646.16ms | tok/sec: 811,388 | mfu: 50.71 | epoch: 2 | total time: 161.01m | eta: 18.4m +step 14989/16704 (89.73%) | loss: 2.464124 | lrm: 0.21 | dt: 647.45ms | tok/sec: 809,770 | mfu: 50.61 | epoch: 2 | total time: 161.03m | eta: 18.4m +step 14990/16704 (89.74%) | loss: 2.468814 | lrm: 0.21 | dt: 648.36ms | tok/sec: 808,635 | mfu: 50.54 | epoch: 2 | total time: 161.04m | eta: 18.4m +step 14991/16704 (89.74%) | loss: 2.450518 | lrm: 0.21 | dt: 648.36ms | tok/sec: 808,640 | mfu: 50.54 | epoch: 2 | total time: 161.05m | eta: 18.4m +step 14992/16704 (89.75%) | loss: 2.459166 | lrm: 0.20 | dt: 650.33ms | tok/sec: 806,181 | mfu: 50.39 | epoch: 2 | total time: 161.06m | eta: 18.4m +step 14993/16704 (89.76%) | loss: 2.468569 | lrm: 0.20 | dt: 644.12ms | tok/sec: 813,965 | mfu: 50.87 | epoch: 2 | total time: 161.07m | eta: 18.4m +step 14994/16704 (89.76%) | loss: 2.470925 | lrm: 0.20 | dt: 646.10ms | tok/sec: 811,469 | mfu: 50.72 | epoch: 2 | total time: 161.08m | eta: 18.4m +step 14995/16704 (89.77%) | loss: 2.464441 | lrm: 0.20 | dt: 646.47ms | tok/sec: 811,002 | mfu: 50.69 | epoch: 2 | total time: 161.09m | eta: 18.4m +step 14996/16704 (89.77%) | loss: 2.460415 | lrm: 0.20 | dt: 645.63ms | tok/sec: 812,052 | mfu: 50.75 | epoch: 2 | total time: 161.10m | eta: 18.4m +step 14997/16704 (89.78%) | loss: 2.467390 | lrm: 0.20 | dt: 646.83ms | tok/sec: 810,551 | mfu: 50.66 | epoch: 2 | total time: 161.11m | eta: 18.4m +step 14998/16704 (89.79%) | loss: 2.463122 | lrm: 0.20 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 2 | total time: 161.12m | eta: 18.3m +step 14999/16704 (89.79%) | loss: 2.474579 | lrm: 0.20 | dt: 646.63ms | tok/sec: 810,802 | mfu: 50.68 | epoch: 2 | total time: 161.13m | eta: 18.3m +Step 15000 | Validation bpb: 0.761969 +Evaluating: hellaswag_zeroshot (0-shot, type: multiple_choice)... accuracy: 0.5034 | centered: 0.3378 | time: 22.62s +Evaluating: jeopardy (10-shot, type: language_modeling)... accuracy: 0.1582 | centered: 0.1582 | time: 4.81s +Evaluating: bigbench_qa_wikidata (10-shot, type: language_modeling)... accuracy: 0.5041 | centered: 0.5041 | time: 47.30s +Evaluating: arc_easy (10-shot, type: multiple_choice)... accuracy: 0.6658 | centered: 0.5544 | time: 5.92s +Evaluating: arc_challenge (10-shot, type: multiple_choice)... accuracy: 0.3703 | centered: 0.1604 | time: 2.93s +Evaluating: copa (0-shot, type: multiple_choice)... accuracy: 0.6600 | centered: 0.3200 | time: 0.23s +Evaluating: commonsense_qa (10-shot, type: multiple_choice)... accuracy: 0.2735 | centered: 0.0919 | time: 3.11s +Evaluating: piqa (10-shot, type: multiple_choice)... accuracy: 0.7062 | centered: 0.4124 | time: 4.36s +Evaluating: openbook_qa (0-shot, type: multiple_choice)... accuracy: 0.3820 | centered: 0.1760 | time: 1.12s +Evaluating: lambada_openai (0-shot, type: language_modeling)... accuracy: 0.4134 | centered: 0.4134 | time: 11.48s +Evaluating: hellaswag (10-shot, type: multiple_choice)... accuracy: 0.5051 | centered: 0.3401 | time: 35.62s +Evaluating: winograd (0-shot, type: schema)... accuracy: 0.6777 | centered: 0.3553 | time: 0.61s +Evaluating: winogrande (0-shot, type: schema)... accuracy: 0.5762 | centered: 0.1523 | time: 2.76s +Evaluating: bigbench_dyck_languages (10-shot, type: language_modeling)... accuracy: 0.1310 | centered: 0.1310 | time: 2.36s +Evaluating: agi_eval_lsat_ar (3-shot, type: multiple_choice)... accuracy: 0.2565 | centered: 0.0707 | time: 0.79s +Evaluating: bigbench_cs_algorithms (10-shot, type: language_modeling)... accuracy: 0.4273 | centered: 0.4273 | time: 3.05s +Evaluating: bigbench_operators (10-shot, type: language_modeling)... accuracy: 0.1714 | centered: 0.1714 | time: 0.49s +Evaluating: bigbench_repeat_copy_logic (10-shot, type: language_modeling)... accuracy: 0.0000 | centered: 0.0000 | time: 0.08s +Evaluating: squad (10-shot, type: language_modeling)... accuracy: 0.3267 | centered: 0.3267 | time: 28.75s +Evaluating: coqa (0-shot, type: language_modeling)... accuracy: 0.2455 | centered: 0.2455 | time: 18.42s +Evaluating: boolq (10-shot, type: multiple_choice)... accuracy: 0.5872 | centered: -0.0864 | time: 10.61s +Evaluating: bigbench_language_identification (10-shot, type: multiple_choice)... accuracy: 0.2517 | centered: 0.1768 | time: 59.35s +Step 15000 | CORE metric: 0.2472 +step 15000/16704 (89.80%) | loss: 2.477914 | lrm: 0.20 | dt: 627.85ms | tok/sec: 835,058 | mfu: 52.19 | epoch: 2 | total time: 161.14m | eta: 18.3m +step 15001/16704 (89.80%) | loss: 2.482463 | lrm: 0.20 | dt: 654.97ms | tok/sec: 800,481 | mfu: 50.03 | epoch: 2 | total time: 161.15m | eta: 18.3m +step 15002/16704 (89.81%) | loss: 2.480597 | lrm: 0.20 | dt: 644.00ms | tok/sec: 814,107 | mfu: 50.88 | epoch: 2 | total time: 161.17m | eta: 18.3m +step 15003/16704 (89.82%) | loss: 2.470595 | lrm: 0.20 | dt: 640.30ms | tok/sec: 818,819 | mfu: 51.18 | epoch: 2 | total time: 161.18m | eta: 18.3m +step 15004/16704 (89.82%) | loss: 2.459657 | lrm: 0.20 | dt: 648.61ms | tok/sec: 808,324 | mfu: 50.52 | epoch: 2 | total time: 161.19m | eta: 18.3m +step 15005/16704 (89.83%) | loss: 2.458326 | lrm: 0.20 | dt: 639.73ms | tok/sec: 819,551 | mfu: 51.22 | epoch: 2 | total time: 161.20m | eta: 18.3m +step 15006/16704 (89.83%) | loss: 2.472449 | lrm: 0.20 | dt: 645.83ms | tok/sec: 811,810 | mfu: 50.74 | epoch: 2 | total time: 161.21m | eta: 18.3m +step 15007/16704 (89.84%) | loss: 2.473159 | lrm: 0.20 | dt: 645.51ms | tok/sec: 812,212 | mfu: 50.76 | epoch: 2 | total time: 161.22m | eta: 18.2m +step 15008/16704 (89.85%) | loss: 2.463556 | lrm: 0.20 | dt: 641.89ms | tok/sec: 816,793 | mfu: 51.05 | epoch: 2 | total time: 161.23m | eta: 18.2m +step 15009/16704 (89.85%) | loss: 2.476287 | lrm: 0.20 | dt: 644.69ms | tok/sec: 813,241 | mfu: 50.83 | epoch: 2 | total time: 161.24m | eta: 18.2m +step 15010/16704 (89.86%) | loss: 2.460398 | lrm: 0.20 | dt: 644.29ms | tok/sec: 813,739 | mfu: 50.86 | epoch: 2 | total time: 161.25m | eta: 18.2m +step 15011/16704 (89.86%) | loss: 2.464266 | lrm: 0.20 | dt: 642.79ms | tok/sec: 815,639 | mfu: 50.98 | epoch: 2 | total time: 161.26m | eta: 18.2m +step 15012/16704 (89.87%) | loss: 2.452024 | lrm: 0.20 | dt: 643.97ms | tok/sec: 814,145 | mfu: 50.89 | epoch: 2 | total time: 161.27m | eta: 18.2m +step 15013/16704 (89.88%) | loss: 2.452845 | lrm: 0.20 | dt: 644.17ms | tok/sec: 813,898 | mfu: 50.87 | epoch: 2 | total time: 161.28m | eta: 18.2m +step 15014/16704 (89.88%) | loss: 2.459947 | lrm: 0.20 | dt: 647.04ms | tok/sec: 810,283 | mfu: 50.64 | epoch: 2 | total time: 161.29m | eta: 18.2m +step 15015/16704 (89.89%) | loss: 2.467086 | lrm: 0.20 | dt: 644.27ms | tok/sec: 813,776 | mfu: 50.86 | epoch: 2 | total time: 161.30m | eta: 18.2m +step 15016/16704 (89.89%) | loss: 2.444583 | lrm: 0.20 | dt: 642.56ms | tok/sec: 815,930 | mfu: 51.00 | epoch: 2 | total time: 161.32m | eta: 18.1m +step 15017/16704 (89.90%) | loss: 2.446987 | lrm: 0.20 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 2 | total time: 161.33m | eta: 18.1m +step 15018/16704 (89.91%) | loss: 2.459349 | lrm: 0.20 | dt: 643.83ms | tok/sec: 814,331 | mfu: 50.90 | epoch: 2 | total time: 161.34m | eta: 18.1m +step 15019/16704 (89.91%) | loss: 2.459757 | lrm: 0.20 | dt: 645.41ms | tok/sec: 812,327 | mfu: 50.77 | epoch: 2 | total time: 161.35m | eta: 18.1m +step 15020/16704 (89.92%) | loss: 2.460531 | lrm: 0.20 | dt: 646.02ms | tok/sec: 811,560 | mfu: 50.72 | epoch: 2 | total time: 161.36m | eta: 18.1m +step 15021/16704 (89.92%) | loss: 2.451277 | lrm: 0.20 | dt: 643.32ms | tok/sec: 814,978 | mfu: 50.94 | epoch: 2 | total time: 161.37m | eta: 18.1m +step 15022/16704 (89.93%) | loss: 2.464089 | lrm: 0.20 | dt: 645.63ms | tok/sec: 812,061 | mfu: 50.76 | epoch: 2 | total time: 161.38m | eta: 18.1m +step 15023/16704 (89.94%) | loss: 2.450922 | lrm: 0.20 | dt: 643.75ms | tok/sec: 814,429 | mfu: 50.90 | epoch: 2 | total time: 161.39m | eta: 18.1m +step 15024/16704 (89.94%) | loss: 2.458083 | lrm: 0.20 | dt: 642.91ms | tok/sec: 815,490 | mfu: 50.97 | epoch: 2 | total time: 161.40m | eta: 18.1m +step 15025/16704 (89.95%) | loss: 2.457654 | lrm: 0.20 | dt: 641.17ms | tok/sec: 817,701 | mfu: 51.11 | epoch: 2 | total time: 161.41m | eta: 18.0m +step 15026/16704 (89.95%) | loss: 2.458951 | lrm: 0.20 | dt: 640.49ms | tok/sec: 818,567 | mfu: 51.16 | epoch: 2 | total time: 161.42m | eta: 18.0m +step 15027/16704 (89.96%) | loss: 2.455171 | lrm: 0.20 | dt: 642.84ms | tok/sec: 815,574 | mfu: 50.97 | epoch: 2 | total time: 161.43m | eta: 18.0m +step 15028/16704 (89.97%) | loss: 2.452716 | lrm: 0.20 | dt: 641.67ms | tok/sec: 817,066 | mfu: 51.07 | epoch: 2 | total time: 161.44m | eta: 18.0m +step 15029/16704 (89.97%) | loss: 2.457470 | lrm: 0.20 | dt: 643.38ms | tok/sec: 814,900 | mfu: 50.93 | epoch: 2 | total time: 161.45m | eta: 18.0m +step 15030/16704 (89.98%) | loss: 2.472252 | lrm: 0.20 | dt: 643.39ms | tok/sec: 814,879 | mfu: 50.93 | epoch: 2 | total time: 161.47m | eta: 18.0m +step 15031/16704 (89.98%) | loss: 2.459656 | lrm: 0.20 | dt: 643.68ms | tok/sec: 814,514 | mfu: 50.91 | epoch: 2 | total time: 161.48m | eta: 18.0m +step 15032/16704 (89.99%) | loss: 2.473100 | lrm: 0.20 | dt: 643.25ms | tok/sec: 815,064 | mfu: 50.94 | epoch: 2 | total time: 161.49m | eta: 18.0m +step 15033/16704 (90.00%) | loss: 2.465004 | lrm: 0.20 | dt: 641.86ms | tok/sec: 816,825 | mfu: 51.05 | epoch: 2 | total time: 161.50m | eta: 18.0m +step 15034/16704 (90.00%) | loss: 2.471471 | lrm: 0.20 | dt: 642.19ms | tok/sec: 816,409 | mfu: 51.03 | epoch: 2 | total time: 161.51m | eta: 18.0m +step 15035/16704 (90.01%) | loss: 2.469196 | lrm: 0.20 | dt: 641.60ms | tok/sec: 817,158 | mfu: 51.07 | epoch: 2 | total time: 161.52m | eta: 17.9m +step 15036/16704 (90.01%) | loss: 2.453525 | lrm: 0.20 | dt: 642.46ms | tok/sec: 816,065 | mfu: 51.01 | epoch: 2 | total time: 161.53m | eta: 17.9m +step 15037/16704 (90.02%) | loss: 2.449240 | lrm: 0.20 | dt: 643.12ms | tok/sec: 815,227 | mfu: 50.95 | epoch: 2 | total time: 161.54m | eta: 17.9m +step 15038/16704 (90.03%) | loss: 2.456291 | lrm: 0.20 | dt: 643.83ms | tok/sec: 814,324 | mfu: 50.90 | epoch: 2 | total time: 161.55m | eta: 17.9m +step 15039/16704 (90.03%) | loss: 2.473810 | lrm: 0.20 | dt: 643.19ms | tok/sec: 815,132 | mfu: 50.95 | epoch: 2 | total time: 161.56m | eta: 17.9m +step 15040/16704 (90.04%) | loss: 2.476697 | lrm: 0.20 | dt: 640.91ms | tok/sec: 818,033 | mfu: 51.13 | epoch: 2 | total time: 161.57m | eta: 17.9m +step 15041/16704 (90.04%) | loss: 2.475584 | lrm: 0.20 | dt: 643.90ms | tok/sec: 814,238 | mfu: 50.89 | epoch: 2 | total time: 161.58m | eta: 17.9m +step 15042/16704 (90.05%) | loss: 2.476151 | lrm: 0.20 | dt: 640.62ms | tok/sec: 818,407 | mfu: 51.15 | epoch: 2 | total time: 161.59m | eta: 17.9m +step 15043/16704 (90.06%) | loss: 2.468744 | lrm: 0.20 | dt: 644.00ms | tok/sec: 814,114 | mfu: 50.88 | epoch: 2 | total time: 161.60m | eta: 17.9m +step 15044/16704 (90.06%) | loss: 2.462806 | lrm: 0.20 | dt: 645.87ms | tok/sec: 811,757 | mfu: 50.74 | epoch: 2 | total time: 161.62m | eta: 17.8m +step 15045/16704 (90.07%) | loss: 2.475673 | lrm: 0.20 | dt: 642.46ms | tok/sec: 816,066 | mfu: 51.01 | epoch: 2 | total time: 161.63m | eta: 17.8m +step 15046/16704 (90.07%) | loss: 2.480345 | lrm: 0.20 | dt: 641.42ms | tok/sec: 817,387 | mfu: 51.09 | epoch: 2 | total time: 161.64m | eta: 17.8m +step 15047/16704 (90.08%) | loss: 2.482744 | lrm: 0.20 | dt: 642.82ms | tok/sec: 815,611 | mfu: 50.98 | epoch: 2 | total time: 161.65m | eta: 17.8m +step 15048/16704 (90.09%) | loss: 2.487750 | lrm: 0.20 | dt: 641.91ms | tok/sec: 816,759 | mfu: 51.05 | epoch: 2 | total time: 161.66m | eta: 17.8m +step 15049/16704 (90.09%) | loss: 2.499752 | lrm: 0.20 | dt: 643.14ms | tok/sec: 815,204 | mfu: 50.95 | epoch: 2 | total time: 161.67m | eta: 17.8m +step 15050/16704 (90.10%) | loss: 2.487344 | lrm: 0.20 | dt: 643.53ms | tok/sec: 814,704 | mfu: 50.92 | epoch: 2 | total time: 161.68m | eta: 17.8m +step 15051/16704 (90.10%) | loss: 2.481061 | lrm: 0.20 | dt: 645.80ms | tok/sec: 811,847 | mfu: 50.74 | epoch: 2 | total time: 161.69m | eta: 17.8m +step 15052/16704 (90.11%) | loss: 2.479610 | lrm: 0.20 | dt: 644.12ms | tok/sec: 813,964 | mfu: 50.87 | epoch: 2 | total time: 161.70m | eta: 17.8m +step 15053/16704 (90.12%) | loss: 2.459937 | lrm: 0.20 | dt: 642.28ms | tok/sec: 816,298 | mfu: 51.02 | epoch: 2 | total time: 161.71m | eta: 17.7m +step 15054/16704 (90.12%) | loss: 2.449562 | lrm: 0.20 | dt: 642.35ms | tok/sec: 816,206 | mfu: 51.01 | epoch: 2 | total time: 161.72m | eta: 17.7m +step 15055/16704 (90.13%) | loss: 2.460606 | lrm: 0.20 | dt: 641.92ms | tok/sec: 816,749 | mfu: 51.05 | epoch: 2 | total time: 161.73m | eta: 17.7m +step 15056/16704 (90.13%) | loss: 2.446540 | lrm: 0.20 | dt: 645.45ms | tok/sec: 812,283 | mfu: 50.77 | epoch: 2 | total time: 161.74m | eta: 17.7m +step 15057/16704 (90.14%) | loss: 2.444327 | lrm: 0.20 | dt: 642.67ms | tok/sec: 815,799 | mfu: 50.99 | epoch: 2 | total time: 161.75m | eta: 17.7m +step 15058/16704 (90.15%) | loss: 2.443118 | lrm: 0.20 | dt: 645.90ms | tok/sec: 811,712 | mfu: 50.73 | epoch: 2 | total time: 161.77m | eta: 17.7m +step 15059/16704 (90.15%) | loss: 2.463338 | lrm: 0.20 | dt: 644.84ms | tok/sec: 813,050 | mfu: 50.82 | epoch: 2 | total time: 161.78m | eta: 17.7m +step 15060/16704 (90.16%) | loss: 2.472009 | lrm: 0.20 | dt: 643.85ms | tok/sec: 814,305 | mfu: 50.90 | epoch: 2 | total time: 161.79m | eta: 17.7m +step 15061/16704 (90.16%) | loss: 2.471064 | lrm: 0.20 | dt: 643.32ms | tok/sec: 814,969 | mfu: 50.94 | epoch: 2 | total time: 161.80m | eta: 17.7m +step 15062/16704 (90.17%) | loss: 2.481610 | lrm: 0.20 | dt: 645.68ms | tok/sec: 811,991 | mfu: 50.75 | epoch: 2 | total time: 161.81m | eta: 17.7m +step 15063/16704 (90.18%) | loss: 2.485555 | lrm: 0.20 | dt: 643.43ms | tok/sec: 814,837 | mfu: 50.93 | epoch: 2 | total time: 161.82m | eta: 17.6m +step 15064/16704 (90.18%) | loss: 2.490655 | lrm: 0.20 | dt: 643.15ms | tok/sec: 815,192 | mfu: 50.95 | epoch: 2 | total time: 161.83m | eta: 17.6m +step 15065/16704 (90.19%) | loss: 2.491117 | lrm: 0.20 | dt: 643.02ms | tok/sec: 815,352 | mfu: 50.96 | epoch: 2 | total time: 161.84m | eta: 17.6m +step 15066/16704 (90.19%) | loss: 2.489675 | lrm: 0.20 | dt: 644.43ms | tok/sec: 813,572 | mfu: 50.85 | epoch: 2 | total time: 161.85m | eta: 17.6m +step 15067/16704 (90.20%) | loss: 2.474459 | lrm: 0.20 | dt: 644.48ms | tok/sec: 813,502 | mfu: 50.85 | epoch: 2 | total time: 161.86m | eta: 17.6m +step 15068/16704 (90.21%) | loss: 2.474320 | lrm: 0.20 | dt: 642.98ms | tok/sec: 815,397 | mfu: 50.96 | epoch: 2 | total time: 161.87m | eta: 17.6m +step 15069/16704 (90.21%) | loss: 2.468007 | lrm: 0.20 | dt: 644.54ms | tok/sec: 813,428 | mfu: 50.84 | epoch: 2 | total time: 161.88m | eta: 17.6m +step 15070/16704 (90.22%) | loss: 2.459464 | lrm: 0.20 | dt: 644.32ms | tok/sec: 813,708 | mfu: 50.86 | epoch: 2 | total time: 161.89m | eta: 17.6m +step 15071/16704 (90.22%) | loss: 2.471331 | lrm: 0.20 | dt: 643.25ms | tok/sec: 815,063 | mfu: 50.94 | epoch: 2 | total time: 161.91m | eta: 17.6m +step 15072/16704 (90.23%) | loss: 2.480238 | lrm: 0.20 | dt: 645.71ms | tok/sec: 811,949 | mfu: 50.75 | epoch: 2 | total time: 161.92m | eta: 17.5m +step 15073/16704 (90.24%) | loss: 2.474086 | lrm: 0.20 | dt: 639.69ms | tok/sec: 819,602 | mfu: 51.23 | epoch: 2 | total time: 161.93m | eta: 17.5m +step 15074/16704 (90.24%) | loss: 2.475278 | lrm: 0.20 | dt: 644.42ms | tok/sec: 813,584 | mfu: 50.85 | epoch: 2 | total time: 161.94m | eta: 17.5m +step 15075/16704 (90.25%) | loss: 2.472610 | lrm: 0.20 | dt: 643.82ms | tok/sec: 814,342 | mfu: 50.90 | epoch: 2 | total time: 161.95m | eta: 17.5m +step 15076/16704 (90.25%) | loss: 2.460034 | lrm: 0.19 | dt: 642.53ms | tok/sec: 815,977 | mfu: 51.00 | epoch: 2 | total time: 161.96m | eta: 17.5m +step 15077/16704 (90.26%) | loss: 2.459503 | lrm: 0.19 | dt: 644.89ms | tok/sec: 812,990 | mfu: 50.81 | epoch: 2 | total time: 161.97m | eta: 17.5m +step 15078/16704 (90.27%) | loss: 2.455572 | lrm: 0.19 | dt: 642.82ms | tok/sec: 815,611 | mfu: 50.98 | epoch: 2 | total time: 161.98m | eta: 17.5m +step 15079/16704 (90.27%) | loss: 2.476061 | lrm: 0.19 | dt: 645.65ms | tok/sec: 812,025 | mfu: 50.75 | epoch: 2 | total time: 161.99m | eta: 17.5m +step 15080/16704 (90.28%) | loss: 2.469339 | lrm: 0.19 | dt: 643.60ms | tok/sec: 814,621 | mfu: 50.91 | epoch: 2 | total time: 162.00m | eta: 17.5m +step 15081/16704 (90.28%) | loss: 2.472924 | lrm: 0.19 | dt: 643.02ms | tok/sec: 815,349 | mfu: 50.96 | epoch: 2 | total time: 162.01m | eta: 17.4m +step 15082/16704 (90.29%) | loss: 2.472073 | lrm: 0.19 | dt: 643.05ms | tok/sec: 815,316 | mfu: 50.96 | epoch: 2 | total time: 162.02m | eta: 17.4m +step 15083/16704 (90.30%) | loss: 2.478488 | lrm: 0.19 | dt: 643.64ms | tok/sec: 814,573 | mfu: 50.91 | epoch: 2 | total time: 162.03m | eta: 17.4m +step 15084/16704 (90.30%) | loss: 2.476759 | lrm: 0.19 | dt: 642.06ms | tok/sec: 816,570 | mfu: 51.04 | epoch: 2 | total time: 162.04m | eta: 17.4m +step 15085/16704 (90.31%) | loss: 2.475086 | lrm: 0.19 | dt: 645.89ms | tok/sec: 811,725 | mfu: 50.73 | epoch: 2 | total time: 162.06m | eta: 17.4m +step 15086/16704 (90.31%) | loss: 2.472928 | lrm: 0.19 | dt: 644.78ms | tok/sec: 813,124 | mfu: 50.82 | epoch: 2 | total time: 162.07m | eta: 17.4m +step 15087/16704 (90.32%) | loss: 2.465878 | lrm: 0.19 | dt: 643.20ms | tok/sec: 815,124 | mfu: 50.95 | epoch: 2 | total time: 162.08m | eta: 17.4m +step 15088/16704 (90.33%) | loss: 2.455662 | lrm: 0.19 | dt: 643.68ms | tok/sec: 814,519 | mfu: 50.91 | epoch: 2 | total time: 162.09m | eta: 17.4m +step 15089/16704 (90.33%) | loss: 2.453921 | lrm: 0.19 | dt: 642.57ms | tok/sec: 815,923 | mfu: 51.00 | epoch: 2 | total time: 162.10m | eta: 17.4m +step 15090/16704 (90.34%) | loss: 2.462893 | lrm: 0.19 | dt: 644.32ms | tok/sec: 813,708 | mfu: 50.86 | epoch: 2 | total time: 162.11m | eta: 17.4m +step 15091/16704 (90.34%) | loss: 2.460175 | lrm: 0.19 | dt: 644.38ms | tok/sec: 813,635 | mfu: 50.85 | epoch: 2 | total time: 162.12m | eta: 17.3m +step 15092/16704 (90.35%) | loss: 2.447070 | lrm: 0.19 | dt: 643.21ms | tok/sec: 815,117 | mfu: 50.95 | epoch: 2 | total time: 162.13m | eta: 17.3m +step 15093/16704 (90.36%) | loss: 2.452117 | lrm: 0.19 | dt: 644.59ms | tok/sec: 813,364 | mfu: 50.84 | epoch: 2 | total time: 162.14m | eta: 17.3m +step 15094/16704 (90.36%) | loss: 2.447675 | lrm: 0.19 | dt: 642.86ms | tok/sec: 815,557 | mfu: 50.97 | epoch: 2 | total time: 162.15m | eta: 17.3m +step 15095/16704 (90.37%) | loss: 2.447912 | lrm: 0.19 | dt: 641.45ms | tok/sec: 817,353 | mfu: 51.09 | epoch: 2 | total time: 162.16m | eta: 17.3m +step 15096/16704 (90.37%) | loss: 2.467548 | lrm: 0.19 | dt: 643.48ms | tok/sec: 814,775 | mfu: 50.92 | epoch: 2 | total time: 162.17m | eta: 17.3m +step 15097/16704 (90.38%) | loss: 2.469283 | lrm: 0.19 | dt: 642.07ms | tok/sec: 816,565 | mfu: 51.04 | epoch: 2 | total time: 162.18m | eta: 17.3m +step 15098/16704 (90.39%) | loss: 2.473251 | lrm: 0.19 | dt: 642.52ms | tok/sec: 815,992 | mfu: 51.00 | epoch: 2 | total time: 162.19m | eta: 17.3m +step 15099/16704 (90.39%) | loss: 2.459159 | lrm: 0.19 | dt: 643.24ms | tok/sec: 815,071 | mfu: 50.94 | epoch: 2 | total time: 162.21m | eta: 17.3m +step 15100/16704 (90.40%) | loss: 2.456228 | lrm: 0.19 | dt: 643.39ms | tok/sec: 814,887 | mfu: 50.93 | epoch: 2 | total time: 162.22m | eta: 17.2m +step 15101/16704 (90.40%) | loss: 2.463383 | lrm: 0.19 | dt: 645.67ms | tok/sec: 812,002 | mfu: 50.75 | epoch: 2 | total time: 162.23m | eta: 17.2m +step 15102/16704 (90.41%) | loss: 2.469797 | lrm: 0.19 | dt: 641.69ms | tok/sec: 817,048 | mfu: 51.07 | epoch: 2 | total time: 162.24m | eta: 17.2m +step 15103/16704 (90.42%) | loss: 2.467031 | lrm: 0.19 | dt: 643.44ms | tok/sec: 814,824 | mfu: 50.93 | epoch: 2 | total time: 162.25m | eta: 17.2m +step 15104/16704 (90.42%) | loss: 2.471614 | lrm: 0.19 | dt: 644.77ms | tok/sec: 813,141 | mfu: 50.82 | epoch: 2 | total time: 162.26m | eta: 17.2m +step 15105/16704 (90.43%) | loss: 2.477567 | lrm: 0.19 | dt: 644.18ms | tok/sec: 813,889 | mfu: 50.87 | epoch: 2 | total time: 162.27m | eta: 17.2m +step 15106/16704 (90.43%) | loss: 2.476617 | lrm: 0.19 | dt: 643.64ms | tok/sec: 814,568 | mfu: 50.91 | epoch: 2 | total time: 162.28m | eta: 17.2m +step 15107/16704 (90.44%) | loss: 2.467504 | lrm: 0.19 | dt: 641.37ms | tok/sec: 817,455 | mfu: 51.09 | epoch: 2 | total time: 162.29m | eta: 17.2m +step 15108/16704 (90.45%) | loss: 2.458980 | lrm: 0.19 | dt: 642.37ms | tok/sec: 816,171 | mfu: 51.01 | epoch: 2 | total time: 162.30m | eta: 17.2m +step 15109/16704 (90.45%) | loss: 2.469941 | lrm: 0.19 | dt: 643.77ms | tok/sec: 814,405 | mfu: 50.90 | epoch: 2 | total time: 162.31m | eta: 17.1m +step 15110/16704 (90.46%) | loss: 2.472228 | lrm: 0.19 | dt: 643.94ms | tok/sec: 814,187 | mfu: 50.89 | epoch: 2 | total time: 162.32m | eta: 17.1m +step 15111/16704 (90.46%) | loss: 2.475439 | lrm: 0.19 | dt: 645.55ms | tok/sec: 812,163 | mfu: 50.76 | epoch: 2 | total time: 162.33m | eta: 17.1m +step 15112/16704 (90.47%) | loss: 2.488920 | lrm: 0.19 | dt: 641.68ms | tok/sec: 817,056 | mfu: 51.07 | epoch: 2 | total time: 162.34m | eta: 17.1m +step 15113/16704 (90.48%) | loss: 2.483733 | lrm: 0.19 | dt: 643.18ms | tok/sec: 815,152 | mfu: 50.95 | epoch: 2 | total time: 162.36m | eta: 17.1m +step 15114/16704 (90.48%) | loss: 2.465729 | lrm: 0.19 | dt: 644.04ms | tok/sec: 814,066 | mfu: 50.88 | epoch: 2 | total time: 162.37m | eta: 17.1m +step 15115/16704 (90.49%) | loss: 2.471687 | lrm: 0.19 | dt: 641.96ms | tok/sec: 816,702 | mfu: 51.05 | epoch: 2 | total time: 162.38m | eta: 17.1m +step 15116/16704 (90.49%) | loss: 2.454929 | lrm: 0.19 | dt: 644.86ms | tok/sec: 813,029 | mfu: 50.82 | epoch: 2 | total time: 162.39m | eta: 17.1m +step 15117/16704 (90.50%) | loss: 2.456303 | lrm: 0.19 | dt: 642.85ms | tok/sec: 815,568 | mfu: 50.97 | epoch: 2 | total time: 162.40m | eta: 17.1m +step 15118/16704 (90.51%) | loss: 2.452747 | lrm: 0.19 | dt: 642.16ms | tok/sec: 816,446 | mfu: 51.03 | epoch: 2 | total time: 162.41m | eta: 17.0m +step 15119/16704 (90.51%) | loss: 2.465851 | lrm: 0.19 | dt: 645.34ms | tok/sec: 812,417 | mfu: 50.78 | epoch: 2 | total time: 162.42m | eta: 17.0m +step 15120/16704 (90.52%) | loss: 2.467908 | lrm: 0.19 | dt: 643.08ms | tok/sec: 815,271 | mfu: 50.96 | epoch: 2 | total time: 162.43m | eta: 17.0m +step 15121/16704 (90.52%) | loss: 2.465736 | lrm: 0.19 | dt: 642.48ms | tok/sec: 816,035 | mfu: 51.00 | epoch: 2 | total time: 162.44m | eta: 17.0m +step 15122/16704 (90.53%) | loss: 2.462038 | lrm: 0.19 | dt: 643.85ms | tok/sec: 814,306 | mfu: 50.90 | epoch: 2 | total time: 162.45m | eta: 17.0m +step 15123/16704 (90.54%) | loss: 2.458458 | lrm: 0.19 | dt: 643.37ms | tok/sec: 814,913 | mfu: 50.93 | epoch: 2 | total time: 162.46m | eta: 17.0m +step 15124/16704 (90.54%) | loss: 2.449171 | lrm: 0.19 | dt: 641.68ms | tok/sec: 817,055 | mfu: 51.07 | epoch: 2 | total time: 162.47m | eta: 17.0m +step 15125/16704 (90.55%) | loss: 2.451659 | lrm: 0.19 | dt: 642.42ms | tok/sec: 816,108 | mfu: 51.01 | epoch: 2 | total time: 162.48m | eta: 17.0m +step 15126/16704 (90.55%) | loss: 2.451422 | lrm: 0.19 | dt: 644.44ms | tok/sec: 813,556 | mfu: 50.85 | epoch: 2 | total time: 162.49m | eta: 17.0m +step 15127/16704 (90.56%) | loss: 2.443677 | lrm: 0.19 | dt: 643.97ms | tok/sec: 814,151 | mfu: 50.89 | epoch: 2 | total time: 162.51m | eta: 17.0m +step 15128/16704 (90.57%) | loss: 2.437022 | lrm: 0.19 | dt: 644.56ms | tok/sec: 813,399 | mfu: 50.84 | epoch: 2 | total time: 162.52m | eta: 16.9m +step 15129/16704 (90.57%) | loss: 2.439052 | lrm: 0.19 | dt: 644.52ms | tok/sec: 813,453 | mfu: 50.84 | epoch: 2 | total time: 162.53m | eta: 16.9m +step 15130/16704 (90.58%) | loss: 2.445347 | lrm: 0.19 | dt: 643.43ms | tok/sec: 814,833 | mfu: 50.93 | epoch: 2 | total time: 162.54m | eta: 16.9m +step 15131/16704 (90.58%) | loss: 2.440658 | lrm: 0.19 | dt: 644.75ms | tok/sec: 813,164 | mfu: 50.82 | epoch: 2 | total time: 162.55m | eta: 16.9m +step 15132/16704 (90.59%) | loss: 2.428925 | lrm: 0.19 | dt: 643.21ms | tok/sec: 815,110 | mfu: 50.95 | epoch: 2 | total time: 162.56m | eta: 16.9m +step 15133/16704 (90.60%) | loss: 2.421707 | lrm: 0.19 | dt: 643.60ms | tok/sec: 814,614 | mfu: 50.91 | epoch: 2 | total time: 162.57m | eta: 16.9m +step 15134/16704 (90.60%) | loss: 2.424339 | lrm: 0.19 | dt: 643.76ms | tok/sec: 814,420 | mfu: 50.90 | epoch: 2 | total time: 162.58m | eta: 16.9m +step 15135/16704 (90.61%) | loss: 2.420691 | lrm: 0.19 | dt: 643.07ms | tok/sec: 815,292 | mfu: 50.96 | epoch: 2 | total time: 162.59m | eta: 16.9m +step 15136/16704 (90.61%) | loss: 2.424322 | lrm: 0.19 | dt: 643.18ms | tok/sec: 815,150 | mfu: 50.95 | epoch: 2 | total time: 162.60m | eta: 16.9m +step 15137/16704 (90.62%) | loss: 2.424814 | lrm: 0.19 | dt: 643.53ms | tok/sec: 814,700 | mfu: 50.92 | epoch: 2 | total time: 162.61m | eta: 16.8m +step 15138/16704 (90.62%) | loss: 2.430592 | lrm: 0.19 | dt: 643.59ms | tok/sec: 814,626 | mfu: 50.92 | epoch: 2 | total time: 162.62m | eta: 16.8m +step 15139/16704 (90.63%) | loss: 2.438365 | lrm: 0.19 | dt: 644.68ms | tok/sec: 813,248 | mfu: 50.83 | epoch: 2 | total time: 162.63m | eta: 16.8m +step 15140/16704 (90.64%) | loss: 2.446108 | lrm: 0.19 | dt: 643.23ms | tok/sec: 815,089 | mfu: 50.94 | epoch: 2 | total time: 162.65m | eta: 16.8m +step 15141/16704 (90.64%) | loss: 2.462319 | lrm: 0.19 | dt: 644.29ms | tok/sec: 813,749 | mfu: 50.86 | epoch: 2 | total time: 162.66m | eta: 16.8m +step 15142/16704 (90.65%) | loss: 2.467570 | lrm: 0.19 | dt: 642.62ms | tok/sec: 815,864 | mfu: 50.99 | epoch: 2 | total time: 162.67m | eta: 16.8m +step 15143/16704 (90.65%) | loss: 2.462477 | lrm: 0.19 | dt: 644.49ms | tok/sec: 813,487 | mfu: 50.84 | epoch: 2 | total time: 162.68m | eta: 16.8m +step 15144/16704 (90.66%) | loss: 2.468209 | lrm: 0.19 | dt: 641.30ms | tok/sec: 817,535 | mfu: 51.10 | epoch: 2 | total time: 162.69m | eta: 16.8m +step 15145/16704 (90.67%) | loss: 2.478024 | lrm: 0.19 | dt: 644.45ms | tok/sec: 813,548 | mfu: 50.85 | epoch: 2 | total time: 162.70m | eta: 16.8m +step 15146/16704 (90.67%) | loss: 2.472332 | lrm: 0.19 | dt: 642.27ms | tok/sec: 816,309 | mfu: 51.02 | epoch: 2 | total time: 162.71m | eta: 16.7m +step 15147/16704 (90.68%) | loss: 2.473836 | lrm: 0.19 | dt: 643.28ms | tok/sec: 815,024 | mfu: 50.94 | epoch: 2 | total time: 162.72m | eta: 16.7m +step 15148/16704 (90.68%) | loss: 2.467836 | lrm: 0.19 | dt: 644.08ms | tok/sec: 814,010 | mfu: 50.88 | epoch: 2 | total time: 162.73m | eta: 16.7m +step 15149/16704 (90.69%) | loss: 2.467885 | lrm: 0.19 | dt: 642.06ms | tok/sec: 816,567 | mfu: 51.04 | epoch: 2 | total time: 162.74m | eta: 16.7m +step 15150/16704 (90.70%) | loss: 2.461866 | lrm: 0.19 | dt: 644.84ms | tok/sec: 813,049 | mfu: 50.82 | epoch: 2 | total time: 162.75m | eta: 16.7m +step 15151/16704 (90.70%) | loss: 2.462382 | lrm: 0.19 | dt: 641.25ms | tok/sec: 817,603 | mfu: 51.10 | epoch: 2 | total time: 162.76m | eta: 16.7m +step 15152/16704 (90.71%) | loss: 2.467931 | lrm: 0.19 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 2 | total time: 162.77m | eta: 16.7m +step 15153/16704 (90.71%) | loss: 2.463383 | lrm: 0.19 | dt: 644.12ms | tok/sec: 813,961 | mfu: 50.87 | epoch: 2 | total time: 162.78m | eta: 16.7m +step 15154/16704 (90.72%) | loss: 2.468740 | lrm: 0.19 | dt: 642.82ms | tok/sec: 815,601 | mfu: 50.98 | epoch: 2 | total time: 162.80m | eta: 16.7m +step 15155/16704 (90.73%) | loss: 2.460611 | lrm: 0.19 | dt: 644.98ms | tok/sec: 812,877 | mfu: 50.81 | epoch: 2 | total time: 162.81m | eta: 16.7m +step 15156/16704 (90.73%) | loss: 2.468328 | lrm: 0.19 | dt: 644.30ms | tok/sec: 813,736 | mfu: 50.86 | epoch: 2 | total time: 162.82m | eta: 16.6m +step 15157/16704 (90.74%) | loss: 2.479154 | lrm: 0.19 | dt: 645.07ms | tok/sec: 812,763 | mfu: 50.80 | epoch: 2 | total time: 162.83m | eta: 16.6m +step 15158/16704 (90.74%) | loss: 2.489002 | lrm: 0.19 | dt: 641.22ms | tok/sec: 817,638 | mfu: 51.10 | epoch: 2 | total time: 162.84m | eta: 16.6m +step 15159/16704 (90.75%) | loss: 2.492880 | lrm: 0.18 | dt: 644.47ms | tok/sec: 813,518 | mfu: 50.85 | epoch: 2 | total time: 162.85m | eta: 16.6m +step 15160/16704 (90.76%) | loss: 2.496705 | lrm: 0.18 | dt: 644.05ms | tok/sec: 814,052 | mfu: 50.88 | epoch: 2 | total time: 162.86m | eta: 16.6m +step 15161/16704 (90.76%) | loss: 2.489977 | lrm: 0.18 | dt: 642.18ms | tok/sec: 816,423 | mfu: 51.03 | epoch: 2 | total time: 162.87m | eta: 16.6m +step 15162/16704 (90.77%) | loss: 2.474612 | lrm: 0.18 | dt: 645.16ms | tok/sec: 812,648 | mfu: 50.79 | epoch: 2 | total time: 162.88m | eta: 16.6m +step 15163/16704 (90.77%) | loss: 2.468778 | lrm: 0.18 | dt: 644.73ms | tok/sec: 813,190 | mfu: 50.83 | epoch: 2 | total time: 162.89m | eta: 16.6m +step 15164/16704 (90.78%) | loss: 2.467617 | lrm: 0.18 | dt: 643.76ms | tok/sec: 814,413 | mfu: 50.90 | epoch: 2 | total time: 162.90m | eta: 16.6m +step 15165/16704 (90.79%) | loss: 2.472060 | lrm: 0.18 | dt: 644.69ms | tok/sec: 813,237 | mfu: 50.83 | epoch: 2 | total time: 162.91m | eta: 16.5m +step 15166/16704 (90.79%) | loss: 2.465529 | lrm: 0.18 | dt: 648.02ms | tok/sec: 809,056 | mfu: 50.57 | epoch: 2 | total time: 162.92m | eta: 16.5m +step 15167/16704 (90.80%) | loss: 2.471694 | lrm: 0.18 | dt: 641.80ms | tok/sec: 816,902 | mfu: 51.06 | epoch: 2 | total time: 162.93m | eta: 16.5m +step 15168/16704 (90.80%) | loss: 2.473268 | lrm: 0.18 | dt: 642.89ms | tok/sec: 815,512 | mfu: 50.97 | epoch: 2 | total time: 162.95m | eta: 16.5m +step 15169/16704 (90.81%) | loss: 2.477159 | lrm: 0.18 | dt: 643.47ms | tok/sec: 814,780 | mfu: 50.92 | epoch: 2 | total time: 162.96m | eta: 16.5m +step 15170/16704 (90.82%) | loss: 2.467601 | lrm: 0.18 | dt: 643.13ms | tok/sec: 815,213 | mfu: 50.95 | epoch: 2 | total time: 162.97m | eta: 16.5m +step 15171/16704 (90.82%) | loss: 2.473713 | lrm: 0.18 | dt: 643.66ms | tok/sec: 814,545 | mfu: 50.91 | epoch: 2 | total time: 162.98m | eta: 16.5m +step 15172/16704 (90.83%) | loss: 2.483215 | lrm: 0.18 | dt: 641.29ms | tok/sec: 817,558 | mfu: 51.10 | epoch: 2 | total time: 162.99m | eta: 16.5m +step 15173/16704 (90.83%) | loss: 2.477169 | lrm: 0.18 | dt: 643.41ms | tok/sec: 814,861 | mfu: 50.93 | epoch: 2 | total time: 163.00m | eta: 16.5m +step 15174/16704 (90.84%) | loss: 2.474983 | lrm: 0.18 | dt: 643.11ms | tok/sec: 815,239 | mfu: 50.95 | epoch: 2 | total time: 163.01m | eta: 16.4m +step 15175/16704 (90.85%) | loss: 2.476702 | lrm: 0.18 | dt: 643.93ms | tok/sec: 814,202 | mfu: 50.89 | epoch: 2 | total time: 163.02m | eta: 16.4m +step 15176/16704 (90.85%) | loss: 2.484662 | lrm: 0.18 | dt: 644.09ms | tok/sec: 813,998 | mfu: 50.88 | epoch: 2 | total time: 163.03m | eta: 16.4m +step 15177/16704 (90.86%) | loss: 2.490870 | lrm: 0.18 | dt: 643.06ms | tok/sec: 815,307 | mfu: 50.96 | epoch: 2 | total time: 163.04m | eta: 16.4m +step 15178/16704 (90.86%) | loss: 2.488035 | lrm: 0.18 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 2 | total time: 163.05m | eta: 16.4m +step 15179/16704 (90.87%) | loss: 2.495397 | lrm: 0.18 | dt: 644.88ms | tok/sec: 813,002 | mfu: 50.81 | epoch: 2 | total time: 163.06m | eta: 16.4m +step 15180/16704 (90.88%) | loss: 2.498548 | lrm: 0.18 | dt: 645.90ms | tok/sec: 811,716 | mfu: 50.73 | epoch: 2 | total time: 163.07m | eta: 16.4m +step 15181/16704 (90.88%) | loss: 2.480008 | lrm: 0.18 | dt: 646.00ms | tok/sec: 811,597 | mfu: 50.73 | epoch: 2 | total time: 163.09m | eta: 16.4m +step 15182/16704 (90.89%) | loss: 2.471376 | lrm: 0.18 | dt: 643.59ms | tok/sec: 814,630 | mfu: 50.92 | epoch: 2 | total time: 163.10m | eta: 16.4m +step 15183/16704 (90.89%) | loss: 2.472975 | lrm: 0.18 | dt: 642.53ms | tok/sec: 815,978 | mfu: 51.00 | epoch: 2 | total time: 163.11m | eta: 16.4m +step 15184/16704 (90.90%) | loss: 2.473194 | lrm: 0.18 | dt: 642.46ms | tok/sec: 816,061 | mfu: 51.01 | epoch: 2 | total time: 163.12m | eta: 16.3m +step 15185/16704 (90.91%) | loss: 2.458855 | lrm: 0.18 | dt: 644.54ms | tok/sec: 813,435 | mfu: 50.84 | epoch: 2 | total time: 163.13m | eta: 16.3m +step 15186/16704 (90.91%) | loss: 2.446886 | lrm: 0.18 | dt: 643.57ms | tok/sec: 814,657 | mfu: 50.92 | epoch: 2 | total time: 163.14m | eta: 16.3m +step 15187/16704 (90.92%) | loss: 2.448423 | lrm: 0.18 | dt: 643.47ms | tok/sec: 814,778 | mfu: 50.92 | epoch: 2 | total time: 163.15m | eta: 16.3m +step 15188/16704 (90.92%) | loss: 2.444189 | lrm: 0.18 | dt: 645.09ms | tok/sec: 812,731 | mfu: 50.80 | epoch: 2 | total time: 163.16m | eta: 16.3m +step 15189/16704 (90.93%) | loss: 2.457002 | lrm: 0.18 | dt: 643.20ms | tok/sec: 815,119 | mfu: 50.95 | epoch: 2 | total time: 163.17m | eta: 16.3m +step 15190/16704 (90.94%) | loss: 2.469466 | lrm: 0.18 | dt: 644.40ms | tok/sec: 813,605 | mfu: 50.85 | epoch: 2 | total time: 163.18m | eta: 16.3m +step 15191/16704 (90.94%) | loss: 2.471946 | lrm: 0.18 | dt: 642.44ms | tok/sec: 816,094 | mfu: 51.01 | epoch: 2 | total time: 163.19m | eta: 16.3m +step 15192/16704 (90.95%) | loss: 2.466916 | lrm: 0.18 | dt: 641.98ms | tok/sec: 816,671 | mfu: 51.04 | epoch: 2 | total time: 163.20m | eta: 16.3m +step 15193/16704 (90.95%) | loss: 2.471637 | lrm: 0.18 | dt: 642.24ms | tok/sec: 816,338 | mfu: 51.02 | epoch: 2 | total time: 163.21m | eta: 16.2m +step 15194/16704 (90.96%) | loss: 2.460028 | lrm: 0.18 | dt: 646.85ms | tok/sec: 810,526 | mfu: 50.66 | epoch: 2 | total time: 163.22m | eta: 16.2m +step 15195/16704 (90.97%) | loss: 2.456371 | lrm: 0.18 | dt: 643.01ms | tok/sec: 815,359 | mfu: 50.96 | epoch: 2 | total time: 163.24m | eta: 16.2m +step 15196/16704 (90.97%) | loss: 2.453247 | lrm: 0.18 | dt: 643.54ms | tok/sec: 814,698 | mfu: 50.92 | epoch: 2 | total time: 163.25m | eta: 16.2m +step 15197/16704 (90.98%) | loss: 2.449010 | lrm: 0.18 | dt: 644.17ms | tok/sec: 813,901 | mfu: 50.87 | epoch: 2 | total time: 163.26m | eta: 16.2m +step 15198/16704 (90.98%) | loss: 2.455565 | lrm: 0.18 | dt: 644.01ms | tok/sec: 814,095 | mfu: 50.88 | epoch: 2 | total time: 163.27m | eta: 16.2m +step 15199/16704 (90.99%) | loss: 2.450513 | lrm: 0.18 | dt: 645.70ms | tok/sec: 811,972 | mfu: 50.75 | epoch: 2 | total time: 163.28m | eta: 16.2m +step 15200/16704 (91.00%) | loss: 2.442733 | lrm: 0.18 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 2 | total time: 163.29m | eta: 16.2m +step 15201/16704 (91.00%) | loss: 2.451566 | lrm: 0.18 | dt: 643.61ms | tok/sec: 814,604 | mfu: 50.91 | epoch: 2 | total time: 163.30m | eta: 16.2m +step 15202/16704 (91.01%) | loss: 2.449001 | lrm: 0.18 | dt: 646.78ms | tok/sec: 810,607 | mfu: 50.66 | epoch: 2 | total time: 163.31m | eta: 16.1m +step 15203/16704 (91.01%) | loss: 2.449542 | lrm: 0.18 | dt: 642.91ms | tok/sec: 815,496 | mfu: 50.97 | epoch: 2 | total time: 163.32m | eta: 16.1m +step 15204/16704 (91.02%) | loss: 2.459715 | lrm: 0.18 | dt: 643.78ms | tok/sec: 814,390 | mfu: 50.90 | epoch: 2 | total time: 163.33m | eta: 16.1m +step 15205/16704 (91.03%) | loss: 2.471893 | lrm: 0.18 | dt: 645.93ms | tok/sec: 811,678 | mfu: 50.73 | epoch: 2 | total time: 163.34m | eta: 16.1m +step 15206/16704 (91.03%) | loss: 2.474996 | lrm: 0.18 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 2 | total time: 163.35m | eta: 16.1m +step 15207/16704 (91.04%) | loss: 2.484401 | lrm: 0.18 | dt: 645.47ms | tok/sec: 812,255 | mfu: 50.77 | epoch: 2 | total time: 163.36m | eta: 16.1m +step 15208/16704 (91.04%) | loss: 2.482778 | lrm: 0.18 | dt: 641.96ms | tok/sec: 816,693 | mfu: 51.04 | epoch: 2 | total time: 163.37m | eta: 16.1m +step 15209/16704 (91.05%) | loss: 2.479628 | lrm: 0.18 | dt: 643.80ms | tok/sec: 814,367 | mfu: 50.90 | epoch: 2 | total time: 163.39m | eta: 16.1m +step 15210/16704 (91.06%) | loss: 2.474829 | lrm: 0.18 | dt: 641.81ms | tok/sec: 816,891 | mfu: 51.06 | epoch: 2 | total time: 163.40m | eta: 16.1m +step 15211/16704 (91.06%) | loss: 2.452934 | lrm: 0.18 | dt: 642.89ms | tok/sec: 815,518 | mfu: 50.97 | epoch: 2 | total time: 163.41m | eta: 16.0m +step 15212/16704 (91.07%) | loss: 2.441770 | lrm: 0.18 | dt: 645.37ms | tok/sec: 812,383 | mfu: 50.78 | epoch: 2 | total time: 163.42m | eta: 16.0m +step 15213/16704 (91.07%) | loss: 2.444232 | lrm: 0.18 | dt: 643.85ms | tok/sec: 814,297 | mfu: 50.89 | epoch: 2 | total time: 163.43m | eta: 16.0m +step 15214/16704 (91.08%) | loss: 2.442883 | lrm: 0.18 | dt: 641.44ms | tok/sec: 817,354 | mfu: 51.09 | epoch: 2 | total time: 163.44m | eta: 16.0m +step 15215/16704 (91.09%) | loss: 2.427964 | lrm: 0.18 | dt: 644.81ms | tok/sec: 813,090 | mfu: 50.82 | epoch: 2 | total time: 163.45m | eta: 16.0m +step 15216/16704 (91.09%) | loss: 2.433034 | lrm: 0.18 | dt: 643.21ms | tok/sec: 815,105 | mfu: 50.95 | epoch: 2 | total time: 163.46m | eta: 16.0m +step 15217/16704 (91.10%) | loss: 2.424693 | lrm: 0.18 | dt: 641.42ms | tok/sec: 817,392 | mfu: 51.09 | epoch: 2 | total time: 163.47m | eta: 16.0m +step 15218/16704 (91.10%) | loss: 2.429671 | lrm: 0.18 | dt: 645.09ms | tok/sec: 812,737 | mfu: 50.80 | epoch: 2 | total time: 163.48m | eta: 16.0m +step 15219/16704 (91.11%) | loss: 2.432246 | lrm: 0.18 | dt: 642.14ms | tok/sec: 816,472 | mfu: 51.03 | epoch: 2 | total time: 163.49m | eta: 16.0m +step 15220/16704 (91.12%) | loss: 2.431248 | lrm: 0.18 | dt: 644.15ms | tok/sec: 813,928 | mfu: 50.87 | epoch: 2 | total time: 163.50m | eta: 16.0m +step 15221/16704 (91.12%) | loss: 2.430664 | lrm: 0.18 | dt: 644.38ms | tok/sec: 813,632 | mfu: 50.85 | epoch: 2 | total time: 163.51m | eta: 15.9m +step 15222/16704 (91.13%) | loss: 2.420570 | lrm: 0.18 | dt: 643.43ms | tok/sec: 814,829 | mfu: 50.93 | epoch: 2 | total time: 163.53m | eta: 15.9m +step 15223/16704 (91.13%) | loss: 2.429262 | lrm: 0.18 | dt: 645.59ms | tok/sec: 812,103 | mfu: 50.76 | epoch: 2 | total time: 163.54m | eta: 15.9m +step 15224/16704 (91.14%) | loss: 2.440434 | lrm: 0.18 | dt: 642.15ms | tok/sec: 816,462 | mfu: 51.03 | epoch: 2 | total time: 163.55m | eta: 15.9m +step 15225/16704 (91.15%) | loss: 2.452940 | lrm: 0.18 | dt: 644.47ms | tok/sec: 813,514 | mfu: 50.85 | epoch: 2 | total time: 163.56m | eta: 15.9m +step 15226/16704 (91.15%) | loss: 2.448619 | lrm: 0.18 | dt: 643.74ms | tok/sec: 814,439 | mfu: 50.90 | epoch: 2 | total time: 163.57m | eta: 15.9m +step 15227/16704 (91.16%) | loss: 2.458252 | lrm: 0.18 | dt: 642.95ms | tok/sec: 815,437 | mfu: 50.97 | epoch: 2 | total time: 163.58m | eta: 15.9m +step 15228/16704 (91.16%) | loss: 2.456915 | lrm: 0.18 | dt: 645.81ms | tok/sec: 811,832 | mfu: 50.74 | epoch: 2 | total time: 163.59m | eta: 15.9m +step 15229/16704 (91.17%) | loss: 2.464797 | lrm: 0.18 | dt: 642.98ms | tok/sec: 815,407 | mfu: 50.96 | epoch: 2 | total time: 163.60m | eta: 15.9m +step 15230/16704 (91.18%) | loss: 2.461576 | lrm: 0.18 | dt: 644.56ms | tok/sec: 813,406 | mfu: 50.84 | epoch: 2 | total time: 163.61m | eta: 15.8m +step 15231/16704 (91.18%) | loss: 2.462880 | lrm: 0.18 | dt: 643.54ms | tok/sec: 814,695 | mfu: 50.92 | epoch: 2 | total time: 163.62m | eta: 15.8m +step 15232/16704 (91.19%) | loss: 2.461129 | lrm: 0.18 | dt: 642.80ms | tok/sec: 815,628 | mfu: 50.98 | epoch: 2 | total time: 163.63m | eta: 15.8m +step 15233/16704 (91.19%) | loss: 2.463563 | lrm: 0.18 | dt: 643.06ms | tok/sec: 815,303 | mfu: 50.96 | epoch: 2 | total time: 163.64m | eta: 15.8m +step 15234/16704 (91.20%) | loss: 2.467655 | lrm: 0.18 | dt: 644.07ms | tok/sec: 814,025 | mfu: 50.88 | epoch: 2 | total time: 163.65m | eta: 15.8m +step 15235/16704 (91.21%) | loss: 2.468278 | lrm: 0.18 | dt: 641.82ms | tok/sec: 816,881 | mfu: 51.06 | epoch: 2 | total time: 163.66m | eta: 15.8m +step 15236/16704 (91.21%) | loss: 2.483531 | lrm: 0.18 | dt: 644.62ms | tok/sec: 813,324 | mfu: 50.83 | epoch: 2 | total time: 163.68m | eta: 15.8m +step 15237/16704 (91.22%) | loss: 2.484126 | lrm: 0.18 | dt: 642.65ms | tok/sec: 815,817 | mfu: 50.99 | epoch: 2 | total time: 163.69m | eta: 15.8m +step 15238/16704 (91.22%) | loss: 2.489119 | lrm: 0.18 | dt: 642.07ms | tok/sec: 816,560 | mfu: 51.04 | epoch: 2 | total time: 163.70m | eta: 15.8m +step 15239/16704 (91.23%) | loss: 2.484703 | lrm: 0.18 | dt: 644.59ms | tok/sec: 813,370 | mfu: 50.84 | epoch: 2 | total time: 163.71m | eta: 15.7m +step 15240/16704 (91.24%) | loss: 2.477365 | lrm: 0.18 | dt: 643.79ms | tok/sec: 814,376 | mfu: 50.90 | epoch: 2 | total time: 163.72m | eta: 15.7m +step 15241/16704 (91.24%) | loss: 2.472294 | lrm: 0.18 | dt: 643.28ms | tok/sec: 815,023 | mfu: 50.94 | epoch: 2 | total time: 163.73m | eta: 15.7m +step 15242/16704 (91.25%) | loss: 2.467163 | lrm: 0.18 | dt: 644.54ms | tok/sec: 813,428 | mfu: 50.84 | epoch: 2 | total time: 163.74m | eta: 15.7m +step 15243/16704 (91.25%) | loss: 2.470003 | lrm: 0.17 | dt: 643.47ms | tok/sec: 814,782 | mfu: 50.93 | epoch: 2 | total time: 163.75m | eta: 15.7m +step 15244/16704 (91.26%) | loss: 2.477959 | lrm: 0.17 | dt: 647.00ms | tok/sec: 810,340 | mfu: 50.65 | epoch: 2 | total time: 163.76m | eta: 15.7m +step 15245/16704 (91.27%) | loss: 2.487456 | lrm: 0.17 | dt: 643.24ms | tok/sec: 815,068 | mfu: 50.94 | epoch: 2 | total time: 163.77m | eta: 15.7m +step 15246/16704 (91.27%) | loss: 2.487602 | lrm: 0.17 | dt: 644.95ms | tok/sec: 812,917 | mfu: 50.81 | epoch: 2 | total time: 163.78m | eta: 15.7m +step 15247/16704 (91.28%) | loss: 2.489925 | lrm: 0.17 | dt: 645.61ms | tok/sec: 812,080 | mfu: 50.76 | epoch: 2 | total time: 163.79m | eta: 15.7m +step 15248/16704 (91.28%) | loss: 2.486405 | lrm: 0.17 | dt: 643.71ms | tok/sec: 814,480 | mfu: 50.91 | epoch: 2 | total time: 163.80m | eta: 15.7m +step 15249/16704 (91.29%) | loss: 2.488918 | lrm: 0.17 | dt: 642.43ms | tok/sec: 816,101 | mfu: 51.01 | epoch: 2 | total time: 163.81m | eta: 15.6m +Step 15250 | Validation bpb: 0.760237 +step 15250/16704 (91.30%) | loss: 2.479469 | lrm: 0.17 | dt: 629.21ms | tok/sec: 833,247 | mfu: 52.08 | epoch: 2 | total time: 163.83m | eta: 15.6m +step 15251/16704 (91.30%) | loss: 2.475702 | lrm: 0.17 | dt: 651.21ms | tok/sec: 805,103 | mfu: 50.32 | epoch: 2 | total time: 163.84m | eta: 15.6m +step 15252/16704 (91.31%) | loss: 2.486807 | lrm: 0.17 | dt: 642.01ms | tok/sec: 816,641 | mfu: 51.04 | epoch: 2 | total time: 163.85m | eta: 15.6m +step 15253/16704 (91.31%) | loss: 2.489208 | lrm: 0.17 | dt: 642.41ms | tok/sec: 816,130 | mfu: 51.01 | epoch: 2 | total time: 163.86m | eta: 15.6m +step 15254/16704 (91.32%) | loss: 2.488219 | lrm: 0.17 | dt: 648.79ms | tok/sec: 808,106 | mfu: 50.51 | epoch: 2 | total time: 163.87m | eta: 15.6m +step 15255/16704 (91.33%) | loss: 2.504238 | lrm: 0.17 | dt: 639.13ms | tok/sec: 820,310 | mfu: 51.27 | epoch: 2 | total time: 163.88m | eta: 15.6m +step 15256/16704 (91.33%) | loss: 2.497284 | lrm: 0.17 | dt: 647.36ms | tok/sec: 809,881 | mfu: 50.62 | epoch: 2 | total time: 163.89m | eta: 15.6m +step 15257/16704 (91.34%) | loss: 2.508441 | lrm: 0.17 | dt: 644.89ms | tok/sec: 812,984 | mfu: 50.81 | epoch: 2 | total time: 163.90m | eta: 15.6m +step 15258/16704 (91.34%) | loss: 2.490809 | lrm: 0.17 | dt: 641.29ms | tok/sec: 817,550 | mfu: 51.10 | epoch: 2 | total time: 163.91m | eta: 15.5m +step 15259/16704 (91.35%) | loss: 2.479849 | lrm: 0.17 | dt: 645.89ms | tok/sec: 811,727 | mfu: 50.73 | epoch: 2 | total time: 163.92m | eta: 15.5m +step 15260/16704 (91.36%) | loss: 2.476439 | lrm: 0.17 | dt: 641.24ms | tok/sec: 817,613 | mfu: 51.10 | epoch: 2 | total time: 163.93m | eta: 15.5m +step 15261/16704 (91.36%) | loss: 2.484608 | lrm: 0.17 | dt: 644.25ms | tok/sec: 813,802 | mfu: 50.86 | epoch: 2 | total time: 163.94m | eta: 15.5m +step 15262/16704 (91.37%) | loss: 2.484378 | lrm: 0.17 | dt: 643.52ms | tok/sec: 814,722 | mfu: 50.92 | epoch: 2 | total time: 163.95m | eta: 15.5m +step 15263/16704 (91.37%) | loss: 2.489893 | lrm: 0.17 | dt: 645.22ms | tok/sec: 812,574 | mfu: 50.79 | epoch: 2 | total time: 163.96m | eta: 15.5m +step 15264/16704 (91.38%) | loss: 2.482543 | lrm: 0.17 | dt: 641.40ms | tok/sec: 817,413 | mfu: 51.09 | epoch: 2 | total time: 163.98m | eta: 15.5m +step 15265/16704 (91.39%) | loss: 2.490990 | lrm: 0.17 | dt: 642.65ms | tok/sec: 815,827 | mfu: 50.99 | epoch: 2 | total time: 163.99m | eta: 15.5m +step 15266/16704 (91.39%) | loss: 2.489723 | lrm: 0.17 | dt: 643.67ms | tok/sec: 814,523 | mfu: 50.91 | epoch: 2 | total time: 164.00m | eta: 15.5m +step 15267/16704 (91.40%) | loss: 2.483465 | lrm: 0.17 | dt: 642.81ms | tok/sec: 815,619 | mfu: 50.98 | epoch: 2 | total time: 164.01m | eta: 15.4m +step 15268/16704 (91.40%) | loss: 2.507587 | lrm: 0.17 | dt: 644.82ms | tok/sec: 813,074 | mfu: 50.82 | epoch: 2 | total time: 164.02m | eta: 15.4m +step 15269/16704 (91.41%) | loss: 2.495535 | lrm: 0.17 | dt: 644.76ms | tok/sec: 813,155 | mfu: 50.82 | epoch: 2 | total time: 164.03m | eta: 15.4m +step 15270/16704 (91.42%) | loss: 2.499454 | lrm: 0.17 | dt: 641.75ms | tok/sec: 816,969 | mfu: 51.06 | epoch: 2 | total time: 164.04m | eta: 15.4m +step 15271/16704 (91.42%) | loss: 2.486985 | lrm: 0.17 | dt: 645.67ms | tok/sec: 812,005 | mfu: 50.75 | epoch: 2 | total time: 164.05m | eta: 15.4m +step 15272/16704 (91.43%) | loss: 2.491095 | lrm: 0.17 | dt: 644.06ms | tok/sec: 814,040 | mfu: 50.88 | epoch: 2 | total time: 164.06m | eta: 15.4m +step 15273/16704 (91.43%) | loss: 2.492039 | lrm: 0.17 | dt: 642.50ms | tok/sec: 816,011 | mfu: 51.00 | epoch: 2 | total time: 164.07m | eta: 15.4m +step 15274/16704 (91.44%) | loss: 2.485797 | lrm: 0.17 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 2 | total time: 164.08m | eta: 15.4m +step 15275/16704 (91.45%) | loss: 2.486597 | lrm: 0.17 | dt: 643.63ms | tok/sec: 814,577 | mfu: 50.91 | epoch: 2 | total time: 164.09m | eta: 15.4m +step 15276/16704 (91.45%) | loss: 2.472492 | lrm: 0.17 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 2 | total time: 164.10m | eta: 15.4m +step 15277/16704 (91.46%) | loss: 2.469077 | lrm: 0.17 | dt: 643.67ms | tok/sec: 814,525 | mfu: 50.91 | epoch: 2 | total time: 164.11m | eta: 15.3m +step 15278/16704 (91.46%) | loss: 2.467691 | lrm: 0.17 | dt: 643.11ms | tok/sec: 815,236 | mfu: 50.95 | epoch: 2 | total time: 164.13m | eta: 15.3m +step 15279/16704 (91.47%) | loss: 2.462345 | lrm: 0.17 | dt: 644.21ms | tok/sec: 813,848 | mfu: 50.87 | epoch: 2 | total time: 164.14m | eta: 15.3m +step 15280/16704 (91.48%) | loss: 2.463977 | lrm: 0.17 | dt: 643.69ms | tok/sec: 814,510 | mfu: 50.91 | epoch: 2 | total time: 164.15m | eta: 15.3m +step 15281/16704 (91.48%) | loss: 2.453564 | lrm: 0.17 | dt: 646.37ms | tok/sec: 811,130 | mfu: 50.70 | epoch: 2 | total time: 164.16m | eta: 15.3m +step 15282/16704 (91.49%) | loss: 2.446630 | lrm: 0.17 | dt: 645.05ms | tok/sec: 812,785 | mfu: 50.80 | epoch: 2 | total time: 164.17m | eta: 15.3m +step 15283/16704 (91.49%) | loss: 2.441314 | lrm: 0.17 | dt: 643.90ms | tok/sec: 814,244 | mfu: 50.89 | epoch: 2 | total time: 164.18m | eta: 15.3m +step 15284/16704 (91.50%) | loss: 2.443893 | lrm: 0.17 | dt: 642.64ms | tok/sec: 815,830 | mfu: 50.99 | epoch: 2 | total time: 164.19m | eta: 15.3m +step 15285/16704 (91.51%) | loss: 2.445651 | lrm: 0.17 | dt: 644.09ms | tok/sec: 813,997 | mfu: 50.88 | epoch: 2 | total time: 164.20m | eta: 15.3m +step 15286/16704 (91.51%) | loss: 2.454163 | lrm: 0.17 | dt: 644.36ms | tok/sec: 813,661 | mfu: 50.86 | epoch: 2 | total time: 164.21m | eta: 15.2m +step 15287/16704 (91.52%) | loss: 2.453943 | lrm: 0.17 | dt: 644.30ms | tok/sec: 813,736 | mfu: 50.86 | epoch: 2 | total time: 164.22m | eta: 15.2m +step 15288/16704 (91.52%) | loss: 2.475955 | lrm: 0.17 | dt: 644.23ms | tok/sec: 813,821 | mfu: 50.86 | epoch: 2 | total time: 164.23m | eta: 15.2m +step 15289/16704 (91.53%) | loss: 2.483367 | lrm: 0.17 | dt: 640.65ms | tok/sec: 818,366 | mfu: 51.15 | epoch: 2 | total time: 164.24m | eta: 15.2m +step 15290/16704 (91.53%) | loss: 2.479446 | lrm: 0.17 | dt: 644.30ms | tok/sec: 813,730 | mfu: 50.86 | epoch: 2 | total time: 164.25m | eta: 15.2m +step 15291/16704 (91.54%) | loss: 2.471942 | lrm: 0.17 | dt: 642.41ms | tok/sec: 816,129 | mfu: 51.01 | epoch: 2 | total time: 164.27m | eta: 15.2m +step 15292/16704 (91.55%) | loss: 2.461598 | lrm: 0.17 | dt: 644.38ms | tok/sec: 813,633 | mfu: 50.85 | epoch: 2 | total time: 164.28m | eta: 15.2m +step 15293/16704 (91.55%) | loss: 2.448955 | lrm: 0.17 | dt: 644.33ms | tok/sec: 813,690 | mfu: 50.86 | epoch: 2 | total time: 164.29m | eta: 15.2m +step 15294/16704 (91.56%) | loss: 2.445122 | lrm: 0.17 | dt: 643.61ms | tok/sec: 814,606 | mfu: 50.91 | epoch: 2 | total time: 164.30m | eta: 15.2m +step 15295/16704 (91.56%) | loss: 2.446560 | lrm: 0.17 | dt: 644.13ms | tok/sec: 813,941 | mfu: 50.87 | epoch: 2 | total time: 164.31m | eta: 15.1m +step 15296/16704 (91.57%) | loss: 2.433700 | lrm: 0.17 | dt: 643.82ms | tok/sec: 814,335 | mfu: 50.90 | epoch: 2 | total time: 164.32m | eta: 15.1m +step 15297/16704 (91.58%) | loss: 2.433924 | lrm: 0.17 | dt: 644.32ms | tok/sec: 813,711 | mfu: 50.86 | epoch: 2 | total time: 164.33m | eta: 15.1m +step 15298/16704 (91.58%) | loss: 2.437931 | lrm: 0.17 | dt: 644.36ms | tok/sec: 813,651 | mfu: 50.85 | epoch: 2 | total time: 164.34m | eta: 15.1m +step 15299/16704 (91.59%) | loss: 2.426483 | lrm: 0.17 | dt: 644.23ms | tok/sec: 813,818 | mfu: 50.86 | epoch: 2 | total time: 164.35m | eta: 15.1m +step 15300/16704 (91.59%) | loss: 2.413133 | lrm: 0.17 | dt: 644.57ms | tok/sec: 813,388 | mfu: 50.84 | epoch: 2 | total time: 164.36m | eta: 15.1m +step 15301/16704 (91.60%) | loss: 2.416812 | lrm: 0.17 | dt: 644.84ms | tok/sec: 813,049 | mfu: 50.82 | epoch: 2 | total time: 164.37m | eta: 15.1m +step 15302/16704 (91.61%) | loss: 2.426334 | lrm: 0.17 | dt: 645.44ms | tok/sec: 812,292 | mfu: 50.77 | epoch: 2 | total time: 164.38m | eta: 15.1m +step 15303/16704 (91.61%) | loss: 2.409344 | lrm: 0.17 | dt: 640.79ms | tok/sec: 818,188 | mfu: 51.14 | epoch: 2 | total time: 164.39m | eta: 15.1m +step 15304/16704 (91.62%) | loss: 2.409678 | lrm: 0.17 | dt: 643.57ms | tok/sec: 814,649 | mfu: 50.92 | epoch: 2 | total time: 164.40m | eta: 15.0m +step 15305/16704 (91.62%) | loss: 2.421726 | lrm: 0.17 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 2 | total time: 164.42m | eta: 15.0m +step 15306/16704 (91.63%) | loss: 2.432566 | lrm: 0.17 | dt: 645.23ms | tok/sec: 812,557 | mfu: 50.79 | epoch: 2 | total time: 164.43m | eta: 15.0m +step 15307/16704 (91.64%) | loss: 2.435889 | lrm: 0.17 | dt: 644.44ms | tok/sec: 813,556 | mfu: 50.85 | epoch: 2 | total time: 164.44m | eta: 15.0m +step 15308/16704 (91.64%) | loss: 2.438860 | lrm: 0.17 | dt: 642.69ms | tok/sec: 815,766 | mfu: 50.99 | epoch: 2 | total time: 164.45m | eta: 15.0m +step 15309/16704 (91.65%) | loss: 2.447266 | lrm: 0.17 | dt: 642.57ms | tok/sec: 815,924 | mfu: 51.00 | epoch: 2 | total time: 164.46m | eta: 15.0m +step 15310/16704 (91.65%) | loss: 2.440687 | lrm: 0.17 | dt: 643.82ms | tok/sec: 814,340 | mfu: 50.90 | epoch: 2 | total time: 164.47m | eta: 15.0m +step 15311/16704 (91.66%) | loss: 2.434620 | lrm: 0.17 | dt: 643.15ms | tok/sec: 815,183 | mfu: 50.95 | epoch: 2 | total time: 164.48m | eta: 15.0m +step 15312/16704 (91.67%) | loss: 2.434911 | lrm: 0.17 | dt: 644.61ms | tok/sec: 813,343 | mfu: 50.84 | epoch: 2 | total time: 164.49m | eta: 15.0m +step 15313/16704 (91.67%) | loss: 2.425834 | lrm: 0.17 | dt: 642.95ms | tok/sec: 815,446 | mfu: 50.97 | epoch: 2 | total time: 164.50m | eta: 15.0m +step 15314/16704 (91.68%) | loss: 2.437922 | lrm: 0.17 | dt: 643.52ms | tok/sec: 814,712 | mfu: 50.92 | epoch: 2 | total time: 164.51m | eta: 14.9m +step 15315/16704 (91.68%) | loss: 2.452725 | lrm: 0.17 | dt: 645.42ms | tok/sec: 812,321 | mfu: 50.77 | epoch: 2 | total time: 164.52m | eta: 14.9m +step 15316/16704 (91.69%) | loss: 2.449060 | lrm: 0.17 | dt: 643.87ms | tok/sec: 814,279 | mfu: 50.89 | epoch: 2 | total time: 164.53m | eta: 14.9m +step 15317/16704 (91.70%) | loss: 2.454366 | lrm: 0.17 | dt: 643.96ms | tok/sec: 814,158 | mfu: 50.89 | epoch: 2 | total time: 164.54m | eta: 14.9m +step 15318/16704 (91.70%) | loss: 2.455799 | lrm: 0.17 | dt: 644.00ms | tok/sec: 814,114 | mfu: 50.88 | epoch: 2 | total time: 164.56m | eta: 14.9m +step 15319/16704 (91.71%) | loss: 2.454918 | lrm: 0.17 | dt: 641.04ms | tok/sec: 817,869 | mfu: 51.12 | epoch: 2 | total time: 164.57m | eta: 14.9m +step 15320/16704 (91.71%) | loss: 2.455809 | lrm: 0.17 | dt: 642.78ms | tok/sec: 815,655 | mfu: 50.98 | epoch: 2 | total time: 164.58m | eta: 14.9m +step 15321/16704 (91.72%) | loss: 2.459066 | lrm: 0.17 | dt: 644.78ms | tok/sec: 813,130 | mfu: 50.82 | epoch: 2 | total time: 164.59m | eta: 14.9m +step 15322/16704 (91.73%) | loss: 2.460033 | lrm: 0.17 | dt: 642.11ms | tok/sec: 816,502 | mfu: 51.03 | epoch: 2 | total time: 164.60m | eta: 14.9m +step 15323/16704 (91.73%) | loss: 2.455466 | lrm: 0.17 | dt: 644.10ms | tok/sec: 813,989 | mfu: 50.88 | epoch: 2 | total time: 164.61m | eta: 14.8m +step 15324/16704 (91.74%) | loss: 2.453208 | lrm: 0.17 | dt: 643.50ms | tok/sec: 814,738 | mfu: 50.92 | epoch: 2 | total time: 164.62m | eta: 14.8m +step 15325/16704 (91.74%) | loss: 2.453543 | lrm: 0.17 | dt: 642.45ms | tok/sec: 816,080 | mfu: 51.01 | epoch: 2 | total time: 164.63m | eta: 14.8m +step 15326/16704 (91.75%) | loss: 2.448369 | lrm: 0.16 | dt: 645.72ms | tok/sec: 811,947 | mfu: 50.75 | epoch: 2 | total time: 164.64m | eta: 14.8m +step 15327/16704 (91.76%) | loss: 2.454574 | lrm: 0.16 | dt: 642.90ms | tok/sec: 815,503 | mfu: 50.97 | epoch: 2 | total time: 164.65m | eta: 14.8m +step 15328/16704 (91.76%) | loss: 2.456011 | lrm: 0.16 | dt: 643.31ms | tok/sec: 814,987 | mfu: 50.94 | epoch: 2 | total time: 164.66m | eta: 14.8m +step 15329/16704 (91.77%) | loss: 2.461646 | lrm: 0.16 | dt: 645.17ms | tok/sec: 812,637 | mfu: 50.79 | epoch: 2 | total time: 164.67m | eta: 14.8m +step 15330/16704 (91.77%) | loss: 2.472668 | lrm: 0.16 | dt: 643.35ms | tok/sec: 814,935 | mfu: 50.93 | epoch: 2 | total time: 164.68m | eta: 14.8m +step 15331/16704 (91.78%) | loss: 2.471216 | lrm: 0.16 | dt: 643.67ms | tok/sec: 814,532 | mfu: 50.91 | epoch: 2 | total time: 164.69m | eta: 14.8m +step 15332/16704 (91.79%) | loss: 2.472364 | lrm: 0.16 | dt: 643.86ms | tok/sec: 814,285 | mfu: 50.89 | epoch: 2 | total time: 164.71m | eta: 14.7m +step 15333/16704 (91.79%) | loss: 2.477977 | lrm: 0.16 | dt: 645.11ms | tok/sec: 812,715 | mfu: 50.80 | epoch: 2 | total time: 164.72m | eta: 14.7m +step 15334/16704 (91.80%) | loss: 2.466265 | lrm: 0.16 | dt: 644.07ms | tok/sec: 814,025 | mfu: 50.88 | epoch: 2 | total time: 164.73m | eta: 14.7m +step 15335/16704 (91.80%) | loss: 2.467506 | lrm: 0.16 | dt: 642.10ms | tok/sec: 816,525 | mfu: 51.03 | epoch: 2 | total time: 164.74m | eta: 14.7m +step 15336/16704 (91.81%) | loss: 2.478110 | lrm: 0.16 | dt: 645.91ms | tok/sec: 811,703 | mfu: 50.73 | epoch: 2 | total time: 164.75m | eta: 14.7m +step 15337/16704 (91.82%) | loss: 2.490367 | lrm: 0.16 | dt: 644.42ms | tok/sec: 813,585 | mfu: 50.85 | epoch: 2 | total time: 164.76m | eta: 14.7m +step 15338/16704 (91.82%) | loss: 2.476435 | lrm: 0.16 | dt: 642.91ms | tok/sec: 815,498 | mfu: 50.97 | epoch: 2 | total time: 164.77m | eta: 14.7m +step 15339/16704 (91.83%) | loss: 2.481733 | lrm: 0.16 | dt: 644.17ms | tok/sec: 813,902 | mfu: 50.87 | epoch: 2 | total time: 164.78m | eta: 14.7m +step 15340/16704 (91.83%) | loss: 2.481570 | lrm: 0.16 | dt: 645.07ms | tok/sec: 812,764 | mfu: 50.80 | epoch: 2 | total time: 164.79m | eta: 14.7m +step 15341/16704 (91.84%) | loss: 2.479562 | lrm: 0.16 | dt: 641.87ms | tok/sec: 816,808 | mfu: 51.05 | epoch: 2 | total time: 164.80m | eta: 14.7m +step 15342/16704 (91.85%) | loss: 2.480042 | lrm: 0.16 | dt: 645.15ms | tok/sec: 812,661 | mfu: 50.79 | epoch: 2 | total time: 164.81m | eta: 14.6m +step 15343/16704 (91.85%) | loss: 2.486729 | lrm: 0.16 | dt: 644.00ms | tok/sec: 814,112 | mfu: 50.88 | epoch: 2 | total time: 164.82m | eta: 14.6m +step 15344/16704 (91.86%) | loss: 2.484815 | lrm: 0.16 | dt: 641.74ms | tok/sec: 816,980 | mfu: 51.06 | epoch: 2 | total time: 164.83m | eta: 14.6m +step 15345/16704 (91.86%) | loss: 2.472852 | lrm: 0.16 | dt: 644.95ms | tok/sec: 812,913 | mfu: 50.81 | epoch: 2 | total time: 164.84m | eta: 14.6m +step 15346/16704 (91.87%) | loss: 2.473336 | lrm: 0.16 | dt: 643.17ms | tok/sec: 815,163 | mfu: 50.95 | epoch: 2 | total time: 164.86m | eta: 14.6m +step 15347/16704 (91.88%) | loss: 2.469327 | lrm: 0.16 | dt: 644.14ms | tok/sec: 813,940 | mfu: 50.87 | epoch: 2 | total time: 164.87m | eta: 14.6m +step 15348/16704 (91.88%) | loss: 2.469355 | lrm: 0.16 | dt: 643.90ms | tok/sec: 814,232 | mfu: 50.89 | epoch: 2 | total time: 164.88m | eta: 14.6m +step 15349/16704 (91.89%) | loss: 2.460445 | lrm: 0.16 | dt: 643.75ms | tok/sec: 814,429 | mfu: 50.90 | epoch: 2 | total time: 164.89m | eta: 14.6m +step 15350/16704 (91.89%) | loss: 2.457366 | lrm: 0.16 | dt: 644.75ms | tok/sec: 813,169 | mfu: 50.82 | epoch: 2 | total time: 164.90m | eta: 14.6m +step 15351/16704 (91.90%) | loss: 2.453238 | lrm: 0.16 | dt: 642.55ms | tok/sec: 815,955 | mfu: 51.00 | epoch: 2 | total time: 164.91m | eta: 14.5m +step 15352/16704 (91.91%) | loss: 2.455133 | lrm: 0.16 | dt: 642.68ms | tok/sec: 815,781 | mfu: 50.99 | epoch: 2 | total time: 164.92m | eta: 14.5m +step 15353/16704 (91.91%) | loss: 2.452008 | lrm: 0.16 | dt: 643.19ms | tok/sec: 815,140 | mfu: 50.95 | epoch: 2 | total time: 164.93m | eta: 14.5m +step 15354/16704 (91.92%) | loss: 2.456380 | lrm: 0.16 | dt: 645.37ms | tok/sec: 812,380 | mfu: 50.77 | epoch: 2 | total time: 164.94m | eta: 14.5m +step 15355/16704 (91.92%) | loss: 2.444239 | lrm: 0.16 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 2 | total time: 164.95m | eta: 14.5m +step 15356/16704 (91.93%) | loss: 2.453605 | lrm: 0.16 | dt: 645.13ms | tok/sec: 812,689 | mfu: 50.79 | epoch: 2 | total time: 164.96m | eta: 14.5m +step 15357/16704 (91.94%) | loss: 2.443979 | lrm: 0.16 | dt: 642.42ms | tok/sec: 816,117 | mfu: 51.01 | epoch: 2 | total time: 164.97m | eta: 14.5m +step 15358/16704 (91.94%) | loss: 2.439304 | lrm: 0.16 | dt: 643.60ms | tok/sec: 814,613 | mfu: 50.91 | epoch: 2 | total time: 164.98m | eta: 14.5m +step 15359/16704 (91.95%) | loss: 2.435402 | lrm: 0.16 | dt: 644.30ms | tok/sec: 813,727 | mfu: 50.86 | epoch: 2 | total time: 164.99m | eta: 14.5m +step 15360/16704 (91.95%) | loss: 2.431143 | lrm: 0.16 | dt: 643.88ms | tok/sec: 814,261 | mfu: 50.89 | epoch: 2 | total time: 165.01m | eta: 14.4m +step 15361/16704 (91.96%) | loss: 2.408503 | lrm: 0.16 | dt: 646.53ms | tok/sec: 810,921 | mfu: 50.68 | epoch: 2 | total time: 165.02m | eta: 14.4m +step 15362/16704 (91.97%) | loss: 2.422694 | lrm: 0.16 | dt: 643.68ms | tok/sec: 814,514 | mfu: 50.91 | epoch: 2 | total time: 165.03m | eta: 14.4m +step 15363/16704 (91.97%) | loss: 2.437776 | lrm: 0.16 | dt: 642.97ms | tok/sec: 815,410 | mfu: 50.96 | epoch: 2 | total time: 165.04m | eta: 14.4m +step 15364/16704 (91.98%) | loss: 2.432580 | lrm: 0.16 | dt: 645.58ms | tok/sec: 812,121 | mfu: 50.76 | epoch: 2 | total time: 165.05m | eta: 14.4m +step 15365/16704 (91.98%) | loss: 2.436802 | lrm: 0.16 | dt: 643.00ms | tok/sec: 815,384 | mfu: 50.96 | epoch: 2 | total time: 165.06m | eta: 14.4m +step 15366/16704 (91.99%) | loss: 2.437549 | lrm: 0.16 | dt: 645.08ms | tok/sec: 812,746 | mfu: 50.80 | epoch: 2 | total time: 165.07m | eta: 14.4m +step 15367/16704 (92.00%) | loss: 2.435314 | lrm: 0.16 | dt: 644.81ms | tok/sec: 813,084 | mfu: 50.82 | epoch: 2 | total time: 165.08m | eta: 14.4m +step 15368/16704 (92.00%) | loss: 2.442014 | lrm: 0.16 | dt: 642.82ms | tok/sec: 815,600 | mfu: 50.98 | epoch: 2 | total time: 165.09m | eta: 14.4m +step 15369/16704 (92.01%) | loss: 2.455402 | lrm: 0.16 | dt: 642.36ms | tok/sec: 816,187 | mfu: 51.01 | epoch: 2 | total time: 165.10m | eta: 14.4m +step 15370/16704 (92.01%) | loss: 2.461023 | lrm: 0.16 | dt: 643.76ms | tok/sec: 814,418 | mfu: 50.90 | epoch: 2 | total time: 165.11m | eta: 14.3m +step 15371/16704 (92.02%) | loss: 2.452013 | lrm: 0.16 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 2 | total time: 165.12m | eta: 14.3m +step 15372/16704 (92.03%) | loss: 2.445594 | lrm: 0.16 | dt: 643.91ms | tok/sec: 814,222 | mfu: 50.89 | epoch: 2 | total time: 165.13m | eta: 14.3m +step 15373/16704 (92.03%) | loss: 2.446343 | lrm: 0.16 | dt: 644.88ms | tok/sec: 813,001 | mfu: 50.81 | epoch: 2 | total time: 165.15m | eta: 14.3m +step 15374/16704 (92.04%) | loss: 2.439249 | lrm: 0.16 | dt: 644.47ms | tok/sec: 813,518 | mfu: 50.85 | epoch: 2 | total time: 165.16m | eta: 14.3m +step 15375/16704 (92.04%) | loss: 2.452601 | lrm: 0.16 | dt: 643.66ms | tok/sec: 814,540 | mfu: 50.91 | epoch: 2 | total time: 165.17m | eta: 14.3m +step 15376/16704 (92.05%) | loss: 2.440865 | lrm: 0.16 | dt: 643.56ms | tok/sec: 814,671 | mfu: 50.92 | epoch: 2 | total time: 165.18m | eta: 14.3m +step 15377/16704 (92.06%) | loss: 2.430680 | lrm: 0.16 | dt: 644.69ms | tok/sec: 813,246 | mfu: 50.83 | epoch: 2 | total time: 165.19m | eta: 14.3m +step 15378/16704 (92.06%) | loss: 2.446325 | lrm: 0.16 | dt: 643.53ms | tok/sec: 814,708 | mfu: 50.92 | epoch: 2 | total time: 165.20m | eta: 14.3m +step 15379/16704 (92.07%) | loss: 2.451785 | lrm: 0.16 | dt: 643.02ms | tok/sec: 815,356 | mfu: 50.96 | epoch: 2 | total time: 165.21m | eta: 14.2m +step 15380/16704 (92.07%) | loss: 2.455796 | lrm: 0.16 | dt: 643.12ms | tok/sec: 815,222 | mfu: 50.95 | epoch: 2 | total time: 165.22m | eta: 14.2m +step 15381/16704 (92.08%) | loss: 2.465062 | lrm: 0.16 | dt: 643.98ms | tok/sec: 814,137 | mfu: 50.88 | epoch: 2 | total time: 165.23m | eta: 14.2m +step 15382/16704 (92.09%) | loss: 2.465520 | lrm: 0.16 | dt: 644.07ms | tok/sec: 814,021 | mfu: 50.88 | epoch: 2 | total time: 165.24m | eta: 14.2m +step 15383/16704 (92.09%) | loss: 2.456940 | lrm: 0.16 | dt: 642.09ms | tok/sec: 816,528 | mfu: 51.03 | epoch: 2 | total time: 165.25m | eta: 14.2m +step 15384/16704 (92.10%) | loss: 2.459207 | lrm: 0.16 | dt: 642.48ms | tok/sec: 816,034 | mfu: 51.00 | epoch: 2 | total time: 165.26m | eta: 14.2m +step 15385/16704 (92.10%) | loss: 2.460196 | lrm: 0.16 | dt: 642.88ms | tok/sec: 815,530 | mfu: 50.97 | epoch: 2 | total time: 165.27m | eta: 14.2m +step 15386/16704 (92.11%) | loss: 2.451322 | lrm: 0.16 | dt: 645.93ms | tok/sec: 811,684 | mfu: 50.73 | epoch: 2 | total time: 165.28m | eta: 14.2m +step 15387/16704 (92.12%) | loss: 2.452976 | lrm: 0.16 | dt: 644.41ms | tok/sec: 813,598 | mfu: 50.85 | epoch: 2 | total time: 165.30m | eta: 14.2m +step 15388/16704 (92.12%) | loss: 2.460333 | lrm: 0.16 | dt: 644.21ms | tok/sec: 813,847 | mfu: 50.87 | epoch: 2 | total time: 165.31m | eta: 14.1m +step 15389/16704 (92.13%) | loss: 2.454724 | lrm: 0.16 | dt: 645.32ms | tok/sec: 812,440 | mfu: 50.78 | epoch: 2 | total time: 165.32m | eta: 14.1m +step 15390/16704 (92.13%) | loss: 2.446880 | lrm: 0.16 | dt: 644.36ms | tok/sec: 813,656 | mfu: 50.85 | epoch: 2 | total time: 165.33m | eta: 14.1m +step 15391/16704 (92.14%) | loss: 2.442332 | lrm: 0.16 | dt: 641.45ms | tok/sec: 817,343 | mfu: 51.09 | epoch: 2 | total time: 165.34m | eta: 14.1m +step 15392/16704 (92.15%) | loss: 2.447761 | lrm: 0.16 | dt: 644.26ms | tok/sec: 813,786 | mfu: 50.86 | epoch: 2 | total time: 165.35m | eta: 14.1m +step 15393/16704 (92.15%) | loss: 2.451481 | lrm: 0.16 | dt: 642.92ms | tok/sec: 815,482 | mfu: 50.97 | epoch: 2 | total time: 165.36m | eta: 14.1m +step 15394/16704 (92.16%) | loss: 2.461030 | lrm: 0.16 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 2 | total time: 165.37m | eta: 14.1m +step 15395/16704 (92.16%) | loss: 2.446102 | lrm: 0.16 | dt: 645.01ms | tok/sec: 812,832 | mfu: 50.80 | epoch: 2 | total time: 165.38m | eta: 14.1m +step 15396/16704 (92.17%) | loss: 2.430442 | lrm: 0.16 | dt: 642.91ms | tok/sec: 815,494 | mfu: 50.97 | epoch: 2 | total time: 165.39m | eta: 14.1m +step 15397/16704 (92.18%) | loss: 2.437196 | lrm: 0.16 | dt: 644.99ms | tok/sec: 812,866 | mfu: 50.81 | epoch: 2 | total time: 165.40m | eta: 14.0m +step 15398/16704 (92.18%) | loss: 2.431127 | lrm: 0.16 | dt: 642.72ms | tok/sec: 815,729 | mfu: 50.98 | epoch: 2 | total time: 165.41m | eta: 14.0m +step 15399/16704 (92.19%) | loss: 2.438687 | lrm: 0.16 | dt: 643.85ms | tok/sec: 814,297 | mfu: 50.89 | epoch: 2 | total time: 165.42m | eta: 14.0m +step 15400/16704 (92.19%) | loss: 2.431494 | lrm: 0.16 | dt: 641.51ms | tok/sec: 817,268 | mfu: 51.08 | epoch: 2 | total time: 165.43m | eta: 14.0m +step 15401/16704 (92.20%) | loss: 2.435786 | lrm: 0.16 | dt: 645.39ms | tok/sec: 812,359 | mfu: 50.77 | epoch: 2 | total time: 165.45m | eta: 14.0m +step 15402/16704 (92.21%) | loss: 2.448268 | lrm: 0.16 | dt: 642.07ms | tok/sec: 816,554 | mfu: 51.04 | epoch: 2 | total time: 165.46m | eta: 14.0m +step 15403/16704 (92.21%) | loss: 2.445314 | lrm: 0.16 | dt: 644.99ms | tok/sec: 812,863 | mfu: 50.81 | epoch: 2 | total time: 165.47m | eta: 14.0m +step 15404/16704 (92.22%) | loss: 2.448905 | lrm: 0.16 | dt: 644.89ms | tok/sec: 812,988 | mfu: 50.81 | epoch: 2 | total time: 165.48m | eta: 14.0m +step 15405/16704 (92.22%) | loss: 2.455653 | lrm: 0.16 | dt: 644.02ms | tok/sec: 814,083 | mfu: 50.88 | epoch: 2 | total time: 165.49m | eta: 14.0m +step 15406/16704 (92.23%) | loss: 2.454943 | lrm: 0.16 | dt: 644.20ms | tok/sec: 813,862 | mfu: 50.87 | epoch: 2 | total time: 165.50m | eta: 14.0m +step 15407/16704 (92.24%) | loss: 2.446864 | lrm: 0.16 | dt: 642.41ms | tok/sec: 816,132 | mfu: 51.01 | epoch: 2 | total time: 165.51m | eta: 13.9m +step 15408/16704 (92.24%) | loss: 2.435289 | lrm: 0.16 | dt: 645.43ms | tok/sec: 812,307 | mfu: 50.77 | epoch: 2 | total time: 165.52m | eta: 13.9m +step 15409/16704 (92.25%) | loss: 2.428847 | lrm: 0.16 | dt: 645.48ms | tok/sec: 812,244 | mfu: 50.77 | epoch: 2 | total time: 165.53m | eta: 13.9m +step 15410/16704 (92.25%) | loss: 2.443864 | lrm: 0.15 | dt: 642.33ms | tok/sec: 816,225 | mfu: 51.02 | epoch: 2 | total time: 165.54m | eta: 13.9m +step 15411/16704 (92.26%) | loss: 2.448066 | lrm: 0.15 | dt: 642.65ms | tok/sec: 815,825 | mfu: 50.99 | epoch: 2 | total time: 165.55m | eta: 13.9m +step 15412/16704 (92.27%) | loss: 2.452789 | lrm: 0.15 | dt: 645.24ms | tok/sec: 812,543 | mfu: 50.79 | epoch: 2 | total time: 165.56m | eta: 13.9m +step 15413/16704 (92.27%) | loss: 2.445817 | lrm: 0.15 | dt: 645.19ms | tok/sec: 812,604 | mfu: 50.79 | epoch: 2 | total time: 165.57m | eta: 13.9m +step 15414/16704 (92.28%) | loss: 2.447228 | lrm: 0.15 | dt: 641.96ms | tok/sec: 816,699 | mfu: 51.04 | epoch: 2 | total time: 165.59m | eta: 13.9m +step 15415/16704 (92.28%) | loss: 2.449911 | lrm: 0.15 | dt: 641.99ms | tok/sec: 816,663 | mfu: 51.04 | epoch: 2 | total time: 165.60m | eta: 13.9m +step 15416/16704 (92.29%) | loss: 2.445926 | lrm: 0.15 | dt: 644.20ms | tok/sec: 813,858 | mfu: 50.87 | epoch: 2 | total time: 165.61m | eta: 13.8m +step 15417/16704 (92.30%) | loss: 2.435149 | lrm: 0.15 | dt: 643.84ms | tok/sec: 814,320 | mfu: 50.90 | epoch: 2 | total time: 165.62m | eta: 13.8m +step 15418/16704 (92.30%) | loss: 2.441621 | lrm: 0.15 | dt: 643.46ms | tok/sec: 814,796 | mfu: 50.93 | epoch: 2 | total time: 165.63m | eta: 13.8m +step 15419/16704 (92.31%) | loss: 2.448719 | lrm: 0.15 | dt: 644.16ms | tok/sec: 813,906 | mfu: 50.87 | epoch: 2 | total time: 165.64m | eta: 13.8m +step 15420/16704 (92.31%) | loss: 2.447220 | lrm: 0.15 | dt: 642.41ms | tok/sec: 816,122 | mfu: 51.01 | epoch: 2 | total time: 165.65m | eta: 13.8m +step 15421/16704 (92.32%) | loss: 2.468350 | lrm: 0.15 | dt: 643.54ms | tok/sec: 814,689 | mfu: 50.92 | epoch: 2 | total time: 165.66m | eta: 13.8m +step 15422/16704 (92.33%) | loss: 2.469711 | lrm: 0.15 | dt: 644.33ms | tok/sec: 813,697 | mfu: 50.86 | epoch: 2 | total time: 165.67m | eta: 13.8m +step 15423/16704 (92.33%) | loss: 2.468146 | lrm: 0.15 | dt: 644.10ms | tok/sec: 813,991 | mfu: 50.88 | epoch: 2 | total time: 165.68m | eta: 13.8m +step 15424/16704 (92.34%) | loss: 2.470444 | lrm: 0.15 | dt: 643.34ms | tok/sec: 814,947 | mfu: 50.94 | epoch: 2 | total time: 165.69m | eta: 13.8m +step 15425/16704 (92.34%) | loss: 2.463087 | lrm: 0.15 | dt: 643.03ms | tok/sec: 815,340 | mfu: 50.96 | epoch: 2 | total time: 165.70m | eta: 13.7m +step 15426/16704 (92.35%) | loss: 2.458748 | lrm: 0.15 | dt: 645.85ms | tok/sec: 811,785 | mfu: 50.74 | epoch: 2 | total time: 165.71m | eta: 13.7m +step 15427/16704 (92.36%) | loss: 2.454463 | lrm: 0.15 | dt: 643.29ms | tok/sec: 815,013 | mfu: 50.94 | epoch: 2 | total time: 165.72m | eta: 13.7m +step 15428/16704 (92.36%) | loss: 2.460304 | lrm: 0.15 | dt: 644.17ms | tok/sec: 813,899 | mfu: 50.87 | epoch: 2 | total time: 165.74m | eta: 13.7m +step 15429/16704 (92.37%) | loss: 2.455150 | lrm: 0.15 | dt: 644.03ms | tok/sec: 814,070 | mfu: 50.88 | epoch: 2 | total time: 165.75m | eta: 13.7m +step 15430/16704 (92.37%) | loss: 2.460311 | lrm: 0.15 | dt: 645.53ms | tok/sec: 812,178 | mfu: 50.76 | epoch: 2 | total time: 165.76m | eta: 13.7m +step 15431/16704 (92.38%) | loss: 2.468278 | lrm: 0.15 | dt: 643.09ms | tok/sec: 815,265 | mfu: 50.96 | epoch: 2 | total time: 165.77m | eta: 13.7m +step 15432/16704 (92.39%) | loss: 2.466446 | lrm: 0.15 | dt: 641.88ms | tok/sec: 816,798 | mfu: 51.05 | epoch: 2 | total time: 165.78m | eta: 13.7m +step 15433/16704 (92.39%) | loss: 2.474044 | lrm: 0.15 | dt: 644.37ms | tok/sec: 813,646 | mfu: 50.85 | epoch: 2 | total time: 165.79m | eta: 13.7m +step 15434/16704 (92.40%) | loss: 2.467663 | lrm: 0.15 | dt: 644.23ms | tok/sec: 813,825 | mfu: 50.87 | epoch: 2 | total time: 165.80m | eta: 13.7m +step 15435/16704 (92.40%) | loss: 2.461393 | lrm: 0.15 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 2 | total time: 165.81m | eta: 13.6m +step 15436/16704 (92.41%) | loss: 2.463983 | lrm: 0.15 | dt: 645.55ms | tok/sec: 812,161 | mfu: 50.76 | epoch: 2 | total time: 165.82m | eta: 13.6m +step 15437/16704 (92.41%) | loss: 2.460316 | lrm: 0.15 | dt: 643.86ms | tok/sec: 814,282 | mfu: 50.89 | epoch: 2 | total time: 165.83m | eta: 13.6m +step 15438/16704 (92.42%) | loss: 2.465160 | lrm: 0.15 | dt: 645.84ms | tok/sec: 811,798 | mfu: 50.74 | epoch: 2 | total time: 165.84m | eta: 13.6m +step 15439/16704 (92.43%) | loss: 2.468093 | lrm: 0.15 | dt: 644.42ms | tok/sec: 813,580 | mfu: 50.85 | epoch: 2 | total time: 165.85m | eta: 13.6m +step 15440/16704 (92.43%) | loss: 2.464348 | lrm: 0.15 | dt: 643.48ms | tok/sec: 814,773 | mfu: 50.92 | epoch: 2 | total time: 165.86m | eta: 13.6m +step 15441/16704 (92.44%) | loss: 2.472734 | lrm: 0.15 | dt: 644.31ms | tok/sec: 813,718 | mfu: 50.86 | epoch: 2 | total time: 165.87m | eta: 13.6m +step 15442/16704 (92.44%) | loss: 2.470872 | lrm: 0.15 | dt: 646.14ms | tok/sec: 811,416 | mfu: 50.71 | epoch: 2 | total time: 165.89m | eta: 13.6m +step 15443/16704 (92.45%) | loss: 2.465037 | lrm: 0.15 | dt: 642.48ms | tok/sec: 816,042 | mfu: 51.00 | epoch: 2 | total time: 165.90m | eta: 13.6m +step 15444/16704 (92.46%) | loss: 2.459225 | lrm: 0.15 | dt: 645.72ms | tok/sec: 811,942 | mfu: 50.75 | epoch: 2 | total time: 165.91m | eta: 13.5m +step 15445/16704 (92.46%) | loss: 2.457262 | lrm: 0.15 | dt: 642.77ms | tok/sec: 815,663 | mfu: 50.98 | epoch: 2 | total time: 165.92m | eta: 13.5m +step 15446/16704 (92.47%) | loss: 2.457982 | lrm: 0.15 | dt: 644.49ms | tok/sec: 813,491 | mfu: 50.84 | epoch: 2 | total time: 165.93m | eta: 13.5m +step 15447/16704 (92.47%) | loss: 2.466293 | lrm: 0.15 | dt: 645.10ms | tok/sec: 812,723 | mfu: 50.80 | epoch: 2 | total time: 165.94m | eta: 13.5m +step 15448/16704 (92.48%) | loss: 2.453670 | lrm: 0.15 | dt: 645.12ms | tok/sec: 812,695 | mfu: 50.79 | epoch: 2 | total time: 165.95m | eta: 13.5m +step 15449/16704 (92.49%) | loss: 2.456696 | lrm: 0.15 | dt: 643.85ms | tok/sec: 814,298 | mfu: 50.89 | epoch: 2 | total time: 165.96m | eta: 13.5m +step 15450/16704 (92.49%) | loss: 2.445824 | lrm: 0.15 | dt: 644.38ms | tok/sec: 813,630 | mfu: 50.85 | epoch: 2 | total time: 165.97m | eta: 13.5m +step 15451/16704 (92.50%) | loss: 2.446058 | lrm: 0.15 | dt: 642.85ms | tok/sec: 815,566 | mfu: 50.97 | epoch: 2 | total time: 165.98m | eta: 13.5m +step 15452/16704 (92.50%) | loss: 2.444835 | lrm: 0.15 | dt: 646.75ms | tok/sec: 810,653 | mfu: 50.67 | epoch: 2 | total time: 165.99m | eta: 13.5m +step 15453/16704 (92.51%) | loss: 2.438379 | lrm: 0.15 | dt: 642.31ms | tok/sec: 816,256 | mfu: 51.02 | epoch: 2 | total time: 166.00m | eta: 13.4m +step 15454/16704 (92.52%) | loss: 2.435327 | lrm: 0.15 | dt: 644.52ms | tok/sec: 813,448 | mfu: 50.84 | epoch: 2 | total time: 166.01m | eta: 13.4m +step 15455/16704 (92.52%) | loss: 2.443296 | lrm: 0.15 | dt: 643.24ms | tok/sec: 815,077 | mfu: 50.94 | epoch: 2 | total time: 166.03m | eta: 13.4m +step 15456/16704 (92.53%) | loss: 2.450198 | lrm: 0.15 | dt: 645.86ms | tok/sec: 811,770 | mfu: 50.74 | epoch: 2 | total time: 166.04m | eta: 13.4m +step 15457/16704 (92.53%) | loss: 2.462359 | lrm: 0.15 | dt: 644.43ms | tok/sec: 813,564 | mfu: 50.85 | epoch: 2 | total time: 166.05m | eta: 13.4m +step 15458/16704 (92.54%) | loss: 2.463054 | lrm: 0.15 | dt: 643.36ms | tok/sec: 814,923 | mfu: 50.93 | epoch: 2 | total time: 166.06m | eta: 13.4m +step 15459/16704 (92.55%) | loss: 2.455795 | lrm: 0.15 | dt: 643.48ms | tok/sec: 814,772 | mfu: 50.92 | epoch: 2 | total time: 166.07m | eta: 13.4m +step 15460/16704 (92.55%) | loss: 2.444199 | lrm: 0.15 | dt: 643.28ms | tok/sec: 815,019 | mfu: 50.94 | epoch: 2 | total time: 166.08m | eta: 13.4m +step 15461/16704 (92.56%) | loss: 2.452096 | lrm: 0.15 | dt: 644.25ms | tok/sec: 813,794 | mfu: 50.86 | epoch: 2 | total time: 166.09m | eta: 13.4m +step 15462/16704 (92.56%) | loss: 2.455128 | lrm: 0.15 | dt: 644.65ms | tok/sec: 813,285 | mfu: 50.83 | epoch: 2 | total time: 166.10m | eta: 13.4m +step 15463/16704 (92.57%) | loss: 2.457245 | lrm: 0.15 | dt: 645.27ms | tok/sec: 812,515 | mfu: 50.78 | epoch: 2 | total time: 166.11m | eta: 13.3m +step 15464/16704 (92.58%) | loss: 2.448645 | lrm: 0.15 | dt: 643.38ms | tok/sec: 814,896 | mfu: 50.93 | epoch: 2 | total time: 166.12m | eta: 13.3m +step 15465/16704 (92.58%) | loss: 2.457227 | lrm: 0.15 | dt: 644.26ms | tok/sec: 813,780 | mfu: 50.86 | epoch: 2 | total time: 166.13m | eta: 13.3m +step 15466/16704 (92.59%) | loss: 2.452089 | lrm: 0.15 | dt: 646.40ms | tok/sec: 811,084 | mfu: 50.69 | epoch: 2 | total time: 166.14m | eta: 13.3m +step 15467/16704 (92.59%) | loss: 2.461267 | lrm: 0.15 | dt: 643.19ms | tok/sec: 815,137 | mfu: 50.95 | epoch: 2 | total time: 166.15m | eta: 13.3m +step 15468/16704 (92.60%) | loss: 2.465024 | lrm: 0.15 | dt: 645.53ms | tok/sec: 812,180 | mfu: 50.76 | epoch: 2 | total time: 166.16m | eta: 13.3m +step 15469/16704 (92.61%) | loss: 2.467152 | lrm: 0.15 | dt: 642.43ms | tok/sec: 816,104 | mfu: 51.01 | epoch: 2 | total time: 166.18m | eta: 13.3m +step 15470/16704 (92.61%) | loss: 2.460769 | lrm: 0.15 | dt: 646.05ms | tok/sec: 811,527 | mfu: 50.72 | epoch: 2 | total time: 166.19m | eta: 13.3m +step 15471/16704 (92.62%) | loss: 2.466068 | lrm: 0.15 | dt: 642.29ms | tok/sec: 816,280 | mfu: 51.02 | epoch: 2 | total time: 166.20m | eta: 13.3m +step 15472/16704 (92.62%) | loss: 2.455734 | lrm: 0.15 | dt: 647.58ms | tok/sec: 809,606 | mfu: 50.60 | epoch: 2 | total time: 166.21m | eta: 13.2m +step 15473/16704 (92.63%) | loss: 2.463811 | lrm: 0.15 | dt: 643.97ms | tok/sec: 814,147 | mfu: 50.89 | epoch: 2 | total time: 166.22m | eta: 13.2m +step 15474/16704 (92.64%) | loss: 2.469967 | lrm: 0.15 | dt: 644.46ms | tok/sec: 813,529 | mfu: 50.85 | epoch: 2 | total time: 166.23m | eta: 13.2m +step 15475/16704 (92.64%) | loss: 2.460056 | lrm: 0.15 | dt: 646.71ms | tok/sec: 810,695 | mfu: 50.67 | epoch: 2 | total time: 166.24m | eta: 13.2m +step 15476/16704 (92.65%) | loss: 2.477703 | lrm: 0.15 | dt: 642.09ms | tok/sec: 816,538 | mfu: 51.03 | epoch: 2 | total time: 166.25m | eta: 13.2m +step 15477/16704 (92.65%) | loss: 2.481518 | lrm: 0.15 | dt: 645.09ms | tok/sec: 812,738 | mfu: 50.80 | epoch: 2 | total time: 166.26m | eta: 13.2m +step 15478/16704 (92.66%) | loss: 2.483907 | lrm: 0.15 | dt: 643.28ms | tok/sec: 815,025 | mfu: 50.94 | epoch: 2 | total time: 166.27m | eta: 13.2m +step 15479/16704 (92.67%) | loss: 2.479400 | lrm: 0.15 | dt: 643.15ms | tok/sec: 815,189 | mfu: 50.95 | epoch: 2 | total time: 166.28m | eta: 13.2m +step 15480/16704 (92.67%) | loss: 2.489639 | lrm: 0.15 | dt: 644.50ms | tok/sec: 813,485 | mfu: 50.84 | epoch: 2 | total time: 166.29m | eta: 13.2m +step 15481/16704 (92.68%) | loss: 2.486072 | lrm: 0.15 | dt: 645.98ms | tok/sec: 811,620 | mfu: 50.73 | epoch: 2 | total time: 166.30m | eta: 13.1m +step 15482/16704 (92.68%) | loss: 2.489046 | lrm: 0.15 | dt: 641.67ms | tok/sec: 817,073 | mfu: 51.07 | epoch: 2 | total time: 166.32m | eta: 13.1m +step 15483/16704 (92.69%) | loss: 2.481791 | lrm: 0.15 | dt: 646.38ms | tok/sec: 811,115 | mfu: 50.70 | epoch: 2 | total time: 166.33m | eta: 13.1m +step 15484/16704 (92.70%) | loss: 2.477654 | lrm: 0.15 | dt: 647.17ms | tok/sec: 810,123 | mfu: 50.63 | epoch: 2 | total time: 166.34m | eta: 13.1m +step 15485/16704 (92.70%) | loss: 2.466994 | lrm: 0.15 | dt: 643.14ms | tok/sec: 815,206 | mfu: 50.95 | epoch: 2 | total time: 166.35m | eta: 13.1m +step 15486/16704 (92.71%) | loss: 2.447794 | lrm: 0.15 | dt: 645.97ms | tok/sec: 811,627 | mfu: 50.73 | epoch: 2 | total time: 166.36m | eta: 13.1m +step 15487/16704 (92.71%) | loss: 2.449527 | lrm: 0.15 | dt: 644.34ms | tok/sec: 813,686 | mfu: 50.86 | epoch: 2 | total time: 166.37m | eta: 13.1m +step 15488/16704 (92.72%) | loss: 2.459280 | lrm: 0.15 | dt: 645.07ms | tok/sec: 812,767 | mfu: 50.80 | epoch: 2 | total time: 166.38m | eta: 13.1m +step 15489/16704 (92.73%) | loss: 2.462633 | lrm: 0.15 | dt: 642.87ms | tok/sec: 815,540 | mfu: 50.97 | epoch: 2 | total time: 166.39m | eta: 13.1m +step 15490/16704 (92.73%) | loss: 2.460973 | lrm: 0.15 | dt: 641.49ms | tok/sec: 817,291 | mfu: 51.08 | epoch: 2 | total time: 166.40m | eta: 13.0m +step 15491/16704 (92.74%) | loss: 2.454303 | lrm: 0.15 | dt: 645.83ms | tok/sec: 811,799 | mfu: 50.74 | epoch: 2 | total time: 166.41m | eta: 13.0m +step 15492/16704 (92.74%) | loss: 2.451160 | lrm: 0.15 | dt: 644.48ms | tok/sec: 813,507 | mfu: 50.85 | epoch: 2 | total time: 166.42m | eta: 13.0m +step 15493/16704 (92.75%) | loss: 2.457476 | lrm: 0.14 | dt: 646.28ms | tok/sec: 811,234 | mfu: 50.70 | epoch: 2 | total time: 166.43m | eta: 13.0m +step 15494/16704 (92.76%) | loss: 2.459546 | lrm: 0.14 | dt: 642.96ms | tok/sec: 815,423 | mfu: 50.97 | epoch: 2 | total time: 166.44m | eta: 13.0m +step 15495/16704 (92.76%) | loss: 2.456660 | lrm: 0.14 | dt: 642.63ms | tok/sec: 815,843 | mfu: 50.99 | epoch: 2 | total time: 166.45m | eta: 13.0m +step 15496/16704 (92.77%) | loss: 2.452894 | lrm: 0.14 | dt: 644.03ms | tok/sec: 814,073 | mfu: 50.88 | epoch: 2 | total time: 166.47m | eta: 13.0m +step 15497/16704 (92.77%) | loss: 2.457643 | lrm: 0.14 | dt: 644.91ms | tok/sec: 812,966 | mfu: 50.81 | epoch: 2 | total time: 166.48m | eta: 13.0m +step 15498/16704 (92.78%) | loss: 2.467089 | lrm: 0.14 | dt: 644.65ms | tok/sec: 813,286 | mfu: 50.83 | epoch: 2 | total time: 166.49m | eta: 13.0m +step 15499/16704 (92.79%) | loss: 2.471220 | lrm: 0.14 | dt: 645.13ms | tok/sec: 812,690 | mfu: 50.79 | epoch: 2 | total time: 166.50m | eta: 13.0m +Step 15500 | Validation bpb: 0.758593 +step 15500/16704 (92.79%) | loss: 2.460579 | lrm: 0.14 | dt: 629.75ms | tok/sec: 832,533 | mfu: 52.03 | epoch: 2 | total time: 166.51m | eta: 12.9m +step 15501/16704 (92.80%) | loss: 2.468889 | lrm: 0.14 | dt: 652.74ms | tok/sec: 803,206 | mfu: 50.20 | epoch: 2 | total time: 166.52m | eta: 12.9m +step 15502/16704 (92.80%) | loss: 2.473945 | lrm: 0.14 | dt: 643.39ms | tok/sec: 814,879 | mfu: 50.93 | epoch: 2 | total time: 166.53m | eta: 12.9m +step 15503/16704 (92.81%) | loss: 2.482279 | lrm: 0.14 | dt: 642.44ms | tok/sec: 816,084 | mfu: 51.01 | epoch: 2 | total time: 166.54m | eta: 12.9m +step 15504/16704 (92.82%) | loss: 2.483658 | lrm: 0.14 | dt: 647.96ms | tok/sec: 809,140 | mfu: 50.57 | epoch: 2 | total time: 166.55m | eta: 12.9m +step 15505/16704 (92.82%) | loss: 2.491290 | lrm: 0.14 | dt: 639.85ms | tok/sec: 819,386 | mfu: 51.21 | epoch: 2 | total time: 166.56m | eta: 12.9m +step 15506/16704 (92.83%) | loss: 2.498773 | lrm: 0.14 | dt: 646.15ms | tok/sec: 811,405 | mfu: 50.71 | epoch: 2 | total time: 166.57m | eta: 12.9m +step 15507/16704 (92.83%) | loss: 2.484524 | lrm: 0.14 | dt: 643.46ms | tok/sec: 814,793 | mfu: 50.93 | epoch: 2 | total time: 166.58m | eta: 12.9m +step 15508/16704 (92.84%) | loss: 2.483522 | lrm: 0.14 | dt: 640.28ms | tok/sec: 818,844 | mfu: 51.18 | epoch: 2 | total time: 166.59m | eta: 12.9m +step 15509/16704 (92.85%) | loss: 2.495273 | lrm: 0.14 | dt: 646.42ms | tok/sec: 811,062 | mfu: 50.69 | epoch: 2 | total time: 166.60m | eta: 12.8m +step 15510/16704 (92.85%) | loss: 2.491735 | lrm: 0.14 | dt: 644.88ms | tok/sec: 812,997 | mfu: 50.81 | epoch: 2 | total time: 166.62m | eta: 12.8m +step 15511/16704 (92.86%) | loss: 2.489380 | lrm: 0.14 | dt: 642.47ms | tok/sec: 816,047 | mfu: 51.00 | epoch: 2 | total time: 166.63m | eta: 12.8m +step 15512/16704 (92.86%) | loss: 2.482880 | lrm: 0.14 | dt: 646.48ms | tok/sec: 810,993 | mfu: 50.69 | epoch: 2 | total time: 166.64m | eta: 12.8m +step 15513/16704 (92.87%) | loss: 2.476905 | lrm: 0.14 | dt: 641.28ms | tok/sec: 817,567 | mfu: 51.10 | epoch: 2 | total time: 166.65m | eta: 12.8m +step 15514/16704 (92.88%) | loss: 2.460680 | lrm: 0.14 | dt: 646.31ms | tok/sec: 811,200 | mfu: 50.70 | epoch: 2 | total time: 166.66m | eta: 12.8m +step 15515/16704 (92.88%) | loss: 2.453123 | lrm: 0.14 | dt: 644.96ms | tok/sec: 812,904 | mfu: 50.81 | epoch: 2 | total time: 166.67m | eta: 12.8m +step 15516/16704 (92.89%) | loss: 2.450787 | lrm: 0.14 | dt: 642.05ms | tok/sec: 816,584 | mfu: 51.04 | epoch: 2 | total time: 166.68m | eta: 12.8m +step 15517/16704 (92.89%) | loss: 2.447044 | lrm: 0.14 | dt: 643.68ms | tok/sec: 814,516 | mfu: 50.91 | epoch: 2 | total time: 166.69m | eta: 12.8m +step 15518/16704 (92.90%) | loss: 2.467224 | lrm: 0.14 | dt: 643.19ms | tok/sec: 815,132 | mfu: 50.95 | epoch: 2 | total time: 166.70m | eta: 12.7m +step 15519/16704 (92.91%) | loss: 2.467475 | lrm: 0.14 | dt: 643.21ms | tok/sec: 815,105 | mfu: 50.95 | epoch: 2 | total time: 166.71m | eta: 12.7m +step 15520/16704 (92.91%) | loss: 2.469650 | lrm: 0.14 | dt: 645.88ms | tok/sec: 811,739 | mfu: 50.73 | epoch: 2 | total time: 166.72m | eta: 12.7m +step 15521/16704 (92.92%) | loss: 2.459793 | lrm: 0.14 | dt: 644.87ms | tok/sec: 813,015 | mfu: 50.81 | epoch: 2 | total time: 166.73m | eta: 12.7m +step 15522/16704 (92.92%) | loss: 2.460340 | lrm: 0.14 | dt: 641.56ms | tok/sec: 817,210 | mfu: 51.08 | epoch: 2 | total time: 166.74m | eta: 12.7m +step 15523/16704 (92.93%) | loss: 2.461476 | lrm: 0.14 | dt: 643.19ms | tok/sec: 815,142 | mfu: 50.95 | epoch: 2 | total time: 166.76m | eta: 12.7m +step 15524/16704 (92.94%) | loss: 2.453036 | lrm: 0.14 | dt: 641.89ms | tok/sec: 816,793 | mfu: 51.05 | epoch: 2 | total time: 166.77m | eta: 12.7m +step 15525/16704 (92.94%) | loss: 2.460450 | lrm: 0.14 | dt: 644.56ms | tok/sec: 813,408 | mfu: 50.84 | epoch: 2 | total time: 166.78m | eta: 12.7m +step 15526/16704 (92.95%) | loss: 2.472106 | lrm: 0.14 | dt: 643.86ms | tok/sec: 814,294 | mfu: 50.89 | epoch: 2 | total time: 166.79m | eta: 12.7m +step 15527/16704 (92.95%) | loss: 2.442188 | lrm: 0.14 | dt: 643.70ms | tok/sec: 814,487 | mfu: 50.91 | epoch: 2 | total time: 166.80m | eta: 12.7m +step 15528/16704 (92.96%) | loss: 2.443672 | lrm: 0.14 | dt: 643.46ms | tok/sec: 814,797 | mfu: 50.93 | epoch: 2 | total time: 166.81m | eta: 12.6m +step 15529/16704 (92.97%) | loss: 2.437989 | lrm: 0.14 | dt: 643.31ms | tok/sec: 814,990 | mfu: 50.94 | epoch: 2 | total time: 166.82m | eta: 12.6m +step 15530/16704 (92.97%) | loss: 2.430760 | lrm: 0.14 | dt: 645.01ms | tok/sec: 812,840 | mfu: 50.80 | epoch: 2 | total time: 166.83m | eta: 12.6m +step 15531/16704 (92.98%) | loss: 2.443012 | lrm: 0.14 | dt: 642.43ms | tok/sec: 816,104 | mfu: 51.01 | epoch: 2 | total time: 166.84m | eta: 12.6m +step 15532/16704 (92.98%) | loss: 2.446590 | lrm: 0.14 | dt: 643.80ms | tok/sec: 814,370 | mfu: 50.90 | epoch: 2 | total time: 166.85m | eta: 12.6m +step 15533/16704 (92.99%) | loss: 2.441166 | lrm: 0.14 | dt: 641.88ms | tok/sec: 816,806 | mfu: 51.05 | epoch: 2 | total time: 166.86m | eta: 12.6m +step 15534/16704 (93.00%) | loss: 2.427378 | lrm: 0.14 | dt: 643.99ms | tok/sec: 814,124 | mfu: 50.88 | epoch: 2 | total time: 166.87m | eta: 12.6m +step 15535/16704 (93.00%) | loss: 2.431555 | lrm: 0.14 | dt: 644.50ms | tok/sec: 813,476 | mfu: 50.84 | epoch: 2 | total time: 166.88m | eta: 12.6m +step 15536/16704 (93.01%) | loss: 2.445686 | lrm: 0.14 | dt: 642.49ms | tok/sec: 816,030 | mfu: 51.00 | epoch: 2 | total time: 166.89m | eta: 12.6m +step 15537/16704 (93.01%) | loss: 2.452389 | lrm: 0.14 | dt: 643.02ms | tok/sec: 815,350 | mfu: 50.96 | epoch: 2 | total time: 166.91m | eta: 12.5m +step 15538/16704 (93.02%) | loss: 2.475503 | lrm: 0.14 | dt: 643.26ms | tok/sec: 815,045 | mfu: 50.94 | epoch: 2 | total time: 166.92m | eta: 12.5m +step 15539/16704 (93.03%) | loss: 2.484501 | lrm: 0.14 | dt: 642.54ms | tok/sec: 815,956 | mfu: 51.00 | epoch: 2 | total time: 166.93m | eta: 12.5m +step 15540/16704 (93.03%) | loss: 2.486745 | lrm: 0.14 | dt: 643.65ms | tok/sec: 814,549 | mfu: 50.91 | epoch: 2 | total time: 166.94m | eta: 12.5m +step 15541/16704 (93.04%) | loss: 2.506884 | lrm: 0.14 | dt: 645.27ms | tok/sec: 812,505 | mfu: 50.78 | epoch: 2 | total time: 166.95m | eta: 12.5m +step 15542/16704 (93.04%) | loss: 2.497932 | lrm: 0.14 | dt: 643.15ms | tok/sec: 815,189 | mfu: 50.95 | epoch: 2 | total time: 166.96m | eta: 12.5m +step 15543/16704 (93.05%) | loss: 2.502135 | lrm: 0.14 | dt: 644.32ms | tok/sec: 813,709 | mfu: 50.86 | epoch: 2 | total time: 166.97m | eta: 12.5m +step 15544/16704 (93.06%) | loss: 2.478335 | lrm: 0.14 | dt: 645.88ms | tok/sec: 811,741 | mfu: 50.73 | epoch: 2 | total time: 166.98m | eta: 12.5m +step 15545/16704 (93.06%) | loss: 2.478104 | lrm: 0.14 | dt: 644.01ms | tok/sec: 814,100 | mfu: 50.88 | epoch: 2 | total time: 166.99m | eta: 12.5m +step 15546/16704 (93.07%) | loss: 2.481432 | lrm: 0.14 | dt: 644.86ms | tok/sec: 813,028 | mfu: 50.82 | epoch: 2 | total time: 167.00m | eta: 12.4m +step 15547/16704 (93.07%) | loss: 2.487752 | lrm: 0.14 | dt: 644.50ms | tok/sec: 813,484 | mfu: 50.84 | epoch: 2 | total time: 167.01m | eta: 12.4m +step 15548/16704 (93.08%) | loss: 2.483231 | lrm: 0.14 | dt: 643.18ms | tok/sec: 815,153 | mfu: 50.95 | epoch: 2 | total time: 167.02m | eta: 12.4m +step 15549/16704 (93.09%) | loss: 2.487152 | lrm: 0.14 | dt: 645.10ms | tok/sec: 812,718 | mfu: 50.80 | epoch: 2 | total time: 167.03m | eta: 12.4m +step 15550/16704 (93.09%) | loss: 2.478154 | lrm: 0.14 | dt: 643.67ms | tok/sec: 814,528 | mfu: 50.91 | epoch: 2 | total time: 167.04m | eta: 12.4m +step 15551/16704 (93.10%) | loss: 2.476215 | lrm: 0.14 | dt: 645.39ms | tok/sec: 812,357 | mfu: 50.77 | epoch: 2 | total time: 167.06m | eta: 12.4m +step 15552/16704 (93.10%) | loss: 2.466509 | lrm: 0.14 | dt: 644.00ms | tok/sec: 814,112 | mfu: 50.88 | epoch: 2 | total time: 167.07m | eta: 12.4m +step 15553/16704 (93.11%) | loss: 2.461378 | lrm: 0.14 | dt: 643.40ms | tok/sec: 814,871 | mfu: 50.93 | epoch: 2 | total time: 167.08m | eta: 12.4m +step 15554/16704 (93.12%) | loss: 2.455240 | lrm: 0.14 | dt: 645.29ms | tok/sec: 812,478 | mfu: 50.78 | epoch: 2 | total time: 167.09m | eta: 12.4m +step 15555/16704 (93.12%) | loss: 2.458277 | lrm: 0.14 | dt: 644.47ms | tok/sec: 813,516 | mfu: 50.85 | epoch: 2 | total time: 167.10m | eta: 12.4m +step 15556/16704 (93.13%) | loss: 2.455234 | lrm: 0.14 | dt: 644.06ms | tok/sec: 814,040 | mfu: 50.88 | epoch: 2 | total time: 167.11m | eta: 12.3m +step 15557/16704 (93.13%) | loss: 2.451837 | lrm: 0.14 | dt: 643.52ms | tok/sec: 814,717 | mfu: 50.92 | epoch: 2 | total time: 167.12m | eta: 12.3m +step 15558/16704 (93.14%) | loss: 2.450913 | lrm: 0.14 | dt: 642.37ms | tok/sec: 816,182 | mfu: 51.01 | epoch: 2 | total time: 167.13m | eta: 12.3m +step 15559/16704 (93.15%) | loss: 2.454092 | lrm: 0.14 | dt: 643.40ms | tok/sec: 814,868 | mfu: 50.93 | epoch: 2 | total time: 167.14m | eta: 12.3m +step 15560/16704 (93.15%) | loss: 2.451469 | lrm: 0.14 | dt: 646.21ms | tok/sec: 811,327 | mfu: 50.71 | epoch: 2 | total time: 167.15m | eta: 12.3m +step 15561/16704 (93.16%) | loss: 2.453320 | lrm: 0.14 | dt: 641.09ms | tok/sec: 817,803 | mfu: 51.11 | epoch: 2 | total time: 167.16m | eta: 12.3m +step 15562/16704 (93.16%) | loss: 2.452998 | lrm: 0.14 | dt: 644.95ms | tok/sec: 812,916 | mfu: 50.81 | epoch: 2 | total time: 167.17m | eta: 12.3m +step 15563/16704 (93.17%) | loss: 2.450803 | lrm: 0.14 | dt: 643.42ms | tok/sec: 814,850 | mfu: 50.93 | epoch: 2 | total time: 167.18m | eta: 12.3m +step 15564/16704 (93.18%) | loss: 2.453961 | lrm: 0.14 | dt: 643.94ms | tok/sec: 814,185 | mfu: 50.89 | epoch: 2 | total time: 167.20m | eta: 12.3m +step 15565/16704 (93.18%) | loss: 2.446419 | lrm: 0.14 | dt: 645.94ms | tok/sec: 811,666 | mfu: 50.73 | epoch: 2 | total time: 167.21m | eta: 12.2m +step 15566/16704 (93.19%) | loss: 2.436444 | lrm: 0.14 | dt: 644.55ms | tok/sec: 813,415 | mfu: 50.84 | epoch: 2 | total time: 167.22m | eta: 12.2m +step 15567/16704 (93.19%) | loss: 2.418072 | lrm: 0.14 | dt: 644.68ms | tok/sec: 813,258 | mfu: 50.83 | epoch: 2 | total time: 167.23m | eta: 12.2m +step 15568/16704 (93.20%) | loss: 2.430632 | lrm: 0.14 | dt: 644.63ms | tok/sec: 813,315 | mfu: 50.83 | epoch: 2 | total time: 167.24m | eta: 12.2m +step 15569/16704 (93.21%) | loss: 2.428934 | lrm: 0.14 | dt: 644.56ms | tok/sec: 813,400 | mfu: 50.84 | epoch: 2 | total time: 167.25m | eta: 12.2m +step 15570/16704 (93.21%) | loss: 2.429061 | lrm: 0.14 | dt: 644.59ms | tok/sec: 813,371 | mfu: 50.84 | epoch: 2 | total time: 167.26m | eta: 12.2m +step 15571/16704 (93.22%) | loss: 2.445518 | lrm: 0.14 | dt: 646.80ms | tok/sec: 810,593 | mfu: 50.66 | epoch: 2 | total time: 167.27m | eta: 12.2m +step 15572/16704 (93.22%) | loss: 2.435771 | lrm: 0.14 | dt: 644.97ms | tok/sec: 812,883 | mfu: 50.81 | epoch: 2 | total time: 167.28m | eta: 12.2m +step 15573/16704 (93.23%) | loss: 2.453696 | lrm: 0.14 | dt: 644.90ms | tok/sec: 812,975 | mfu: 50.81 | epoch: 2 | total time: 167.29m | eta: 12.2m +step 15574/16704 (93.24%) | loss: 2.444774 | lrm: 0.14 | dt: 645.62ms | tok/sec: 812,071 | mfu: 50.76 | epoch: 2 | total time: 167.30m | eta: 12.1m +step 15575/16704 (93.24%) | loss: 2.436387 | lrm: 0.14 | dt: 645.50ms | tok/sec: 812,220 | mfu: 50.76 | epoch: 2 | total time: 167.31m | eta: 12.1m +step 15576/16704 (93.25%) | loss: 2.436846 | lrm: 0.14 | dt: 643.91ms | tok/sec: 814,222 | mfu: 50.89 | epoch: 2 | total time: 167.32m | eta: 12.1m +step 15577/16704 (93.25%) | loss: 2.450762 | lrm: 0.13 | dt: 645.86ms | tok/sec: 811,769 | mfu: 50.74 | epoch: 2 | total time: 167.33m | eta: 12.1m +step 15578/16704 (93.26%) | loss: 2.447133 | lrm: 0.13 | dt: 643.54ms | tok/sec: 814,689 | mfu: 50.92 | epoch: 2 | total time: 167.35m | eta: 12.1m +step 15579/16704 (93.27%) | loss: 2.443568 | lrm: 0.13 | dt: 645.76ms | tok/sec: 811,894 | mfu: 50.74 | epoch: 2 | total time: 167.36m | eta: 12.1m +step 15580/16704 (93.27%) | loss: 2.466358 | lrm: 0.13 | dt: 642.63ms | tok/sec: 815,846 | mfu: 50.99 | epoch: 2 | total time: 167.37m | eta: 12.1m +step 15581/16704 (93.28%) | loss: 2.456826 | lrm: 0.13 | dt: 644.29ms | tok/sec: 813,746 | mfu: 50.86 | epoch: 2 | total time: 167.38m | eta: 12.1m +step 15582/16704 (93.28%) | loss: 2.452687 | lrm: 0.13 | dt: 644.81ms | tok/sec: 813,088 | mfu: 50.82 | epoch: 2 | total time: 167.39m | eta: 12.1m +step 15583/16704 (93.29%) | loss: 2.456816 | lrm: 0.13 | dt: 646.02ms | tok/sec: 811,563 | mfu: 50.72 | epoch: 2 | total time: 167.40m | eta: 12.1m +step 15584/16704 (93.30%) | loss: 2.454204 | lrm: 0.13 | dt: 643.13ms | tok/sec: 815,209 | mfu: 50.95 | epoch: 2 | total time: 167.41m | eta: 12.0m +step 15585/16704 (93.30%) | loss: 2.452410 | lrm: 0.13 | dt: 643.16ms | tok/sec: 815,174 | mfu: 50.95 | epoch: 2 | total time: 167.42m | eta: 12.0m +step 15586/16704 (93.31%) | loss: 2.454755 | lrm: 0.13 | dt: 644.91ms | tok/sec: 812,964 | mfu: 50.81 | epoch: 2 | total time: 167.43m | eta: 12.0m +step 15587/16704 (93.31%) | loss: 2.442219 | lrm: 0.13 | dt: 641.66ms | tok/sec: 817,075 | mfu: 51.07 | epoch: 2 | total time: 167.44m | eta: 12.0m +step 15588/16704 (93.32%) | loss: 2.443190 | lrm: 0.13 | dt: 645.45ms | tok/sec: 812,283 | mfu: 50.77 | epoch: 2 | total time: 167.45m | eta: 12.0m +step 15589/16704 (93.32%) | loss: 2.450349 | lrm: 0.13 | dt: 643.78ms | tok/sec: 814,388 | mfu: 50.90 | epoch: 2 | total time: 167.46m | eta: 12.0m +step 15590/16704 (93.33%) | loss: 2.448018 | lrm: 0.13 | dt: 644.43ms | tok/sec: 813,572 | mfu: 50.85 | epoch: 2 | total time: 167.47m | eta: 12.0m +step 15591/16704 (93.34%) | loss: 2.433791 | lrm: 0.13 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 2 | total time: 167.49m | eta: 12.0m +step 15592/16704 (93.34%) | loss: 2.441791 | lrm: 0.13 | dt: 645.09ms | tok/sec: 812,739 | mfu: 50.80 | epoch: 2 | total time: 167.50m | eta: 12.0m +step 15593/16704 (93.35%) | loss: 2.440399 | lrm: 0.13 | dt: 643.83ms | tok/sec: 814,333 | mfu: 50.90 | epoch: 2 | total time: 167.51m | eta: 11.9m +step 15594/16704 (93.35%) | loss: 2.429263 | lrm: 0.13 | dt: 642.55ms | tok/sec: 815,947 | mfu: 51.00 | epoch: 2 | total time: 167.52m | eta: 11.9m +step 15595/16704 (93.36%) | loss: 2.420508 | lrm: 0.13 | dt: 644.40ms | tok/sec: 813,602 | mfu: 50.85 | epoch: 2 | total time: 167.53m | eta: 11.9m +step 15596/16704 (93.37%) | loss: 2.416199 | lrm: 0.13 | dt: 646.06ms | tok/sec: 811,513 | mfu: 50.72 | epoch: 2 | total time: 167.54m | eta: 11.9m +step 15597/16704 (93.37%) | loss: 2.431117 | lrm: 0.13 | dt: 643.07ms | tok/sec: 815,294 | mfu: 50.96 | epoch: 2 | total time: 167.55m | eta: 11.9m +step 15598/16704 (93.38%) | loss: 2.427814 | lrm: 0.13 | dt: 644.04ms | tok/sec: 814,065 | mfu: 50.88 | epoch: 2 | total time: 167.56m | eta: 11.9m +step 15599/16704 (93.38%) | loss: 2.424036 | lrm: 0.13 | dt: 642.86ms | tok/sec: 815,557 | mfu: 50.97 | epoch: 2 | total time: 167.57m | eta: 11.9m +step 15600/16704 (93.39%) | loss: 2.432454 | lrm: 0.13 | dt: 646.39ms | tok/sec: 811,106 | mfu: 50.70 | epoch: 2 | total time: 167.58m | eta: 11.9m +step 15601/16704 (93.40%) | loss: 2.428009 | lrm: 0.13 | dt: 644.01ms | tok/sec: 814,100 | mfu: 50.88 | epoch: 2 | total time: 167.59m | eta: 11.9m +step 15602/16704 (93.40%) | loss: 2.414588 | lrm: 0.13 | dt: 643.50ms | tok/sec: 814,744 | mfu: 50.92 | epoch: 2 | total time: 167.60m | eta: 11.8m +step 15603/16704 (93.41%) | loss: 2.420943 | lrm: 0.13 | dt: 643.27ms | tok/sec: 815,034 | mfu: 50.94 | epoch: 2 | total time: 167.61m | eta: 11.8m +step 15604/16704 (93.41%) | loss: 2.424852 | lrm: 0.13 | dt: 646.47ms | tok/sec: 810,998 | mfu: 50.69 | epoch: 2 | total time: 167.62m | eta: 11.8m +step 15605/16704 (93.42%) | loss: 2.418812 | lrm: 0.13 | dt: 642.62ms | tok/sec: 815,854 | mfu: 50.99 | epoch: 2 | total time: 167.64m | eta: 11.8m +step 15606/16704 (93.43%) | loss: 2.431712 | lrm: 0.13 | dt: 643.89ms | tok/sec: 814,246 | mfu: 50.89 | epoch: 2 | total time: 167.65m | eta: 11.8m +step 15607/16704 (93.43%) | loss: 2.429133 | lrm: 0.13 | dt: 647.42ms | tok/sec: 809,815 | mfu: 50.61 | epoch: 2 | total time: 167.66m | eta: 11.8m +step 15608/16704 (93.44%) | loss: 2.436369 | lrm: 0.13 | dt: 642.70ms | tok/sec: 815,753 | mfu: 50.99 | epoch: 2 | total time: 167.67m | eta: 11.8m +step 15609/16704 (93.44%) | loss: 2.437398 | lrm: 0.13 | dt: 645.91ms | tok/sec: 811,700 | mfu: 50.73 | epoch: 2 | total time: 167.68m | eta: 11.8m +step 15610/16704 (93.45%) | loss: 2.437676 | lrm: 0.13 | dt: 643.12ms | tok/sec: 815,230 | mfu: 50.95 | epoch: 2 | total time: 167.69m | eta: 11.8m +step 15611/16704 (93.46%) | loss: 2.429502 | lrm: 0.13 | dt: 644.14ms | tok/sec: 813,934 | mfu: 50.87 | epoch: 2 | total time: 167.70m | eta: 11.7m +step 15612/16704 (93.46%) | loss: 2.431428 | lrm: 0.13 | dt: 645.61ms | tok/sec: 812,083 | mfu: 50.76 | epoch: 2 | total time: 167.71m | eta: 11.7m +step 15613/16704 (93.47%) | loss: 2.441452 | lrm: 0.13 | dt: 645.33ms | tok/sec: 812,434 | mfu: 50.78 | epoch: 2 | total time: 167.72m | eta: 11.7m +step 15614/16704 (93.47%) | loss: 2.433713 | lrm: 0.13 | dt: 644.49ms | tok/sec: 813,494 | mfu: 50.84 | epoch: 2 | total time: 167.73m | eta: 11.7m +step 15615/16704 (93.48%) | loss: 2.428972 | lrm: 0.13 | dt: 645.91ms | tok/sec: 811,708 | mfu: 50.73 | epoch: 2 | total time: 167.74m | eta: 11.7m +step 15616/16704 (93.49%) | loss: 2.430947 | lrm: 0.13 | dt: 642.64ms | tok/sec: 815,839 | mfu: 50.99 | epoch: 2 | total time: 167.75m | eta: 11.7m +step 15617/16704 (93.49%) | loss: 2.445871 | lrm: 0.13 | dt: 645.74ms | tok/sec: 811,916 | mfu: 50.75 | epoch: 2 | total time: 167.76m | eta: 11.7m +step 15618/16704 (93.50%) | loss: 2.436615 | lrm: 0.13 | dt: 644.94ms | tok/sec: 812,924 | mfu: 50.81 | epoch: 2 | total time: 167.78m | eta: 11.7m +step 15619/16704 (93.50%) | loss: 2.429211 | lrm: 0.13 | dt: 643.36ms | tok/sec: 814,922 | mfu: 50.93 | epoch: 2 | total time: 167.79m | eta: 11.7m +step 15620/16704 (93.51%) | loss: 2.432249 | lrm: 0.13 | dt: 645.25ms | tok/sec: 812,532 | mfu: 50.78 | epoch: 2 | total time: 167.80m | eta: 11.7m +step 15621/16704 (93.52%) | loss: 2.432940 | lrm: 0.13 | dt: 642.13ms | tok/sec: 816,480 | mfu: 51.03 | epoch: 2 | total time: 167.81m | eta: 11.6m +step 15622/16704 (93.52%) | loss: 2.440432 | lrm: 0.13 | dt: 644.22ms | tok/sec: 813,834 | mfu: 50.87 | epoch: 2 | total time: 167.82m | eta: 11.6m +step 15623/16704 (93.53%) | loss: 2.429041 | lrm: 0.13 | dt: 645.18ms | tok/sec: 812,625 | mfu: 50.79 | epoch: 2 | total time: 167.83m | eta: 11.6m +step 15624/16704 (93.53%) | loss: 2.436178 | lrm: 0.13 | dt: 643.80ms | tok/sec: 814,367 | mfu: 50.90 | epoch: 2 | total time: 167.84m | eta: 11.6m +step 15625/16704 (93.54%) | loss: 2.429439 | lrm: 0.13 | dt: 645.81ms | tok/sec: 811,826 | mfu: 50.74 | epoch: 2 | total time: 167.85m | eta: 11.6m +step 15626/16704 (93.55%) | loss: 2.440170 | lrm: 0.13 | dt: 643.47ms | tok/sec: 814,776 | mfu: 50.92 | epoch: 2 | total time: 167.86m | eta: 11.6m +step 15627/16704 (93.55%) | loss: 2.449032 | lrm: 0.13 | dt: 646.56ms | tok/sec: 810,884 | mfu: 50.68 | epoch: 2 | total time: 167.87m | eta: 11.6m +step 15628/16704 (93.56%) | loss: 2.446140 | lrm: 0.13 | dt: 646.81ms | tok/sec: 810,571 | mfu: 50.66 | epoch: 2 | total time: 167.88m | eta: 11.6m +step 15629/16704 (93.56%) | loss: 2.445112 | lrm: 0.13 | dt: 643.23ms | tok/sec: 815,080 | mfu: 50.94 | epoch: 2 | total time: 167.89m | eta: 11.6m +step 15630/16704 (93.57%) | loss: 2.444662 | lrm: 0.13 | dt: 647.34ms | tok/sec: 809,913 | mfu: 50.62 | epoch: 2 | total time: 167.90m | eta: 11.5m +step 15631/16704 (93.58%) | loss: 2.436460 | lrm: 0.13 | dt: 643.66ms | tok/sec: 814,539 | mfu: 50.91 | epoch: 2 | total time: 167.91m | eta: 11.5m +step 15632/16704 (93.58%) | loss: 2.435123 | lrm: 0.13 | dt: 645.56ms | tok/sec: 812,146 | mfu: 50.76 | epoch: 2 | total time: 167.93m | eta: 11.5m +step 15633/16704 (93.59%) | loss: 2.446208 | lrm: 0.13 | dt: 645.63ms | tok/sec: 812,061 | mfu: 50.76 | epoch: 2 | total time: 167.94m | eta: 11.5m +step 15634/16704 (93.59%) | loss: 2.454161 | lrm: 0.13 | dt: 644.98ms | tok/sec: 812,879 | mfu: 50.81 | epoch: 2 | total time: 167.95m | eta: 11.5m +step 15635/16704 (93.60%) | loss: 2.443415 | lrm: 0.13 | dt: 645.85ms | tok/sec: 811,781 | mfu: 50.74 | epoch: 2 | total time: 167.96m | eta: 11.5m +step 15636/16704 (93.61%) | loss: 2.455472 | lrm: 0.13 | dt: 643.97ms | tok/sec: 814,148 | mfu: 50.89 | epoch: 2 | total time: 167.97m | eta: 11.5m +step 15637/16704 (93.61%) | loss: 2.447690 | lrm: 0.13 | dt: 644.21ms | tok/sec: 813,845 | mfu: 50.87 | epoch: 2 | total time: 167.98m | eta: 11.5m +step 15638/16704 (93.62%) | loss: 2.431403 | lrm: 0.13 | dt: 644.14ms | tok/sec: 813,937 | mfu: 50.87 | epoch: 2 | total time: 167.99m | eta: 11.5m +step 15639/16704 (93.62%) | loss: 2.435729 | lrm: 0.13 | dt: 644.05ms | tok/sec: 814,046 | mfu: 50.88 | epoch: 2 | total time: 168.00m | eta: 11.4m +step 15640/16704 (93.63%) | loss: 2.429818 | lrm: 0.13 | dt: 642.93ms | tok/sec: 815,468 | mfu: 50.97 | epoch: 2 | total time: 168.01m | eta: 11.4m +step 15641/16704 (93.64%) | loss: 2.417060 | lrm: 0.13 | dt: 644.74ms | tok/sec: 813,180 | mfu: 50.82 | epoch: 2 | total time: 168.02m | eta: 11.4m +step 15642/16704 (93.64%) | loss: 2.419918 | lrm: 0.13 | dt: 643.60ms | tok/sec: 814,622 | mfu: 50.92 | epoch: 2 | total time: 168.03m | eta: 11.4m +step 15643/16704 (93.65%) | loss: 2.432591 | lrm: 0.13 | dt: 643.85ms | tok/sec: 814,299 | mfu: 50.89 | epoch: 2 | total time: 168.04m | eta: 11.4m +step 15644/16704 (93.65%) | loss: 2.429314 | lrm: 0.13 | dt: 645.67ms | tok/sec: 812,006 | mfu: 50.75 | epoch: 2 | total time: 168.05m | eta: 11.4m +step 15645/16704 (93.66%) | loss: 2.429351 | lrm: 0.13 | dt: 642.74ms | tok/sec: 815,707 | mfu: 50.98 | epoch: 2 | total time: 168.07m | eta: 11.4m +step 15646/16704 (93.67%) | loss: 2.433080 | lrm: 0.13 | dt: 645.42ms | tok/sec: 812,316 | mfu: 50.77 | epoch: 2 | total time: 168.08m | eta: 11.4m +step 15647/16704 (93.67%) | loss: 2.431984 | lrm: 0.13 | dt: 644.42ms | tok/sec: 813,584 | mfu: 50.85 | epoch: 2 | total time: 168.09m | eta: 11.4m +step 15648/16704 (93.68%) | loss: 2.439140 | lrm: 0.13 | dt: 644.60ms | tok/sec: 813,359 | mfu: 50.84 | epoch: 2 | total time: 168.10m | eta: 11.4m +step 15649/16704 (93.68%) | loss: 2.440146 | lrm: 0.13 | dt: 645.40ms | tok/sec: 812,347 | mfu: 50.77 | epoch: 2 | total time: 168.11m | eta: 11.3m +step 15650/16704 (93.69%) | loss: 2.432471 | lrm: 0.13 | dt: 642.97ms | tok/sec: 815,411 | mfu: 50.96 | epoch: 2 | total time: 168.12m | eta: 11.3m +step 15651/16704 (93.70%) | loss: 2.429489 | lrm: 0.13 | dt: 643.32ms | tok/sec: 814,971 | mfu: 50.94 | epoch: 2 | total time: 168.13m | eta: 11.3m +step 15652/16704 (93.70%) | loss: 2.426924 | lrm: 0.13 | dt: 646.10ms | tok/sec: 811,460 | mfu: 50.72 | epoch: 2 | total time: 168.14m | eta: 11.3m +step 15653/16704 (93.71%) | loss: 2.428672 | lrm: 0.13 | dt: 642.04ms | tok/sec: 816,596 | mfu: 51.04 | epoch: 2 | total time: 168.15m | eta: 11.3m +step 15654/16704 (93.71%) | loss: 2.429867 | lrm: 0.13 | dt: 645.34ms | tok/sec: 812,418 | mfu: 50.78 | epoch: 2 | total time: 168.16m | eta: 11.3m +step 15655/16704 (93.72%) | loss: 2.433390 | lrm: 0.13 | dt: 642.18ms | tok/sec: 816,423 | mfu: 51.03 | epoch: 2 | total time: 168.17m | eta: 11.3m +step 15656/16704 (93.73%) | loss: 2.428074 | lrm: 0.13 | dt: 643.97ms | tok/sec: 814,155 | mfu: 50.89 | epoch: 2 | total time: 168.18m | eta: 11.3m +step 15657/16704 (93.73%) | loss: 2.428773 | lrm: 0.13 | dt: 646.27ms | tok/sec: 811,247 | mfu: 50.70 | epoch: 2 | total time: 168.19m | eta: 11.3m +step 15658/16704 (93.74%) | loss: 2.433081 | lrm: 0.13 | dt: 641.60ms | tok/sec: 817,161 | mfu: 51.07 | epoch: 2 | total time: 168.20m | eta: 11.2m +step 15659/16704 (93.74%) | loss: 2.437456 | lrm: 0.13 | dt: 646.82ms | tok/sec: 810,565 | mfu: 50.66 | epoch: 2 | total time: 168.22m | eta: 11.2m +step 15660/16704 (93.75%) | loss: 2.427273 | lrm: 0.12 | dt: 642.75ms | tok/sec: 815,699 | mfu: 50.98 | epoch: 2 | total time: 168.23m | eta: 11.2m +step 15661/16704 (93.76%) | loss: 2.416113 | lrm: 0.12 | dt: 643.98ms | tok/sec: 814,136 | mfu: 50.88 | epoch: 2 | total time: 168.24m | eta: 11.2m +step 15662/16704 (93.76%) | loss: 2.431046 | lrm: 0.12 | dt: 644.23ms | tok/sec: 813,825 | mfu: 50.87 | epoch: 2 | total time: 168.25m | eta: 11.2m +step 15663/16704 (93.77%) | loss: 2.426816 | lrm: 0.12 | dt: 642.09ms | tok/sec: 816,529 | mfu: 51.03 | epoch: 2 | total time: 168.26m | eta: 11.2m +step 15664/16704 (93.77%) | loss: 2.434287 | lrm: 0.12 | dt: 646.99ms | tok/sec: 810,344 | mfu: 50.65 | epoch: 2 | total time: 168.27m | eta: 11.2m +step 15665/16704 (93.78%) | loss: 2.441452 | lrm: 0.12 | dt: 646.47ms | tok/sec: 811,003 | mfu: 50.69 | epoch: 2 | total time: 168.28m | eta: 11.2m +step 15666/16704 (93.79%) | loss: 2.433111 | lrm: 0.12 | dt: 644.17ms | tok/sec: 813,900 | mfu: 50.87 | epoch: 2 | total time: 168.29m | eta: 11.2m +step 15667/16704 (93.79%) | loss: 2.428296 | lrm: 0.12 | dt: 646.46ms | tok/sec: 811,013 | mfu: 50.69 | epoch: 2 | total time: 168.30m | eta: 11.1m +step 15668/16704 (93.80%) | loss: 2.429428 | lrm: 0.12 | dt: 643.87ms | tok/sec: 814,281 | mfu: 50.89 | epoch: 2 | total time: 168.31m | eta: 11.1m +step 15669/16704 (93.80%) | loss: 2.424081 | lrm: 0.12 | dt: 643.98ms | tok/sec: 814,135 | mfu: 50.88 | epoch: 2 | total time: 168.32m | eta: 11.1m +step 15670/16704 (93.81%) | loss: 2.414087 | lrm: 0.12 | dt: 644.09ms | tok/sec: 813,995 | mfu: 50.88 | epoch: 2 | total time: 168.33m | eta: 11.1m +step 15671/16704 (93.82%) | loss: 2.395618 | lrm: 0.12 | dt: 644.27ms | tok/sec: 813,772 | mfu: 50.86 | epoch: 2 | total time: 168.34m | eta: 11.1m +step 15672/16704 (93.82%) | loss: 2.407679 | lrm: 0.12 | dt: 644.47ms | tok/sec: 813,521 | mfu: 50.85 | epoch: 2 | total time: 168.36m | eta: 11.1m +step 15673/16704 (93.83%) | loss: 2.415454 | lrm: 0.12 | dt: 643.46ms | tok/sec: 814,791 | mfu: 50.93 | epoch: 2 | total time: 168.37m | eta: 11.1m +step 15674/16704 (93.83%) | loss: 2.420896 | lrm: 0.12 | dt: 642.99ms | tok/sec: 815,393 | mfu: 50.96 | epoch: 2 | total time: 168.38m | eta: 11.1m +step 15675/16704 (93.84%) | loss: 2.418904 | lrm: 0.12 | dt: 643.65ms | tok/sec: 814,558 | mfu: 50.91 | epoch: 2 | total time: 168.39m | eta: 11.1m +step 15676/16704 (93.85%) | loss: 2.415892 | lrm: 0.12 | dt: 646.46ms | tok/sec: 811,016 | mfu: 50.69 | epoch: 2 | total time: 168.40m | eta: 11.1m +step 15677/16704 (93.85%) | loss: 2.419757 | lrm: 0.12 | dt: 641.17ms | tok/sec: 817,705 | mfu: 51.11 | epoch: 2 | total time: 168.41m | eta: 11.0m +step 15678/16704 (93.86%) | loss: 2.421313 | lrm: 0.12 | dt: 644.86ms | tok/sec: 813,020 | mfu: 50.81 | epoch: 2 | total time: 168.42m | eta: 11.0m +step 15679/16704 (93.86%) | loss: 2.429692 | lrm: 0.12 | dt: 643.80ms | tok/sec: 814,368 | mfu: 50.90 | epoch: 2 | total time: 168.43m | eta: 11.0m +step 15680/16704 (93.87%) | loss: 2.435670 | lrm: 0.12 | dt: 643.09ms | tok/sec: 815,261 | mfu: 50.95 | epoch: 2 | total time: 168.44m | eta: 11.0m +step 15681/16704 (93.88%) | loss: 2.442582 | lrm: 0.12 | dt: 644.28ms | tok/sec: 813,763 | mfu: 50.86 | epoch: 2 | total time: 168.45m | eta: 11.0m +step 15682/16704 (93.88%) | loss: 2.452901 | lrm: 0.12 | dt: 641.51ms | tok/sec: 817,269 | mfu: 51.08 | epoch: 2 | total time: 168.46m | eta: 11.0m +step 15683/16704 (93.89%) | loss: 2.445975 | lrm: 0.12 | dt: 643.38ms | tok/sec: 814,894 | mfu: 50.93 | epoch: 2 | total time: 168.47m | eta: 11.0m +step 15684/16704 (93.89%) | loss: 2.443118 | lrm: 0.12 | dt: 645.37ms | tok/sec: 812,384 | mfu: 50.78 | epoch: 2 | total time: 168.48m | eta: 11.0m +step 15685/16704 (93.90%) | loss: 2.434805 | lrm: 0.12 | dt: 642.96ms | tok/sec: 815,427 | mfu: 50.97 | epoch: 2 | total time: 168.49m | eta: 11.0m +step 15686/16704 (93.91%) | loss: 2.430479 | lrm: 0.12 | dt: 644.50ms | tok/sec: 813,480 | mfu: 50.84 | epoch: 2 | total time: 168.51m | eta: 10.9m +step 15687/16704 (93.91%) | loss: 2.430240 | lrm: 0.12 | dt: 644.35ms | tok/sec: 813,673 | mfu: 50.86 | epoch: 2 | total time: 168.52m | eta: 10.9m +step 15688/16704 (93.92%) | loss: 2.436953 | lrm: 0.12 | dt: 641.88ms | tok/sec: 816,803 | mfu: 51.05 | epoch: 2 | total time: 168.53m | eta: 10.9m +step 15689/16704 (93.92%) | loss: 2.441018 | lrm: 0.12 | dt: 643.11ms | tok/sec: 815,233 | mfu: 50.95 | epoch: 2 | total time: 168.54m | eta: 10.9m +step 15690/16704 (93.93%) | loss: 2.442764 | lrm: 0.12 | dt: 643.33ms | tok/sec: 814,961 | mfu: 50.94 | epoch: 2 | total time: 168.55m | eta: 10.9m +step 15691/16704 (93.94%) | loss: 2.436476 | lrm: 0.12 | dt: 643.84ms | tok/sec: 814,318 | mfu: 50.90 | epoch: 2 | total time: 168.56m | eta: 10.9m +step 15692/16704 (93.94%) | loss: 2.435000 | lrm: 0.12 | dt: 645.50ms | tok/sec: 812,217 | mfu: 50.76 | epoch: 2 | total time: 168.57m | eta: 10.9m +step 15693/16704 (93.95%) | loss: 2.435701 | lrm: 0.12 | dt: 642.38ms | tok/sec: 816,170 | mfu: 51.01 | epoch: 2 | total time: 168.58m | eta: 10.9m +step 15694/16704 (93.95%) | loss: 2.436035 | lrm: 0.12 | dt: 644.39ms | tok/sec: 813,614 | mfu: 50.85 | epoch: 2 | total time: 168.59m | eta: 10.9m +step 15695/16704 (93.96%) | loss: 2.438276 | lrm: 0.12 | dt: 643.32ms | tok/sec: 814,968 | mfu: 50.94 | epoch: 2 | total time: 168.60m | eta: 10.8m +step 15696/16704 (93.97%) | loss: 2.453592 | lrm: 0.12 | dt: 644.46ms | tok/sec: 813,535 | mfu: 50.85 | epoch: 2 | total time: 168.61m | eta: 10.8m +step 15697/16704 (93.97%) | loss: 2.462487 | lrm: 0.12 | dt: 643.49ms | tok/sec: 814,760 | mfu: 50.92 | epoch: 2 | total time: 168.62m | eta: 10.8m +step 15698/16704 (93.98%) | loss: 2.463364 | lrm: 0.12 | dt: 643.08ms | tok/sec: 815,277 | mfu: 50.96 | epoch: 2 | total time: 168.63m | eta: 10.8m +step 15699/16704 (93.98%) | loss: 2.463013 | lrm: 0.12 | dt: 644.08ms | tok/sec: 814,012 | mfu: 50.88 | epoch: 2 | total time: 168.64m | eta: 10.8m +step 15700/16704 (93.99%) | loss: 2.456578 | lrm: 0.12 | dt: 641.37ms | tok/sec: 817,456 | mfu: 51.09 | epoch: 2 | total time: 168.66m | eta: 10.8m +step 15701/16704 (94.00%) | loss: 2.461301 | lrm: 0.12 | dt: 645.08ms | tok/sec: 812,744 | mfu: 50.80 | epoch: 2 | total time: 168.67m | eta: 10.8m +step 15702/16704 (94.00%) | loss: 2.451075 | lrm: 0.12 | dt: 645.18ms | tok/sec: 812,616 | mfu: 50.79 | epoch: 2 | total time: 168.68m | eta: 10.8m +step 15703/16704 (94.01%) | loss: 2.443756 | lrm: 0.12 | dt: 642.57ms | tok/sec: 815,926 | mfu: 51.00 | epoch: 2 | total time: 168.69m | eta: 10.8m +step 15704/16704 (94.01%) | loss: 2.439137 | lrm: 0.12 | dt: 643.97ms | tok/sec: 814,153 | mfu: 50.89 | epoch: 2 | total time: 168.70m | eta: 10.7m +step 15705/16704 (94.02%) | loss: 2.438782 | lrm: 0.12 | dt: 642.26ms | tok/sec: 816,322 | mfu: 51.02 | epoch: 2 | total time: 168.71m | eta: 10.7m +step 15706/16704 (94.03%) | loss: 2.446694 | lrm: 0.12 | dt: 643.14ms | tok/sec: 815,206 | mfu: 50.95 | epoch: 2 | total time: 168.72m | eta: 10.7m +step 15707/16704 (94.03%) | loss: 2.450327 | lrm: 0.12 | dt: 642.88ms | tok/sec: 815,525 | mfu: 50.97 | epoch: 2 | total time: 168.73m | eta: 10.7m +step 15708/16704 (94.04%) | loss: 2.446463 | lrm: 0.12 | dt: 644.67ms | tok/sec: 813,266 | mfu: 50.83 | epoch: 2 | total time: 168.74m | eta: 10.7m +step 15709/16704 (94.04%) | loss: 2.440776 | lrm: 0.12 | dt: 643.71ms | tok/sec: 814,484 | mfu: 50.91 | epoch: 2 | total time: 168.75m | eta: 10.7m +step 15710/16704 (94.05%) | loss: 2.445593 | lrm: 0.12 | dt: 644.55ms | tok/sec: 813,423 | mfu: 50.84 | epoch: 2 | total time: 168.76m | eta: 10.7m +step 15711/16704 (94.06%) | loss: 2.438162 | lrm: 0.12 | dt: 644.13ms | tok/sec: 813,944 | mfu: 50.87 | epoch: 2 | total time: 168.77m | eta: 10.7m +step 15712/16704 (94.06%) | loss: 2.443761 | lrm: 0.12 | dt: 643.91ms | tok/sec: 814,231 | mfu: 50.89 | epoch: 2 | total time: 168.78m | eta: 10.7m +step 15713/16704 (94.07%) | loss: 2.429238 | lrm: 0.12 | dt: 642.96ms | tok/sec: 815,431 | mfu: 50.97 | epoch: 2 | total time: 168.80m | eta: 10.7m +step 15714/16704 (94.07%) | loss: 2.426546 | lrm: 0.12 | dt: 646.04ms | tok/sec: 811,542 | mfu: 50.72 | epoch: 2 | total time: 168.81m | eta: 10.6m +step 15715/16704 (94.08%) | loss: 2.428018 | lrm: 0.12 | dt: 643.86ms | tok/sec: 814,292 | mfu: 50.89 | epoch: 2 | total time: 168.82m | eta: 10.6m +step 15716/16704 (94.09%) | loss: 2.419862 | lrm: 0.12 | dt: 643.68ms | tok/sec: 814,514 | mfu: 50.91 | epoch: 2 | total time: 168.83m | eta: 10.6m +step 15717/16704 (94.09%) | loss: 2.425850 | lrm: 0.12 | dt: 644.70ms | tok/sec: 813,222 | mfu: 50.83 | epoch: 2 | total time: 168.84m | eta: 10.6m +step 15718/16704 (94.10%) | loss: 2.422627 | lrm: 0.12 | dt: 643.86ms | tok/sec: 814,283 | mfu: 50.89 | epoch: 2 | total time: 168.85m | eta: 10.6m +step 15719/16704 (94.10%) | loss: 2.427579 | lrm: 0.12 | dt: 643.85ms | tok/sec: 814,300 | mfu: 50.89 | epoch: 2 | total time: 168.86m | eta: 10.6m +step 15720/16704 (94.11%) | loss: 2.430294 | lrm: 0.12 | dt: 644.06ms | tok/sec: 814,037 | mfu: 50.88 | epoch: 2 | total time: 168.87m | eta: 10.6m +step 15721/16704 (94.12%) | loss: 2.437696 | lrm: 0.12 | dt: 645.49ms | tok/sec: 812,228 | mfu: 50.77 | epoch: 2 | total time: 168.88m | eta: 10.6m +step 15722/16704 (94.12%) | loss: 2.438287 | lrm: 0.12 | dt: 643.86ms | tok/sec: 814,291 | mfu: 50.89 | epoch: 2 | total time: 168.89m | eta: 10.6m +step 15723/16704 (94.13%) | loss: 2.442420 | lrm: 0.12 | dt: 640.23ms | tok/sec: 818,908 | mfu: 51.18 | epoch: 2 | total time: 168.90m | eta: 10.5m +step 15724/16704 (94.13%) | loss: 2.437276 | lrm: 0.12 | dt: 646.05ms | tok/sec: 811,526 | mfu: 50.72 | epoch: 2 | total time: 168.91m | eta: 10.5m +step 15725/16704 (94.14%) | loss: 2.449195 | lrm: 0.12 | dt: 641.97ms | tok/sec: 816,683 | mfu: 51.04 | epoch: 2 | total time: 168.92m | eta: 10.5m +step 15726/16704 (94.15%) | loss: 2.458542 | lrm: 0.12 | dt: 644.91ms | tok/sec: 812,957 | mfu: 50.81 | epoch: 2 | total time: 168.93m | eta: 10.5m +step 15727/16704 (94.15%) | loss: 2.468350 | lrm: 0.12 | dt: 644.45ms | tok/sec: 813,542 | mfu: 50.85 | epoch: 2 | total time: 168.95m | eta: 10.5m +step 15728/16704 (94.16%) | loss: 2.476717 | lrm: 0.12 | dt: 642.40ms | tok/sec: 816,138 | mfu: 51.01 | epoch: 2 | total time: 168.96m | eta: 10.5m +step 15729/16704 (94.16%) | loss: 2.470525 | lrm: 0.12 | dt: 643.12ms | tok/sec: 815,225 | mfu: 50.95 | epoch: 2 | total time: 168.97m | eta: 10.5m +step 15730/16704 (94.17%) | loss: 2.465483 | lrm: 0.12 | dt: 646.34ms | tok/sec: 811,159 | mfu: 50.70 | epoch: 2 | total time: 168.98m | eta: 10.5m +step 15731/16704 (94.18%) | loss: 2.458396 | lrm: 0.12 | dt: 642.67ms | tok/sec: 815,801 | mfu: 50.99 | epoch: 2 | total time: 168.99m | eta: 10.5m +step 15732/16704 (94.18%) | loss: 2.459681 | lrm: 0.12 | dt: 646.49ms | tok/sec: 810,977 | mfu: 50.69 | epoch: 2 | total time: 169.00m | eta: 10.4m +step 15733/16704 (94.19%) | loss: 2.454452 | lrm: 0.12 | dt: 643.54ms | tok/sec: 814,687 | mfu: 50.92 | epoch: 2 | total time: 169.01m | eta: 10.4m +step 15734/16704 (94.19%) | loss: 2.450813 | lrm: 0.12 | dt: 644.00ms | tok/sec: 814,116 | mfu: 50.88 | epoch: 2 | total time: 169.02m | eta: 10.4m +step 15735/16704 (94.20%) | loss: 2.452705 | lrm: 0.12 | dt: 644.52ms | tok/sec: 813,453 | mfu: 50.84 | epoch: 2 | total time: 169.03m | eta: 10.4m +step 15736/16704 (94.20%) | loss: 2.452366 | lrm: 0.12 | dt: 643.45ms | tok/sec: 814,809 | mfu: 50.93 | epoch: 2 | total time: 169.04m | eta: 10.4m +step 15737/16704 (94.21%) | loss: 2.437719 | lrm: 0.12 | dt: 643.35ms | tok/sec: 814,933 | mfu: 50.93 | epoch: 2 | total time: 169.05m | eta: 10.4m +step 15738/16704 (94.22%) | loss: 2.430248 | lrm: 0.12 | dt: 641.70ms | tok/sec: 817,028 | mfu: 51.07 | epoch: 2 | total time: 169.06m | eta: 10.4m +step 15739/16704 (94.22%) | loss: 2.432278 | lrm: 0.12 | dt: 645.24ms | tok/sec: 812,543 | mfu: 50.79 | epoch: 2 | total time: 169.07m | eta: 10.4m +step 15740/16704 (94.23%) | loss: 2.449079 | lrm: 0.12 | dt: 645.93ms | tok/sec: 811,682 | mfu: 50.73 | epoch: 2 | total time: 169.08m | eta: 10.4m +step 15741/16704 (94.23%) | loss: 2.436643 | lrm: 0.12 | dt: 645.07ms | tok/sec: 812,766 | mfu: 50.80 | epoch: 2 | total time: 169.10m | eta: 10.4m +step 15742/16704 (94.24%) | loss: 2.457297 | lrm: 0.12 | dt: 645.01ms | tok/sec: 812,842 | mfu: 50.80 | epoch: 2 | total time: 169.11m | eta: 10.3m +step 15743/16704 (94.25%) | loss: 2.448232 | lrm: 0.12 | dt: 643.54ms | tok/sec: 814,697 | mfu: 50.92 | epoch: 2 | total time: 169.12m | eta: 10.3m +step 15744/16704 (94.25%) | loss: 2.457162 | lrm: 0.11 | dt: 645.06ms | tok/sec: 812,770 | mfu: 50.80 | epoch: 2 | total time: 169.13m | eta: 10.3m +step 15745/16704 (94.26%) | loss: 2.445891 | lrm: 0.11 | dt: 642.85ms | tok/sec: 815,562 | mfu: 50.97 | epoch: 2 | total time: 169.14m | eta: 10.3m +step 15746/16704 (94.26%) | loss: 2.454537 | lrm: 0.11 | dt: 647.92ms | tok/sec: 809,187 | mfu: 50.58 | epoch: 2 | total time: 169.15m | eta: 10.3m +step 15747/16704 (94.27%) | loss: 2.460449 | lrm: 0.11 | dt: 642.35ms | tok/sec: 816,199 | mfu: 51.01 | epoch: 2 | total time: 169.16m | eta: 10.3m +step 15748/16704 (94.28%) | loss: 2.475498 | lrm: 0.11 | dt: 644.81ms | tok/sec: 813,094 | mfu: 50.82 | epoch: 2 | total time: 169.17m | eta: 10.3m +step 15749/16704 (94.28%) | loss: 2.451160 | lrm: 0.11 | dt: 645.76ms | tok/sec: 811,899 | mfu: 50.74 | epoch: 2 | total time: 169.18m | eta: 10.3m +Step 15750 | Validation bpb: 0.756988 +step 15750/16704 (94.29%) | loss: 2.445057 | lrm: 0.11 | dt: 634.56ms | tok/sec: 826,220 | mfu: 51.64 | epoch: 2 | total time: 169.19m | eta: 10.3m +step 15751/16704 (94.29%) | loss: 2.449324 | lrm: 0.11 | dt: 652.62ms | tok/sec: 803,361 | mfu: 50.21 | epoch: 2 | total time: 169.20m | eta: 10.2m +step 15752/16704 (94.30%) | loss: 2.448285 | lrm: 0.11 | dt: 640.85ms | tok/sec: 818,114 | mfu: 51.13 | epoch: 2 | total time: 169.21m | eta: 10.2m +step 15753/16704 (94.31%) | loss: 2.443627 | lrm: 0.11 | dt: 644.61ms | tok/sec: 813,339 | mfu: 50.83 | epoch: 2 | total time: 169.22m | eta: 10.2m +step 15754/16704 (94.31%) | loss: 2.447864 | lrm: 0.11 | dt: 646.62ms | tok/sec: 810,808 | mfu: 50.68 | epoch: 2 | total time: 169.24m | eta: 10.2m +step 15755/16704 (94.32%) | loss: 2.444425 | lrm: 0.11 | dt: 639.67ms | tok/sec: 819,627 | mfu: 51.23 | epoch: 2 | total time: 169.25m | eta: 10.2m +step 15756/16704 (94.32%) | loss: 2.462537 | lrm: 0.11 | dt: 645.08ms | tok/sec: 812,744 | mfu: 50.80 | epoch: 2 | total time: 169.26m | eta: 10.2m +step 15757/16704 (94.33%) | loss: 2.459841 | lrm: 0.11 | dt: 645.55ms | tok/sec: 812,161 | mfu: 50.76 | epoch: 2 | total time: 169.27m | eta: 10.2m +step 15758/16704 (94.34%) | loss: 2.462012 | lrm: 0.11 | dt: 644.50ms | tok/sec: 813,480 | mfu: 50.84 | epoch: 2 | total time: 169.28m | eta: 10.2m +step 15759/16704 (94.34%) | loss: 2.473987 | lrm: 0.11 | dt: 646.70ms | tok/sec: 810,718 | mfu: 50.67 | epoch: 2 | total time: 169.29m | eta: 10.2m +step 15760/16704 (94.35%) | loss: 2.471853 | lrm: 0.11 | dt: 643.47ms | tok/sec: 814,786 | mfu: 50.93 | epoch: 2 | total time: 169.30m | eta: 10.1m +step 15761/16704 (94.35%) | loss: 2.468686 | lrm: 0.11 | dt: 643.52ms | tok/sec: 814,718 | mfu: 50.92 | epoch: 2 | total time: 169.31m | eta: 10.1m +step 15762/16704 (94.36%) | loss: 2.469107 | lrm: 0.11 | dt: 646.51ms | tok/sec: 810,947 | mfu: 50.69 | epoch: 2 | total time: 169.32m | eta: 10.1m +step 15763/16704 (94.37%) | loss: 2.466171 | lrm: 0.11 | dt: 641.48ms | tok/sec: 817,306 | mfu: 51.08 | epoch: 2 | total time: 169.33m | eta: 10.1m +step 15764/16704 (94.37%) | loss: 2.468749 | lrm: 0.11 | dt: 643.15ms | tok/sec: 815,189 | mfu: 50.95 | epoch: 2 | total time: 169.34m | eta: 10.1m +step 15765/16704 (94.38%) | loss: 2.453356 | lrm: 0.11 | dt: 644.55ms | tok/sec: 813,412 | mfu: 50.84 | epoch: 2 | total time: 169.35m | eta: 10.1m +step 15766/16704 (94.38%) | loss: 2.451581 | lrm: 0.11 | dt: 644.91ms | tok/sec: 812,965 | mfu: 50.81 | epoch: 2 | total time: 169.36m | eta: 10.1m +step 15767/16704 (94.39%) | loss: 2.452322 | lrm: 0.11 | dt: 646.30ms | tok/sec: 811,220 | mfu: 50.70 | epoch: 2 | total time: 169.37m | eta: 10.1m +step 15768/16704 (94.40%) | loss: 2.467455 | lrm: 0.11 | dt: 646.65ms | tok/sec: 810,776 | mfu: 50.67 | epoch: 2 | total time: 169.39m | eta: 10.1m +step 15769/16704 (94.40%) | loss: 2.470616 | lrm: 0.11 | dt: 643.93ms | tok/sec: 814,198 | mfu: 50.89 | epoch: 2 | total time: 169.40m | eta: 10.1m +step 15770/16704 (94.41%) | loss: 2.455842 | lrm: 0.11 | dt: 644.59ms | tok/sec: 813,371 | mfu: 50.84 | epoch: 2 | total time: 169.41m | eta: 10.0m +step 15771/16704 (94.41%) | loss: 2.463109 | lrm: 0.11 | dt: 643.71ms | tok/sec: 814,479 | mfu: 50.91 | epoch: 2 | total time: 169.42m | eta: 10.0m +step 15772/16704 (94.42%) | loss: 2.450066 | lrm: 0.11 | dt: 644.55ms | tok/sec: 813,421 | mfu: 50.84 | epoch: 2 | total time: 169.43m | eta: 10.0m +step 15773/16704 (94.43%) | loss: 2.445119 | lrm: 0.11 | dt: 646.07ms | tok/sec: 811,504 | mfu: 50.72 | epoch: 2 | total time: 169.44m | eta: 10.0m +step 15774/16704 (94.43%) | loss: 2.453997 | lrm: 0.11 | dt: 641.41ms | tok/sec: 817,398 | mfu: 51.09 | epoch: 2 | total time: 169.45m | eta: 10.0m +step 15775/16704 (94.44%) | loss: 2.445747 | lrm: 0.11 | dt: 644.28ms | tok/sec: 813,752 | mfu: 50.86 | epoch: 2 | total time: 169.46m | eta: 10.0m +step 15776/16704 (94.44%) | loss: 2.446333 | lrm: 0.11 | dt: 645.39ms | tok/sec: 812,354 | mfu: 50.77 | epoch: 2 | total time: 169.47m | eta: 10.0m +step 15777/16704 (94.45%) | loss: 2.439265 | lrm: 0.11 | dt: 643.55ms | tok/sec: 814,678 | mfu: 50.92 | epoch: 2 | total time: 169.48m | eta: 10.0m +step 15778/16704 (94.46%) | loss: 2.426128 | lrm: 0.11 | dt: 645.22ms | tok/sec: 812,568 | mfu: 50.79 | epoch: 2 | total time: 169.49m | eta: 10.0m +step 15779/16704 (94.46%) | loss: 2.438545 | lrm: 0.11 | dt: 642.72ms | tok/sec: 815,730 | mfu: 50.98 | epoch: 2 | total time: 169.50m | eta: 9.9m +step 15780/16704 (94.47%) | loss: 2.448166 | lrm: 0.11 | dt: 644.10ms | tok/sec: 813,987 | mfu: 50.88 | epoch: 2 | total time: 169.51m | eta: 9.9m +step 15781/16704 (94.47%) | loss: 2.449712 | lrm: 0.11 | dt: 644.29ms | tok/sec: 813,744 | mfu: 50.86 | epoch: 2 | total time: 169.53m | eta: 9.9m +step 15782/16704 (94.48%) | loss: 2.442072 | lrm: 0.11 | dt: 644.17ms | tok/sec: 813,894 | mfu: 50.87 | epoch: 2 | total time: 169.54m | eta: 9.9m +step 15783/16704 (94.49%) | loss: 2.450251 | lrm: 0.11 | dt: 644.11ms | tok/sec: 813,972 | mfu: 50.87 | epoch: 2 | total time: 169.55m | eta: 9.9m +step 15784/16704 (94.49%) | loss: 2.443385 | lrm: 0.11 | dt: 644.48ms | tok/sec: 813,506 | mfu: 50.85 | epoch: 2 | total time: 169.56m | eta: 9.9m +step 15785/16704 (94.50%) | loss: 2.435051 | lrm: 0.11 | dt: 646.43ms | tok/sec: 811,045 | mfu: 50.69 | epoch: 2 | total time: 169.57m | eta: 9.9m +step 15786/16704 (94.50%) | loss: 2.437875 | lrm: 0.11 | dt: 643.80ms | tok/sec: 814,365 | mfu: 50.90 | epoch: 2 | total time: 169.58m | eta: 9.9m +step 15787/16704 (94.51%) | loss: 2.443534 | lrm: 0.11 | dt: 645.01ms | tok/sec: 812,832 | mfu: 50.80 | epoch: 2 | total time: 169.59m | eta: 9.9m +step 15788/16704 (94.52%) | loss: 2.441102 | lrm: 0.11 | dt: 643.80ms | tok/sec: 814,369 | mfu: 50.90 | epoch: 2 | total time: 169.60m | eta: 9.8m +step 15789/16704 (94.52%) | loss: 2.424454 | lrm: 0.11 | dt: 644.24ms | tok/sec: 813,810 | mfu: 50.86 | epoch: 2 | total time: 169.61m | eta: 9.8m +step 15790/16704 (94.53%) | loss: 2.438475 | lrm: 0.11 | dt: 644.57ms | tok/sec: 813,386 | mfu: 50.84 | epoch: 2 | total time: 169.62m | eta: 9.8m +step 15791/16704 (94.53%) | loss: 2.431562 | lrm: 0.11 | dt: 644.99ms | tok/sec: 812,859 | mfu: 50.80 | epoch: 2 | total time: 169.63m | eta: 9.8m +step 15792/16704 (94.54%) | loss: 2.441733 | lrm: 0.11 | dt: 642.76ms | tok/sec: 815,688 | mfu: 50.98 | epoch: 2 | total time: 169.64m | eta: 9.8m +step 15793/16704 (94.55%) | loss: 2.446096 | lrm: 0.11 | dt: 644.56ms | tok/sec: 813,401 | mfu: 50.84 | epoch: 2 | total time: 169.65m | eta: 9.8m +step 15794/16704 (94.55%) | loss: 2.436002 | lrm: 0.11 | dt: 644.30ms | tok/sec: 813,736 | mfu: 50.86 | epoch: 2 | total time: 169.66m | eta: 9.8m +step 15795/16704 (94.56%) | loss: 2.446786 | lrm: 0.11 | dt: 643.46ms | tok/sec: 814,795 | mfu: 50.93 | epoch: 2 | total time: 169.68m | eta: 9.8m +step 15796/16704 (94.56%) | loss: 2.445403 | lrm: 0.11 | dt: 644.12ms | tok/sec: 813,965 | mfu: 50.87 | epoch: 2 | total time: 169.69m | eta: 9.8m +step 15797/16704 (94.57%) | loss: 2.441257 | lrm: 0.11 | dt: 645.16ms | tok/sec: 812,650 | mfu: 50.79 | epoch: 2 | total time: 169.70m | eta: 9.7m +step 15798/16704 (94.58%) | loss: 2.427973 | lrm: 0.11 | dt: 644.73ms | tok/sec: 813,185 | mfu: 50.83 | epoch: 2 | total time: 169.71m | eta: 9.7m +step 15799/16704 (94.58%) | loss: 2.435663 | lrm: 0.11 | dt: 644.97ms | tok/sec: 812,887 | mfu: 50.81 | epoch: 2 | total time: 169.72m | eta: 9.7m +step 15800/16704 (94.59%) | loss: 2.422449 | lrm: 0.11 | dt: 643.52ms | tok/sec: 814,724 | mfu: 50.92 | epoch: 2 | total time: 169.73m | eta: 9.7m +step 15801/16704 (94.59%) | loss: 2.401987 | lrm: 0.11 | dt: 644.22ms | tok/sec: 813,834 | mfu: 50.87 | epoch: 2 | total time: 169.74m | eta: 9.7m +step 15802/16704 (94.60%) | loss: 2.383676 | lrm: 0.11 | dt: 644.88ms | tok/sec: 813,006 | mfu: 50.81 | epoch: 2 | total time: 169.75m | eta: 9.7m +step 15803/16704 (94.61%) | loss: 2.385842 | lrm: 0.11 | dt: 642.61ms | tok/sec: 815,870 | mfu: 50.99 | epoch: 2 | total time: 169.76m | eta: 9.7m +step 15804/16704 (94.61%) | loss: 2.404920 | lrm: 0.11 | dt: 646.08ms | tok/sec: 811,489 | mfu: 50.72 | epoch: 2 | total time: 169.77m | eta: 9.7m +step 15805/16704 (94.62%) | loss: 2.400301 | lrm: 0.11 | dt: 643.21ms | tok/sec: 815,112 | mfu: 50.95 | epoch: 2 | total time: 169.78m | eta: 9.7m +step 15806/16704 (94.62%) | loss: 2.407605 | lrm: 0.11 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 2 | total time: 169.79m | eta: 9.7m +step 15807/16704 (94.63%) | loss: 2.409526 | lrm: 0.11 | dt: 645.05ms | tok/sec: 812,792 | mfu: 50.80 | epoch: 2 | total time: 169.80m | eta: 9.6m +step 15808/16704 (94.64%) | loss: 2.405717 | lrm: 0.11 | dt: 642.96ms | tok/sec: 815,430 | mfu: 50.97 | epoch: 2 | total time: 169.82m | eta: 9.6m +step 15809/16704 (94.64%) | loss: 2.420428 | lrm: 0.11 | dt: 645.39ms | tok/sec: 812,361 | mfu: 50.77 | epoch: 2 | total time: 169.83m | eta: 9.6m +step 15810/16704 (94.65%) | loss: 2.422629 | lrm: 0.11 | dt: 642.62ms | tok/sec: 815,856 | mfu: 50.99 | epoch: 2 | total time: 169.84m | eta: 9.6m +step 15811/16704 (94.65%) | loss: 2.430635 | lrm: 0.11 | dt: 642.47ms | tok/sec: 816,046 | mfu: 51.00 | epoch: 2 | total time: 169.85m | eta: 9.6m +step 15812/16704 (94.66%) | loss: 2.442638 | lrm: 0.11 | dt: 645.38ms | tok/sec: 812,372 | mfu: 50.77 | epoch: 2 | total time: 169.86m | eta: 9.6m +step 15813/16704 (94.67%) | loss: 2.433260 | lrm: 0.11 | dt: 644.35ms | tok/sec: 813,674 | mfu: 50.86 | epoch: 2 | total time: 169.87m | eta: 9.6m +step 15814/16704 (94.67%) | loss: 2.442020 | lrm: 0.11 | dt: 644.02ms | tok/sec: 814,088 | mfu: 50.88 | epoch: 2 | total time: 169.88m | eta: 9.6m +step 15815/16704 (94.68%) | loss: 2.432867 | lrm: 0.11 | dt: 646.43ms | tok/sec: 811,053 | mfu: 50.69 | epoch: 2 | total time: 169.89m | eta: 9.6m +step 15816/16704 (94.68%) | loss: 2.416809 | lrm: 0.11 | dt: 645.76ms | tok/sec: 811,895 | mfu: 50.74 | epoch: 2 | total time: 169.90m | eta: 9.5m +step 15817/16704 (94.69%) | loss: 2.415837 | lrm: 0.11 | dt: 642.83ms | tok/sec: 815,592 | mfu: 50.98 | epoch: 2 | total time: 169.91m | eta: 9.5m +step 15818/16704 (94.70%) | loss: 2.420308 | lrm: 0.11 | dt: 644.69ms | tok/sec: 813,238 | mfu: 50.83 | epoch: 2 | total time: 169.92m | eta: 9.5m +step 15819/16704 (94.70%) | loss: 2.431153 | lrm: 0.11 | dt: 642.26ms | tok/sec: 816,321 | mfu: 51.02 | epoch: 2 | total time: 169.93m | eta: 9.5m +step 15820/16704 (94.71%) | loss: 2.444047 | lrm: 0.11 | dt: 645.18ms | tok/sec: 812,616 | mfu: 50.79 | epoch: 2 | total time: 169.94m | eta: 9.5m +step 15821/16704 (94.71%) | loss: 2.439146 | lrm: 0.11 | dt: 644.20ms | tok/sec: 813,862 | mfu: 50.87 | epoch: 2 | total time: 169.95m | eta: 9.5m +step 15822/16704 (94.72%) | loss: 2.448607 | lrm: 0.11 | dt: 643.61ms | tok/sec: 814,608 | mfu: 50.91 | epoch: 2 | total time: 169.97m | eta: 9.5m +step 15823/16704 (94.73%) | loss: 2.441799 | lrm: 0.11 | dt: 642.66ms | tok/sec: 815,810 | mfu: 50.99 | epoch: 2 | total time: 169.98m | eta: 9.5m +step 15824/16704 (94.73%) | loss: 2.442958 | lrm: 0.11 | dt: 646.78ms | tok/sec: 810,607 | mfu: 50.66 | epoch: 2 | total time: 169.99m | eta: 9.5m +step 15825/16704 (94.74%) | loss: 2.428534 | lrm: 0.11 | dt: 642.25ms | tok/sec: 816,326 | mfu: 51.02 | epoch: 2 | total time: 170.00m | eta: 9.4m +step 15826/16704 (94.74%) | loss: 2.409166 | lrm: 0.11 | dt: 645.35ms | tok/sec: 812,408 | mfu: 50.78 | epoch: 2 | total time: 170.01m | eta: 9.4m +step 15827/16704 (94.75%) | loss: 2.396592 | lrm: 0.11 | dt: 644.99ms | tok/sec: 812,859 | mfu: 50.80 | epoch: 2 | total time: 170.02m | eta: 9.4m +step 15828/16704 (94.76%) | loss: 2.394374 | lrm: 0.10 | dt: 644.32ms | tok/sec: 813,702 | mfu: 50.86 | epoch: 2 | total time: 170.03m | eta: 9.4m +step 15829/16704 (94.76%) | loss: 2.400146 | lrm: 0.10 | dt: 644.37ms | tok/sec: 813,650 | mfu: 50.85 | epoch: 2 | total time: 170.04m | eta: 9.4m +step 15830/16704 (94.77%) | loss: 2.400818 | lrm: 0.10 | dt: 645.04ms | tok/sec: 812,797 | mfu: 50.80 | epoch: 2 | total time: 170.05m | eta: 9.4m +step 15831/16704 (94.77%) | loss: 2.405465 | lrm: 0.10 | dt: 644.20ms | tok/sec: 813,854 | mfu: 50.87 | epoch: 2 | total time: 170.06m | eta: 9.4m +step 15832/16704 (94.78%) | loss: 2.412304 | lrm: 0.10 | dt: 643.26ms | tok/sec: 815,051 | mfu: 50.94 | epoch: 2 | total time: 170.07m | eta: 9.4m +step 15833/16704 (94.79%) | loss: 2.417841 | lrm: 0.10 | dt: 644.34ms | tok/sec: 813,686 | mfu: 50.86 | epoch: 2 | total time: 170.08m | eta: 9.4m +step 15834/16704 (94.79%) | loss: 2.418566 | lrm: 0.10 | dt: 645.05ms | tok/sec: 812,791 | mfu: 50.80 | epoch: 2 | total time: 170.09m | eta: 9.4m +step 15835/16704 (94.80%) | loss: 2.420264 | lrm: 0.10 | dt: 644.16ms | tok/sec: 813,904 | mfu: 50.87 | epoch: 2 | total time: 170.11m | eta: 9.3m +step 15836/16704 (94.80%) | loss: 2.429494 | lrm: 0.10 | dt: 643.01ms | tok/sec: 815,369 | mfu: 50.96 | epoch: 2 | total time: 170.12m | eta: 9.3m +step 15837/16704 (94.81%) | loss: 2.438446 | lrm: 0.10 | dt: 645.63ms | tok/sec: 812,053 | mfu: 50.75 | epoch: 2 | total time: 170.13m | eta: 9.3m +step 15838/16704 (94.82%) | loss: 2.435926 | lrm: 0.10 | dt: 643.52ms | tok/sec: 814,713 | mfu: 50.92 | epoch: 2 | total time: 170.14m | eta: 9.3m +step 15839/16704 (94.82%) | loss: 2.438914 | lrm: 0.10 | dt: 644.16ms | tok/sec: 813,908 | mfu: 50.87 | epoch: 2 | total time: 170.15m | eta: 9.3m +step 15840/16704 (94.83%) | loss: 2.431785 | lrm: 0.10 | dt: 643.48ms | tok/sec: 814,774 | mfu: 50.92 | epoch: 2 | total time: 170.16m | eta: 9.3m +step 15841/16704 (94.83%) | loss: 2.429976 | lrm: 0.10 | dt: 643.98ms | tok/sec: 814,136 | mfu: 50.88 | epoch: 2 | total time: 170.17m | eta: 9.3m +step 15842/16704 (94.84%) | loss: 2.424565 | lrm: 0.10 | dt: 645.87ms | tok/sec: 811,753 | mfu: 50.74 | epoch: 2 | total time: 170.18m | eta: 9.3m +step 15843/16704 (94.85%) | loss: 2.432051 | lrm: 0.10 | dt: 645.35ms | tok/sec: 812,406 | mfu: 50.78 | epoch: 2 | total time: 170.19m | eta: 9.3m +step 15844/16704 (94.85%) | loss: 2.422547 | lrm: 0.10 | dt: 646.95ms | tok/sec: 810,394 | mfu: 50.65 | epoch: 2 | total time: 170.20m | eta: 9.2m +step 15845/16704 (94.86%) | loss: 2.431550 | lrm: 0.10 | dt: 643.38ms | tok/sec: 814,896 | mfu: 50.93 | epoch: 2 | total time: 170.21m | eta: 9.2m +step 15846/16704 (94.86%) | loss: 2.438512 | lrm: 0.10 | dt: 644.31ms | tok/sec: 813,717 | mfu: 50.86 | epoch: 2 | total time: 170.22m | eta: 9.2m +step 15847/16704 (94.87%) | loss: 2.432188 | lrm: 0.10 | dt: 646.31ms | tok/sec: 811,203 | mfu: 50.70 | epoch: 2 | total time: 170.23m | eta: 9.2m +step 15848/16704 (94.88%) | loss: 2.426153 | lrm: 0.10 | dt: 643.62ms | tok/sec: 814,587 | mfu: 50.91 | epoch: 2 | total time: 170.24m | eta: 9.2m +step 15849/16704 (94.88%) | loss: 2.419551 | lrm: 0.10 | dt: 644.62ms | tok/sec: 813,326 | mfu: 50.83 | epoch: 2 | total time: 170.26m | eta: 9.2m +step 15850/16704 (94.89%) | loss: 2.406764 | lrm: 0.10 | dt: 644.06ms | tok/sec: 814,039 | mfu: 50.88 | epoch: 2 | total time: 170.27m | eta: 9.2m +step 15851/16704 (94.89%) | loss: 2.403639 | lrm: 0.10 | dt: 646.18ms | tok/sec: 811,362 | mfu: 50.71 | epoch: 2 | total time: 170.28m | eta: 9.2m +step 15852/16704 (94.90%) | loss: 2.397146 | lrm: 0.10 | dt: 646.13ms | tok/sec: 811,427 | mfu: 50.72 | epoch: 2 | total time: 170.29m | eta: 9.2m +step 15853/16704 (94.91%) | loss: 2.397000 | lrm: 0.10 | dt: 642.97ms | tok/sec: 815,421 | mfu: 50.97 | epoch: 2 | total time: 170.30m | eta: 9.1m +step 15854/16704 (94.91%) | loss: 2.399308 | lrm: 0.10 | dt: 646.74ms | tok/sec: 810,668 | mfu: 50.67 | epoch: 2 | total time: 170.31m | eta: 9.1m +step 15855/16704 (94.92%) | loss: 2.395606 | lrm: 0.10 | dt: 645.77ms | tok/sec: 811,879 | mfu: 50.74 | epoch: 2 | total time: 170.32m | eta: 9.1m +step 15856/16704 (94.92%) | loss: 2.398434 | lrm: 0.10 | dt: 644.16ms | tok/sec: 813,913 | mfu: 50.87 | epoch: 2 | total time: 170.33m | eta: 9.1m +step 15857/16704 (94.93%) | loss: 2.380844 | lrm: 0.10 | dt: 643.63ms | tok/sec: 814,580 | mfu: 50.91 | epoch: 2 | total time: 170.34m | eta: 9.1m +step 15858/16704 (94.94%) | loss: 2.393430 | lrm: 0.10 | dt: 644.23ms | tok/sec: 813,815 | mfu: 50.86 | epoch: 2 | total time: 170.35m | eta: 9.1m +step 15859/16704 (94.94%) | loss: 2.410653 | lrm: 0.10 | dt: 644.51ms | tok/sec: 813,466 | mfu: 50.84 | epoch: 2 | total time: 170.36m | eta: 9.1m +step 15860/16704 (94.95%) | loss: 2.414523 | lrm: 0.10 | dt: 642.81ms | tok/sec: 815,620 | mfu: 50.98 | epoch: 2 | total time: 170.37m | eta: 9.1m +step 15861/16704 (94.95%) | loss: 2.434880 | lrm: 0.10 | dt: 644.92ms | tok/sec: 812,947 | mfu: 50.81 | epoch: 2 | total time: 170.38m | eta: 9.1m +step 15862/16704 (94.96%) | loss: 2.436948 | lrm: 0.10 | dt: 644.80ms | tok/sec: 813,103 | mfu: 50.82 | epoch: 2 | total time: 170.40m | eta: 9.1m +step 15863/16704 (94.97%) | loss: 2.430436 | lrm: 0.10 | dt: 645.88ms | tok/sec: 811,746 | mfu: 50.74 | epoch: 2 | total time: 170.41m | eta: 9.0m +step 15864/16704 (94.97%) | loss: 2.443637 | lrm: 0.10 | dt: 646.15ms | tok/sec: 811,402 | mfu: 50.71 | epoch: 2 | total time: 170.42m | eta: 9.0m +step 15865/16704 (94.98%) | loss: 2.448845 | lrm: 0.10 | dt: 644.30ms | tok/sec: 813,736 | mfu: 50.86 | epoch: 2 | total time: 170.43m | eta: 9.0m +step 15866/16704 (94.98%) | loss: 2.456491 | lrm: 0.10 | dt: 643.39ms | tok/sec: 814,882 | mfu: 50.93 | epoch: 2 | total time: 170.44m | eta: 9.0m +step 15867/16704 (94.99%) | loss: 2.464524 | lrm: 0.10 | dt: 644.10ms | tok/sec: 813,988 | mfu: 50.88 | epoch: 2 | total time: 170.45m | eta: 9.0m +step 15868/16704 (95.00%) | loss: 2.460229 | lrm: 0.10 | dt: 644.39ms | tok/sec: 813,615 | mfu: 50.85 | epoch: 2 | total time: 170.46m | eta: 9.0m +step 15869/16704 (95.00%) | loss: 2.470597 | lrm: 0.10 | dt: 646.96ms | tok/sec: 810,392 | mfu: 50.65 | epoch: 2 | total time: 170.47m | eta: 9.0m +step 15870/16704 (95.01%) | loss: 2.465229 | lrm: 0.10 | dt: 642.52ms | tok/sec: 815,984 | mfu: 51.00 | epoch: 2 | total time: 170.48m | eta: 9.0m +step 15871/16704 (95.01%) | loss: 2.471680 | lrm: 0.10 | dt: 645.89ms | tok/sec: 811,727 | mfu: 50.73 | epoch: 2 | total time: 170.49m | eta: 9.0m +step 15872/16704 (95.02%) | loss: 2.473172 | lrm: 0.10 | dt: 644.47ms | tok/sec: 813,516 | mfu: 50.85 | epoch: 2 | total time: 170.50m | eta: 8.9m +step 15873/16704 (95.03%) | loss: 2.469679 | lrm: 0.10 | dt: 643.31ms | tok/sec: 814,985 | mfu: 50.94 | epoch: 2 | total time: 170.51m | eta: 8.9m +step 15874/16704 (95.03%) | loss: 2.456224 | lrm: 0.10 | dt: 643.28ms | tok/sec: 815,027 | mfu: 50.94 | epoch: 2 | total time: 170.52m | eta: 8.9m +step 15875/16704 (95.04%) | loss: 2.455243 | lrm: 0.10 | dt: 643.59ms | tok/sec: 814,630 | mfu: 50.92 | epoch: 2 | total time: 170.53m | eta: 8.9m +step 15876/16704 (95.04%) | loss: 2.449604 | lrm: 0.10 | dt: 645.68ms | tok/sec: 811,987 | mfu: 50.75 | epoch: 2 | total time: 170.55m | eta: 8.9m +step 15877/16704 (95.05%) | loss: 2.454939 | lrm: 0.10 | dt: 645.37ms | tok/sec: 812,378 | mfu: 50.77 | epoch: 2 | total time: 170.56m | eta: 8.9m +step 15878/16704 (95.06%) | loss: 2.460151 | lrm: 0.10 | dt: 644.04ms | tok/sec: 814,064 | mfu: 50.88 | epoch: 2 | total time: 170.57m | eta: 8.9m +step 15879/16704 (95.06%) | loss: 2.468603 | lrm: 0.10 | dt: 644.40ms | tok/sec: 813,604 | mfu: 50.85 | epoch: 2 | total time: 170.58m | eta: 8.9m +step 15880/16704 (95.07%) | loss: 2.467799 | lrm: 0.10 | dt: 647.53ms | tok/sec: 809,669 | mfu: 50.61 | epoch: 2 | total time: 170.59m | eta: 8.9m +step 15881/16704 (95.07%) | loss: 2.485357 | lrm: 0.10 | dt: 645.85ms | tok/sec: 811,781 | mfu: 50.74 | epoch: 2 | total time: 170.60m | eta: 8.8m +step 15882/16704 (95.08%) | loss: 2.481338 | lrm: 0.10 | dt: 642.90ms | tok/sec: 815,510 | mfu: 50.97 | epoch: 2 | total time: 170.61m | eta: 8.8m +step 15883/16704 (95.09%) | loss: 2.470138 | lrm: 0.10 | dt: 647.01ms | tok/sec: 810,323 | mfu: 50.65 | epoch: 2 | total time: 170.62m | eta: 8.8m +step 15884/16704 (95.09%) | loss: 2.458454 | lrm: 0.10 | dt: 644.07ms | tok/sec: 814,023 | mfu: 50.88 | epoch: 2 | total time: 170.63m | eta: 8.8m +step 15885/16704 (95.10%) | loss: 2.446527 | lrm: 0.10 | dt: 646.02ms | tok/sec: 811,563 | mfu: 50.72 | epoch: 2 | total time: 170.64m | eta: 8.8m +step 15886/16704 (95.10%) | loss: 2.432480 | lrm: 0.10 | dt: 643.00ms | tok/sec: 815,378 | mfu: 50.96 | epoch: 2 | total time: 170.65m | eta: 8.8m +step 15887/16704 (95.11%) | loss: 2.428250 | lrm: 0.10 | dt: 643.92ms | tok/sec: 814,209 | mfu: 50.89 | epoch: 2 | total time: 170.66m | eta: 8.8m +step 15888/16704 (95.11%) | loss: 2.427446 | lrm: 0.10 | dt: 646.21ms | tok/sec: 811,323 | mfu: 50.71 | epoch: 2 | total time: 170.67m | eta: 8.8m +step 15889/16704 (95.12%) | loss: 2.426194 | lrm: 0.10 | dt: 645.65ms | tok/sec: 812,037 | mfu: 50.75 | epoch: 2 | total time: 170.69m | eta: 8.8m +step 15890/16704 (95.13%) | loss: 2.424712 | lrm: 0.10 | dt: 643.32ms | tok/sec: 814,977 | mfu: 50.94 | epoch: 2 | total time: 170.70m | eta: 8.7m +step 15891/16704 (95.13%) | loss: 2.422056 | lrm: 0.10 | dt: 644.42ms | tok/sec: 813,583 | mfu: 50.85 | epoch: 2 | total time: 170.71m | eta: 8.7m +step 15892/16704 (95.14%) | loss: 2.430889 | lrm: 0.10 | dt: 643.60ms | tok/sec: 814,618 | mfu: 50.91 | epoch: 2 | total time: 170.72m | eta: 8.7m +step 15893/16704 (95.14%) | loss: 2.437502 | lrm: 0.10 | dt: 643.29ms | tok/sec: 815,010 | mfu: 50.94 | epoch: 2 | total time: 170.73m | eta: 8.7m +step 15894/16704 (95.15%) | loss: 2.438094 | lrm: 0.10 | dt: 645.67ms | tok/sec: 812,000 | mfu: 50.75 | epoch: 2 | total time: 170.74m | eta: 8.7m +step 15895/16704 (95.16%) | loss: 2.433019 | lrm: 0.10 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 2 | total time: 170.75m | eta: 8.7m +step 15896/16704 (95.16%) | loss: 2.441755 | lrm: 0.10 | dt: 644.16ms | tok/sec: 813,905 | mfu: 50.87 | epoch: 2 | total time: 170.76m | eta: 8.7m +step 15897/16704 (95.17%) | loss: 2.441627 | lrm: 0.10 | dt: 646.37ms | tok/sec: 811,131 | mfu: 50.70 | epoch: 2 | total time: 170.77m | eta: 8.7m +step 15898/16704 (95.17%) | loss: 2.447101 | lrm: 0.10 | dt: 643.86ms | tok/sec: 814,289 | mfu: 50.89 | epoch: 2 | total time: 170.78m | eta: 8.7m +step 15899/16704 (95.18%) | loss: 2.437031 | lrm: 0.10 | dt: 645.00ms | tok/sec: 812,852 | mfu: 50.80 | epoch: 2 | total time: 170.79m | eta: 8.7m +step 15900/16704 (95.19%) | loss: 2.419971 | lrm: 0.10 | dt: 644.85ms | tok/sec: 813,038 | mfu: 50.82 | epoch: 2 | total time: 170.80m | eta: 8.6m +step 15901/16704 (95.19%) | loss: 2.413488 | lrm: 0.10 | dt: 645.68ms | tok/sec: 811,999 | mfu: 50.75 | epoch: 2 | total time: 170.81m | eta: 8.6m +step 15902/16704 (95.20%) | loss: 2.415671 | lrm: 0.10 | dt: 646.41ms | tok/sec: 811,070 | mfu: 50.69 | epoch: 2 | total time: 170.83m | eta: 8.6m +step 15903/16704 (95.20%) | loss: 2.411949 | lrm: 0.10 | dt: 644.36ms | tok/sec: 813,652 | mfu: 50.85 | epoch: 2 | total time: 170.84m | eta: 8.6m +step 15904/16704 (95.21%) | loss: 2.417274 | lrm: 0.10 | dt: 644.36ms | tok/sec: 813,659 | mfu: 50.85 | epoch: 2 | total time: 170.85m | eta: 8.6m +step 15905/16704 (95.22%) | loss: 2.412972 | lrm: 0.10 | dt: 645.28ms | tok/sec: 812,492 | mfu: 50.78 | epoch: 2 | total time: 170.86m | eta: 8.6m +step 15906/16704 (95.22%) | loss: 2.414311 | lrm: 0.10 | dt: 644.86ms | tok/sec: 813,020 | mfu: 50.81 | epoch: 2 | total time: 170.87m | eta: 8.6m +step 15907/16704 (95.23%) | loss: 2.427086 | lrm: 0.10 | dt: 644.48ms | tok/sec: 813,501 | mfu: 50.84 | epoch: 2 | total time: 170.88m | eta: 8.6m +step 15908/16704 (95.23%) | loss: 2.429362 | lrm: 0.10 | dt: 644.67ms | tok/sec: 813,267 | mfu: 50.83 | epoch: 2 | total time: 170.89m | eta: 8.6m +step 15909/16704 (95.24%) | loss: 2.440126 | lrm: 0.10 | dt: 644.75ms | tok/sec: 813,165 | mfu: 50.82 | epoch: 2 | total time: 170.90m | eta: 8.5m +step 15910/16704 (95.25%) | loss: 2.435175 | lrm: 0.10 | dt: 643.87ms | tok/sec: 814,273 | mfu: 50.89 | epoch: 2 | total time: 170.91m | eta: 8.5m +step 15911/16704 (95.25%) | loss: 2.431532 | lrm: 0.09 | dt: 643.80ms | tok/sec: 814,359 | mfu: 50.90 | epoch: 2 | total time: 170.92m | eta: 8.5m +step 15912/16704 (95.26%) | loss: 2.427506 | lrm: 0.09 | dt: 642.30ms | tok/sec: 816,263 | mfu: 51.02 | epoch: 2 | total time: 170.93m | eta: 8.5m +step 15913/16704 (95.26%) | loss: 2.426703 | lrm: 0.09 | dt: 642.43ms | tok/sec: 816,104 | mfu: 51.01 | epoch: 2 | total time: 170.94m | eta: 8.5m +step 15914/16704 (95.27%) | loss: 2.416156 | lrm: 0.09 | dt: 651.18ms | tok/sec: 805,129 | mfu: 50.32 | epoch: 2 | total time: 170.95m | eta: 8.5m +step 15915/16704 (95.28%) | loss: 2.427229 | lrm: 0.09 | dt: 642.52ms | tok/sec: 815,986 | mfu: 51.00 | epoch: 2 | total time: 170.96m | eta: 8.5m +step 15916/16704 (95.28%) | loss: 2.422424 | lrm: 0.09 | dt: 645.59ms | tok/sec: 812,110 | mfu: 50.76 | epoch: 3 | total time: 170.98m | eta: 8.5m +step 15917/16704 (95.29%) | loss: 2.427623 | lrm: 0.09 | dt: 645.93ms | tok/sec: 811,684 | mfu: 50.73 | epoch: 3 | total time: 170.99m | eta: 8.5m +step 15918/16704 (95.29%) | loss: 2.416108 | lrm: 0.09 | dt: 642.22ms | tok/sec: 816,372 | mfu: 51.02 | epoch: 3 | total time: 171.00m | eta: 8.4m +step 15919/16704 (95.30%) | loss: 2.422607 | lrm: 0.09 | dt: 647.30ms | tok/sec: 809,958 | mfu: 50.62 | epoch: 3 | total time: 171.01m | eta: 8.4m +step 15920/16704 (95.31%) | loss: 2.423859 | lrm: 0.09 | dt: 643.45ms | tok/sec: 814,813 | mfu: 50.93 | epoch: 3 | total time: 171.02m | eta: 8.4m +step 15921/16704 (95.31%) | loss: 2.422501 | lrm: 0.09 | dt: 645.31ms | tok/sec: 812,458 | mfu: 50.78 | epoch: 3 | total time: 171.03m | eta: 8.4m +step 15922/16704 (95.32%) | loss: 2.428101 | lrm: 0.09 | dt: 644.10ms | tok/sec: 813,983 | mfu: 50.88 | epoch: 3 | total time: 171.04m | eta: 8.4m +step 15923/16704 (95.32%) | loss: 2.433384 | lrm: 0.09 | dt: 642.89ms | tok/sec: 815,514 | mfu: 50.97 | epoch: 3 | total time: 171.05m | eta: 8.4m +step 15924/16704 (95.33%) | loss: 2.433751 | lrm: 0.09 | dt: 646.63ms | tok/sec: 810,800 | mfu: 50.68 | epoch: 3 | total time: 171.06m | eta: 8.4m +step 15925/16704 (95.34%) | loss: 2.436638 | lrm: 0.09 | dt: 647.30ms | tok/sec: 809,961 | mfu: 50.62 | epoch: 3 | total time: 171.07m | eta: 8.4m +step 15926/16704 (95.34%) | loss: 2.446001 | lrm: 0.09 | dt: 646.11ms | tok/sec: 811,454 | mfu: 50.72 | epoch: 3 | total time: 171.08m | eta: 8.4m +step 15927/16704 (95.35%) | loss: 2.441229 | lrm: 0.09 | dt: 644.54ms | tok/sec: 813,426 | mfu: 50.84 | epoch: 3 | total time: 171.09m | eta: 8.4m +step 15928/16704 (95.35%) | loss: 2.433848 | lrm: 0.09 | dt: 644.24ms | tok/sec: 813,811 | mfu: 50.86 | epoch: 3 | total time: 171.10m | eta: 8.3m +step 15929/16704 (95.36%) | loss: 2.433230 | lrm: 0.09 | dt: 643.66ms | tok/sec: 814,546 | mfu: 50.91 | epoch: 3 | total time: 171.12m | eta: 8.3m +step 15930/16704 (95.37%) | loss: 2.435985 | lrm: 0.09 | dt: 646.12ms | tok/sec: 811,441 | mfu: 50.72 | epoch: 3 | total time: 171.13m | eta: 8.3m +step 15931/16704 (95.37%) | loss: 2.436171 | lrm: 0.09 | dt: 641.93ms | tok/sec: 816,732 | mfu: 51.05 | epoch: 3 | total time: 171.14m | eta: 8.3m +step 15932/16704 (95.38%) | loss: 2.436038 | lrm: 0.09 | dt: 643.60ms | tok/sec: 814,611 | mfu: 50.91 | epoch: 3 | total time: 171.15m | eta: 8.3m +step 15933/16704 (95.38%) | loss: 2.440854 | lrm: 0.09 | dt: 644.13ms | tok/sec: 813,946 | mfu: 50.87 | epoch: 3 | total time: 171.16m | eta: 8.3m +step 15934/16704 (95.39%) | loss: 2.451138 | lrm: 0.09 | dt: 643.68ms | tok/sec: 814,522 | mfu: 50.91 | epoch: 3 | total time: 171.17m | eta: 8.3m +step 15935/16704 (95.40%) | loss: 2.439696 | lrm: 0.09 | dt: 645.61ms | tok/sec: 812,076 | mfu: 50.76 | epoch: 3 | total time: 171.18m | eta: 8.3m +step 15936/16704 (95.40%) | loss: 2.436763 | lrm: 0.09 | dt: 642.67ms | tok/sec: 815,791 | mfu: 50.99 | epoch: 3 | total time: 171.19m | eta: 8.3m +step 15937/16704 (95.41%) | loss: 2.446064 | lrm: 0.09 | dt: 644.04ms | tok/sec: 814,058 | mfu: 50.88 | epoch: 3 | total time: 171.20m | eta: 8.2m +step 15938/16704 (95.41%) | loss: 2.446741 | lrm: 0.09 | dt: 646.71ms | tok/sec: 810,698 | mfu: 50.67 | epoch: 3 | total time: 171.21m | eta: 8.2m +step 15939/16704 (95.42%) | loss: 2.444250 | lrm: 0.09 | dt: 643.66ms | tok/sec: 814,546 | mfu: 50.91 | epoch: 3 | total time: 171.22m | eta: 8.2m +step 15940/16704 (95.43%) | loss: 2.448153 | lrm: 0.09 | dt: 643.86ms | tok/sec: 814,286 | mfu: 50.89 | epoch: 3 | total time: 171.23m | eta: 8.2m +step 15941/16704 (95.43%) | loss: 2.441009 | lrm: 0.09 | dt: 644.53ms | tok/sec: 813,439 | mfu: 50.84 | epoch: 3 | total time: 171.24m | eta: 8.2m +step 15942/16704 (95.44%) | loss: 2.427981 | lrm: 0.09 | dt: 647.00ms | tok/sec: 810,333 | mfu: 50.65 | epoch: 3 | total time: 171.25m | eta: 8.2m +step 15943/16704 (95.44%) | loss: 2.435052 | lrm: 0.09 | dt: 642.83ms | tok/sec: 815,587 | mfu: 50.98 | epoch: 3 | total time: 171.27m | eta: 8.2m +step 15944/16704 (95.45%) | loss: 2.429815 | lrm: 0.09 | dt: 646.02ms | tok/sec: 811,563 | mfu: 50.72 | epoch: 3 | total time: 171.28m | eta: 8.2m +step 15945/16704 (95.46%) | loss: 2.428264 | lrm: 0.09 | dt: 644.48ms | tok/sec: 813,501 | mfu: 50.84 | epoch: 3 | total time: 171.29m | eta: 8.2m +step 15946/16704 (95.46%) | loss: 2.422014 | lrm: 0.09 | dt: 648.80ms | tok/sec: 808,085 | mfu: 50.51 | epoch: 3 | total time: 171.30m | eta: 8.1m +step 15947/16704 (95.47%) | loss: 2.418088 | lrm: 0.09 | dt: 645.61ms | tok/sec: 812,085 | mfu: 50.76 | epoch: 3 | total time: 171.31m | eta: 8.1m +step 15948/16704 (95.47%) | loss: 2.421199 | lrm: 0.09 | dt: 643.71ms | tok/sec: 814,475 | mfu: 50.91 | epoch: 3 | total time: 171.32m | eta: 8.1m +step 15949/16704 (95.48%) | loss: 2.428986 | lrm: 0.09 | dt: 645.47ms | tok/sec: 812,253 | mfu: 50.77 | epoch: 3 | total time: 171.33m | eta: 8.1m +step 15950/16704 (95.49%) | loss: 2.412895 | lrm: 0.09 | dt: 643.55ms | tok/sec: 814,679 | mfu: 50.92 | epoch: 3 | total time: 171.34m | eta: 8.1m +step 15951/16704 (95.49%) | loss: 2.418413 | lrm: 0.09 | dt: 645.57ms | tok/sec: 812,130 | mfu: 50.76 | epoch: 3 | total time: 171.35m | eta: 8.1m +step 15952/16704 (95.50%) | loss: 2.405759 | lrm: 0.09 | dt: 644.39ms | tok/sec: 813,624 | mfu: 50.85 | epoch: 3 | total time: 171.36m | eta: 8.1m +step 15953/16704 (95.50%) | loss: 2.403950 | lrm: 0.09 | dt: 644.78ms | tok/sec: 813,126 | mfu: 50.82 | epoch: 3 | total time: 171.37m | eta: 8.1m +step 15954/16704 (95.51%) | loss: 2.403453 | lrm: 0.09 | dt: 644.30ms | tok/sec: 813,734 | mfu: 50.86 | epoch: 3 | total time: 171.38m | eta: 8.1m +step 15955/16704 (95.52%) | loss: 2.416908 | lrm: 0.09 | dt: 643.70ms | tok/sec: 814,486 | mfu: 50.91 | epoch: 3 | total time: 171.39m | eta: 8.1m +step 15956/16704 (95.52%) | loss: 2.416194 | lrm: 0.09 | dt: 646.28ms | tok/sec: 811,244 | mfu: 50.70 | epoch: 3 | total time: 171.41m | eta: 8.0m +step 15957/16704 (95.53%) | loss: 2.422483 | lrm: 0.09 | dt: 644.36ms | tok/sec: 813,651 | mfu: 50.85 | epoch: 3 | total time: 171.42m | eta: 8.0m +step 15958/16704 (95.53%) | loss: 2.427632 | lrm: 0.09 | dt: 645.78ms | tok/sec: 811,866 | mfu: 50.74 | epoch: 3 | total time: 171.43m | eta: 8.0m +step 15959/16704 (95.54%) | loss: 2.417509 | lrm: 0.09 | dt: 644.87ms | tok/sec: 813,019 | mfu: 50.81 | epoch: 3 | total time: 171.44m | eta: 8.0m +step 15960/16704 (95.55%) | loss: 2.408192 | lrm: 0.09 | dt: 645.65ms | tok/sec: 812,033 | mfu: 50.75 | epoch: 3 | total time: 171.45m | eta: 8.0m +step 15961/16704 (95.55%) | loss: 2.407121 | lrm: 0.09 | dt: 644.62ms | tok/sec: 813,327 | mfu: 50.83 | epoch: 3 | total time: 171.46m | eta: 8.0m +step 15962/16704 (95.56%) | loss: 2.404362 | lrm: 0.09 | dt: 645.35ms | tok/sec: 812,409 | mfu: 50.78 | epoch: 3 | total time: 171.47m | eta: 8.0m +step 15963/16704 (95.56%) | loss: 2.399248 | lrm: 0.09 | dt: 645.26ms | tok/sec: 812,527 | mfu: 50.78 | epoch: 3 | total time: 171.48m | eta: 8.0m +step 15964/16704 (95.57%) | loss: 2.404151 | lrm: 0.09 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 3 | total time: 171.49m | eta: 8.0m +step 15965/16704 (95.58%) | loss: 2.397119 | lrm: 0.09 | dt: 644.93ms | tok/sec: 812,937 | mfu: 50.81 | epoch: 3 | total time: 171.50m | eta: 7.9m +step 15966/16704 (95.58%) | loss: 2.391506 | lrm: 0.09 | dt: 645.00ms | tok/sec: 812,855 | mfu: 50.80 | epoch: 3 | total time: 171.51m | eta: 7.9m +step 15967/16704 (95.59%) | loss: 2.391537 | lrm: 0.09 | dt: 644.84ms | tok/sec: 813,055 | mfu: 50.82 | epoch: 3 | total time: 171.52m | eta: 7.9m +step 15968/16704 (95.59%) | loss: 2.396615 | lrm: 0.09 | dt: 644.66ms | tok/sec: 813,275 | mfu: 50.83 | epoch: 3 | total time: 171.53m | eta: 7.9m +step 15969/16704 (95.60%) | loss: 2.404948 | lrm: 0.09 | dt: 644.49ms | tok/sec: 813,489 | mfu: 50.84 | epoch: 3 | total time: 171.55m | eta: 7.9m +step 15970/16704 (95.61%) | loss: 2.401985 | lrm: 0.09 | dt: 645.18ms | tok/sec: 812,625 | mfu: 50.79 | epoch: 3 | total time: 171.56m | eta: 7.9m +step 15971/16704 (95.61%) | loss: 2.398804 | lrm: 0.09 | dt: 642.67ms | tok/sec: 815,802 | mfu: 50.99 | epoch: 3 | total time: 171.57m | eta: 7.9m +step 15972/16704 (95.62%) | loss: 2.414078 | lrm: 0.09 | dt: 643.62ms | tok/sec: 814,586 | mfu: 50.91 | epoch: 3 | total time: 171.58m | eta: 7.9m +step 15973/16704 (95.62%) | loss: 2.410597 | lrm: 0.09 | dt: 645.16ms | tok/sec: 812,642 | mfu: 50.79 | epoch: 3 | total time: 171.59m | eta: 7.9m +step 15974/16704 (95.63%) | loss: 2.412581 | lrm: 0.09 | dt: 643.84ms | tok/sec: 814,313 | mfu: 50.90 | epoch: 3 | total time: 171.60m | eta: 7.8m +step 15975/16704 (95.64%) | loss: 2.410786 | lrm: 0.09 | dt: 643.75ms | tok/sec: 814,427 | mfu: 50.90 | epoch: 3 | total time: 171.61m | eta: 7.8m +step 15976/16704 (95.64%) | loss: 2.412762 | lrm: 0.09 | dt: 643.71ms | tok/sec: 814,479 | mfu: 50.91 | epoch: 3 | total time: 171.62m | eta: 7.8m +step 15977/16704 (95.65%) | loss: 2.405316 | lrm: 0.09 | dt: 644.85ms | tok/sec: 813,043 | mfu: 50.82 | epoch: 3 | total time: 171.63m | eta: 7.8m +step 15978/16704 (95.65%) | loss: 2.412335 | lrm: 0.09 | dt: 643.77ms | tok/sec: 814,396 | mfu: 50.90 | epoch: 3 | total time: 171.64m | eta: 7.8m +step 15979/16704 (95.66%) | loss: 2.427414 | lrm: 0.09 | dt: 644.06ms | tok/sec: 814,039 | mfu: 50.88 | epoch: 3 | total time: 171.65m | eta: 7.8m +step 15980/16704 (95.67%) | loss: 2.423341 | lrm: 0.09 | dt: 645.06ms | tok/sec: 812,773 | mfu: 50.80 | epoch: 3 | total time: 171.66m | eta: 7.8m +step 15981/16704 (95.67%) | loss: 2.414993 | lrm: 0.09 | dt: 643.10ms | tok/sec: 815,245 | mfu: 50.95 | epoch: 3 | total time: 171.67m | eta: 7.8m +step 15982/16704 (95.68%) | loss: 2.425140 | lrm: 0.09 | dt: 642.28ms | tok/sec: 816,291 | mfu: 51.02 | epoch: 3 | total time: 171.68m | eta: 7.8m +step 15983/16704 (95.68%) | loss: 2.424376 | lrm: 0.09 | dt: 645.56ms | tok/sec: 812,148 | mfu: 50.76 | epoch: 3 | total time: 171.70m | eta: 7.8m +step 15984/16704 (95.69%) | loss: 2.432077 | lrm: 0.09 | dt: 644.89ms | tok/sec: 812,993 | mfu: 50.81 | epoch: 3 | total time: 171.71m | eta: 7.7m +step 15985/16704 (95.70%) | loss: 2.419667 | lrm: 0.09 | dt: 645.69ms | tok/sec: 811,983 | mfu: 50.75 | epoch: 3 | total time: 171.72m | eta: 7.7m +step 15986/16704 (95.70%) | loss: 2.435689 | lrm: 0.09 | dt: 643.45ms | tok/sec: 814,807 | mfu: 50.93 | epoch: 3 | total time: 171.73m | eta: 7.7m +step 15987/16704 (95.71%) | loss: 2.444973 | lrm: 0.09 | dt: 643.94ms | tok/sec: 814,188 | mfu: 50.89 | epoch: 3 | total time: 171.74m | eta: 7.7m +step 15988/16704 (95.71%) | loss: 2.446803 | lrm: 0.09 | dt: 645.96ms | tok/sec: 811,637 | mfu: 50.73 | epoch: 3 | total time: 171.75m | eta: 7.7m +step 15989/16704 (95.72%) | loss: 2.423664 | lrm: 0.09 | dt: 645.32ms | tok/sec: 812,442 | mfu: 50.78 | epoch: 3 | total time: 171.76m | eta: 7.7m +step 15990/16704 (95.73%) | loss: 2.432826 | lrm: 0.09 | dt: 643.21ms | tok/sec: 815,114 | mfu: 50.95 | epoch: 3 | total time: 171.77m | eta: 7.7m +step 15991/16704 (95.73%) | loss: 2.428939 | lrm: 0.09 | dt: 644.40ms | tok/sec: 813,601 | mfu: 50.85 | epoch: 3 | total time: 171.78m | eta: 7.7m +step 15992/16704 (95.74%) | loss: 2.438809 | lrm: 0.09 | dt: 644.76ms | tok/sec: 813,158 | mfu: 50.82 | epoch: 3 | total time: 171.79m | eta: 7.7m +step 15993/16704 (95.74%) | loss: 2.439216 | lrm: 0.09 | dt: 644.59ms | tok/sec: 813,367 | mfu: 50.84 | epoch: 3 | total time: 171.80m | eta: 7.6m +step 15994/16704 (95.75%) | loss: 2.444688 | lrm: 0.09 | dt: 645.91ms | tok/sec: 811,699 | mfu: 50.73 | epoch: 3 | total time: 171.81m | eta: 7.6m +step 15995/16704 (95.76%) | loss: 2.437251 | lrm: 0.08 | dt: 642.51ms | tok/sec: 816,002 | mfu: 51.00 | epoch: 3 | total time: 171.82m | eta: 7.6m +step 15996/16704 (95.76%) | loss: 2.434278 | lrm: 0.08 | dt: 644.48ms | tok/sec: 813,505 | mfu: 50.85 | epoch: 3 | total time: 171.83m | eta: 7.6m +step 15997/16704 (95.77%) | loss: 2.434976 | lrm: 0.08 | dt: 645.07ms | tok/sec: 812,758 | mfu: 50.80 | epoch: 3 | total time: 171.85m | eta: 7.6m +step 15998/16704 (95.77%) | loss: 2.430871 | lrm: 0.08 | dt: 641.30ms | tok/sec: 817,539 | mfu: 51.10 | epoch: 3 | total time: 171.86m | eta: 7.6m +step 15999/16704 (95.78%) | loss: 2.435482 | lrm: 0.08 | dt: 646.92ms | tok/sec: 810,436 | mfu: 50.65 | epoch: 3 | total time: 171.87m | eta: 7.6m +[GC rank1] gen2: 361.3ms collected 91088 objects +[GC rank6] gen2: 368.9ms collected 90984 objects +[GC rank2] gen2: 374.3ms collected 91056 objects[GC rank0] gen2: 374.3ms collected 91112 objects + +[GC rank5] gen2: 375.9ms collected 91008 objects +[GC rank4] gen2: 480.3ms collected 91024 objects +[GC rank7] gen2: 485.1ms collected 90976 objects +[GC rank3] gen2: 491.4ms collected 91048 objects +Step 16000 | Validation bpb: 0.755753 +step 16000/16704 (95.79%) | loss: 2.437783 | lrm: 0.08 | dt: 635.00ms | tok/sec: 825,651 | mfu: 51.60 | epoch: 3 | total time: 171.88m | eta: 7.6m +step 16001/16704 (95.79%) | loss: 2.429405 | lrm: 0.08 | dt: 651.14ms | tok/sec: 805,180 | mfu: 50.32 | epoch: 3 | total time: 171.89m | eta: 7.6m +step 16002/16704 (95.80%) | loss: 2.420320 | lrm: 0.08 | dt: 642.49ms | tok/sec: 816,024 | mfu: 51.00 | epoch: 3 | total time: 171.90m | eta: 7.5m +step 16003/16704 (95.80%) | loss: 2.428856 | lrm: 0.08 | dt: 644.53ms | tok/sec: 813,436 | mfu: 50.84 | epoch: 3 | total time: 171.91m | eta: 7.5m +step 16004/16704 (95.81%) | loss: 2.419150 | lrm: 0.08 | dt: 645.65ms | tok/sec: 812,027 | mfu: 50.75 | epoch: 3 | total time: 171.92m | eta: 7.5m +step 16005/16704 (95.82%) | loss: 2.434643 | lrm: 0.08 | dt: 641.77ms | tok/sec: 816,942 | mfu: 51.06 | epoch: 3 | total time: 171.93m | eta: 7.5m +step 16006/16704 (95.82%) | loss: 2.436544 | lrm: 0.08 | dt: 645.15ms | tok/sec: 812,663 | mfu: 50.79 | epoch: 3 | total time: 171.94m | eta: 7.5m +step 16007/16704 (95.83%) | loss: 2.430347 | lrm: 0.08 | dt: 644.66ms | tok/sec: 813,277 | mfu: 50.83 | epoch: 3 | total time: 171.95m | eta: 7.5m +step 16008/16704 (95.83%) | loss: 2.424203 | lrm: 0.08 | dt: 642.27ms | tok/sec: 816,306 | mfu: 51.02 | epoch: 3 | total time: 171.96m | eta: 7.5m +step 16009/16704 (95.84%) | loss: 2.426359 | lrm: 0.08 | dt: 644.75ms | tok/sec: 813,163 | mfu: 50.82 | epoch: 3 | total time: 171.97m | eta: 7.5m +step 16010/16704 (95.85%) | loss: 2.422290 | lrm: 0.08 | dt: 644.34ms | tok/sec: 813,684 | mfu: 50.86 | epoch: 3 | total time: 171.99m | eta: 7.5m +step 16011/16704 (95.85%) | loss: 2.410882 | lrm: 0.08 | dt: 645.34ms | tok/sec: 812,416 | mfu: 50.78 | epoch: 3 | total time: 172.00m | eta: 7.4m +step 16012/16704 (95.86%) | loss: 2.419830 | lrm: 0.08 | dt: 644.44ms | tok/sec: 813,554 | mfu: 50.85 | epoch: 3 | total time: 172.01m | eta: 7.4m +step 16013/16704 (95.86%) | loss: 2.432196 | lrm: 0.08 | dt: 643.20ms | tok/sec: 815,120 | mfu: 50.95 | epoch: 3 | total time: 172.02m | eta: 7.4m +step 16014/16704 (95.87%) | loss: 2.426314 | lrm: 0.08 | dt: 643.28ms | tok/sec: 815,028 | mfu: 50.94 | epoch: 3 | total time: 172.03m | eta: 7.4m +step 16015/16704 (95.88%) | loss: 2.421603 | lrm: 0.08 | dt: 643.53ms | tok/sec: 814,701 | mfu: 50.92 | epoch: 3 | total time: 172.04m | eta: 7.4m +step 16016/16704 (95.88%) | loss: 2.427497 | lrm: 0.08 | dt: 643.11ms | tok/sec: 815,244 | mfu: 50.95 | epoch: 3 | total time: 172.05m | eta: 7.4m +step 16017/16704 (95.89%) | loss: 2.420407 | lrm: 0.08 | dt: 644.08ms | tok/sec: 814,013 | mfu: 50.88 | epoch: 3 | total time: 172.06m | eta: 7.4m +step 16018/16704 (95.89%) | loss: 2.423029 | lrm: 0.08 | dt: 645.19ms | tok/sec: 812,611 | mfu: 50.79 | epoch: 3 | total time: 172.07m | eta: 7.4m +step 16019/16704 (95.90%) | loss: 2.433870 | lrm: 0.08 | dt: 642.08ms | tok/sec: 816,542 | mfu: 51.04 | epoch: 3 | total time: 172.08m | eta: 7.4m +step 16020/16704 (95.91%) | loss: 2.420005 | lrm: 0.08 | dt: 644.64ms | tok/sec: 813,299 | mfu: 50.83 | epoch: 3 | total time: 172.09m | eta: 7.4m +step 16021/16704 (95.91%) | loss: 2.424757 | lrm: 0.08 | dt: 645.53ms | tok/sec: 812,178 | mfu: 50.76 | epoch: 3 | total time: 172.10m | eta: 7.3m +step 16022/16704 (95.92%) | loss: 2.429690 | lrm: 0.08 | dt: 640.70ms | tok/sec: 818,307 | mfu: 51.15 | epoch: 3 | total time: 172.11m | eta: 7.3m +step 16023/16704 (95.92%) | loss: 2.434316 | lrm: 0.08 | dt: 645.73ms | tok/sec: 811,927 | mfu: 50.75 | epoch: 3 | total time: 172.12m | eta: 7.3m +step 16024/16704 (95.93%) | loss: 2.431921 | lrm: 0.08 | dt: 646.01ms | tok/sec: 811,582 | mfu: 50.73 | epoch: 3 | total time: 172.14m | eta: 7.3m +step 16025/16704 (95.94%) | loss: 2.423268 | lrm: 0.08 | dt: 642.00ms | tok/sec: 816,642 | mfu: 51.04 | epoch: 3 | total time: 172.15m | eta: 7.3m +step 16026/16704 (95.94%) | loss: 2.425847 | lrm: 0.08 | dt: 645.22ms | tok/sec: 812,578 | mfu: 50.79 | epoch: 3 | total time: 172.16m | eta: 7.3m +step 16027/16704 (95.95%) | loss: 2.429467 | lrm: 0.08 | dt: 643.20ms | tok/sec: 815,123 | mfu: 50.95 | epoch: 3 | total time: 172.17m | eta: 7.3m +step 16028/16704 (95.95%) | loss: 2.440239 | lrm: 0.08 | dt: 644.15ms | tok/sec: 813,920 | mfu: 50.87 | epoch: 3 | total time: 172.18m | eta: 7.3m +step 16029/16704 (95.96%) | loss: 2.452532 | lrm: 0.08 | dt: 644.13ms | tok/sec: 813,951 | mfu: 50.87 | epoch: 3 | total time: 172.19m | eta: 7.3m +step 16030/16704 (95.97%) | loss: 2.446004 | lrm: 0.08 | dt: 644.35ms | tok/sec: 813,673 | mfu: 50.86 | epoch: 3 | total time: 172.20m | eta: 7.2m +step 16031/16704 (95.97%) | loss: 2.442927 | lrm: 0.08 | dt: 642.90ms | tok/sec: 815,508 | mfu: 50.97 | epoch: 3 | total time: 172.21m | eta: 7.2m +step 16032/16704 (95.98%) | loss: 2.427460 | lrm: 0.08 | dt: 643.98ms | tok/sec: 814,134 | mfu: 50.88 | epoch: 3 | total time: 172.22m | eta: 7.2m +step 16033/16704 (95.98%) | loss: 2.420878 | lrm: 0.08 | dt: 642.26ms | tok/sec: 816,320 | mfu: 51.02 | epoch: 3 | total time: 172.23m | eta: 7.2m +step 16034/16704 (95.99%) | loss: 2.428928 | lrm: 0.08 | dt: 645.05ms | tok/sec: 812,784 | mfu: 50.80 | epoch: 3 | total time: 172.24m | eta: 7.2m +step 16035/16704 (95.99%) | loss: 2.437439 | lrm: 0.08 | dt: 644.94ms | tok/sec: 812,919 | mfu: 50.81 | epoch: 3 | total time: 172.25m | eta: 7.2m +step 16036/16704 (96.00%) | loss: 2.446278 | lrm: 0.08 | dt: 646.31ms | tok/sec: 811,196 | mfu: 50.70 | epoch: 3 | total time: 172.26m | eta: 7.2m +step 16037/16704 (96.01%) | loss: 2.443354 | lrm: 0.08 | dt: 644.75ms | tok/sec: 813,167 | mfu: 50.82 | epoch: 3 | total time: 172.28m | eta: 7.2m +step 16038/16704 (96.01%) | loss: 2.438647 | lrm: 0.08 | dt: 646.24ms | tok/sec: 811,291 | mfu: 50.71 | epoch: 3 | total time: 172.29m | eta: 7.2m +step 16039/16704 (96.02%) | loss: 2.428573 | lrm: 0.08 | dt: 641.99ms | tok/sec: 816,659 | mfu: 51.04 | epoch: 3 | total time: 172.30m | eta: 7.1m +step 16040/16704 (96.02%) | loss: 2.420170 | lrm: 0.08 | dt: 645.53ms | tok/sec: 812,188 | mfu: 50.76 | epoch: 3 | total time: 172.31m | eta: 7.1m +step 16041/16704 (96.03%) | loss: 2.415941 | lrm: 0.08 | dt: 644.22ms | tok/sec: 813,839 | mfu: 50.87 | epoch: 3 | total time: 172.32m | eta: 7.1m +step 16042/16704 (96.04%) | loss: 2.424114 | lrm: 0.08 | dt: 643.80ms | tok/sec: 814,358 | mfu: 50.90 | epoch: 3 | total time: 172.33m | eta: 7.1m +step 16043/16704 (96.04%) | loss: 2.430828 | lrm: 0.08 | dt: 645.76ms | tok/sec: 811,893 | mfu: 50.74 | epoch: 3 | total time: 172.34m | eta: 7.1m +step 16044/16704 (96.05%) | loss: 2.437604 | lrm: 0.08 | dt: 645.48ms | tok/sec: 812,238 | mfu: 50.77 | epoch: 3 | total time: 172.35m | eta: 7.1m +step 16045/16704 (96.05%) | loss: 2.427594 | lrm: 0.08 | dt: 644.02ms | tok/sec: 814,089 | mfu: 50.88 | epoch: 3 | total time: 172.36m | eta: 7.1m +step 16046/16704 (96.06%) | loss: 2.423778 | lrm: 0.08 | dt: 645.41ms | tok/sec: 812,336 | mfu: 50.77 | epoch: 3 | total time: 172.37m | eta: 7.1m +step 16047/16704 (96.07%) | loss: 2.426278 | lrm: 0.08 | dt: 643.44ms | tok/sec: 814,823 | mfu: 50.93 | epoch: 3 | total time: 172.38m | eta: 7.1m +step 16048/16704 (96.07%) | loss: 2.432108 | lrm: 0.08 | dt: 643.60ms | tok/sec: 814,616 | mfu: 50.91 | epoch: 3 | total time: 172.39m | eta: 7.1m +step 16049/16704 (96.08%) | loss: 2.434026 | lrm: 0.08 | dt: 644.05ms | tok/sec: 814,043 | mfu: 50.88 | epoch: 3 | total time: 172.40m | eta: 7.0m +step 16050/16704 (96.08%) | loss: 2.437185 | lrm: 0.08 | dt: 644.61ms | tok/sec: 813,344 | mfu: 50.84 | epoch: 3 | total time: 172.41m | eta: 7.0m +step 16051/16704 (96.09%) | loss: 2.439972 | lrm: 0.08 | dt: 644.51ms | tok/sec: 813,467 | mfu: 50.84 | epoch: 3 | total time: 172.43m | eta: 7.0m +step 16052/16704 (96.10%) | loss: 2.430784 | lrm: 0.08 | dt: 644.52ms | tok/sec: 813,458 | mfu: 50.84 | epoch: 3 | total time: 172.44m | eta: 7.0m +step 16053/16704 (96.10%) | loss: 2.433549 | lrm: 0.08 | dt: 645.90ms | tok/sec: 811,713 | mfu: 50.73 | epoch: 3 | total time: 172.45m | eta: 7.0m +step 16054/16704 (96.11%) | loss: 2.424882 | lrm: 0.08 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 3 | total time: 172.46m | eta: 7.0m +step 16055/16704 (96.11%) | loss: 2.426578 | lrm: 0.08 | dt: 646.73ms | tok/sec: 810,679 | mfu: 50.67 | epoch: 3 | total time: 172.47m | eta: 7.0m +step 16056/16704 (96.12%) | loss: 2.425365 | lrm: 0.08 | dt: 641.59ms | tok/sec: 817,166 | mfu: 51.07 | epoch: 3 | total time: 172.48m | eta: 7.0m +step 16057/16704 (96.13%) | loss: 2.434183 | lrm: 0.08 | dt: 647.57ms | tok/sec: 809,617 | mfu: 50.60 | epoch: 3 | total time: 172.49m | eta: 7.0m +step 16058/16704 (96.13%) | loss: 2.442820 | lrm: 0.08 | dt: 644.19ms | tok/sec: 813,873 | mfu: 50.87 | epoch: 3 | total time: 172.50m | eta: 6.9m +step 16059/16704 (96.14%) | loss: 2.428843 | lrm: 0.08 | dt: 643.80ms | tok/sec: 814,369 | mfu: 50.90 | epoch: 3 | total time: 172.51m | eta: 6.9m +step 16060/16704 (96.14%) | loss: 2.432894 | lrm: 0.08 | dt: 643.51ms | tok/sec: 814,725 | mfu: 50.92 | epoch: 3 | total time: 172.52m | eta: 6.9m +step 16061/16704 (96.15%) | loss: 2.437793 | lrm: 0.08 | dt: 643.76ms | tok/sec: 814,418 | mfu: 50.90 | epoch: 3 | total time: 172.53m | eta: 6.9m +step 16062/16704 (96.16%) | loss: 2.434074 | lrm: 0.08 | dt: 647.87ms | tok/sec: 809,254 | mfu: 50.58 | epoch: 3 | total time: 172.54m | eta: 6.9m +step 16063/16704 (96.16%) | loss: 2.429852 | lrm: 0.08 | dt: 643.99ms | tok/sec: 814,120 | mfu: 50.88 | epoch: 3 | total time: 172.55m | eta: 6.9m +step 16064/16704 (96.17%) | loss: 2.435010 | lrm: 0.08 | dt: 646.10ms | tok/sec: 811,462 | mfu: 50.72 | epoch: 3 | total time: 172.57m | eta: 6.9m +step 16065/16704 (96.17%) | loss: 2.435289 | lrm: 0.08 | dt: 645.85ms | tok/sec: 811,782 | mfu: 50.74 | epoch: 3 | total time: 172.58m | eta: 6.9m +step 16066/16704 (96.18%) | loss: 2.438530 | lrm: 0.08 | dt: 643.42ms | tok/sec: 814,842 | mfu: 50.93 | epoch: 3 | total time: 172.59m | eta: 6.9m +step 16067/16704 (96.19%) | loss: 2.412864 | lrm: 0.08 | dt: 642.87ms | tok/sec: 815,537 | mfu: 50.97 | epoch: 3 | total time: 172.60m | eta: 6.8m +step 16068/16704 (96.19%) | loss: 2.419398 | lrm: 0.08 | dt: 646.41ms | tok/sec: 811,075 | mfu: 50.69 | epoch: 3 | total time: 172.61m | eta: 6.8m +step 16069/16704 (96.20%) | loss: 2.432547 | lrm: 0.08 | dt: 644.55ms | tok/sec: 813,414 | mfu: 50.84 | epoch: 3 | total time: 172.62m | eta: 6.8m +step 16070/16704 (96.20%) | loss: 2.434989 | lrm: 0.08 | dt: 647.10ms | tok/sec: 810,215 | mfu: 50.64 | epoch: 3 | total time: 172.63m | eta: 6.8m +step 16071/16704 (96.21%) | loss: 2.426620 | lrm: 0.08 | dt: 646.14ms | tok/sec: 811,420 | mfu: 50.71 | epoch: 3 | total time: 172.64m | eta: 6.8m +step 16072/16704 (96.22%) | loss: 2.437382 | lrm: 0.08 | dt: 643.60ms | tok/sec: 814,619 | mfu: 50.91 | epoch: 3 | total time: 172.65m | eta: 6.8m +step 16073/16704 (96.22%) | loss: 2.434161 | lrm: 0.08 | dt: 647.03ms | tok/sec: 810,302 | mfu: 50.65 | epoch: 3 | total time: 172.66m | eta: 6.8m +step 16074/16704 (96.23%) | loss: 2.440196 | lrm: 0.08 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 3 | total time: 172.67m | eta: 6.8m +step 16075/16704 (96.23%) | loss: 2.445570 | lrm: 0.08 | dt: 648.61ms | tok/sec: 808,329 | mfu: 50.52 | epoch: 3 | total time: 172.68m | eta: 6.8m +step 16076/16704 (96.24%) | loss: 2.458885 | lrm: 0.08 | dt: 644.03ms | tok/sec: 814,078 | mfu: 50.88 | epoch: 3 | total time: 172.69m | eta: 6.8m +step 16077/16704 (96.25%) | loss: 2.446982 | lrm: 0.08 | dt: 645.32ms | tok/sec: 812,447 | mfu: 50.78 | epoch: 3 | total time: 172.70m | eta: 6.7m +step 16078/16704 (96.25%) | loss: 2.446170 | lrm: 0.07 | dt: 646.34ms | tok/sec: 811,161 | mfu: 50.70 | epoch: 3 | total time: 172.72m | eta: 6.7m +step 16079/16704 (96.26%) | loss: 2.442412 | lrm: 0.07 | dt: 646.72ms | tok/sec: 810,693 | mfu: 50.67 | epoch: 3 | total time: 172.73m | eta: 6.7m +step 16080/16704 (96.26%) | loss: 2.436134 | lrm: 0.07 | dt: 644.68ms | tok/sec: 813,251 | mfu: 50.83 | epoch: 3 | total time: 172.74m | eta: 6.7m +step 16081/16704 (96.27%) | loss: 2.439921 | lrm: 0.07 | dt: 644.78ms | tok/sec: 813,124 | mfu: 50.82 | epoch: 3 | total time: 172.75m | eta: 6.7m +step 16082/16704 (96.28%) | loss: 2.444795 | lrm: 0.07 | dt: 646.54ms | tok/sec: 810,910 | mfu: 50.68 | epoch: 3 | total time: 172.76m | eta: 6.7m +step 16083/16704 (96.28%) | loss: 2.438327 | lrm: 0.07 | dt: 646.14ms | tok/sec: 811,421 | mfu: 50.72 | epoch: 3 | total time: 172.77m | eta: 6.7m +step 16084/16704 (96.29%) | loss: 2.435390 | lrm: 0.07 | dt: 642.61ms | tok/sec: 815,875 | mfu: 50.99 | epoch: 3 | total time: 172.78m | eta: 6.7m +step 16085/16704 (96.29%) | loss: 2.440566 | lrm: 0.07 | dt: 646.23ms | tok/sec: 811,297 | mfu: 50.71 | epoch: 3 | total time: 172.79m | eta: 6.7m +step 16086/16704 (96.30%) | loss: 2.447849 | lrm: 0.07 | dt: 643.91ms | tok/sec: 814,224 | mfu: 50.89 | epoch: 3 | total time: 172.80m | eta: 6.6m +step 16087/16704 (96.31%) | loss: 2.454400 | lrm: 0.07 | dt: 646.57ms | tok/sec: 810,872 | mfu: 50.68 | epoch: 3 | total time: 172.81m | eta: 6.6m +step 16088/16704 (96.31%) | loss: 2.456037 | lrm: 0.07 | dt: 646.69ms | tok/sec: 810,721 | mfu: 50.67 | epoch: 3 | total time: 172.82m | eta: 6.6m +step 16089/16704 (96.32%) | loss: 2.469389 | lrm: 0.07 | dt: 646.37ms | tok/sec: 811,125 | mfu: 50.70 | epoch: 3 | total time: 172.83m | eta: 6.6m +step 16090/16704 (96.32%) | loss: 2.458211 | lrm: 0.07 | dt: 648.62ms | tok/sec: 808,307 | mfu: 50.52 | epoch: 3 | total time: 172.84m | eta: 6.6m +step 16091/16704 (96.33%) | loss: 2.458323 | lrm: 0.07 | dt: 644.38ms | tok/sec: 813,631 | mfu: 50.85 | epoch: 3 | total time: 172.86m | eta: 6.6m +step 16092/16704 (96.34%) | loss: 2.465369 | lrm: 0.07 | dt: 645.51ms | tok/sec: 812,211 | mfu: 50.76 | epoch: 3 | total time: 172.87m | eta: 6.6m +step 16093/16704 (96.34%) | loss: 2.466472 | lrm: 0.07 | dt: 647.47ms | tok/sec: 809,753 | mfu: 50.61 | epoch: 3 | total time: 172.88m | eta: 6.6m +step 16094/16704 (96.35%) | loss: 2.467280 | lrm: 0.07 | dt: 644.02ms | tok/sec: 814,080 | mfu: 50.88 | epoch: 3 | total time: 172.89m | eta: 6.6m +step 16095/16704 (96.35%) | loss: 2.473267 | lrm: 0.07 | dt: 643.88ms | tok/sec: 814,263 | mfu: 50.89 | epoch: 3 | total time: 172.90m | eta: 6.5m +step 16096/16704 (96.36%) | loss: 2.470942 | lrm: 0.07 | dt: 645.88ms | tok/sec: 811,744 | mfu: 50.74 | epoch: 3 | total time: 172.91m | eta: 6.5m +step 16097/16704 (96.37%) | loss: 2.485041 | lrm: 0.07 | dt: 644.07ms | tok/sec: 814,019 | mfu: 50.88 | epoch: 3 | total time: 172.92m | eta: 6.5m +step 16098/16704 (96.37%) | loss: 2.471336 | lrm: 0.07 | dt: 646.41ms | tok/sec: 811,076 | mfu: 50.69 | epoch: 3 | total time: 172.93m | eta: 6.5m +step 16099/16704 (96.38%) | loss: 2.490827 | lrm: 0.07 | dt: 645.22ms | tok/sec: 812,567 | mfu: 50.79 | epoch: 3 | total time: 172.94m | eta: 6.5m +step 16100/16704 (96.38%) | loss: 2.491600 | lrm: 0.07 | dt: 645.19ms | tok/sec: 812,612 | mfu: 50.79 | epoch: 3 | total time: 172.95m | eta: 6.5m +step 16101/16704 (96.39%) | loss: 2.486212 | lrm: 0.07 | dt: 646.48ms | tok/sec: 810,985 | mfu: 50.69 | epoch: 3 | total time: 172.96m | eta: 6.5m +step 16102/16704 (96.40%) | loss: 2.468828 | lrm: 0.07 | dt: 645.07ms | tok/sec: 812,755 | mfu: 50.80 | epoch: 3 | total time: 172.97m | eta: 6.5m +step 16103/16704 (96.40%) | loss: 2.464617 | lrm: 0.07 | dt: 646.99ms | tok/sec: 810,355 | mfu: 50.65 | epoch: 3 | total time: 172.98m | eta: 6.5m +step 16104/16704 (96.41%) | loss: 2.461639 | lrm: 0.07 | dt: 645.98ms | tok/sec: 811,613 | mfu: 50.73 | epoch: 3 | total time: 173.00m | eta: 6.4m +step 16105/16704 (96.41%) | loss: 2.453815 | lrm: 0.07 | dt: 646.43ms | tok/sec: 811,052 | mfu: 50.69 | epoch: 3 | total time: 173.01m | eta: 6.4m +step 16106/16704 (96.42%) | loss: 2.467745 | lrm: 0.07 | dt: 645.54ms | tok/sec: 812,173 | mfu: 50.76 | epoch: 3 | total time: 173.02m | eta: 6.4m +step 16107/16704 (96.43%) | loss: 2.472039 | lrm: 0.07 | dt: 645.28ms | tok/sec: 812,494 | mfu: 50.78 | epoch: 3 | total time: 173.03m | eta: 6.4m +step 16108/16704 (96.43%) | loss: 2.464264 | lrm: 0.07 | dt: 642.95ms | tok/sec: 815,437 | mfu: 50.97 | epoch: 3 | total time: 173.04m | eta: 6.4m +step 16109/16704 (96.44%) | loss: 2.464426 | lrm: 0.07 | dt: 646.88ms | tok/sec: 810,485 | mfu: 50.66 | epoch: 3 | total time: 173.05m | eta: 6.4m +step 16110/16704 (96.44%) | loss: 2.464599 | lrm: 0.07 | dt: 645.43ms | tok/sec: 812,307 | mfu: 50.77 | epoch: 3 | total time: 173.06m | eta: 6.4m +step 16111/16704 (96.45%) | loss: 2.470747 | lrm: 0.07 | dt: 648.29ms | tok/sec: 808,729 | mfu: 50.55 | epoch: 3 | total time: 173.07m | eta: 6.4m +step 16112/16704 (96.46%) | loss: 2.460209 | lrm: 0.07 | dt: 645.44ms | tok/sec: 812,300 | mfu: 50.77 | epoch: 3 | total time: 173.08m | eta: 6.4m +step 16113/16704 (96.46%) | loss: 2.449345 | lrm: 0.07 | dt: 645.18ms | tok/sec: 812,619 | mfu: 50.79 | epoch: 3 | total time: 173.09m | eta: 6.4m +step 16114/16704 (96.47%) | loss: 2.441466 | lrm: 0.07 | dt: 645.49ms | tok/sec: 812,234 | mfu: 50.77 | epoch: 3 | total time: 173.10m | eta: 6.3m +step 16115/16704 (96.47%) | loss: 2.433426 | lrm: 0.07 | dt: 647.03ms | tok/sec: 810,295 | mfu: 50.64 | epoch: 3 | total time: 173.11m | eta: 6.3m +step 16116/16704 (96.48%) | loss: 2.442074 | lrm: 0.07 | dt: 645.96ms | tok/sec: 811,636 | mfu: 50.73 | epoch: 3 | total time: 173.12m | eta: 6.3m +step 16117/16704 (96.49%) | loss: 2.440679 | lrm: 0.07 | dt: 646.01ms | tok/sec: 811,578 | mfu: 50.72 | epoch: 3 | total time: 173.14m | eta: 6.3m +step 16118/16704 (96.49%) | loss: 2.446995 | lrm: 0.07 | dt: 645.35ms | tok/sec: 812,404 | mfu: 50.78 | epoch: 3 | total time: 173.15m | eta: 6.3m +step 16119/16704 (96.50%) | loss: 2.443738 | lrm: 0.07 | dt: 645.83ms | tok/sec: 811,799 | mfu: 50.74 | epoch: 3 | total time: 173.16m | eta: 6.3m +step 16120/16704 (96.50%) | loss: 2.430273 | lrm: 0.07 | dt: 643.99ms | tok/sec: 814,124 | mfu: 50.88 | epoch: 3 | total time: 173.17m | eta: 6.3m +step 16121/16704 (96.51%) | loss: 2.429527 | lrm: 0.07 | dt: 645.43ms | tok/sec: 812,304 | mfu: 50.77 | epoch: 3 | total time: 173.18m | eta: 6.3m +step 16122/16704 (96.52%) | loss: 2.430412 | lrm: 0.07 | dt: 644.21ms | tok/sec: 813,842 | mfu: 50.87 | epoch: 3 | total time: 173.19m | eta: 6.3m +step 16123/16704 (96.52%) | loss: 2.414353 | lrm: 0.07 | dt: 645.21ms | tok/sec: 812,583 | mfu: 50.79 | epoch: 3 | total time: 173.20m | eta: 6.2m +step 16124/16704 (96.53%) | loss: 2.414378 | lrm: 0.07 | dt: 644.09ms | tok/sec: 813,998 | mfu: 50.88 | epoch: 3 | total time: 173.21m | eta: 6.2m +step 16125/16704 (96.53%) | loss: 2.414587 | lrm: 0.07 | dt: 647.35ms | tok/sec: 809,904 | mfu: 50.62 | epoch: 3 | total time: 173.22m | eta: 6.2m +step 16126/16704 (96.54%) | loss: 2.414423 | lrm: 0.07 | dt: 643.00ms | tok/sec: 815,377 | mfu: 50.96 | epoch: 3 | total time: 173.23m | eta: 6.2m +step 16127/16704 (96.55%) | loss: 2.416978 | lrm: 0.07 | dt: 644.28ms | tok/sec: 813,755 | mfu: 50.86 | epoch: 3 | total time: 173.24m | eta: 6.2m +step 16128/16704 (96.55%) | loss: 2.423035 | lrm: 0.07 | dt: 646.98ms | tok/sec: 810,364 | mfu: 50.65 | epoch: 3 | total time: 173.25m | eta: 6.2m +step 16129/16704 (96.56%) | loss: 2.427078 | lrm: 0.07 | dt: 645.28ms | tok/sec: 812,499 | mfu: 50.78 | epoch: 3 | total time: 173.26m | eta: 6.2m +step 16130/16704 (96.56%) | loss: 2.423687 | lrm: 0.07 | dt: 642.84ms | tok/sec: 815,575 | mfu: 50.97 | epoch: 3 | total time: 173.28m | eta: 6.2m +step 16131/16704 (96.57%) | loss: 2.435345 | lrm: 0.07 | dt: 647.59ms | tok/sec: 809,597 | mfu: 50.60 | epoch: 3 | total time: 173.29m | eta: 6.2m +step 16132/16704 (96.58%) | loss: 2.435732 | lrm: 0.07 | dt: 645.59ms | tok/sec: 812,103 | mfu: 50.76 | epoch: 3 | total time: 173.30m | eta: 6.1m +step 16133/16704 (96.58%) | loss: 2.438154 | lrm: 0.07 | dt: 644.67ms | tok/sec: 813,263 | mfu: 50.83 | epoch: 3 | total time: 173.31m | eta: 6.1m +step 16134/16704 (96.59%) | loss: 2.450640 | lrm: 0.07 | dt: 645.57ms | tok/sec: 812,131 | mfu: 50.76 | epoch: 3 | total time: 173.32m | eta: 6.1m +step 16135/16704 (96.59%) | loss: 2.455672 | lrm: 0.07 | dt: 645.63ms | tok/sec: 812,052 | mfu: 50.75 | epoch: 3 | total time: 173.33m | eta: 6.1m +step 16136/16704 (96.60%) | loss: 2.458667 | lrm: 0.07 | dt: 645.51ms | tok/sec: 812,205 | mfu: 50.76 | epoch: 3 | total time: 173.34m | eta: 6.1m +step 16137/16704 (96.61%) | loss: 2.455118 | lrm: 0.07 | dt: 644.52ms | tok/sec: 813,460 | mfu: 50.84 | epoch: 3 | total time: 173.35m | eta: 6.1m +step 16138/16704 (96.61%) | loss: 2.447481 | lrm: 0.07 | dt: 645.07ms | tok/sec: 812,761 | mfu: 50.80 | epoch: 3 | total time: 173.36m | eta: 6.1m +step 16139/16704 (96.62%) | loss: 2.449251 | lrm: 0.07 | dt: 645.71ms | tok/sec: 811,952 | mfu: 50.75 | epoch: 3 | total time: 173.37m | eta: 6.1m +step 16140/16704 (96.62%) | loss: 2.440517 | lrm: 0.07 | dt: 645.32ms | tok/sec: 812,441 | mfu: 50.78 | epoch: 3 | total time: 173.38m | eta: 6.1m +step 16141/16704 (96.63%) | loss: 2.445578 | lrm: 0.07 | dt: 644.37ms | tok/sec: 813,640 | mfu: 50.85 | epoch: 3 | total time: 173.39m | eta: 6.1m +step 16142/16704 (96.64%) | loss: 2.443295 | lrm: 0.07 | dt: 645.00ms | tok/sec: 812,846 | mfu: 50.80 | epoch: 3 | total time: 173.40m | eta: 6.0m +step 16143/16704 (96.64%) | loss: 2.437414 | lrm: 0.07 | dt: 644.31ms | tok/sec: 813,724 | mfu: 50.86 | epoch: 3 | total time: 173.42m | eta: 6.0m +step 16144/16704 (96.65%) | loss: 2.441767 | lrm: 0.07 | dt: 645.30ms | tok/sec: 812,473 | mfu: 50.78 | epoch: 3 | total time: 173.43m | eta: 6.0m +step 16145/16704 (96.65%) | loss: 2.433227 | lrm: 0.07 | dt: 646.71ms | tok/sec: 810,706 | mfu: 50.67 | epoch: 3 | total time: 173.44m | eta: 6.0m +step 16146/16704 (96.66%) | loss: 2.440310 | lrm: 0.07 | dt: 649.09ms | tok/sec: 807,724 | mfu: 50.48 | epoch: 3 | total time: 173.45m | eta: 6.0m +step 16147/16704 (96.67%) | loss: 2.433386 | lrm: 0.07 | dt: 643.67ms | tok/sec: 814,529 | mfu: 50.91 | epoch: 3 | total time: 173.46m | eta: 6.0m +step 16148/16704 (96.67%) | loss: 2.436540 | lrm: 0.07 | dt: 646.01ms | tok/sec: 811,581 | mfu: 50.72 | epoch: 3 | total time: 173.47m | eta: 6.0m +step 16149/16704 (96.68%) | loss: 2.443301 | lrm: 0.07 | dt: 645.61ms | tok/sec: 812,084 | mfu: 50.76 | epoch: 3 | total time: 173.48m | eta: 6.0m +step 16150/16704 (96.68%) | loss: 2.439747 | lrm: 0.07 | dt: 644.97ms | tok/sec: 812,885 | mfu: 50.81 | epoch: 3 | total time: 173.49m | eta: 6.0m +step 16151/16704 (96.69%) | loss: 2.422341 | lrm: 0.07 | dt: 644.55ms | tok/sec: 813,415 | mfu: 50.84 | epoch: 3 | total time: 173.50m | eta: 5.9m +step 16152/16704 (96.70%) | loss: 2.427737 | lrm: 0.07 | dt: 643.89ms | tok/sec: 814,245 | mfu: 50.89 | epoch: 3 | total time: 173.51m | eta: 5.9m +step 16153/16704 (96.70%) | loss: 2.434819 | lrm: 0.07 | dt: 646.27ms | tok/sec: 811,247 | mfu: 50.70 | epoch: 3 | total time: 173.52m | eta: 5.9m +step 16154/16704 (96.71%) | loss: 2.431590 | lrm: 0.07 | dt: 645.43ms | tok/sec: 812,308 | mfu: 50.77 | epoch: 3 | total time: 173.53m | eta: 5.9m +step 16155/16704 (96.71%) | loss: 2.434337 | lrm: 0.07 | dt: 643.79ms | tok/sec: 814,382 | mfu: 50.90 | epoch: 3 | total time: 173.54m | eta: 5.9m +step 16156/16704 (96.72%) | loss: 2.428956 | lrm: 0.07 | dt: 644.42ms | tok/sec: 813,585 | mfu: 50.85 | epoch: 3 | total time: 173.55m | eta: 5.9m +step 16157/16704 (96.73%) | loss: 2.431728 | lrm: 0.07 | dt: 646.54ms | tok/sec: 810,915 | mfu: 50.68 | epoch: 3 | total time: 173.57m | eta: 5.9m +step 16158/16704 (96.73%) | loss: 2.435488 | lrm: 0.07 | dt: 644.18ms | tok/sec: 813,883 | mfu: 50.87 | epoch: 3 | total time: 173.58m | eta: 5.9m +step 16159/16704 (96.74%) | loss: 2.435573 | lrm: 0.07 | dt: 644.14ms | tok/sec: 813,928 | mfu: 50.87 | epoch: 3 | total time: 173.59m | eta: 5.9m +step 16160/16704 (96.74%) | loss: 2.438022 | lrm: 0.07 | dt: 647.97ms | tok/sec: 809,118 | mfu: 50.57 | epoch: 3 | total time: 173.60m | eta: 5.8m +step 16161/16704 (96.75%) | loss: 2.426497 | lrm: 0.07 | dt: 643.40ms | tok/sec: 814,867 | mfu: 50.93 | epoch: 3 | total time: 173.61m | eta: 5.8m +step 16162/16704 (96.76%) | loss: 2.407817 | lrm: 0.06 | dt: 647.93ms | tok/sec: 809,179 | mfu: 50.57 | epoch: 3 | total time: 173.62m | eta: 5.8m +step 16163/16704 (96.76%) | loss: 2.410870 | lrm: 0.06 | dt: 645.75ms | tok/sec: 811,907 | mfu: 50.75 | epoch: 3 | total time: 173.63m | eta: 5.8m +step 16164/16704 (96.77%) | loss: 2.413617 | lrm: 0.06 | dt: 647.23ms | tok/sec: 810,050 | mfu: 50.63 | epoch: 3 | total time: 173.64m | eta: 5.8m +step 16165/16704 (96.77%) | loss: 2.418076 | lrm: 0.06 | dt: 646.64ms | tok/sec: 810,790 | mfu: 50.68 | epoch: 3 | total time: 173.65m | eta: 5.8m +step 16166/16704 (96.78%) | loss: 2.418187 | lrm: 0.06 | dt: 642.65ms | tok/sec: 815,822 | mfu: 50.99 | epoch: 3 | total time: 173.66m | eta: 5.8m +step 16167/16704 (96.79%) | loss: 2.409334 | lrm: 0.06 | dt: 643.33ms | tok/sec: 814,960 | mfu: 50.94 | epoch: 3 | total time: 173.67m | eta: 5.8m +step 16168/16704 (96.79%) | loss: 2.412011 | lrm: 0.06 | dt: 646.09ms | tok/sec: 811,480 | mfu: 50.72 | epoch: 3 | total time: 173.68m | eta: 5.8m +step 16169/16704 (96.80%) | loss: 2.423430 | lrm: 0.06 | dt: 643.82ms | tok/sec: 814,337 | mfu: 50.90 | epoch: 3 | total time: 173.69m | eta: 5.8m +step 16170/16704 (96.80%) | loss: 2.420001 | lrm: 0.06 | dt: 645.00ms | tok/sec: 812,852 | mfu: 50.80 | epoch: 3 | total time: 173.71m | eta: 5.7m +step 16171/16704 (96.81%) | loss: 2.404759 | lrm: 0.06 | dt: 644.44ms | tok/sec: 813,556 | mfu: 50.85 | epoch: 3 | total time: 173.72m | eta: 5.7m +step 16172/16704 (96.82%) | loss: 2.409720 | lrm: 0.06 | dt: 643.95ms | tok/sec: 814,180 | mfu: 50.89 | epoch: 3 | total time: 173.73m | eta: 5.7m +step 16173/16704 (96.82%) | loss: 2.418994 | lrm: 0.06 | dt: 645.14ms | tok/sec: 812,667 | mfu: 50.79 | epoch: 3 | total time: 173.74m | eta: 5.7m +step 16174/16704 (96.83%) | loss: 2.409762 | lrm: 0.06 | dt: 644.63ms | tok/sec: 813,319 | mfu: 50.83 | epoch: 3 | total time: 173.75m | eta: 5.7m +step 16175/16704 (96.83%) | loss: 2.407392 | lrm: 0.06 | dt: 646.10ms | tok/sec: 811,462 | mfu: 50.72 | epoch: 3 | total time: 173.76m | eta: 5.7m +step 16176/16704 (96.84%) | loss: 2.408978 | lrm: 0.06 | dt: 646.59ms | tok/sec: 810,851 | mfu: 50.68 | epoch: 3 | total time: 173.77m | eta: 5.7m +step 16177/16704 (96.85%) | loss: 2.388784 | lrm: 0.06 | dt: 648.95ms | tok/sec: 807,908 | mfu: 50.50 | epoch: 3 | total time: 173.78m | eta: 5.7m +step 16178/16704 (96.85%) | loss: 2.399613 | lrm: 0.06 | dt: 644.34ms | tok/sec: 813,684 | mfu: 50.86 | epoch: 3 | total time: 173.79m | eta: 5.7m +step 16179/16704 (96.86%) | loss: 2.422081 | lrm: 0.06 | dt: 647.44ms | tok/sec: 809,784 | mfu: 50.61 | epoch: 3 | total time: 173.80m | eta: 5.6m +step 16180/16704 (96.86%) | loss: 2.415440 | lrm: 0.06 | dt: 645.49ms | tok/sec: 812,226 | mfu: 50.77 | epoch: 3 | total time: 173.81m | eta: 5.6m +step 16181/16704 (96.87%) | loss: 2.424247 | lrm: 0.06 | dt: 642.62ms | tok/sec: 815,860 | mfu: 50.99 | epoch: 3 | total time: 173.82m | eta: 5.6m +step 16182/16704 (96.88%) | loss: 2.424106 | lrm: 0.06 | dt: 645.06ms | tok/sec: 812,780 | mfu: 50.80 | epoch: 3 | total time: 173.83m | eta: 5.6m +step 16183/16704 (96.88%) | loss: 2.414574 | lrm: 0.06 | dt: 647.31ms | tok/sec: 809,947 | mfu: 50.62 | epoch: 3 | total time: 173.85m | eta: 5.6m +step 16184/16704 (96.89%) | loss: 2.409143 | lrm: 0.06 | dt: 645.59ms | tok/sec: 812,102 | mfu: 50.76 | epoch: 3 | total time: 173.86m | eta: 5.6m +step 16185/16704 (96.89%) | loss: 2.421332 | lrm: 0.06 | dt: 645.15ms | tok/sec: 812,658 | mfu: 50.79 | epoch: 3 | total time: 173.87m | eta: 5.6m +step 16186/16704 (96.90%) | loss: 2.423916 | lrm: 0.06 | dt: 644.58ms | tok/sec: 813,376 | mfu: 50.84 | epoch: 3 | total time: 173.88m | eta: 5.6m +step 16187/16704 (96.90%) | loss: 2.428762 | lrm: 0.06 | dt: 644.80ms | tok/sec: 813,104 | mfu: 50.82 | epoch: 3 | total time: 173.89m | eta: 5.6m +step 16188/16704 (96.91%) | loss: 2.420539 | lrm: 0.06 | dt: 647.74ms | tok/sec: 809,412 | mfu: 50.59 | epoch: 3 | total time: 173.90m | eta: 5.5m +step 16189/16704 (96.92%) | loss: 2.410072 | lrm: 0.06 | dt: 644.41ms | tok/sec: 813,594 | mfu: 50.85 | epoch: 3 | total time: 173.91m | eta: 5.5m +step 16190/16704 (96.92%) | loss: 2.412159 | lrm: 0.06 | dt: 645.79ms | tok/sec: 811,849 | mfu: 50.74 | epoch: 3 | total time: 173.92m | eta: 5.5m +step 16191/16704 (96.93%) | loss: 2.403685 | lrm: 0.06 | dt: 645.16ms | tok/sec: 812,651 | mfu: 50.79 | epoch: 3 | total time: 173.93m | eta: 5.5m +step 16192/16704 (96.93%) | loss: 2.386404 | lrm: 0.06 | dt: 645.04ms | tok/sec: 812,803 | mfu: 50.80 | epoch: 3 | total time: 173.94m | eta: 5.5m +step 16193/16704 (96.94%) | loss: 2.385960 | lrm: 0.06 | dt: 645.62ms | tok/sec: 812,066 | mfu: 50.76 | epoch: 3 | total time: 173.95m | eta: 5.5m +step 16194/16704 (96.95%) | loss: 2.396903 | lrm: 0.06 | dt: 642.92ms | tok/sec: 815,474 | mfu: 50.97 | epoch: 3 | total time: 173.96m | eta: 5.5m +step 16195/16704 (96.95%) | loss: 2.394843 | lrm: 0.06 | dt: 644.89ms | tok/sec: 812,984 | mfu: 50.81 | epoch: 3 | total time: 173.97m | eta: 5.5m +step 16196/16704 (96.96%) | loss: 2.398933 | lrm: 0.06 | dt: 646.52ms | tok/sec: 810,936 | mfu: 50.68 | epoch: 3 | total time: 173.99m | eta: 5.5m +step 16197/16704 (96.96%) | loss: 2.397840 | lrm: 0.06 | dt: 644.62ms | tok/sec: 813,327 | mfu: 50.83 | epoch: 3 | total time: 174.00m | eta: 5.4m +step 16198/16704 (96.97%) | loss: 2.416601 | lrm: 0.06 | dt: 645.74ms | tok/sec: 811,913 | mfu: 50.75 | epoch: 3 | total time: 174.01m | eta: 5.4m +step 16199/16704 (96.98%) | loss: 2.416231 | lrm: 0.06 | dt: 645.02ms | tok/sec: 812,825 | mfu: 50.80 | epoch: 3 | total time: 174.02m | eta: 5.4m +step 16200/16704 (96.98%) | loss: 2.416579 | lrm: 0.06 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 3 | total time: 174.03m | eta: 5.4m +step 16201/16704 (96.99%) | loss: 2.406781 | lrm: 0.06 | dt: 643.04ms | tok/sec: 815,328 | mfu: 50.96 | epoch: 3 | total time: 174.04m | eta: 5.4m +step 16202/16704 (96.99%) | loss: 2.416695 | lrm: 0.06 | dt: 643.48ms | tok/sec: 814,768 | mfu: 50.92 | epoch: 3 | total time: 174.05m | eta: 5.4m +step 16203/16704 (97.00%) | loss: 2.421669 | lrm: 0.06 | dt: 644.03ms | tok/sec: 814,067 | mfu: 50.88 | epoch: 3 | total time: 174.06m | eta: 5.4m +step 16204/16704 (97.01%) | loss: 2.416725 | lrm: 0.06 | dt: 646.95ms | tok/sec: 810,402 | mfu: 50.65 | epoch: 3 | total time: 174.07m | eta: 5.4m +step 16205/16704 (97.01%) | loss: 2.423093 | lrm: 0.06 | dt: 644.76ms | tok/sec: 813,157 | mfu: 50.82 | epoch: 3 | total time: 174.08m | eta: 5.4m +step 16206/16704 (97.02%) | loss: 2.422824 | lrm: 0.06 | dt: 647.14ms | tok/sec: 810,166 | mfu: 50.64 | epoch: 3 | total time: 174.09m | eta: 5.4m +step 16207/16704 (97.02%) | loss: 2.412184 | lrm: 0.06 | dt: 644.56ms | tok/sec: 813,400 | mfu: 50.84 | epoch: 3 | total time: 174.10m | eta: 5.3m +step 16208/16704 (97.03%) | loss: 2.414432 | lrm: 0.06 | dt: 645.08ms | tok/sec: 812,749 | mfu: 50.80 | epoch: 3 | total time: 174.11m | eta: 5.3m +step 16209/16704 (97.04%) | loss: 2.422738 | lrm: 0.06 | dt: 644.82ms | tok/sec: 813,071 | mfu: 50.82 | epoch: 3 | total time: 174.12m | eta: 5.3m +step 16210/16704 (97.04%) | loss: 2.415790 | lrm: 0.06 | dt: 644.67ms | tok/sec: 813,269 | mfu: 50.83 | epoch: 3 | total time: 174.14m | eta: 5.3m +step 16211/16704 (97.05%) | loss: 2.406589 | lrm: 0.06 | dt: 646.50ms | tok/sec: 810,968 | mfu: 50.69 | epoch: 3 | total time: 174.15m | eta: 5.3m +step 16212/16704 (97.05%) | loss: 2.412234 | lrm: 0.06 | dt: 644.42ms | tok/sec: 813,584 | mfu: 50.85 | epoch: 3 | total time: 174.16m | eta: 5.3m +step 16213/16704 (97.06%) | loss: 2.412086 | lrm: 0.06 | dt: 644.78ms | tok/sec: 813,132 | mfu: 50.82 | epoch: 3 | total time: 174.17m | eta: 5.3m +step 16214/16704 (97.07%) | loss: 2.419055 | lrm: 0.06 | dt: 648.10ms | tok/sec: 808,966 | mfu: 50.56 | epoch: 3 | total time: 174.18m | eta: 5.3m +step 16215/16704 (97.07%) | loss: 2.430804 | lrm: 0.06 | dt: 643.39ms | tok/sec: 814,882 | mfu: 50.93 | epoch: 3 | total time: 174.19m | eta: 5.3m +step 16216/16704 (97.08%) | loss: 2.429862 | lrm: 0.06 | dt: 645.98ms | tok/sec: 811,621 | mfu: 50.73 | epoch: 3 | total time: 174.20m | eta: 5.2m +step 16217/16704 (97.08%) | loss: 2.426348 | lrm: 0.06 | dt: 644.88ms | tok/sec: 813,006 | mfu: 50.81 | epoch: 3 | total time: 174.21m | eta: 5.2m +step 16218/16704 (97.09%) | loss: 2.433084 | lrm: 0.06 | dt: 645.38ms | tok/sec: 812,371 | mfu: 50.77 | epoch: 3 | total time: 174.22m | eta: 5.2m +step 16219/16704 (97.10%) | loss: 2.432474 | lrm: 0.06 | dt: 644.46ms | tok/sec: 813,525 | mfu: 50.85 | epoch: 3 | total time: 174.23m | eta: 5.2m +step 16220/16704 (97.10%) | loss: 2.419287 | lrm: 0.06 | dt: 643.68ms | tok/sec: 814,514 | mfu: 50.91 | epoch: 3 | total time: 174.24m | eta: 5.2m +step 16221/16704 (97.11%) | loss: 2.395349 | lrm: 0.06 | dt: 645.63ms | tok/sec: 812,053 | mfu: 50.75 | epoch: 3 | total time: 174.25m | eta: 5.2m +step 16222/16704 (97.11%) | loss: 2.400063 | lrm: 0.06 | dt: 644.86ms | tok/sec: 813,024 | mfu: 50.82 | epoch: 3 | total time: 174.26m | eta: 5.2m +step 16223/16704 (97.12%) | loss: 2.401743 | lrm: 0.06 | dt: 644.61ms | tok/sec: 813,344 | mfu: 50.84 | epoch: 3 | total time: 174.28m | eta: 5.2m +step 16224/16704 (97.13%) | loss: 2.392301 | lrm: 0.06 | dt: 645.90ms | tok/sec: 811,721 | mfu: 50.73 | epoch: 3 | total time: 174.29m | eta: 5.2m +step 16225/16704 (97.13%) | loss: 2.379538 | lrm: 0.06 | dt: 645.16ms | tok/sec: 812,653 | mfu: 50.79 | epoch: 3 | total time: 174.30m | eta: 5.1m +step 16226/16704 (97.14%) | loss: 2.394994 | lrm: 0.06 | dt: 644.76ms | tok/sec: 813,153 | mfu: 50.82 | epoch: 3 | total time: 174.31m | eta: 5.1m +step 16227/16704 (97.14%) | loss: 2.396208 | lrm: 0.06 | dt: 644.09ms | tok/sec: 813,997 | mfu: 50.88 | epoch: 3 | total time: 174.32m | eta: 5.1m +step 16228/16704 (97.15%) | loss: 2.410717 | lrm: 0.06 | dt: 643.70ms | tok/sec: 814,490 | mfu: 50.91 | epoch: 3 | total time: 174.33m | eta: 5.1m +step 16229/16704 (97.16%) | loss: 2.424983 | lrm: 0.06 | dt: 646.87ms | tok/sec: 810,503 | mfu: 50.66 | epoch: 3 | total time: 174.34m | eta: 5.1m +step 16230/16704 (97.16%) | loss: 2.430254 | lrm: 0.06 | dt: 643.47ms | tok/sec: 814,780 | mfu: 50.92 | epoch: 3 | total time: 174.35m | eta: 5.1m +step 16231/16704 (97.17%) | loss: 2.436122 | lrm: 0.06 | dt: 645.50ms | tok/sec: 812,215 | mfu: 50.76 | epoch: 3 | total time: 174.36m | eta: 5.1m +step 16232/16704 (97.17%) | loss: 2.422619 | lrm: 0.06 | dt: 643.36ms | tok/sec: 814,924 | mfu: 50.93 | epoch: 3 | total time: 174.37m | eta: 5.1m +step 16233/16704 (97.18%) | loss: 2.428721 | lrm: 0.06 | dt: 646.53ms | tok/sec: 810,928 | mfu: 50.68 | epoch: 3 | total time: 174.38m | eta: 5.1m +step 16234/16704 (97.19%) | loss: 2.416656 | lrm: 0.06 | dt: 644.83ms | tok/sec: 813,065 | mfu: 50.82 | epoch: 3 | total time: 174.39m | eta: 5.1m +step 16235/16704 (97.19%) | loss: 2.436086 | lrm: 0.06 | dt: 641.93ms | tok/sec: 816,733 | mfu: 51.05 | epoch: 3 | total time: 174.40m | eta: 5.0m +step 16236/16704 (97.20%) | loss: 2.440889 | lrm: 0.06 | dt: 645.30ms | tok/sec: 812,473 | mfu: 50.78 | epoch: 3 | total time: 174.42m | eta: 5.0m +step 16237/16704 (97.20%) | loss: 2.436790 | lrm: 0.06 | dt: 646.17ms | tok/sec: 811,382 | mfu: 50.71 | epoch: 3 | total time: 174.43m | eta: 5.0m +step 16238/16704 (97.21%) | loss: 2.437689 | lrm: 0.06 | dt: 647.02ms | tok/sec: 810,313 | mfu: 50.65 | epoch: 3 | total time: 174.44m | eta: 5.0m +step 16239/16704 (97.22%) | loss: 2.416147 | lrm: 0.06 | dt: 647.10ms | tok/sec: 810,205 | mfu: 50.64 | epoch: 3 | total time: 174.45m | eta: 5.0m +step 16240/16704 (97.22%) | loss: 2.415330 | lrm: 0.06 | dt: 644.76ms | tok/sec: 813,154 | mfu: 50.82 | epoch: 3 | total time: 174.46m | eta: 5.0m +step 16241/16704 (97.23%) | loss: 2.411949 | lrm: 0.06 | dt: 644.21ms | tok/sec: 813,843 | mfu: 50.87 | epoch: 3 | total time: 174.47m | eta: 5.0m +step 16242/16704 (97.23%) | loss: 2.418654 | lrm: 0.06 | dt: 642.93ms | tok/sec: 815,470 | mfu: 50.97 | epoch: 3 | total time: 174.48m | eta: 5.0m +step 16243/16704 (97.24%) | loss: 2.419387 | lrm: 0.06 | dt: 643.94ms | tok/sec: 814,186 | mfu: 50.89 | epoch: 3 | total time: 174.49m | eta: 5.0m +step 16244/16704 (97.25%) | loss: 2.422560 | lrm: 0.06 | dt: 648.73ms | tok/sec: 808,172 | mfu: 50.51 | epoch: 3 | total time: 174.50m | eta: 4.9m +step 16245/16704 (97.25%) | loss: 2.416919 | lrm: 0.05 | dt: 643.46ms | tok/sec: 814,799 | mfu: 50.93 | epoch: 3 | total time: 174.51m | eta: 4.9m +step 16246/16704 (97.26%) | loss: 2.417092 | lrm: 0.05 | dt: 645.95ms | tok/sec: 811,657 | mfu: 50.73 | epoch: 3 | total time: 174.52m | eta: 4.9m +step 16247/16704 (97.26%) | loss: 2.408912 | lrm: 0.05 | dt: 645.24ms | tok/sec: 812,541 | mfu: 50.78 | epoch: 3 | total time: 174.53m | eta: 4.9m +step 16248/16704 (97.27%) | loss: 2.415509 | lrm: 0.05 | dt: 644.16ms | tok/sec: 813,912 | mfu: 50.87 | epoch: 3 | total time: 174.54m | eta: 4.9m +step 16249/16704 (97.28%) | loss: 2.417174 | lrm: 0.05 | dt: 646.77ms | tok/sec: 810,629 | mfu: 50.67 | epoch: 3 | total time: 174.55m | eta: 4.9m +Step 16250 | Validation bpb: 0.754576 +step 16250/16704 (97.28%) | loss: 2.412731 | lrm: 0.05 | dt: 635.39ms | tok/sec: 825,142 | mfu: 51.57 | epoch: 3 | total time: 174.57m | eta: 4.9m +step 16251/16704 (97.29%) | loss: 2.412296 | lrm: 0.05 | dt: 653.00ms | tok/sec: 802,886 | mfu: 50.18 | epoch: 3 | total time: 174.58m | eta: 4.9m +step 16252/16704 (97.29%) | loss: 2.414969 | lrm: 0.05 | dt: 641.64ms | tok/sec: 817,107 | mfu: 51.07 | epoch: 3 | total time: 174.59m | eta: 4.9m +step 16253/16704 (97.30%) | loss: 2.423435 | lrm: 0.05 | dt: 644.66ms | tok/sec: 813,280 | mfu: 50.83 | epoch: 3 | total time: 174.60m | eta: 4.8m +step 16254/16704 (97.31%) | loss: 2.443466 | lrm: 0.05 | dt: 649.19ms | tok/sec: 807,607 | mfu: 50.48 | epoch: 3 | total time: 174.61m | eta: 4.8m +step 16255/16704 (97.31%) | loss: 2.437028 | lrm: 0.05 | dt: 640.19ms | tok/sec: 818,954 | mfu: 51.19 | epoch: 3 | total time: 174.62m | eta: 4.8m +step 16256/16704 (97.32%) | loss: 2.441837 | lrm: 0.05 | dt: 645.58ms | tok/sec: 812,118 | mfu: 50.76 | epoch: 3 | total time: 174.63m | eta: 4.8m +step 16257/16704 (97.32%) | loss: 2.435226 | lrm: 0.05 | dt: 645.45ms | tok/sec: 812,282 | mfu: 50.77 | epoch: 3 | total time: 174.64m | eta: 4.8m +step 16258/16704 (97.33%) | loss: 2.434661 | lrm: 0.05 | dt: 642.90ms | tok/sec: 815,498 | mfu: 50.97 | epoch: 3 | total time: 174.65m | eta: 4.8m +step 16259/16704 (97.34%) | loss: 2.424274 | lrm: 0.05 | dt: 645.18ms | tok/sec: 812,618 | mfu: 50.79 | epoch: 3 | total time: 174.66m | eta: 4.8m +step 16260/16704 (97.34%) | loss: 2.409546 | lrm: 0.05 | dt: 643.74ms | tok/sec: 814,445 | mfu: 50.90 | epoch: 3 | total time: 174.67m | eta: 4.8m +step 16261/16704 (97.35%) | loss: 2.411320 | lrm: 0.05 | dt: 647.78ms | tok/sec: 809,364 | mfu: 50.59 | epoch: 3 | total time: 174.68m | eta: 4.8m +step 16262/16704 (97.35%) | loss: 2.420284 | lrm: 0.05 | dt: 645.94ms | tok/sec: 811,672 | mfu: 50.73 | epoch: 3 | total time: 174.69m | eta: 4.8m +step 16263/16704 (97.36%) | loss: 2.417198 | lrm: 0.05 | dt: 644.38ms | tok/sec: 813,632 | mfu: 50.85 | epoch: 3 | total time: 174.71m | eta: 4.7m +step 16264/16704 (97.37%) | loss: 2.416847 | lrm: 0.05 | dt: 645.02ms | tok/sec: 812,826 | mfu: 50.80 | epoch: 3 | total time: 174.72m | eta: 4.7m +step 16265/16704 (97.37%) | loss: 2.428126 | lrm: 0.05 | dt: 646.19ms | tok/sec: 811,356 | mfu: 50.71 | epoch: 3 | total time: 174.73m | eta: 4.7m +step 16266/16704 (97.38%) | loss: 2.436374 | lrm: 0.05 | dt: 645.46ms | tok/sec: 812,265 | mfu: 50.77 | epoch: 3 | total time: 174.74m | eta: 4.7m +step 16267/16704 (97.38%) | loss: 2.437194 | lrm: 0.05 | dt: 644.63ms | tok/sec: 813,314 | mfu: 50.83 | epoch: 3 | total time: 174.75m | eta: 4.7m +step 16268/16704 (97.39%) | loss: 2.439321 | lrm: 0.05 | dt: 644.97ms | tok/sec: 812,882 | mfu: 50.81 | epoch: 3 | total time: 174.76m | eta: 4.7m +step 16269/16704 (97.40%) | loss: 2.425785 | lrm: 0.05 | dt: 644.93ms | tok/sec: 812,935 | mfu: 50.81 | epoch: 3 | total time: 174.77m | eta: 4.7m +step 16270/16704 (97.40%) | loss: 2.422186 | lrm: 0.05 | dt: 644.03ms | tok/sec: 814,070 | mfu: 50.88 | epoch: 3 | total time: 174.78m | eta: 4.7m +step 16271/16704 (97.41%) | loss: 2.417097 | lrm: 0.05 | dt: 646.31ms | tok/sec: 811,198 | mfu: 50.70 | epoch: 3 | total time: 174.79m | eta: 4.7m +step 16272/16704 (97.41%) | loss: 2.413421 | lrm: 0.05 | dt: 643.89ms | tok/sec: 814,256 | mfu: 50.89 | epoch: 3 | total time: 174.80m | eta: 4.6m +step 16273/16704 (97.42%) | loss: 2.413062 | lrm: 0.05 | dt: 645.32ms | tok/sec: 812,452 | mfu: 50.78 | epoch: 3 | total time: 174.81m | eta: 4.6m +step 16274/16704 (97.43%) | loss: 2.408014 | lrm: 0.05 | dt: 647.99ms | tok/sec: 809,103 | mfu: 50.57 | epoch: 3 | total time: 174.82m | eta: 4.6m +step 16275/16704 (97.43%) | loss: 2.414471 | lrm: 0.05 | dt: 643.72ms | tok/sec: 814,466 | mfu: 50.91 | epoch: 3 | total time: 174.83m | eta: 4.6m +step 16276/16704 (97.44%) | loss: 2.417457 | lrm: 0.05 | dt: 643.49ms | tok/sec: 814,753 | mfu: 50.92 | epoch: 3 | total time: 174.85m | eta: 4.6m +step 16277/16704 (97.44%) | loss: 2.420051 | lrm: 0.05 | dt: 644.37ms | tok/sec: 813,641 | mfu: 50.85 | epoch: 3 | total time: 174.86m | eta: 4.6m +step 16278/16704 (97.45%) | loss: 2.423732 | lrm: 0.05 | dt: 643.71ms | tok/sec: 814,478 | mfu: 50.91 | epoch: 3 | total time: 174.87m | eta: 4.6m +step 16279/16704 (97.46%) | loss: 2.416432 | lrm: 0.05 | dt: 647.52ms | tok/sec: 809,691 | mfu: 50.61 | epoch: 3 | total time: 174.88m | eta: 4.6m +step 16280/16704 (97.46%) | loss: 2.404374 | lrm: 0.05 | dt: 643.57ms | tok/sec: 814,654 | mfu: 50.92 | epoch: 3 | total time: 174.89m | eta: 4.6m +step 16281/16704 (97.47%) | loss: 2.416005 | lrm: 0.05 | dt: 644.40ms | tok/sec: 813,605 | mfu: 50.85 | epoch: 3 | total time: 174.90m | eta: 4.5m +step 16282/16704 (97.47%) | loss: 2.430333 | lrm: 0.05 | dt: 644.23ms | tok/sec: 813,822 | mfu: 50.87 | epoch: 3 | total time: 174.91m | eta: 4.5m +step 16283/16704 (97.48%) | loss: 2.425006 | lrm: 0.05 | dt: 645.13ms | tok/sec: 812,686 | mfu: 50.79 | epoch: 3 | total time: 174.92m | eta: 4.5m +step 16284/16704 (97.49%) | loss: 2.432010 | lrm: 0.05 | dt: 645.76ms | tok/sec: 811,891 | mfu: 50.74 | epoch: 3 | total time: 174.93m | eta: 4.5m +step 16285/16704 (97.49%) | loss: 2.425383 | lrm: 0.05 | dt: 644.28ms | tok/sec: 813,759 | mfu: 50.86 | epoch: 3 | total time: 174.94m | eta: 4.5m +step 16286/16704 (97.50%) | loss: 2.423298 | lrm: 0.05 | dt: 643.92ms | tok/sec: 814,208 | mfu: 50.89 | epoch: 3 | total time: 174.95m | eta: 4.5m +step 16287/16704 (97.50%) | loss: 2.425027 | lrm: 0.05 | dt: 644.79ms | tok/sec: 813,113 | mfu: 50.82 | epoch: 3 | total time: 174.96m | eta: 4.5m +step 16288/16704 (97.51%) | loss: 2.415325 | lrm: 0.05 | dt: 644.68ms | tok/sec: 813,256 | mfu: 50.83 | epoch: 3 | total time: 174.97m | eta: 4.5m +step 16289/16704 (97.52%) | loss: 2.402998 | lrm: 0.05 | dt: 644.16ms | tok/sec: 813,905 | mfu: 50.87 | epoch: 3 | total time: 174.98m | eta: 4.5m +step 16290/16704 (97.52%) | loss: 2.398544 | lrm: 0.05 | dt: 645.66ms | tok/sec: 812,015 | mfu: 50.75 | epoch: 3 | total time: 175.00m | eta: 4.5m +step 16291/16704 (97.53%) | loss: 2.403958 | lrm: 0.05 | dt: 645.68ms | tok/sec: 811,988 | mfu: 50.75 | epoch: 3 | total time: 175.01m | eta: 4.4m +step 16292/16704 (97.53%) | loss: 2.410307 | lrm: 0.05 | dt: 645.61ms | tok/sec: 812,081 | mfu: 50.76 | epoch: 3 | total time: 175.02m | eta: 4.4m +step 16293/16704 (97.54%) | loss: 2.412186 | lrm: 0.05 | dt: 643.33ms | tok/sec: 814,963 | mfu: 50.94 | epoch: 3 | total time: 175.03m | eta: 4.4m +step 16294/16704 (97.55%) | loss: 2.416872 | lrm: 0.05 | dt: 645.75ms | tok/sec: 811,901 | mfu: 50.74 | epoch: 3 | total time: 175.04m | eta: 4.4m +step 16295/16704 (97.55%) | loss: 2.411367 | lrm: 0.05 | dt: 644.91ms | tok/sec: 812,958 | mfu: 50.81 | epoch: 3 | total time: 175.05m | eta: 4.4m +step 16296/16704 (97.56%) | loss: 2.405767 | lrm: 0.05 | dt: 643.27ms | tok/sec: 815,031 | mfu: 50.94 | epoch: 3 | total time: 175.06m | eta: 4.4m +step 16297/16704 (97.56%) | loss: 2.414989 | lrm: 0.05 | dt: 644.01ms | tok/sec: 814,101 | mfu: 50.88 | epoch: 3 | total time: 175.07m | eta: 4.4m +step 16298/16704 (97.57%) | loss: 2.425896 | lrm: 0.05 | dt: 645.88ms | tok/sec: 811,739 | mfu: 50.73 | epoch: 3 | total time: 175.08m | eta: 4.4m +step 16299/16704 (97.58%) | loss: 2.409958 | lrm: 0.05 | dt: 644.21ms | tok/sec: 813,852 | mfu: 50.87 | epoch: 3 | total time: 175.09m | eta: 4.4m +step 16300/16704 (97.58%) | loss: 2.402164 | lrm: 0.05 | dt: 646.29ms | tok/sec: 811,231 | mfu: 50.70 | epoch: 3 | total time: 175.10m | eta: 4.3m +step 16301/16704 (97.59%) | loss: 2.404095 | lrm: 0.05 | dt: 644.93ms | tok/sec: 812,936 | mfu: 50.81 | epoch: 3 | total time: 175.11m | eta: 4.3m +step 16302/16704 (97.59%) | loss: 2.401761 | lrm: 0.05 | dt: 646.26ms | tok/sec: 811,262 | mfu: 50.71 | epoch: 3 | total time: 175.12m | eta: 4.3m +step 16303/16704 (97.60%) | loss: 2.398272 | lrm: 0.05 | dt: 645.87ms | tok/sec: 811,750 | mfu: 50.74 | epoch: 3 | total time: 175.14m | eta: 4.3m +step 16304/16704 (97.61%) | loss: 2.424823 | lrm: 0.05 | dt: 646.41ms | tok/sec: 811,073 | mfu: 50.69 | epoch: 3 | total time: 175.15m | eta: 4.3m +step 16305/16704 (97.61%) | loss: 2.428191 | lrm: 0.05 | dt: 646.46ms | tok/sec: 811,019 | mfu: 50.69 | epoch: 3 | total time: 175.16m | eta: 4.3m +step 16306/16704 (97.62%) | loss: 2.405713 | lrm: 0.05 | dt: 647.28ms | tok/sec: 809,983 | mfu: 50.63 | epoch: 3 | total time: 175.17m | eta: 4.3m +step 16307/16704 (97.62%) | loss: 2.413758 | lrm: 0.05 | dt: 645.40ms | tok/sec: 812,349 | mfu: 50.77 | epoch: 3 | total time: 175.18m | eta: 4.3m +step 16308/16704 (97.63%) | loss: 2.431536 | lrm: 0.05 | dt: 647.02ms | tok/sec: 810,306 | mfu: 50.65 | epoch: 3 | total time: 175.19m | eta: 4.3m +step 16309/16704 (97.64%) | loss: 2.421304 | lrm: 0.05 | dt: 644.76ms | tok/sec: 813,153 | mfu: 50.82 | epoch: 3 | total time: 175.20m | eta: 4.2m +step 16310/16704 (97.64%) | loss: 2.434859 | lrm: 0.05 | dt: 645.53ms | tok/sec: 812,185 | mfu: 50.76 | epoch: 3 | total time: 175.21m | eta: 4.2m +step 16311/16704 (97.65%) | loss: 2.428206 | lrm: 0.05 | dt: 644.88ms | tok/sec: 812,997 | mfu: 50.81 | epoch: 3 | total time: 175.22m | eta: 4.2m +step 16312/16704 (97.65%) | loss: 2.425179 | lrm: 0.05 | dt: 646.26ms | tok/sec: 811,268 | mfu: 50.71 | epoch: 3 | total time: 175.23m | eta: 4.2m +step 16313/16704 (97.66%) | loss: 2.424135 | lrm: 0.05 | dt: 645.85ms | tok/sec: 811,785 | mfu: 50.74 | epoch: 3 | total time: 175.24m | eta: 4.2m +step 16314/16704 (97.67%) | loss: 2.433471 | lrm: 0.05 | dt: 646.26ms | tok/sec: 811,258 | mfu: 50.70 | epoch: 3 | total time: 175.25m | eta: 4.2m +step 16315/16704 (97.67%) | loss: 2.430794 | lrm: 0.05 | dt: 650.08ms | tok/sec: 806,500 | mfu: 50.41 | epoch: 3 | total time: 175.26m | eta: 4.2m +step 16316/16704 (97.68%) | loss: 2.417234 | lrm: 0.05 | dt: 643.91ms | tok/sec: 814,222 | mfu: 50.89 | epoch: 3 | total time: 175.28m | eta: 4.2m +step 16317/16704 (97.68%) | loss: 2.430941 | lrm: 0.05 | dt: 646.72ms | tok/sec: 810,692 | mfu: 50.67 | epoch: 3 | total time: 175.29m | eta: 4.2m +step 16318/16704 (97.69%) | loss: 2.431723 | lrm: 0.05 | dt: 649.36ms | tok/sec: 807,389 | mfu: 50.46 | epoch: 3 | total time: 175.30m | eta: 4.1m +step 16319/16704 (97.70%) | loss: 2.414908 | lrm: 0.05 | dt: 643.74ms | tok/sec: 814,435 | mfu: 50.90 | epoch: 3 | total time: 175.31m | eta: 4.1m +step 16320/16704 (97.70%) | loss: 2.414622 | lrm: 0.05 | dt: 646.60ms | tok/sec: 810,836 | mfu: 50.68 | epoch: 3 | total time: 175.32m | eta: 4.1m +step 16321/16704 (97.71%) | loss: 2.411726 | lrm: 0.05 | dt: 645.29ms | tok/sec: 812,481 | mfu: 50.78 | epoch: 3 | total time: 175.33m | eta: 4.1m +step 16322/16704 (97.71%) | loss: 2.417059 | lrm: 0.05 | dt: 645.34ms | tok/sec: 812,417 | mfu: 50.78 | epoch: 3 | total time: 175.34m | eta: 4.1m +step 16323/16704 (97.72%) | loss: 2.421656 | lrm: 0.05 | dt: 644.80ms | tok/sec: 813,096 | mfu: 50.82 | epoch: 3 | total time: 175.35m | eta: 4.1m +step 16324/16704 (97.73%) | loss: 2.427248 | lrm: 0.05 | dt: 648.13ms | tok/sec: 808,922 | mfu: 50.56 | epoch: 3 | total time: 175.36m | eta: 4.1m +step 16325/16704 (97.73%) | loss: 2.429038 | lrm: 0.05 | dt: 646.96ms | tok/sec: 810,389 | mfu: 50.65 | epoch: 3 | total time: 175.37m | eta: 4.1m +step 16326/16704 (97.74%) | loss: 2.433863 | lrm: 0.05 | dt: 646.08ms | tok/sec: 811,492 | mfu: 50.72 | epoch: 3 | total time: 175.38m | eta: 4.1m +step 16327/16704 (97.74%) | loss: 2.429557 | lrm: 0.05 | dt: 644.51ms | tok/sec: 813,467 | mfu: 50.84 | epoch: 3 | total time: 175.39m | eta: 4.1m +step 16328/16704 (97.75%) | loss: 2.419970 | lrm: 0.05 | dt: 645.72ms | tok/sec: 811,940 | mfu: 50.75 | epoch: 3 | total time: 175.40m | eta: 4.0m +step 16329/16704 (97.76%) | loss: 2.410729 | lrm: 0.04 | dt: 647.64ms | tok/sec: 809,535 | mfu: 50.60 | epoch: 3 | total time: 175.42m | eta: 4.0m +step 16330/16704 (97.76%) | loss: 2.427907 | lrm: 0.04 | dt: 642.72ms | tok/sec: 815,732 | mfu: 50.98 | epoch: 3 | total time: 175.43m | eta: 4.0m +step 16331/16704 (97.77%) | loss: 2.426045 | lrm: 0.04 | dt: 650.63ms | tok/sec: 805,817 | mfu: 50.36 | epoch: 3 | total time: 175.44m | eta: 4.0m +step 16332/16704 (97.77%) | loss: 2.416404 | lrm: 0.04 | dt: 646.09ms | tok/sec: 811,477 | mfu: 50.72 | epoch: 3 | total time: 175.45m | eta: 4.0m +step 16333/16704 (97.78%) | loss: 2.418100 | lrm: 0.04 | dt: 645.54ms | tok/sec: 812,166 | mfu: 50.76 | epoch: 3 | total time: 175.46m | eta: 4.0m +step 16334/16704 (97.78%) | loss: 2.417805 | lrm: 0.04 | dt: 648.48ms | tok/sec: 808,492 | mfu: 50.53 | epoch: 3 | total time: 175.47m | eta: 4.0m +step 16335/16704 (97.79%) | loss: 2.418083 | lrm: 0.04 | dt: 643.95ms | tok/sec: 814,171 | mfu: 50.89 | epoch: 3 | total time: 175.48m | eta: 4.0m +step 16336/16704 (97.80%) | loss: 2.440121 | lrm: 0.04 | dt: 647.34ms | tok/sec: 809,909 | mfu: 50.62 | epoch: 3 | total time: 175.49m | eta: 4.0m +step 16337/16704 (97.80%) | loss: 2.455070 | lrm: 0.04 | dt: 644.64ms | tok/sec: 813,305 | mfu: 50.83 | epoch: 3 | total time: 175.50m | eta: 3.9m +step 16338/16704 (97.81%) | loss: 2.455639 | lrm: 0.04 | dt: 646.05ms | tok/sec: 811,524 | mfu: 50.72 | epoch: 3 | total time: 175.51m | eta: 3.9m +step 16339/16704 (97.81%) | loss: 2.451594 | lrm: 0.04 | dt: 644.89ms | tok/sec: 812,987 | mfu: 50.81 | epoch: 3 | total time: 175.52m | eta: 3.9m +step 16340/16704 (97.82%) | loss: 2.446692 | lrm: 0.04 | dt: 647.36ms | tok/sec: 809,884 | mfu: 50.62 | epoch: 3 | total time: 175.53m | eta: 3.9m +step 16341/16704 (97.83%) | loss: 2.449230 | lrm: 0.04 | dt: 646.88ms | tok/sec: 810,487 | mfu: 50.66 | epoch: 3 | total time: 175.54m | eta: 3.9m +step 16342/16704 (97.83%) | loss: 2.445020 | lrm: 0.04 | dt: 645.50ms | tok/sec: 812,223 | mfu: 50.77 | epoch: 3 | total time: 175.56m | eta: 3.9m +step 16343/16704 (97.84%) | loss: 2.432199 | lrm: 0.04 | dt: 645.52ms | tok/sec: 812,189 | mfu: 50.76 | epoch: 3 | total time: 175.57m | eta: 3.9m +step 16344/16704 (97.84%) | loss: 2.427492 | lrm: 0.04 | dt: 645.22ms | tok/sec: 812,574 | mfu: 50.79 | epoch: 3 | total time: 175.58m | eta: 3.9m +step 16345/16704 (97.85%) | loss: 2.419130 | lrm: 0.04 | dt: 646.95ms | tok/sec: 810,395 | mfu: 50.65 | epoch: 3 | total time: 175.59m | eta: 3.9m +step 16346/16704 (97.86%) | loss: 2.422478 | lrm: 0.04 | dt: 647.48ms | tok/sec: 809,733 | mfu: 50.61 | epoch: 3 | total time: 175.60m | eta: 3.8m +step 16347/16704 (97.86%) | loss: 2.418669 | lrm: 0.04 | dt: 643.88ms | tok/sec: 814,257 | mfu: 50.89 | epoch: 3 | total time: 175.61m | eta: 3.8m +step 16348/16704 (97.87%) | loss: 2.413292 | lrm: 0.04 | dt: 645.91ms | tok/sec: 811,704 | mfu: 50.73 | epoch: 3 | total time: 175.62m | eta: 3.8m +step 16349/16704 (97.87%) | loss: 2.412482 | lrm: 0.04 | dt: 646.29ms | tok/sec: 811,223 | mfu: 50.70 | epoch: 3 | total time: 175.63m | eta: 3.8m +step 16350/16704 (97.88%) | loss: 2.415108 | lrm: 0.04 | dt: 645.94ms | tok/sec: 811,668 | mfu: 50.73 | epoch: 3 | total time: 175.64m | eta: 3.8m +step 16351/16704 (97.89%) | loss: 2.419705 | lrm: 0.04 | dt: 644.01ms | tok/sec: 814,095 | mfu: 50.88 | epoch: 3 | total time: 175.65m | eta: 3.8m +step 16352/16704 (97.89%) | loss: 2.415247 | lrm: 0.04 | dt: 648.14ms | tok/sec: 808,908 | mfu: 50.56 | epoch: 3 | total time: 175.66m | eta: 3.8m +step 16353/16704 (97.90%) | loss: 2.415970 | lrm: 0.04 | dt: 645.07ms | tok/sec: 812,761 | mfu: 50.80 | epoch: 3 | total time: 175.67m | eta: 3.8m +step 16354/16704 (97.90%) | loss: 2.421067 | lrm: 0.04 | dt: 645.84ms | tok/sec: 811,798 | mfu: 50.74 | epoch: 3 | total time: 175.68m | eta: 3.8m +step 16355/16704 (97.91%) | loss: 2.432244 | lrm: 0.04 | dt: 649.49ms | tok/sec: 807,233 | mfu: 50.45 | epoch: 3 | total time: 175.70m | eta: 3.8m +step 16356/16704 (97.92%) | loss: 2.436223 | lrm: 0.04 | dt: 645.45ms | tok/sec: 812,284 | mfu: 50.77 | epoch: 3 | total time: 175.71m | eta: 3.7m +step 16357/16704 (97.92%) | loss: 2.439739 | lrm: 0.04 | dt: 647.81ms | tok/sec: 809,321 | mfu: 50.58 | epoch: 3 | total time: 175.72m | eta: 3.7m +step 16358/16704 (97.93%) | loss: 2.448714 | lrm: 0.04 | dt: 645.31ms | tok/sec: 812,459 | mfu: 50.78 | epoch: 3 | total time: 175.73m | eta: 3.7m +step 16359/16704 (97.93%) | loss: 2.434956 | lrm: 0.04 | dt: 649.08ms | tok/sec: 807,740 | mfu: 50.48 | epoch: 3 | total time: 175.74m | eta: 3.7m +step 16360/16704 (97.94%) | loss: 2.448296 | lrm: 0.04 | dt: 647.84ms | tok/sec: 809,284 | mfu: 50.58 | epoch: 3 | total time: 175.75m | eta: 3.7m +step 16361/16704 (97.95%) | loss: 2.430221 | lrm: 0.04 | dt: 644.57ms | tok/sec: 813,386 | mfu: 50.84 | epoch: 3 | total time: 175.76m | eta: 3.7m +step 16362/16704 (97.95%) | loss: 2.439144 | lrm: 0.04 | dt: 647.44ms | tok/sec: 809,780 | mfu: 50.61 | epoch: 3 | total time: 175.77m | eta: 3.7m +step 16363/16704 (97.96%) | loss: 2.431777 | lrm: 0.04 | dt: 647.75ms | tok/sec: 809,402 | mfu: 50.59 | epoch: 3 | total time: 175.78m | eta: 3.7m +step 16364/16704 (97.96%) | loss: 2.426832 | lrm: 0.04 | dt: 643.63ms | tok/sec: 814,574 | mfu: 50.91 | epoch: 3 | total time: 175.79m | eta: 3.7m +step 16365/16704 (97.97%) | loss: 2.422414 | lrm: 0.04 | dt: 645.67ms | tok/sec: 812,011 | mfu: 50.75 | epoch: 3 | total time: 175.80m | eta: 3.6m +step 16366/16704 (97.98%) | loss: 2.439767 | lrm: 0.04 | dt: 645.73ms | tok/sec: 811,924 | mfu: 50.75 | epoch: 3 | total time: 175.81m | eta: 3.6m +step 16367/16704 (97.98%) | loss: 2.437173 | lrm: 0.04 | dt: 644.65ms | tok/sec: 813,287 | mfu: 50.83 | epoch: 3 | total time: 175.82m | eta: 3.6m +step 16368/16704 (97.99%) | loss: 2.434535 | lrm: 0.04 | dt: 647.33ms | tok/sec: 809,926 | mfu: 50.62 | epoch: 3 | total time: 175.84m | eta: 3.6m +step 16369/16704 (97.99%) | loss: 2.424711 | lrm: 0.04 | dt: 643.75ms | tok/sec: 814,429 | mfu: 50.90 | epoch: 3 | total time: 175.85m | eta: 3.6m +step 16370/16704 (98.00%) | loss: 2.430290 | lrm: 0.04 | dt: 645.46ms | tok/sec: 812,268 | mfu: 50.77 | epoch: 3 | total time: 175.86m | eta: 3.6m +step 16371/16704 (98.01%) | loss: 2.422822 | lrm: 0.04 | dt: 644.97ms | tok/sec: 812,892 | mfu: 50.81 | epoch: 3 | total time: 175.87m | eta: 3.6m +step 16372/16704 (98.01%) | loss: 2.446363 | lrm: 0.04 | dt: 647.39ms | tok/sec: 809,850 | mfu: 50.62 | epoch: 3 | total time: 175.88m | eta: 3.6m +step 16373/16704 (98.02%) | loss: 2.437251 | lrm: 0.04 | dt: 646.56ms | tok/sec: 810,887 | mfu: 50.68 | epoch: 3 | total time: 175.89m | eta: 3.6m +step 16374/16704 (98.02%) | loss: 2.426765 | lrm: 0.04 | dt: 645.59ms | tok/sec: 812,104 | mfu: 50.76 | epoch: 3 | total time: 175.90m | eta: 3.5m +step 16375/16704 (98.03%) | loss: 2.429442 | lrm: 0.04 | dt: 644.50ms | tok/sec: 813,480 | mfu: 50.84 | epoch: 3 | total time: 175.91m | eta: 3.5m +step 16376/16704 (98.04%) | loss: 2.428450 | lrm: 0.04 | dt: 643.97ms | tok/sec: 814,150 | mfu: 50.89 | epoch: 3 | total time: 175.92m | eta: 3.5m +step 16377/16704 (98.04%) | loss: 2.431031 | lrm: 0.04 | dt: 645.91ms | tok/sec: 811,707 | mfu: 50.73 | epoch: 3 | total time: 175.93m | eta: 3.5m +step 16378/16704 (98.05%) | loss: 2.417757 | lrm: 0.04 | dt: 644.36ms | tok/sec: 813,662 | mfu: 50.86 | epoch: 3 | total time: 175.94m | eta: 3.5m +step 16379/16704 (98.05%) | loss: 2.408310 | lrm: 0.04 | dt: 647.70ms | tok/sec: 809,455 | mfu: 50.59 | epoch: 3 | total time: 175.95m | eta: 3.5m +step 16380/16704 (98.06%) | loss: 2.406551 | lrm: 0.04 | dt: 648.61ms | tok/sec: 808,330 | mfu: 50.52 | epoch: 3 | total time: 175.96m | eta: 3.5m +step 16381/16704 (98.07%) | loss: 2.412344 | lrm: 0.04 | dt: 646.62ms | tok/sec: 810,819 | mfu: 50.68 | epoch: 3 | total time: 175.98m | eta: 3.5m +step 16382/16704 (98.07%) | loss: 2.411755 | lrm: 0.04 | dt: 646.33ms | tok/sec: 811,173 | mfu: 50.70 | epoch: 3 | total time: 175.99m | eta: 3.5m +step 16383/16704 (98.08%) | loss: 2.409349 | lrm: 0.04 | dt: 645.31ms | tok/sec: 812,456 | mfu: 50.78 | epoch: 3 | total time: 176.00m | eta: 3.5m +step 16384/16704 (98.08%) | loss: 2.410685 | lrm: 0.04 | dt: 644.96ms | tok/sec: 812,893 | mfu: 50.81 | epoch: 3 | total time: 176.01m | eta: 3.4m +step 16385/16704 (98.09%) | loss: 2.418000 | lrm: 0.04 | dt: 643.51ms | tok/sec: 814,726 | mfu: 50.92 | epoch: 3 | total time: 176.02m | eta: 3.4m +step 16386/16704 (98.10%) | loss: 2.425065 | lrm: 0.04 | dt: 645.54ms | tok/sec: 812,175 | mfu: 50.76 | epoch: 3 | total time: 176.03m | eta: 3.4m +step 16387/16704 (98.10%) | loss: 2.430827 | lrm: 0.04 | dt: 645.09ms | tok/sec: 812,731 | mfu: 50.80 | epoch: 3 | total time: 176.04m | eta: 3.4m +step 16388/16704 (98.11%) | loss: 2.429274 | lrm: 0.04 | dt: 646.01ms | tok/sec: 811,575 | mfu: 50.72 | epoch: 3 | total time: 176.05m | eta: 3.4m +step 16389/16704 (98.11%) | loss: 2.425824 | lrm: 0.04 | dt: 645.78ms | tok/sec: 811,868 | mfu: 50.74 | epoch: 3 | total time: 176.06m | eta: 3.4m +step 16390/16704 (98.12%) | loss: 2.429300 | lrm: 0.04 | dt: 644.39ms | tok/sec: 813,618 | mfu: 50.85 | epoch: 3 | total time: 176.07m | eta: 3.4m +step 16391/16704 (98.13%) | loss: 2.432392 | lrm: 0.04 | dt: 646.91ms | tok/sec: 810,444 | mfu: 50.65 | epoch: 3 | total time: 176.08m | eta: 3.4m +step 16392/16704 (98.13%) | loss: 2.436835 | lrm: 0.04 | dt: 645.07ms | tok/sec: 812,763 | mfu: 50.80 | epoch: 3 | total time: 176.09m | eta: 3.4m +step 16393/16704 (98.14%) | loss: 2.431998 | lrm: 0.04 | dt: 649.12ms | tok/sec: 807,688 | mfu: 50.48 | epoch: 3 | total time: 176.10m | eta: 3.3m +step 16394/16704 (98.14%) | loss: 2.435302 | lrm: 0.04 | dt: 645.06ms | tok/sec: 812,776 | mfu: 50.80 | epoch: 3 | total time: 176.12m | eta: 3.3m +step 16395/16704 (98.15%) | loss: 2.426033 | lrm: 0.04 | dt: 642.74ms | tok/sec: 815,708 | mfu: 50.98 | epoch: 3 | total time: 176.13m | eta: 3.3m +step 16396/16704 (98.16%) | loss: 2.434122 | lrm: 0.04 | dt: 645.17ms | tok/sec: 812,635 | mfu: 50.79 | epoch: 3 | total time: 176.14m | eta: 3.3m +step 16397/16704 (98.16%) | loss: 2.426902 | lrm: 0.04 | dt: 645.48ms | tok/sec: 812,248 | mfu: 50.77 | epoch: 3 | total time: 176.15m | eta: 3.3m +step 16398/16704 (98.17%) | loss: 2.410795 | lrm: 0.04 | dt: 644.73ms | tok/sec: 813,186 | mfu: 50.83 | epoch: 3 | total time: 176.16m | eta: 3.3m +step 16399/16704 (98.17%) | loss: 2.415276 | lrm: 0.04 | dt: 645.04ms | tok/sec: 812,793 | mfu: 50.80 | epoch: 3 | total time: 176.17m | eta: 3.3m +step 16400/16704 (98.18%) | loss: 2.419404 | lrm: 0.04 | dt: 644.77ms | tok/sec: 813,144 | mfu: 50.82 | epoch: 3 | total time: 176.18m | eta: 3.3m +step 16401/16704 (98.19%) | loss: 2.417856 | lrm: 0.04 | dt: 647.24ms | tok/sec: 810,041 | mfu: 50.63 | epoch: 3 | total time: 176.19m | eta: 3.3m +step 16402/16704 (98.19%) | loss: 2.419248 | lrm: 0.04 | dt: 644.06ms | tok/sec: 814,035 | mfu: 50.88 | epoch: 3 | total time: 176.20m | eta: 3.2m +step 16403/16704 (98.20%) | loss: 2.412244 | lrm: 0.04 | dt: 646.15ms | tok/sec: 811,407 | mfu: 50.71 | epoch: 3 | total time: 176.21m | eta: 3.2m +step 16404/16704 (98.20%) | loss: 2.417986 | lrm: 0.04 | dt: 643.14ms | tok/sec: 815,199 | mfu: 50.95 | epoch: 3 | total time: 176.22m | eta: 3.2m +step 16405/16704 (98.21%) | loss: 2.420062 | lrm: 0.04 | dt: 645.54ms | tok/sec: 812,172 | mfu: 50.76 | epoch: 3 | total time: 176.23m | eta: 3.2m +step 16406/16704 (98.22%) | loss: 2.416271 | lrm: 0.04 | dt: 644.18ms | tok/sec: 813,883 | mfu: 50.87 | epoch: 3 | total time: 176.24m | eta: 3.2m +step 16407/16704 (98.22%) | loss: 2.405707 | lrm: 0.04 | dt: 643.89ms | tok/sec: 814,257 | mfu: 50.89 | epoch: 3 | total time: 176.25m | eta: 3.2m +step 16408/16704 (98.23%) | loss: 2.408388 | lrm: 0.04 | dt: 645.18ms | tok/sec: 812,623 | mfu: 50.79 | epoch: 3 | total time: 176.27m | eta: 3.2m +step 16409/16704 (98.23%) | loss: 2.414620 | lrm: 0.04 | dt: 644.73ms | tok/sec: 813,189 | mfu: 50.83 | epoch: 3 | total time: 176.28m | eta: 3.2m +step 16410/16704 (98.24%) | loss: 2.415850 | lrm: 0.04 | dt: 645.25ms | tok/sec: 812,533 | mfu: 50.78 | epoch: 3 | total time: 176.29m | eta: 3.2m +step 16411/16704 (98.25%) | loss: 2.425050 | lrm: 0.04 | dt: 644.58ms | tok/sec: 813,383 | mfu: 50.84 | epoch: 3 | total time: 176.30m | eta: 3.1m +step 16412/16704 (98.25%) | loss: 2.423524 | lrm: 0.03 | dt: 643.42ms | tok/sec: 814,842 | mfu: 50.93 | epoch: 3 | total time: 176.31m | eta: 3.1m +step 16413/16704 (98.26%) | loss: 2.425815 | lrm: 0.03 | dt: 645.30ms | tok/sec: 812,473 | mfu: 50.78 | epoch: 3 | total time: 176.32m | eta: 3.1m +step 16414/16704 (98.26%) | loss: 2.417066 | lrm: 0.03 | dt: 645.20ms | tok/sec: 812,598 | mfu: 50.79 | epoch: 3 | total time: 176.33m | eta: 3.1m +step 16415/16704 (98.27%) | loss: 2.413540 | lrm: 0.03 | dt: 645.58ms | tok/sec: 812,122 | mfu: 50.76 | epoch: 3 | total time: 176.34m | eta: 3.1m +step 16416/16704 (98.28%) | loss: 2.412718 | lrm: 0.03 | dt: 646.72ms | tok/sec: 810,682 | mfu: 50.67 | epoch: 3 | total time: 176.35m | eta: 3.1m +step 16417/16704 (98.28%) | loss: 2.427364 | lrm: 0.03 | dt: 645.56ms | tok/sec: 812,144 | mfu: 50.76 | epoch: 3 | total time: 176.36m | eta: 3.1m +step 16418/16704 (98.29%) | loss: 2.430216 | lrm: 0.03 | dt: 643.55ms | tok/sec: 814,686 | mfu: 50.92 | epoch: 3 | total time: 176.37m | eta: 3.1m +step 16419/16704 (98.29%) | loss: 2.434626 | lrm: 0.03 | dt: 646.25ms | tok/sec: 811,280 | mfu: 50.71 | epoch: 3 | total time: 176.38m | eta: 3.1m +step 16420/16704 (98.30%) | loss: 2.450203 | lrm: 0.03 | dt: 645.45ms | tok/sec: 812,285 | mfu: 50.77 | epoch: 3 | total time: 176.39m | eta: 3.1m +step 16421/16704 (98.31%) | loss: 2.455290 | lrm: 0.03 | dt: 644.89ms | tok/sec: 812,986 | mfu: 50.81 | epoch: 3 | total time: 176.41m | eta: 3.0m +step 16422/16704 (98.31%) | loss: 2.453079 | lrm: 0.03 | dt: 643.91ms | tok/sec: 814,221 | mfu: 50.89 | epoch: 3 | total time: 176.42m | eta: 3.0m +step 16423/16704 (98.32%) | loss: 2.437993 | lrm: 0.03 | dt: 645.06ms | tok/sec: 812,772 | mfu: 50.80 | epoch: 3 | total time: 176.43m | eta: 3.0m +step 16424/16704 (98.32%) | loss: 2.443015 | lrm: 0.03 | dt: 644.34ms | tok/sec: 813,681 | mfu: 50.86 | epoch: 3 | total time: 176.44m | eta: 3.0m +step 16425/16704 (98.33%) | loss: 2.444404 | lrm: 0.03 | dt: 646.11ms | tok/sec: 811,455 | mfu: 50.72 | epoch: 3 | total time: 176.45m | eta: 3.0m +step 16426/16704 (98.34%) | loss: 2.441703 | lrm: 0.03 | dt: 645.46ms | tok/sec: 812,272 | mfu: 50.77 | epoch: 3 | total time: 176.46m | eta: 3.0m +step 16427/16704 (98.34%) | loss: 2.425166 | lrm: 0.03 | dt: 645.26ms | tok/sec: 812,522 | mfu: 50.78 | epoch: 3 | total time: 176.47m | eta: 3.0m +step 16428/16704 (98.35%) | loss: 2.425135 | lrm: 0.03 | dt: 647.01ms | tok/sec: 810,318 | mfu: 50.65 | epoch: 3 | total time: 176.48m | eta: 3.0m +step 16429/16704 (98.35%) | loss: 2.431084 | lrm: 0.03 | dt: 644.74ms | tok/sec: 813,173 | mfu: 50.82 | epoch: 3 | total time: 176.49m | eta: 3.0m +step 16430/16704 (98.36%) | loss: 2.442608 | lrm: 0.03 | dt: 645.74ms | tok/sec: 811,924 | mfu: 50.75 | epoch: 3 | total time: 176.50m | eta: 2.9m +step 16431/16704 (98.37%) | loss: 2.437567 | lrm: 0.03 | dt: 646.08ms | tok/sec: 811,495 | mfu: 50.72 | epoch: 3 | total time: 176.51m | eta: 2.9m +step 16432/16704 (98.37%) | loss: 2.437890 | lrm: 0.03 | dt: 646.71ms | tok/sec: 810,704 | mfu: 50.67 | epoch: 3 | total time: 176.52m | eta: 2.9m +step 16433/16704 (98.38%) | loss: 2.436045 | lrm: 0.03 | dt: 646.39ms | tok/sec: 811,100 | mfu: 50.69 | epoch: 3 | total time: 176.53m | eta: 2.9m +step 16434/16704 (98.38%) | loss: 2.426285 | lrm: 0.03 | dt: 648.18ms | tok/sec: 808,856 | mfu: 50.55 | epoch: 3 | total time: 176.55m | eta: 2.9m +step 16435/16704 (98.39%) | loss: 2.433479 | lrm: 0.03 | dt: 644.64ms | tok/sec: 813,307 | mfu: 50.83 | epoch: 3 | total time: 176.56m | eta: 2.9m +step 16436/16704 (98.40%) | loss: 2.420811 | lrm: 0.03 | dt: 645.06ms | tok/sec: 812,779 | mfu: 50.80 | epoch: 3 | total time: 176.57m | eta: 2.9m +step 16437/16704 (98.40%) | loss: 2.434017 | lrm: 0.03 | dt: 644.38ms | tok/sec: 813,636 | mfu: 50.85 | epoch: 3 | total time: 176.58m | eta: 2.9m +step 16438/16704 (98.41%) | loss: 2.436479 | lrm: 0.03 | dt: 644.79ms | tok/sec: 813,112 | mfu: 50.82 | epoch: 3 | total time: 176.59m | eta: 2.9m +step 16439/16704 (98.41%) | loss: 2.432554 | lrm: 0.03 | dt: 643.71ms | tok/sec: 814,473 | mfu: 50.91 | epoch: 3 | total time: 176.60m | eta: 2.8m +step 16440/16704 (98.42%) | loss: 2.443715 | lrm: 0.03 | dt: 644.94ms | tok/sec: 812,919 | mfu: 50.81 | epoch: 3 | total time: 176.61m | eta: 2.8m +step 16441/16704 (98.43%) | loss: 2.441227 | lrm: 0.03 | dt: 645.55ms | tok/sec: 812,154 | mfu: 50.76 | epoch: 3 | total time: 176.62m | eta: 2.8m +step 16442/16704 (98.43%) | loss: 2.459258 | lrm: 0.03 | dt: 644.55ms | tok/sec: 813,421 | mfu: 50.84 | epoch: 3 | total time: 176.63m | eta: 2.8m +step 16443/16704 (98.44%) | loss: 2.461053 | lrm: 0.03 | dt: 644.48ms | tok/sec: 813,510 | mfu: 50.85 | epoch: 3 | total time: 176.64m | eta: 2.8m +step 16444/16704 (98.44%) | loss: 2.451802 | lrm: 0.03 | dt: 645.44ms | tok/sec: 812,295 | mfu: 50.77 | epoch: 3 | total time: 176.65m | eta: 2.8m +step 16445/16704 (98.45%) | loss: 2.448407 | lrm: 0.03 | dt: 646.41ms | tok/sec: 811,077 | mfu: 50.69 | epoch: 3 | total time: 176.66m | eta: 2.8m +step 16446/16704 (98.46%) | loss: 2.448814 | lrm: 0.03 | dt: 646.05ms | tok/sec: 811,531 | mfu: 50.72 | epoch: 3 | total time: 176.67m | eta: 2.8m +step 16447/16704 (98.46%) | loss: 2.449823 | lrm: 0.03 | dt: 646.68ms | tok/sec: 810,737 | mfu: 50.67 | epoch: 3 | total time: 176.69m | eta: 2.8m +step 16448/16704 (98.47%) | loss: 2.445664 | lrm: 0.03 | dt: 647.68ms | tok/sec: 809,486 | mfu: 50.59 | epoch: 3 | total time: 176.70m | eta: 2.8m +step 16449/16704 (98.47%) | loss: 2.444383 | lrm: 0.03 | dt: 646.68ms | tok/sec: 810,733 | mfu: 50.67 | epoch: 3 | total time: 176.71m | eta: 2.7m +step 16450/16704 (98.48%) | loss: 2.444039 | lrm: 0.03 | dt: 645.15ms | tok/sec: 812,661 | mfu: 50.79 | epoch: 3 | total time: 176.72m | eta: 2.7m +step 16451/16704 (98.49%) | loss: 2.444329 | lrm: 0.03 | dt: 646.37ms | tok/sec: 811,121 | mfu: 50.70 | epoch: 3 | total time: 176.73m | eta: 2.7m +step 16452/16704 (98.49%) | loss: 2.438047 | lrm: 0.03 | dt: 648.71ms | tok/sec: 808,200 | mfu: 50.51 | epoch: 3 | total time: 176.74m | eta: 2.7m +step 16453/16704 (98.50%) | loss: 2.430542 | lrm: 0.03 | dt: 643.11ms | tok/sec: 815,232 | mfu: 50.95 | epoch: 3 | total time: 176.75m | eta: 2.7m +step 16454/16704 (98.50%) | loss: 2.428168 | lrm: 0.03 | dt: 645.98ms | tok/sec: 811,612 | mfu: 50.73 | epoch: 3 | total time: 176.76m | eta: 2.7m +step 16455/16704 (98.51%) | loss: 2.427245 | lrm: 0.03 | dt: 646.63ms | tok/sec: 810,798 | mfu: 50.68 | epoch: 3 | total time: 176.77m | eta: 2.7m +step 16456/16704 (98.52%) | loss: 2.424764 | lrm: 0.03 | dt: 647.20ms | tok/sec: 810,085 | mfu: 50.63 | epoch: 3 | total time: 176.78m | eta: 2.7m +step 16457/16704 (98.52%) | loss: 2.433700 | lrm: 0.03 | dt: 646.81ms | tok/sec: 810,573 | mfu: 50.66 | epoch: 3 | total time: 176.79m | eta: 2.7m +step 16458/16704 (98.53%) | loss: 2.429883 | lrm: 0.03 | dt: 644.57ms | tok/sec: 813,389 | mfu: 50.84 | epoch: 3 | total time: 176.80m | eta: 2.6m +step 16459/16704 (98.53%) | loss: 2.436105 | lrm: 0.03 | dt: 645.37ms | tok/sec: 812,379 | mfu: 50.77 | epoch: 3 | total time: 176.81m | eta: 2.6m +step 16460/16704 (98.54%) | loss: 2.456385 | lrm: 0.03 | dt: 646.09ms | tok/sec: 811,480 | mfu: 50.72 | epoch: 3 | total time: 176.83m | eta: 2.6m +step 16461/16704 (98.55%) | loss: 2.456896 | lrm: 0.03 | dt: 648.84ms | tok/sec: 808,039 | mfu: 50.50 | epoch: 3 | total time: 176.84m | eta: 2.6m +step 16462/16704 (98.55%) | loss: 2.445212 | lrm: 0.03 | dt: 646.46ms | tok/sec: 811,013 | mfu: 50.69 | epoch: 3 | total time: 176.85m | eta: 2.6m +step 16463/16704 (98.56%) | loss: 2.447983 | lrm: 0.03 | dt: 644.68ms | tok/sec: 813,247 | mfu: 50.83 | epoch: 3 | total time: 176.86m | eta: 2.6m +step 16464/16704 (98.56%) | loss: 2.454806 | lrm: 0.03 | dt: 646.79ms | tok/sec: 810,596 | mfu: 50.66 | epoch: 3 | total time: 176.87m | eta: 2.6m +step 16465/16704 (98.57%) | loss: 2.451303 | lrm: 0.03 | dt: 646.87ms | tok/sec: 810,499 | mfu: 50.66 | epoch: 3 | total time: 176.88m | eta: 2.6m +step 16466/16704 (98.58%) | loss: 2.444232 | lrm: 0.03 | dt: 645.93ms | tok/sec: 811,682 | mfu: 50.73 | epoch: 3 | total time: 176.89m | eta: 2.6m +step 16467/16704 (98.58%) | loss: 2.449804 | lrm: 0.03 | dt: 646.26ms | tok/sec: 811,268 | mfu: 50.71 | epoch: 3 | total time: 176.90m | eta: 2.5m +step 16468/16704 (98.59%) | loss: 2.447263 | lrm: 0.03 | dt: 645.12ms | tok/sec: 812,699 | mfu: 50.79 | epoch: 3 | total time: 176.91m | eta: 2.5m +step 16469/16704 (98.59%) | loss: 2.444488 | lrm: 0.03 | dt: 645.87ms | tok/sec: 811,754 | mfu: 50.74 | epoch: 3 | total time: 176.92m | eta: 2.5m +step 16470/16704 (98.60%) | loss: 2.425610 | lrm: 0.03 | dt: 645.55ms | tok/sec: 812,151 | mfu: 50.76 | epoch: 3 | total time: 176.93m | eta: 2.5m +step 16471/16704 (98.61%) | loss: 2.419671 | lrm: 0.03 | dt: 644.50ms | tok/sec: 813,478 | mfu: 50.84 | epoch: 3 | total time: 176.94m | eta: 2.5m +step 16472/16704 (98.61%) | loss: 2.419060 | lrm: 0.03 | dt: 646.80ms | tok/sec: 810,592 | mfu: 50.66 | epoch: 3 | total time: 176.95m | eta: 2.5m +step 16473/16704 (98.62%) | loss: 2.412772 | lrm: 0.03 | dt: 646.28ms | tok/sec: 811,239 | mfu: 50.70 | epoch: 3 | total time: 176.97m | eta: 2.5m +step 16474/16704 (98.62%) | loss: 2.396018 | lrm: 0.03 | dt: 644.91ms | tok/sec: 812,960 | mfu: 50.81 | epoch: 3 | total time: 176.98m | eta: 2.5m +step 16475/16704 (98.63%) | loss: 2.398604 | lrm: 0.03 | dt: 646.77ms | tok/sec: 810,619 | mfu: 50.66 | epoch: 3 | total time: 176.99m | eta: 2.5m +step 16476/16704 (98.64%) | loss: 2.396117 | lrm: 0.03 | dt: 646.88ms | tok/sec: 810,484 | mfu: 50.66 | epoch: 3 | total time: 177.00m | eta: 2.5m +step 16477/16704 (98.64%) | loss: 2.393789 | lrm: 0.03 | dt: 647.16ms | tok/sec: 810,135 | mfu: 50.63 | epoch: 3 | total time: 177.01m | eta: 2.4m +step 16478/16704 (98.65%) | loss: 2.402080 | lrm: 0.03 | dt: 645.38ms | tok/sec: 812,372 | mfu: 50.77 | epoch: 3 | total time: 177.02m | eta: 2.4m +step 16479/16704 (98.65%) | loss: 2.393372 | lrm: 0.03 | dt: 647.10ms | tok/sec: 810,209 | mfu: 50.64 | epoch: 3 | total time: 177.03m | eta: 2.4m +step 16480/16704 (98.66%) | loss: 2.410207 | lrm: 0.03 | dt: 645.07ms | tok/sec: 812,766 | mfu: 50.80 | epoch: 3 | total time: 177.04m | eta: 2.4m +step 16481/16704 (98.66%) | loss: 2.411872 | lrm: 0.03 | dt: 644.45ms | tok/sec: 813,544 | mfu: 50.85 | epoch: 3 | total time: 177.05m | eta: 2.4m +step 16482/16704 (98.67%) | loss: 2.422157 | lrm: 0.03 | dt: 645.99ms | tok/sec: 811,599 | mfu: 50.73 | epoch: 3 | total time: 177.06m | eta: 2.4m +step 16483/16704 (98.68%) | loss: 2.415839 | lrm: 0.03 | dt: 646.13ms | tok/sec: 811,427 | mfu: 50.72 | epoch: 3 | total time: 177.07m | eta: 2.4m +step 16484/16704 (98.68%) | loss: 2.409642 | lrm: 0.03 | dt: 646.00ms | tok/sec: 811,597 | mfu: 50.73 | epoch: 3 | total time: 177.08m | eta: 2.4m +step 16485/16704 (98.69%) | loss: 2.406740 | lrm: 0.03 | dt: 644.56ms | tok/sec: 813,401 | mfu: 50.84 | epoch: 3 | total time: 177.09m | eta: 2.4m +step 16486/16704 (98.69%) | loss: 2.397963 | lrm: 0.03 | dt: 645.73ms | tok/sec: 811,933 | mfu: 50.75 | epoch: 3 | total time: 177.10m | eta: 2.3m +step 16487/16704 (98.70%) | loss: 2.423945 | lrm: 0.03 | dt: 647.56ms | tok/sec: 809,637 | mfu: 50.60 | epoch: 3 | total time: 177.12m | eta: 2.3m +step 16488/16704 (98.71%) | loss: 2.420298 | lrm: 0.03 | dt: 645.57ms | tok/sec: 812,133 | mfu: 50.76 | epoch: 3 | total time: 177.13m | eta: 2.3m +step 16489/16704 (98.71%) | loss: 2.416078 | lrm: 0.03 | dt: 643.58ms | tok/sec: 814,640 | mfu: 50.92 | epoch: 3 | total time: 177.14m | eta: 2.3m +step 16490/16704 (98.72%) | loss: 2.419847 | lrm: 0.03 | dt: 648.83ms | tok/sec: 808,052 | mfu: 50.50 | epoch: 3 | total time: 177.15m | eta: 2.3m +step 16491/16704 (98.72%) | loss: 2.410395 | lrm: 0.03 | dt: 644.67ms | tok/sec: 813,268 | mfu: 50.83 | epoch: 3 | total time: 177.16m | eta: 2.3m +step 16492/16704 (98.73%) | loss: 2.407288 | lrm: 0.03 | dt: 643.18ms | tok/sec: 815,151 | mfu: 50.95 | epoch: 3 | total time: 177.17m | eta: 2.3m +step 16493/16704 (98.74%) | loss: 2.412006 | lrm: 0.03 | dt: 647.27ms | tok/sec: 810,001 | mfu: 50.63 | epoch: 3 | total time: 177.18m | eta: 2.3m +step 16494/16704 (98.74%) | loss: 2.426037 | lrm: 0.03 | dt: 644.25ms | tok/sec: 813,802 | mfu: 50.86 | epoch: 3 | total time: 177.19m | eta: 2.3m +step 16495/16704 (98.75%) | loss: 2.427155 | lrm: 0.03 | dt: 645.78ms | tok/sec: 811,870 | mfu: 50.74 | epoch: 3 | total time: 177.20m | eta: 2.2m +step 16496/16704 (98.75%) | loss: 2.420690 | lrm: 0.02 | dt: 643.59ms | tok/sec: 814,624 | mfu: 50.92 | epoch: 3 | total time: 177.21m | eta: 2.2m +step 16497/16704 (98.76%) | loss: 2.413215 | lrm: 0.02 | dt: 647.35ms | tok/sec: 809,893 | mfu: 50.62 | epoch: 3 | total time: 177.22m | eta: 2.2m +step 16498/16704 (98.77%) | loss: 2.411737 | lrm: 0.02 | dt: 644.64ms | tok/sec: 813,305 | mfu: 50.83 | epoch: 3 | total time: 177.23m | eta: 2.2m +step 16499/16704 (98.77%) | loss: 2.408826 | lrm: 0.02 | dt: 643.97ms | tok/sec: 814,144 | mfu: 50.89 | epoch: 3 | total time: 177.24m | eta: 2.2m +Step 16500 | Validation bpb: 0.753588 +step 16500/16704 (98.78%) | loss: 2.409723 | lrm: 0.02 | dt: 630.50ms | tok/sec: 831,539 | mfu: 51.97 | epoch: 3 | total time: 177.26m | eta: 2.2m +step 16501/16704 (98.78%) | loss: 2.408307 | lrm: 0.02 | dt: 655.03ms | tok/sec: 800,401 | mfu: 50.03 | epoch: 3 | total time: 177.27m | eta: 2.2m +step 16502/16704 (98.79%) | loss: 2.396683 | lrm: 0.02 | dt: 641.41ms | tok/sec: 817,401 | mfu: 51.09 | epoch: 3 | total time: 177.28m | eta: 2.2m +step 16503/16704 (98.80%) | loss: 2.402542 | lrm: 0.02 | dt: 643.66ms | tok/sec: 814,540 | mfu: 50.91 | epoch: 3 | total time: 177.29m | eta: 2.2m +step 16504/16704 (98.80%) | loss: 2.395466 | lrm: 0.02 | dt: 647.18ms | tok/sec: 810,106 | mfu: 50.63 | epoch: 3 | total time: 177.30m | eta: 2.1m +step 16505/16704 (98.81%) | loss: 2.397619 | lrm: 0.02 | dt: 639.91ms | tok/sec: 819,313 | mfu: 51.21 | epoch: 3 | total time: 177.31m | eta: 2.1m +step 16506/16704 (98.81%) | loss: 2.394673 | lrm: 0.02 | dt: 645.30ms | tok/sec: 812,470 | mfu: 50.78 | epoch: 3 | total time: 177.32m | eta: 2.1m +step 16507/16704 (98.82%) | loss: 2.396941 | lrm: 0.02 | dt: 644.52ms | tok/sec: 813,451 | mfu: 50.84 | epoch: 3 | total time: 177.33m | eta: 2.1m +step 16508/16704 (98.83%) | loss: 2.377725 | lrm: 0.02 | dt: 641.80ms | tok/sec: 816,903 | mfu: 51.06 | epoch: 3 | total time: 177.34m | eta: 2.1m +step 16509/16704 (98.83%) | loss: 2.394574 | lrm: 0.02 | dt: 647.67ms | tok/sec: 809,500 | mfu: 50.59 | epoch: 3 | total time: 177.35m | eta: 2.1m +step 16510/16704 (98.84%) | loss: 2.402650 | lrm: 0.02 | dt: 643.16ms | tok/sec: 815,179 | mfu: 50.95 | epoch: 3 | total time: 177.36m | eta: 2.1m +step 16511/16704 (98.84%) | loss: 2.418365 | lrm: 0.02 | dt: 643.95ms | tok/sec: 814,177 | mfu: 50.89 | epoch: 3 | total time: 177.37m | eta: 2.1m +step 16512/16704 (98.85%) | loss: 2.432568 | lrm: 0.02 | dt: 648.36ms | tok/sec: 808,637 | mfu: 50.54 | epoch: 3 | total time: 177.38m | eta: 2.1m +step 16513/16704 (98.86%) | loss: 2.435041 | lrm: 0.02 | dt: 643.99ms | tok/sec: 814,125 | mfu: 50.88 | epoch: 3 | total time: 177.40m | eta: 2.1m +step 16514/16704 (98.86%) | loss: 2.438145 | lrm: 0.02 | dt: 643.51ms | tok/sec: 814,727 | mfu: 50.92 | epoch: 3 | total time: 177.41m | eta: 2.0m +step 16515/16704 (98.87%) | loss: 2.432574 | lrm: 0.02 | dt: 647.57ms | tok/sec: 809,627 | mfu: 50.60 | epoch: 3 | total time: 177.42m | eta: 2.0m +step 16516/16704 (98.87%) | loss: 2.431303 | lrm: 0.02 | dt: 644.33ms | tok/sec: 813,698 | mfu: 50.86 | epoch: 3 | total time: 177.43m | eta: 2.0m +step 16517/16704 (98.88%) | loss: 2.421394 | lrm: 0.02 | dt: 644.88ms | tok/sec: 813,006 | mfu: 50.81 | epoch: 3 | total time: 177.44m | eta: 2.0m +step 16518/16704 (98.89%) | loss: 2.422560 | lrm: 0.02 | dt: 644.73ms | tok/sec: 813,195 | mfu: 50.83 | epoch: 3 | total time: 177.45m | eta: 2.0m +step 16519/16704 (98.89%) | loss: 2.419220 | lrm: 0.02 | dt: 644.30ms | tok/sec: 813,728 | mfu: 50.86 | epoch: 3 | total time: 177.46m | eta: 2.0m +step 16520/16704 (98.90%) | loss: 2.430515 | lrm: 0.02 | dt: 647.74ms | tok/sec: 809,408 | mfu: 50.59 | epoch: 3 | total time: 177.47m | eta: 2.0m +step 16521/16704 (98.90%) | loss: 2.430088 | lrm: 0.02 | dt: 645.21ms | tok/sec: 812,588 | mfu: 50.79 | epoch: 3 | total time: 177.48m | eta: 2.0m +step 16522/16704 (98.91%) | loss: 2.415292 | lrm: 0.02 | dt: 644.81ms | tok/sec: 813,089 | mfu: 50.82 | epoch: 3 | total time: 177.49m | eta: 2.0m +step 16523/16704 (98.92%) | loss: 2.405156 | lrm: 0.02 | dt: 645.49ms | tok/sec: 812,233 | mfu: 50.77 | epoch: 3 | total time: 177.50m | eta: 1.9m +step 16524/16704 (98.92%) | loss: 2.411831 | lrm: 0.02 | dt: 644.17ms | tok/sec: 813,899 | mfu: 50.87 | epoch: 3 | total time: 177.51m | eta: 1.9m +step 16525/16704 (98.93%) | loss: 2.401812 | lrm: 0.02 | dt: 644.13ms | tok/sec: 813,943 | mfu: 50.87 | epoch: 3 | total time: 177.52m | eta: 1.9m +step 16526/16704 (98.93%) | loss: 2.399216 | lrm: 0.02 | dt: 645.15ms | tok/sec: 812,655 | mfu: 50.79 | epoch: 3 | total time: 177.53m | eta: 1.9m +step 16527/16704 (98.94%) | loss: 2.398698 | lrm: 0.02 | dt: 644.21ms | tok/sec: 813,851 | mfu: 50.87 | epoch: 3 | total time: 177.55m | eta: 1.9m +step 16528/16704 (98.95%) | loss: 2.377742 | lrm: 0.02 | dt: 645.15ms | tok/sec: 812,660 | mfu: 50.79 | epoch: 3 | total time: 177.56m | eta: 1.9m +step 16529/16704 (98.95%) | loss: 2.376223 | lrm: 0.02 | dt: 644.62ms | tok/sec: 813,323 | mfu: 50.83 | epoch: 3 | total time: 177.57m | eta: 1.9m +step 16530/16704 (98.96%) | loss: 2.380009 | lrm: 0.02 | dt: 643.69ms | tok/sec: 814,502 | mfu: 50.91 | epoch: 3 | total time: 177.58m | eta: 1.9m +step 16531/16704 (98.96%) | loss: 2.397018 | lrm: 0.02 | dt: 644.06ms | tok/sec: 814,042 | mfu: 50.88 | epoch: 3 | total time: 177.59m | eta: 1.9m +step 16532/16704 (98.97%) | loss: 2.399346 | lrm: 0.02 | dt: 647.57ms | tok/sec: 809,622 | mfu: 50.60 | epoch: 3 | total time: 177.60m | eta: 1.8m +step 16533/16704 (98.98%) | loss: 2.407357 | lrm: 0.02 | dt: 643.33ms | tok/sec: 814,960 | mfu: 50.94 | epoch: 3 | total time: 177.61m | eta: 1.8m +step 16534/16704 (98.98%) | loss: 2.407076 | lrm: 0.02 | dt: 643.26ms | tok/sec: 815,047 | mfu: 50.94 | epoch: 3 | total time: 177.62m | eta: 1.8m +step 16535/16704 (98.99%) | loss: 2.399469 | lrm: 0.02 | dt: 645.66ms | tok/sec: 812,014 | mfu: 50.75 | epoch: 3 | total time: 177.63m | eta: 1.8m +step 16536/16704 (98.99%) | loss: 2.400413 | lrm: 0.02 | dt: 643.78ms | tok/sec: 814,393 | mfu: 50.90 | epoch: 3 | total time: 177.64m | eta: 1.8m +step 16537/16704 (99.00%) | loss: 2.407293 | lrm: 0.02 | dt: 649.26ms | tok/sec: 807,513 | mfu: 50.47 | epoch: 3 | total time: 177.65m | eta: 1.8m +step 16538/16704 (99.01%) | loss: 2.410789 | lrm: 0.02 | dt: 643.11ms | tok/sec: 815,235 | mfu: 50.95 | epoch: 3 | total time: 177.66m | eta: 1.8m +step 16539/16704 (99.01%) | loss: 2.423735 | lrm: 0.02 | dt: 644.59ms | tok/sec: 813,367 | mfu: 50.84 | epoch: 3 | total time: 177.67m | eta: 1.8m +step 16540/16704 (99.02%) | loss: 2.430630 | lrm: 0.02 | dt: 646.51ms | tok/sec: 810,951 | mfu: 50.69 | epoch: 3 | total time: 177.69m | eta: 1.8m +step 16541/16704 (99.02%) | loss: 2.418280 | lrm: 0.02 | dt: 643.46ms | tok/sec: 814,797 | mfu: 50.93 | epoch: 3 | total time: 177.70m | eta: 1.8m +step 16542/16704 (99.03%) | loss: 2.419447 | lrm: 0.02 | dt: 643.15ms | tok/sec: 815,184 | mfu: 50.95 | epoch: 3 | total time: 177.71m | eta: 1.7m +step 16543/16704 (99.04%) | loss: 2.444161 | lrm: 0.02 | dt: 646.20ms | tok/sec: 811,338 | mfu: 50.71 | epoch: 3 | total time: 177.72m | eta: 1.7m +step 16544/16704 (99.04%) | loss: 2.437461 | lrm: 0.02 | dt: 647.21ms | tok/sec: 810,077 | mfu: 50.63 | epoch: 3 | total time: 177.73m | eta: 1.7m +step 16545/16704 (99.05%) | loss: 2.433781 | lrm: 0.02 | dt: 644.46ms | tok/sec: 813,535 | mfu: 50.85 | epoch: 3 | total time: 177.74m | eta: 1.7m +step 16546/16704 (99.05%) | loss: 2.430733 | lrm: 0.02 | dt: 644.81ms | tok/sec: 813,089 | mfu: 50.82 | epoch: 3 | total time: 177.75m | eta: 1.7m +step 16547/16704 (99.06%) | loss: 2.423512 | lrm: 0.02 | dt: 646.08ms | tok/sec: 811,488 | mfu: 50.72 | epoch: 3 | total time: 177.76m | eta: 1.7m +step 16548/16704 (99.07%) | loss: 2.419289 | lrm: 0.02 | dt: 644.62ms | tok/sec: 813,330 | mfu: 50.83 | epoch: 3 | total time: 177.77m | eta: 1.7m +step 16549/16704 (99.07%) | loss: 2.432096 | lrm: 0.02 | dt: 643.33ms | tok/sec: 814,953 | mfu: 50.94 | epoch: 3 | total time: 177.78m | eta: 1.7m +step 16550/16704 (99.08%) | loss: 2.425549 | lrm: 0.02 | dt: 645.53ms | tok/sec: 812,185 | mfu: 50.76 | epoch: 3 | total time: 177.79m | eta: 1.7m +step 16551/16704 (99.08%) | loss: 2.423357 | lrm: 0.02 | dt: 645.64ms | tok/sec: 812,048 | mfu: 50.75 | epoch: 3 | total time: 177.80m | eta: 1.6m +step 16552/16704 (99.09%) | loss: 2.419494 | lrm: 0.02 | dt: 643.46ms | tok/sec: 814,791 | mfu: 50.93 | epoch: 3 | total time: 177.81m | eta: 1.6m +step 16553/16704 (99.10%) | loss: 2.406788 | lrm: 0.02 | dt: 644.02ms | tok/sec: 814,084 | mfu: 50.88 | epoch: 3 | total time: 177.83m | eta: 1.6m +step 16554/16704 (99.10%) | loss: 2.395246 | lrm: 0.02 | dt: 642.56ms | tok/sec: 815,936 | mfu: 51.00 | epoch: 3 | total time: 177.84m | eta: 1.6m +step 16555/16704 (99.11%) | loss: 2.388179 | lrm: 0.02 | dt: 645.54ms | tok/sec: 812,169 | mfu: 50.76 | epoch: 3 | total time: 177.85m | eta: 1.6m +step 16556/16704 (99.11%) | loss: 2.393538 | lrm: 0.02 | dt: 643.29ms | tok/sec: 815,014 | mfu: 50.94 | epoch: 3 | total time: 177.86m | eta: 1.6m +step 16557/16704 (99.12%) | loss: 2.410377 | lrm: 0.02 | dt: 642.98ms | tok/sec: 815,404 | mfu: 50.96 | epoch: 3 | total time: 177.87m | eta: 1.6m +step 16558/16704 (99.13%) | loss: 2.397016 | lrm: 0.02 | dt: 647.87ms | tok/sec: 809,254 | mfu: 50.58 | epoch: 3 | total time: 177.88m | eta: 1.6m +step 16559/16704 (99.13%) | loss: 2.408571 | lrm: 0.02 | dt: 642.05ms | tok/sec: 816,586 | mfu: 51.04 | epoch: 3 | total time: 177.89m | eta: 1.6m +step 16560/16704 (99.14%) | loss: 2.412027 | lrm: 0.02 | dt: 644.47ms | tok/sec: 813,515 | mfu: 50.85 | epoch: 3 | total time: 177.90m | eta: 1.5m +step 16561/16704 (99.14%) | loss: 2.405957 | lrm: 0.02 | dt: 646.90ms | tok/sec: 810,458 | mfu: 50.65 | epoch: 3 | total time: 177.91m | eta: 1.5m +step 16562/16704 (99.15%) | loss: 2.408617 | lrm: 0.02 | dt: 641.27ms | tok/sec: 817,580 | mfu: 51.10 | epoch: 3 | total time: 177.92m | eta: 1.5m +step 16563/16704 (99.16%) | loss: 2.419834 | lrm: 0.02 | dt: 645.22ms | tok/sec: 812,577 | mfu: 50.79 | epoch: 3 | total time: 177.93m | eta: 1.5m +step 16564/16704 (99.16%) | loss: 2.427475 | lrm: 0.02 | dt: 643.57ms | tok/sec: 814,660 | mfu: 50.92 | epoch: 3 | total time: 177.94m | eta: 1.5m +step 16565/16704 (99.17%) | loss: 2.425142 | lrm: 0.02 | dt: 644.61ms | tok/sec: 813,342 | mfu: 50.84 | epoch: 3 | total time: 177.95m | eta: 1.5m +step 16566/16704 (99.17%) | loss: 2.408662 | lrm: 0.02 | dt: 644.17ms | tok/sec: 813,899 | mfu: 50.87 | epoch: 3 | total time: 177.96m | eta: 1.5m +step 16567/16704 (99.18%) | loss: 2.401231 | lrm: 0.02 | dt: 643.67ms | tok/sec: 814,526 | mfu: 50.91 | epoch: 3 | total time: 177.98m | eta: 1.5m +step 16568/16704 (99.19%) | loss: 2.403068 | lrm: 0.02 | dt: 642.37ms | tok/sec: 816,177 | mfu: 51.01 | epoch: 3 | total time: 177.99m | eta: 1.5m +step 16569/16704 (99.19%) | loss: 2.410238 | lrm: 0.02 | dt: 645.70ms | tok/sec: 811,972 | mfu: 50.75 | epoch: 3 | total time: 178.00m | eta: 1.5m +step 16570/16704 (99.20%) | loss: 2.401241 | lrm: 0.02 | dt: 644.98ms | tok/sec: 812,880 | mfu: 50.81 | epoch: 3 | total time: 178.01m | eta: 1.4m +step 16571/16704 (99.20%) | loss: 2.395252 | lrm: 0.02 | dt: 644.90ms | tok/sec: 812,970 | mfu: 50.81 | epoch: 3 | total time: 178.02m | eta: 1.4m +step 16572/16704 (99.21%) | loss: 2.394925 | lrm: 0.02 | dt: 644.39ms | tok/sec: 813,616 | mfu: 50.85 | epoch: 3 | total time: 178.03m | eta: 1.4m +step 16573/16704 (99.22%) | loss: 2.394710 | lrm: 0.02 | dt: 643.18ms | tok/sec: 815,145 | mfu: 50.95 | epoch: 3 | total time: 178.04m | eta: 1.4m +step 16574/16704 (99.22%) | loss: 2.394220 | lrm: 0.02 | dt: 646.45ms | tok/sec: 811,025 | mfu: 50.69 | epoch: 3 | total time: 178.05m | eta: 1.4m +step 16575/16704 (99.23%) | loss: 2.406538 | lrm: 0.02 | dt: 645.56ms | tok/sec: 812,142 | mfu: 50.76 | epoch: 3 | total time: 178.06m | eta: 1.4m +step 16576/16704 (99.23%) | loss: 2.418213 | lrm: 0.02 | dt: 643.74ms | tok/sec: 814,445 | mfu: 50.90 | epoch: 3 | total time: 178.07m | eta: 1.4m +step 16577/16704 (99.24%) | loss: 2.418008 | lrm: 0.02 | dt: 644.71ms | tok/sec: 813,220 | mfu: 50.83 | epoch: 3 | total time: 178.08m | eta: 1.4m +step 16578/16704 (99.25%) | loss: 2.404640 | lrm: 0.02 | dt: 645.54ms | tok/sec: 812,166 | mfu: 50.76 | epoch: 3 | total time: 178.09m | eta: 1.4m +step 16579/16704 (99.25%) | loss: 2.402690 | lrm: 0.01 | dt: 641.36ms | tok/sec: 817,466 | mfu: 51.09 | epoch: 3 | total time: 178.10m | eta: 1.3m +step 16580/16704 (99.26%) | loss: 2.406898 | lrm: 0.01 | dt: 647.43ms | tok/sec: 809,797 | mfu: 50.61 | epoch: 3 | total time: 178.12m | eta: 1.3m +step 16581/16704 (99.26%) | loss: 2.404451 | lrm: 0.01 | dt: 643.36ms | tok/sec: 814,925 | mfu: 50.93 | epoch: 3 | total time: 178.13m | eta: 1.3m +step 16582/16704 (99.27%) | loss: 2.396027 | lrm: 0.01 | dt: 645.16ms | tok/sec: 812,646 | mfu: 50.79 | epoch: 3 | total time: 178.14m | eta: 1.3m +step 16583/16704 (99.28%) | loss: 2.388157 | lrm: 0.01 | dt: 647.28ms | tok/sec: 809,983 | mfu: 50.63 | epoch: 3 | total time: 178.15m | eta: 1.3m +step 16584/16704 (99.28%) | loss: 2.404610 | lrm: 0.01 | dt: 644.79ms | tok/sec: 813,108 | mfu: 50.82 | epoch: 3 | total time: 178.16m | eta: 1.3m +step 16585/16704 (99.29%) | loss: 2.401278 | lrm: 0.01 | dt: 644.75ms | tok/sec: 813,169 | mfu: 50.82 | epoch: 3 | total time: 178.17m | eta: 1.3m +step 16586/16704 (99.29%) | loss: 2.398406 | lrm: 0.01 | dt: 645.54ms | tok/sec: 812,167 | mfu: 50.76 | epoch: 3 | total time: 178.18m | eta: 1.3m +step 16587/16704 (99.30%) | loss: 2.405436 | lrm: 0.01 | dt: 645.19ms | tok/sec: 812,605 | mfu: 50.79 | epoch: 3 | total time: 178.19m | eta: 1.3m +step 16588/16704 (99.31%) | loss: 2.411220 | lrm: 0.01 | dt: 644.41ms | tok/sec: 813,599 | mfu: 50.85 | epoch: 3 | total time: 178.20m | eta: 1.2m +step 16589/16704 (99.31%) | loss: 2.415013 | lrm: 0.01 | dt: 644.27ms | tok/sec: 813,770 | mfu: 50.86 | epoch: 3 | total time: 178.21m | eta: 1.2m +step 16590/16704 (99.32%) | loss: 2.433480 | lrm: 0.01 | dt: 643.85ms | tok/sec: 814,305 | mfu: 50.90 | epoch: 3 | total time: 178.22m | eta: 1.2m +step 16591/16704 (99.32%) | loss: 2.430607 | lrm: 0.01 | dt: 644.70ms | tok/sec: 813,228 | mfu: 50.83 | epoch: 3 | total time: 178.23m | eta: 1.2m +step 16592/16704 (99.33%) | loss: 2.445833 | lrm: 0.01 | dt: 643.05ms | tok/sec: 815,316 | mfu: 50.96 | epoch: 3 | total time: 178.24m | eta: 1.2m +step 16593/16704 (99.34%) | loss: 2.445367 | lrm: 0.01 | dt: 643.85ms | tok/sec: 814,297 | mfu: 50.89 | epoch: 3 | total time: 178.25m | eta: 1.2m +step 16594/16704 (99.34%) | loss: 2.442377 | lrm: 0.01 | dt: 645.00ms | tok/sec: 812,854 | mfu: 50.80 | epoch: 3 | total time: 178.27m | eta: 1.2m +step 16595/16704 (99.35%) | loss: 2.444561 | lrm: 0.01 | dt: 642.46ms | tok/sec: 816,067 | mfu: 51.01 | epoch: 3 | total time: 178.28m | eta: 1.2m +step 16596/16704 (99.35%) | loss: 2.435963 | lrm: 0.01 | dt: 647.17ms | tok/sec: 810,127 | mfu: 50.63 | epoch: 3 | total time: 178.29m | eta: 1.2m +step 16597/16704 (99.36%) | loss: 2.429207 | lrm: 0.01 | dt: 642.60ms | tok/sec: 815,887 | mfu: 50.99 | epoch: 3 | total time: 178.30m | eta: 1.2m +step 16598/16704 (99.37%) | loss: 2.431104 | lrm: 0.01 | dt: 643.90ms | tok/sec: 814,236 | mfu: 50.89 | epoch: 3 | total time: 178.31m | eta: 1.1m +step 16599/16704 (99.37%) | loss: 2.430757 | lrm: 0.01 | dt: 644.42ms | tok/sec: 813,580 | mfu: 50.85 | epoch: 3 | total time: 178.32m | eta: 1.1m +step 16600/16704 (99.38%) | loss: 2.433021 | lrm: 0.01 | dt: 644.48ms | tok/sec: 813,505 | mfu: 50.85 | epoch: 3 | total time: 178.33m | eta: 1.1m +step 16601/16704 (99.38%) | loss: 2.427363 | lrm: 0.01 | dt: 645.27ms | tok/sec: 812,512 | mfu: 50.78 | epoch: 3 | total time: 178.34m | eta: 1.1m +step 16602/16704 (99.39%) | loss: 2.424298 | lrm: 0.01 | dt: 644.57ms | tok/sec: 813,389 | mfu: 50.84 | epoch: 3 | total time: 178.35m | eta: 1.1m +step 16603/16704 (99.40%) | loss: 2.422885 | lrm: 0.01 | dt: 642.94ms | tok/sec: 815,459 | mfu: 50.97 | epoch: 3 | total time: 178.36m | eta: 1.1m +step 16604/16704 (99.40%) | loss: 2.430759 | lrm: 0.01 | dt: 647.00ms | tok/sec: 810,333 | mfu: 50.65 | epoch: 3 | total time: 178.37m | eta: 1.1m +step 16605/16704 (99.41%) | loss: 2.434733 | lrm: 0.01 | dt: 644.33ms | tok/sec: 813,689 | mfu: 50.86 | epoch: 3 | total time: 178.38m | eta: 1.1m +step 16606/16704 (99.41%) | loss: 2.423703 | lrm: 0.01 | dt: 643.48ms | tok/sec: 814,765 | mfu: 50.92 | epoch: 3 | total time: 178.39m | eta: 1.1m +step 16607/16704 (99.42%) | loss: 2.442402 | lrm: 0.01 | dt: 644.89ms | tok/sec: 812,988 | mfu: 50.81 | epoch: 3 | total time: 178.41m | eta: 1.0m +step 16608/16704 (99.43%) | loss: 2.436199 | lrm: 0.01 | dt: 644.13ms | tok/sec: 813,947 | mfu: 50.87 | epoch: 3 | total time: 178.42m | eta: 1.0m +step 16609/16704 (99.43%) | loss: 2.435903 | lrm: 0.01 | dt: 643.55ms | tok/sec: 814,676 | mfu: 50.92 | epoch: 3 | total time: 178.43m | eta: 1.0m +step 16610/16704 (99.44%) | loss: 2.438850 | lrm: 0.01 | dt: 642.89ms | tok/sec: 815,511 | mfu: 50.97 | epoch: 3 | total time: 178.44m | eta: 1.0m +step 16611/16704 (99.44%) | loss: 2.438374 | lrm: 0.01 | dt: 643.23ms | tok/sec: 815,092 | mfu: 50.94 | epoch: 3 | total time: 178.45m | eta: 1.0m +step 16612/16704 (99.45%) | loss: 2.434846 | lrm: 0.01 | dt: 644.25ms | tok/sec: 813,799 | mfu: 50.86 | epoch: 3 | total time: 178.46m | eta: 1.0m +step 16613/16704 (99.46%) | loss: 2.436605 | lrm: 0.01 | dt: 643.15ms | tok/sec: 815,191 | mfu: 50.95 | epoch: 3 | total time: 178.47m | eta: 1.0m +step 16614/16704 (99.46%) | loss: 2.428962 | lrm: 0.01 | dt: 645.12ms | tok/sec: 812,693 | mfu: 50.79 | epoch: 3 | total time: 178.48m | eta: 1.0m +step 16615/16704 (99.47%) | loss: 2.423825 | lrm: 0.01 | dt: 643.22ms | tok/sec: 815,105 | mfu: 50.95 | epoch: 3 | total time: 178.49m | eta: 1.0m +step 16616/16704 (99.47%) | loss: 2.421287 | lrm: 0.01 | dt: 642.95ms | tok/sec: 815,441 | mfu: 50.97 | epoch: 3 | total time: 178.50m | eta: 0.9m +step 16617/16704 (99.48%) | loss: 2.412304 | lrm: 0.01 | dt: 645.92ms | tok/sec: 811,696 | mfu: 50.73 | epoch: 3 | total time: 178.51m | eta: 0.9m +step 16618/16704 (99.49%) | loss: 2.406192 | lrm: 0.01 | dt: 644.58ms | tok/sec: 813,374 | mfu: 50.84 | epoch: 3 | total time: 178.52m | eta: 0.9m +step 16619/16704 (99.49%) | loss: 2.412779 | lrm: 0.01 | dt: 643.76ms | tok/sec: 814,417 | mfu: 50.90 | epoch: 3 | total time: 178.53m | eta: 0.9m +step 16620/16704 (99.50%) | loss: 2.422698 | lrm: 0.01 | dt: 645.43ms | tok/sec: 812,302 | mfu: 50.77 | epoch: 3 | total time: 178.54m | eta: 0.9m +step 16621/16704 (99.50%) | loss: 2.423762 | lrm: 0.01 | dt: 645.27ms | tok/sec: 812,509 | mfu: 50.78 | epoch: 3 | total time: 178.56m | eta: 0.9m +step 16622/16704 (99.51%) | loss: 2.423722 | lrm: 0.01 | dt: 644.08ms | tok/sec: 814,010 | mfu: 50.88 | epoch: 3 | total time: 178.57m | eta: 0.9m +step 16623/16704 (99.52%) | loss: 2.428164 | lrm: 0.01 | dt: 646.01ms | tok/sec: 811,577 | mfu: 50.72 | epoch: 3 | total time: 178.58m | eta: 0.9m +step 16624/16704 (99.52%) | loss: 2.437688 | lrm: 0.01 | dt: 642.50ms | tok/sec: 816,008 | mfu: 51.00 | epoch: 3 | total time: 178.59m | eta: 0.9m +step 16625/16704 (99.53%) | loss: 2.438537 | lrm: 0.01 | dt: 644.05ms | tok/sec: 814,053 | mfu: 50.88 | epoch: 3 | total time: 178.60m | eta: 0.8m +step 16626/16704 (99.53%) | loss: 2.453846 | lrm: 0.01 | dt: 644.32ms | tok/sec: 813,707 | mfu: 50.86 | epoch: 3 | total time: 178.61m | eta: 0.8m +step 16627/16704 (99.54%) | loss: 2.440256 | lrm: 0.01 | dt: 645.12ms | tok/sec: 812,703 | mfu: 50.80 | epoch: 3 | total time: 178.62m | eta: 0.8m +step 16628/16704 (99.55%) | loss: 2.445270 | lrm: 0.01 | dt: 642.78ms | tok/sec: 815,652 | mfu: 50.98 | epoch: 3 | total time: 178.63m | eta: 0.8m +step 16629/16704 (99.55%) | loss: 2.433759 | lrm: 0.01 | dt: 644.55ms | tok/sec: 813,415 | mfu: 50.84 | epoch: 3 | total time: 178.64m | eta: 0.8m +step 16630/16704 (99.56%) | loss: 2.431224 | lrm: 0.01 | dt: 645.98ms | tok/sec: 811,615 | mfu: 50.73 | epoch: 3 | total time: 178.65m | eta: 0.8m +step 16631/16704 (99.56%) | loss: 2.426333 | lrm: 0.01 | dt: 647.00ms | tok/sec: 810,332 | mfu: 50.65 | epoch: 3 | total time: 178.66m | eta: 0.8m +step 16632/16704 (99.57%) | loss: 2.421673 | lrm: 0.01 | dt: 644.97ms | tok/sec: 812,892 | mfu: 50.81 | epoch: 3 | total time: 178.67m | eta: 0.8m +step 16633/16704 (99.57%) | loss: 2.409408 | lrm: 0.01 | dt: 642.71ms | tok/sec: 815,749 | mfu: 50.99 | epoch: 3 | total time: 178.68m | eta: 0.8m +step 16634/16704 (99.58%) | loss: 2.416708 | lrm: 0.01 | dt: 644.83ms | tok/sec: 813,069 | mfu: 50.82 | epoch: 3 | total time: 178.69m | eta: 0.8m +step 16635/16704 (99.59%) | loss: 2.410679 | lrm: 0.01 | dt: 644.00ms | tok/sec: 814,110 | mfu: 50.88 | epoch: 3 | total time: 178.71m | eta: 0.7m +step 16636/16704 (99.59%) | loss: 2.426725 | lrm: 0.01 | dt: 645.44ms | tok/sec: 812,292 | mfu: 50.77 | epoch: 3 | total time: 178.72m | eta: 0.7m +step 16637/16704 (99.60%) | loss: 2.429710 | lrm: 0.01 | dt: 642.38ms | tok/sec: 816,170 | mfu: 51.01 | epoch: 3 | total time: 178.73m | eta: 0.7m +step 16638/16704 (99.60%) | loss: 2.424056 | lrm: 0.01 | dt: 645.43ms | tok/sec: 812,304 | mfu: 50.77 | epoch: 3 | total time: 178.74m | eta: 0.7m +step 16639/16704 (99.61%) | loss: 2.425330 | lrm: 0.01 | dt: 643.97ms | tok/sec: 814,144 | mfu: 50.89 | epoch: 3 | total time: 178.75m | eta: 0.7m +step 16640/16704 (99.62%) | loss: 2.420040 | lrm: 0.01 | dt: 643.48ms | tok/sec: 814,763 | mfu: 50.92 | epoch: 3 | total time: 178.76m | eta: 0.7m +step 16641/16704 (99.62%) | loss: 2.424904 | lrm: 0.01 | dt: 645.53ms | tok/sec: 812,181 | mfu: 50.76 | epoch: 3 | total time: 178.77m | eta: 0.7m +step 16642/16704 (99.63%) | loss: 2.417530 | lrm: 0.01 | dt: 643.62ms | tok/sec: 814,594 | mfu: 50.91 | epoch: 3 | total time: 178.78m | eta: 0.7m +step 16643/16704 (99.63%) | loss: 2.425238 | lrm: 0.01 | dt: 645.64ms | tok/sec: 812,040 | mfu: 50.75 | epoch: 3 | total time: 178.79m | eta: 0.7m +step 16644/16704 (99.64%) | loss: 2.423522 | lrm: 0.01 | dt: 644.84ms | tok/sec: 813,054 | mfu: 50.82 | epoch: 3 | total time: 178.80m | eta: 0.6m +step 16645/16704 (99.65%) | loss: 2.413929 | lrm: 0.01 | dt: 643.64ms | tok/sec: 814,568 | mfu: 50.91 | epoch: 3 | total time: 178.81m | eta: 0.6m +step 16646/16704 (99.65%) | loss: 2.406753 | lrm: 0.01 | dt: 645.32ms | tok/sec: 812,451 | mfu: 50.78 | epoch: 3 | total time: 178.82m | eta: 0.6m +step 16647/16704 (99.66%) | loss: 2.409560 | lrm: 0.01 | dt: 645.01ms | tok/sec: 812,831 | mfu: 50.80 | epoch: 3 | total time: 178.83m | eta: 0.6m +step 16648/16704 (99.66%) | loss: 2.413136 | lrm: 0.01 | dt: 643.04ms | tok/sec: 815,329 | mfu: 50.96 | epoch: 3 | total time: 178.85m | eta: 0.6m +step 16649/16704 (99.67%) | loss: 2.405109 | lrm: 0.01 | dt: 645.18ms | tok/sec: 812,623 | mfu: 50.79 | epoch: 3 | total time: 178.86m | eta: 0.6m +step 16650/16704 (99.68%) | loss: 2.396094 | lrm: 0.01 | dt: 645.75ms | tok/sec: 811,902 | mfu: 50.75 | epoch: 3 | total time: 178.87m | eta: 0.6m +step 16651/16704 (99.68%) | loss: 2.387860 | lrm: 0.01 | dt: 641.68ms | tok/sec: 817,051 | mfu: 51.07 | epoch: 3 | total time: 178.88m | eta: 0.6m +step 16652/16704 (99.69%) | loss: 2.394996 | lrm: 0.01 | dt: 645.48ms | tok/sec: 812,245 | mfu: 50.77 | epoch: 3 | total time: 178.89m | eta: 0.6m +step 16653/16704 (99.69%) | loss: 2.394626 | lrm: 0.01 | dt: 643.41ms | tok/sec: 814,855 | mfu: 50.93 | epoch: 3 | total time: 178.90m | eta: 0.5m +step 16654/16704 (99.70%) | loss: 2.410509 | lrm: 0.01 | dt: 642.97ms | tok/sec: 815,412 | mfu: 50.96 | epoch: 3 | total time: 178.91m | eta: 0.5m +step 16655/16704 (99.71%) | loss: 2.421961 | lrm: 0.01 | dt: 646.47ms | tok/sec: 811,000 | mfu: 50.69 | epoch: 3 | total time: 178.92m | eta: 0.5m +step 16656/16704 (99.71%) | loss: 2.436513 | lrm: 0.01 | dt: 646.26ms | tok/sec: 811,267 | mfu: 50.71 | epoch: 3 | total time: 178.93m | eta: 0.5m +step 16657/16704 (99.72%) | loss: 2.437210 | lrm: 0.01 | dt: 644.49ms | tok/sec: 813,493 | mfu: 50.84 | epoch: 3 | total time: 178.94m | eta: 0.5m +step 16658/16704 (99.72%) | loss: 2.434811 | lrm: 0.01 | dt: 644.82ms | tok/sec: 813,073 | mfu: 50.82 | epoch: 3 | total time: 178.95m | eta: 0.5m +step 16659/16704 (99.73%) | loss: 2.445264 | lrm: 0.01 | dt: 648.52ms | tok/sec: 808,439 | mfu: 50.53 | epoch: 3 | total time: 178.96m | eta: 0.5m +step 16660/16704 (99.74%) | loss: 2.435920 | lrm: 0.01 | dt: 647.96ms | tok/sec: 809,139 | mfu: 50.57 | epoch: 3 | total time: 178.97m | eta: 0.5m +step 16661/16704 (99.74%) | loss: 2.430623 | lrm: 0.01 | dt: 644.23ms | tok/sec: 813,818 | mfu: 50.86 | epoch: 3 | total time: 178.99m | eta: 0.5m +step 16662/16704 (99.75%) | loss: 2.433579 | lrm: 0.01 | dt: 647.55ms | tok/sec: 809,647 | mfu: 50.60 | epoch: 3 | total time: 179.00m | eta: 0.5m +step 16663/16704 (99.75%) | loss: 2.430351 | lrm: 0.00 | dt: 645.94ms | tok/sec: 811,661 | mfu: 50.73 | epoch: 3 | total time: 179.01m | eta: 0.4m +step 16664/16704 (99.76%) | loss: 2.423253 | lrm: 0.00 | dt: 647.49ms | tok/sec: 809,724 | mfu: 50.61 | epoch: 3 | total time: 179.02m | eta: 0.4m +step 16665/16704 (99.77%) | loss: 2.421617 | lrm: 0.00 | dt: 644.55ms | tok/sec: 813,414 | mfu: 50.84 | epoch: 3 | total time: 179.03m | eta: 0.4m +step 16666/16704 (99.77%) | loss: 2.405802 | lrm: 0.00 | dt: 645.67ms | tok/sec: 812,000 | mfu: 50.75 | epoch: 3 | total time: 179.04m | eta: 0.4m +step 16667/16704 (99.78%) | loss: 2.403598 | lrm: 0.00 | dt: 648.29ms | tok/sec: 808,725 | mfu: 50.55 | epoch: 3 | total time: 179.05m | eta: 0.4m +step 16668/16704 (99.78%) | loss: 2.405941 | lrm: 0.00 | dt: 643.74ms | tok/sec: 814,445 | mfu: 50.90 | epoch: 3 | total time: 179.06m | eta: 0.4m +step 16669/16704 (99.79%) | loss: 2.408676 | lrm: 0.00 | dt: 646.24ms | tok/sec: 811,286 | mfu: 50.71 | epoch: 3 | total time: 179.07m | eta: 0.4m +step 16670/16704 (99.80%) | loss: 2.400817 | lrm: 0.00 | dt: 645.98ms | tok/sec: 811,619 | mfu: 50.73 | epoch: 3 | total time: 179.08m | eta: 0.4m +step 16671/16704 (99.80%) | loss: 2.407833 | lrm: 0.00 | dt: 644.78ms | tok/sec: 813,133 | mfu: 50.82 | epoch: 3 | total time: 179.09m | eta: 0.4m +step 16672/16704 (99.81%) | loss: 2.398257 | lrm: 0.00 | dt: 645.11ms | tok/sec: 812,712 | mfu: 50.80 | epoch: 3 | total time: 179.10m | eta: 0.3m +step 16673/16704 (99.81%) | loss: 2.406614 | lrm: 0.00 | dt: 644.56ms | tok/sec: 813,401 | mfu: 50.84 | epoch: 3 | total time: 179.11m | eta: 0.3m +step 16674/16704 (99.82%) | loss: 2.405582 | lrm: 0.00 | dt: 646.02ms | tok/sec: 811,561 | mfu: 50.72 | epoch: 3 | total time: 179.13m | eta: 0.3m +step 16675/16704 (99.83%) | loss: 2.408216 | lrm: 0.00 | dt: 645.48ms | tok/sec: 812,244 | mfu: 50.77 | epoch: 3 | total time: 179.14m | eta: 0.3m +step 16676/16704 (99.83%) | loss: 2.416175 | lrm: 0.00 | dt: 646.84ms | tok/sec: 810,532 | mfu: 50.66 | epoch: 3 | total time: 179.15m | eta: 0.3m +step 16677/16704 (99.84%) | loss: 2.411621 | lrm: 0.00 | dt: 645.92ms | tok/sec: 811,691 | mfu: 50.73 | epoch: 3 | total time: 179.16m | eta: 0.3m +step 16678/16704 (99.84%) | loss: 2.414442 | lrm: 0.00 | dt: 649.32ms | tok/sec: 807,446 | mfu: 50.47 | epoch: 3 | total time: 179.17m | eta: 0.3m +step 16679/16704 (99.85%) | loss: 2.426993 | lrm: 0.00 | dt: 644.73ms | tok/sec: 813,184 | mfu: 50.83 | epoch: 3 | total time: 179.18m | eta: 0.3m +step 16680/16704 (99.86%) | loss: 2.425445 | lrm: 0.00 | dt: 648.30ms | tok/sec: 808,710 | mfu: 50.55 | epoch: 3 | total time: 179.19m | eta: 0.3m +step 16681/16704 (99.86%) | loss: 2.415920 | lrm: 0.00 | dt: 643.75ms | tok/sec: 814,425 | mfu: 50.90 | epoch: 3 | total time: 179.20m | eta: 0.2m +step 16682/16704 (99.87%) | loss: 2.426602 | lrm: 0.00 | dt: 647.45ms | tok/sec: 809,779 | mfu: 50.61 | epoch: 3 | total time: 179.21m | eta: 0.2m +step 16683/16704 (99.87%) | loss: 2.417053 | lrm: 0.00 | dt: 644.90ms | tok/sec: 812,973 | mfu: 50.81 | epoch: 3 | total time: 179.22m | eta: 0.2m +step 16684/16704 (99.88%) | loss: 2.417309 | lrm: 0.00 | dt: 643.68ms | tok/sec: 814,518 | mfu: 50.91 | epoch: 3 | total time: 179.23m | eta: 0.2m +step 16685/16704 (99.89%) | loss: 2.413461 | lrm: 0.00 | dt: 647.20ms | tok/sec: 810,081 | mfu: 50.63 | epoch: 3 | total time: 179.24m | eta: 0.2m +step 16686/16704 (99.89%) | loss: 2.400886 | lrm: 0.00 | dt: 644.72ms | tok/sec: 813,196 | mfu: 50.83 | epoch: 3 | total time: 179.25m | eta: 0.2m +step 16687/16704 (99.90%) | loss: 2.413325 | lrm: 0.00 | dt: 643.81ms | tok/sec: 814,346 | mfu: 50.90 | epoch: 3 | total time: 179.26m | eta: 0.2m +step 16688/16704 (99.90%) | loss: 2.404685 | lrm: 0.00 | dt: 645.05ms | tok/sec: 812,788 | mfu: 50.80 | epoch: 3 | total time: 179.28m | eta: 0.2m +step 16689/16704 (99.91%) | loss: 2.417067 | lrm: 0.00 | dt: 643.13ms | tok/sec: 815,216 | mfu: 50.95 | epoch: 3 | total time: 179.29m | eta: 0.2m +step 16690/16704 (99.92%) | loss: 2.419556 | lrm: 0.00 | dt: 646.14ms | tok/sec: 811,419 | mfu: 50.71 | epoch: 3 | total time: 179.30m | eta: 0.2m +step 16691/16704 (99.92%) | loss: 2.410644 | lrm: 0.00 | dt: 645.09ms | tok/sec: 812,732 | mfu: 50.80 | epoch: 3 | total time: 179.31m | eta: 0.1m +step 16692/16704 (99.93%) | loss: 2.403238 | lrm: 0.00 | dt: 644.55ms | tok/sec: 813,412 | mfu: 50.84 | epoch: 3 | total time: 179.32m | eta: 0.1m +step 16693/16704 (99.93%) | loss: 2.401346 | lrm: 0.00 | dt: 645.35ms | tok/sec: 812,412 | mfu: 50.78 | epoch: 3 | total time: 179.33m | eta: 0.1m +step 16694/16704 (99.94%) | loss: 2.409432 | lrm: 0.00 | dt: 644.74ms | tok/sec: 813,172 | mfu: 50.82 | epoch: 3 | total time: 179.34m | eta: 0.1m +step 16695/16704 (99.95%) | loss: 2.407421 | lrm: 0.00 | dt: 644.63ms | tok/sec: 813,310 | mfu: 50.83 | epoch: 3 | total time: 179.35m | eta: 0.1m +step 16696/16704 (99.95%) | loss: 2.406380 | lrm: 0.00 | dt: 645.07ms | tok/sec: 812,757 | mfu: 50.80 | epoch: 3 | total time: 179.36m | eta: 0.1m +step 16697/16704 (99.96%) | loss: 2.407167 | lrm: 0.00 | dt: 646.28ms | tok/sec: 811,241 | mfu: 50.70 | epoch: 3 | total time: 179.37m | eta: 0.1m +step 16698/16704 (99.96%) | loss: 2.405408 | lrm: 0.00 | dt: 645.84ms | tok/sec: 811,793 | mfu: 50.74 | epoch: 3 | total time: 179.38m | eta: 0.1m +step 16699/16704 (99.97%) | loss: 2.409648 | lrm: 0.00 | dt: 647.37ms | tok/sec: 809,868 | mfu: 50.62 | epoch: 3 | total time: 179.39m | eta: 0.1m +step 16700/16704 (99.98%) | loss: 2.423465 | lrm: 0.00 | dt: 645.28ms | tok/sec: 812,502 | mfu: 50.78 | epoch: 3 | total time: 179.40m | eta: 0.0m +step 16701/16704 (99.98%) | loss: 2.427105 | lrm: 0.00 | dt: 644.74ms | tok/sec: 813,175 | mfu: 50.82 | epoch: 3 | total time: 179.42m | eta: 0.0m +step 16702/16704 (99.99%) | loss: 2.414285 | lrm: 0.00 | dt: 645.05ms | tok/sec: 812,785 | mfu: 50.80 | epoch: 3 | total time: 179.43m | eta: 0.0m +step 16703/16704 (99.99%) | loss: 2.417931 | lrm: 0.00 | dt: 645.43ms | tok/sec: 812,305 | mfu: 50.77 | epoch: 3 | total time: 179.44m | eta: 0.0m +Step 16704 | Validation bpb: 0.753336 +Evaluating: hellaswag_zeroshot (0-shot, type: multiple_choice)... accuracy: 0.5171 | centered: 0.3562 | time: 22.60s +Evaluating: jeopardy (10-shot, type: language_modeling)... accuracy: 0.1927 | centered: 0.1927 | time: 4.84s +Evaluating: bigbench_qa_wikidata (10-shot, type: language_modeling)... accuracy: 0.5106 | centered: 0.5106 | time: 47.45s +Evaluating: arc_easy (10-shot, type: multiple_choice)... accuracy: 0.6772 | centered: 0.5696 | time: 5.90s +Evaluating: arc_challenge (10-shot, type: multiple_choice)... accuracy: 0.3857 | centered: 0.1809 | time: 2.93s +Evaluating: copa (0-shot, type: multiple_choice)... accuracy: 0.6700 | centered: 0.3400 | time: 0.23s +Evaluating: commonsense_qa (10-shot, type: multiple_choice)... accuracy: 0.3178 | centered: 0.1472 | time: 3.11s +Evaluating: piqa (10-shot, type: multiple_choice)... accuracy: 0.7138 | centered: 0.4276 | time: 4.38s +Evaluating: openbook_qa (0-shot, type: multiple_choice)... accuracy: 0.4000 | centered: 0.2000 | time: 1.12s +Evaluating: lambada_openai (0-shot, type: language_modeling)... accuracy: 0.4271 | centered: 0.4271 | time: 11.51s +Evaluating: hellaswag (10-shot, type: multiple_choice)... accuracy: 0.5179 | centered: 0.3572 | time: 35.50s +Evaluating: winograd (0-shot, type: schema)... accuracy: 0.7106 | centered: 0.4212 | time: 0.61s +Evaluating: winogrande (0-shot, type: schema)... accuracy: 0.5667 | centered: 0.1334 | time: 2.78s +Evaluating: bigbench_dyck_languages (10-shot, type: language_modeling)... accuracy: 0.1230 | centered: 0.1230 | time: 2.37s +Evaluating: agi_eval_lsat_ar (3-shot, type: multiple_choice)... accuracy: 0.3043 | centered: 0.1304 | time: 0.79s +Evaluating: bigbench_cs_algorithms (10-shot, type: language_modeling)... accuracy: 0.4167 | centered: 0.4167 | time: 3.06s +Evaluating: bigbench_operators (10-shot, type: language_modeling)... accuracy: 0.1714 | centered: 0.1714 | time: 0.49s +Evaluating: bigbench_repeat_copy_logic (10-shot, type: language_modeling)... accuracy: 0.0000 | centered: 0.0000 | time: 0.08s +Evaluating: squad (10-shot, type: language_modeling)... accuracy: 0.3535 | centered: 0.3535 | time: 28.84s +Evaluating: coqa (0-shot, type: language_modeling)... accuracy: 0.2547 | centered: 0.2547 | time: 18.47s +Evaluating: boolq (10-shot, type: multiple_choice)... accuracy: 0.5841 | centered: -0.0945 | time: 10.66s +Evaluating: bigbench_language_identification (10-shot, type: multiple_choice)... accuracy: 0.2494 | centered: 0.1743 | time: 59.70s +Step 16704 | CORE metric: 0.2633 +2026-02-02 00:09:51,866 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank2.pt +2026-02-02 00:09:51,873 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank5.pt +2026-02-02 00:09:51,919 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank3.pt +2026-02-02 00:09:51,970 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank1.pt +2026-02-02 00:09:51,999 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank6.pt +2026-02-02 00:09:52,005 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank4.pt +2026-02-02 00:09:52,081 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank7.pt +2026-02-02 00:09:58,013 - nanochat.checkpoint_manager - INFO - Saved model parameters to: /root/.cache/nanochat/base_checkpoints/d24_feb01/model_016704.pt +2026-02-02 00:09:58,014 - nanochat.checkpoint_manager - INFO - Saved metadata to: /root/.cache/nanochat/base_checkpoints/d24_feb01/meta_016704.json +2026-02-02 00:09:59,067 - nanochat.checkpoint_manager - INFO - Saved optimizer state to: /root/.cache/nanochat/base_checkpoints/d24_feb01/optim_016704_rank0.pt +Peak memory usage: 62389.67MiB +Total training time: 179.44m +Minimum validation bpb: 0.753336 diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 208acd14..324ec697 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -49,12 +49,11 @@ def has_ve(layer_idx, n_layer): return layer_idx % 2 == (n_layer - 1) % 2 def apply_rotary_emb(x, cos, sin): - assert x.ndim == 4 # multihead attention - d = x.shape[3] // 2 - x1, x2 = x[..., :d], x[..., d:] # split up last dim into two halves - y1 = x1 * cos + x2 * sin # rotate pairs of dims + assert x.ndim == 4 # (B, T, H, D) multihead attention layout + x1, x2 = x.chunk(2, dim=-1) # split head_dim into two halves + y1 = x1 * cos + x2 * sin y2 = x1 * (-sin) + x2 * cos - return torch.cat([y1, y2], 3) + return torch.cat([y1, y2], dim=-1) class CausalSelfAttention(nn.Module): def __init__(self, config, layer_idx): diff --git a/nanochat/optim.py b/nanochat/optim.py index 42d862b4..9825bfc2 100644 --- a/nanochat/optim.py +++ b/nanochat/optim.py @@ -260,7 +260,12 @@ class MuonAdamW(torch.optim.Optimizer): # Fill all the 0-D tensors with current values self._muon_momentum_t.fill_(group["momentum"]) self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) - self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) + # Shape-based LR scaling (flipped from original): + # - Tall matrices (input projections like c_fc): 1x + # - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4 + ratio = shape[-2] / shape[-1] + lr_mult = 1.0 if ratio >= 1 else ratio**-0.5 + self._muon_lr_t.fill_(group["lr"] * lr_mult) self._muon_wd_t.fill_(group["weight_decay"]) # Single fused kernel: momentum -> polar_express -> variance_reduction -> update @@ -478,7 +483,12 @@ class DistMuonAdamW(torch.optim.Optimizer): # Fill 0-D tensors and run fused kernel self._muon_momentum_t.fill_(group["momentum"]) self._muon_beta2_t.fill_(group["beta2"]) - self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) + # Shape-based LR scaling (flipped from original): + # - Tall matrices (input projections like c_fc): 1x + # - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4 + ratio = shape[-2] / shape[-1] + lr_mult = 1.0 if ratio >= 1 else ratio**-0.5 + self._muon_lr_t.fill_(group["lr"] * lr_mult) self._muon_wd_t.fill_(group["weight_decay"]) muon_step_fused( grad_chunk[:num_owned], stacked_owned,