From 5019accc5bc75400c33148253adfa25dc57c153b Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 17 Mar 2026 16:55:56 +0000 Subject: [PATCH] fix scaling laws scripts after the bigram embeddings were removed --- dev/scaling_analysis.ipynb | 4 ++-- runs/scaling_laws.sh | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/dev/scaling_analysis.ipynb b/dev/scaling_analysis.ipynb index e7761c5..6a95448 100644 --- a/dev/scaling_analysis.ipynb +++ b/dev/scaling_analysis.ipynb @@ -76,7 +76,6 @@ "\n", "Our CSV now has granular counts:\n", "- `params_wte` - token embedding (lookup table)\n", - "- `params_bigram_embed` - bigram hash embeddings (lookup table)\n", "- `params_value_embeds` - value embeddings (lookup table)\n", "- `params_lm_head` - unembedding projection (matmul)\n", "- `params_transformer` - attention + MLP matrices (matmuls)\n", @@ -116,12 +115,13 @@ "\n", "\n", "# Compute derived columns\n", + "df = df.copy() # avoid SettingWithCopyWarning from earlier filter\n", "df['effective_params'] = df.apply(compute_effective_params, axis=1)\n", "df['param_data_ratio'] = df['tokens_trained'] / df['effective_params']\n", "\n", "# Show parameter breakdown for first few rows\n", "print(\"Parameter breakdown (first row per flops budget):\")\n", - "param_cols = ['depth', 'params_wte', 'params_bigram_embed', 'params_value_embeds',\n", + "param_cols = ['depth', 'params_wte', 'params_value_embeds',\n", " 'params_lm_head', 'params_transformer', 'params_scalars', 'params_total', 'effective_params']\n", "df.groupby('flops_budget').first()[param_cols]" ] diff --git a/runs/scaling_laws.sh b/runs/scaling_laws.sh index f1e2fd4..212e675 100644 --- a/runs/scaling_laws.sh +++ b/runs/scaling_laws.sh @@ -24,7 +24,7 @@ RESULTS_FILE="$RESULTS_DIR/results.csv" # Write CSV header only if file doesn't exist if [ ! -f "$RESULTS_FILE" ]; then - echo "flops_budget,depth,model_dim,params_wte,params_bigram_embed,params_value_embeds,params_lm_head,params_transformer,params_scalars,params_total,num_iterations,tokens_trained,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" + echo "flops_budget,depth,model_dim,params_wte,params_value_embeds,params_lm_head,params_transformer,params_scalars,params_total,num_iterations,tokens_trained,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" fi log() { @@ -86,13 +86,14 @@ for flops in "${FLOPS_BUDGETS[@]}"; do LOG_FILE="$RESULTS_DIR/${TAG}_train.log" # Extract detailed parameter counts (for scaling law analysis with different conventions) - PARAMS_WTE=$(grep "wte:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') - PARAMS_BIGRAM=$(grep "bigram_embed:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') - PARAMS_VE=$(grep "value_embeds:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') - PARAMS_LM=$(grep "lm_head:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') - PARAMS_TRANSFORMER=$(grep "transformer_matrices:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') - PARAMS_SCALARS=$(grep "scalars:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') - PARAMS_TOTAL=$(grep "total:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + # Note: the log format is padded, e.g. "wte : 25,165,824" + # so we grep for "^key " (key at start of line followed by space) to avoid false matches + PARAMS_WTE=$(grep "^wte " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_VE=$(grep "^value_embeds " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_LM=$(grep "^lm_head " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_TRANSFORMER=$(grep "^transformer_matrices " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_SCALARS=$(grep "^scalars " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_TOTAL=$(grep "^total " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') # Calculate tokens trained (iterations * batch_size, default 524288) @@ -112,7 +113,7 @@ for flops in "${FLOPS_BUDGETS[@]}"; do log " Params: $PARAMS_TOTAL (transformer: $PARAMS_TRANSFORMER), Iters: $NUM_ITERS, Val BPB: $VAL_BPB, CORE: $CORE_SCORE" # Append to CSV - echo "$flops,$d,$MODEL_DIM,$PARAMS_WTE,$PARAMS_BIGRAM,$PARAMS_VE,$PARAMS_LM,$PARAMS_TRANSFORMER,$PARAMS_SCALARS,$PARAMS_TOTAL,$NUM_ITERS,$TOKENS_TRAINED,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" + echo "$flops,$d,$MODEL_DIM,$PARAMS_WTE,$PARAMS_VE,$PARAMS_LM,$PARAMS_TRANSFORMER,$PARAMS_SCALARS,$PARAMS_TOTAL,$NUM_ITERS,$TOKENS_TRAINED,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" done done