From 938cb31f1ad6264c01a880bda0e55225a633b584 Mon Sep 17 00:00:00 2001
From: obxium <obxium@gmail.com>
Date: Tue, 14 Oct 2025 14:19:44 -0400
Subject: [PATCH 01/36] Update logo

---
 dev/nanochat.png | Bin 19811 -> 1305 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
diff --git a/dev/nanochat.png b/dev/nanochat.png
index 84e1b5fc9fda7b2739f3db39f23ee19dd58234cb..2313d271f8a7db292f8d5c80307b807e9a055ba3 100644
GIT binary patch
literal 1305
zcmeAS@N?(olHy`uVBq!ia0y~yV41_fz$n1N3>0DaDt-;5bOU@sT;=5C&YU>|Bq2av
zULMAjla-Z)aDgO{4HRD$kh>75k-sF!FZloe-`~G{`uz9L*AH)Ay?U>EG$<2jNsp(C
zV@L(#n@fg_3<ey`8_qHQFt_Hc*(=1$sS4DI00-^`efwsj>8M_E$y0sq1<tu(y2Obj
z{xfc-g;ySUxC?MBW_WYw;znLak=De9aCPQ|T#g)00!;!f3c?A_hu5v#mUnhv@cpXH
z7xQjk4*UC@y>_i!<CUG!LCb%)FPCz=vhqVnv(-Xjt3vg6Dlb&-bXo)1oaI75*StG2
zmt}5?b#LE-$pz{_$3F7A<9Wezhj2O2p+Z2#OZ+fY=gmH8^SLVe*J`KUim6%`KJJkS
zQ!ib$VzQO$H1T;?Jm+5UoV#PP)xybED}~F0)JuZY-!1Wb)#CT+$lNQObFXw-uLMgL
zs+T&cmx3ihmVmW^B!wZ`K$4GO+JKUpWz2!rO9NZeGUgSTN8hUTO8ePyIaa^ym))Pg
z+;zWi{y+Cl-{%?k((NKMCPmskDq%KX;gYA2wQS3(3p{^=KSkfF1L|nGtouY}*Zq~t
z=6I_^{JsMmCPzT=1N8GFzZXY9G|=zBZ~%o8Q0NZlT&3U(oz}pxYXJ!sE_Z*?;%8}P
z`^@i-=5s%Ppw^<2s#9}!1fQEb4-&>Em*&n}ENm6~|2!}x_HTApFUi`h$naBI?ou?D
zkLA7^*<<tb4rcE5TVGUiYX1k;7bgEp7ME1r{rjYk;Yqy3-lYeoJU)Fr@!w0ad1bG^
zumL^qzI;2-=BIaF%33W9?(=)UQ@H%p&dcm2PRnKH#ucf*n^L^o4(O<ppa^d9dtuTF
zb|~BzAYq^%p?-k*5hw)oKFp6W**lt*{TFM>Uv(RNEvYWQwf>fb$PUM+Eth$(Xuk4m
zJU!*v?#utY&)eQT1k`#t0cs)EP=lKYvKJoQF#90E4GB?nhjPx%^ZljySN(^dTTW);
zv2UMtE{`sFC8E`^G>!S9^P8ey)0nnT>#@IU+WhKDeN%qQk!f-v-=?3pyL%{@7Z?oA
zczlTJ4xnOCjH0;?RTQ2yTKrba{3(rCUgG@y%;Uoyx>uXdd}>_2d)b34y!<ZbPHM5g
z-r$$n{%q#)cE#vbDbxI|?y~v6Rank8$8KvhI}@m6X!^63eWyl|dU=uhos)C#aDoXS
vr<@5y9GVN{AAl1DAZ1_zp%ziR09gr&(|tFJOg9Yp1rqjj^>bP0l+XkKM=x21

literal 19811
zcmeIa2~<<(`abG_b;ha{R1mBj#R>)$nFDAma4dz03J3&nf-naH2mum`){&`7Au=Tv
zstjRJ!W1A0YC$1Dj1Z71VG4mj5+NjkguvYawLQO|Yu)y)|5@wYdk<@+N%ptDZ-4Lm
zKJWWH@6NV&&)C~+-mrVaiWMt1pZ@2mvny68Wvy7T^2>E!07pbugWdseD?`uPe7mBg
zO?@2rlIC^cw6~qzibKHXbt{xs?q0D*aSHIVSgH2uv-Qe@D^@QZU$tVzZQm72AI`A{
z-W7j8124s#<#(kAw^pnLzHI|uKRsIcVJ0yCXw}D=N?DIqfBd}iOW?c}C(nI%`ZVx<
z?h3-oD=5?l9%gyRW+QN5eegeBLRYL%*{^u5Jbm^b?}2M|`JTTJcERok(<|^m(50*J
z%U+<!z+lBXE5MPaz{fzZuuFR*0|SCWO(V_sEuCQsd{!LR+qZY=lrVqueHZM`?EMao
z@Y-t#Is!Vf&tk*gy?eoktKOz(Pg#GM4tz7;=MxqdY^tXh5fK53&<DX0FukKDCMJ4E
zj_Dmcb{IJ0a40e;>{8_6pir&lMLw=`$}98=!Z$d~7ap`%vF@eI@bED6efty_`t-NF
zPp?SdKduxM`oS!~Kt08X-cisIy-#ZcQ^AU(re}O3y#icL`33?$1Fm6l%;*?+>HPmS
z^2ZhbH1onAGmjoMKJw?Oe;WFqQyoLS5Z}QAfoq0Y{1LAYlm9&UVIo*hVfvq3vFzui
zqky9=Hh}d$1<hi^T_0{f5Jz?2Q;_q(JAk;QzdOKx4+1a6JMh~3tzlmf5_nfWed^@-
z$d!|Qm&%6oAAIo!tu}5Y`ZhXVA9C*g*4J%jD$sx2pPtgZ`LsTme4PHIaKbWnQ7&1W
zXIuTaV&y8Obzkm1@#(|G2lcV)#&!>mZ~C7jKh`g<Hx53#?qAPWOgy&+n87~2^MB3w
zakaL=A4`9{#u3N*xc2k*H@5!ivdd$@@~i*5{QnD`Gq5U70jcYd0?F3Jc_D;!I`e3p
zp6XK^d(B15-df|}p6F_=Pe=&9_oJqd30~8O81J-7XX88Cbl4;;T6L{`^p=P}!f(Iz
z{p!Oo(dxqy^zScvt(3W7ljH${p?Y7zVA3bp)_=1K{fJ!zyg;t&i*zqqQrn;5c<$=P
zJoCOk@MUaOzok5jv8BAvN+&LL%_IzIDkhNPSLcy#oBe_Rymhm2X#{CpTCLCir)59N
zbs?|Nl@TEW<&t2VkJvpi@YPDoHwY)-rCO}yqyktt{*ERFDQAf__<tl)*1mzYu5H}4
zuKb+T`sP()0(g(Co7oW8&V0G&Q<lV?Q<^_Pb&;Q-x|eUe;8KrH%e(SlXZ_b%|DQ$H
zZ0wp!O;=v>y<CY6E`Ccgr^^4Pv;Wyk<k5frSnnCzV6k1rMv!bFF)fkno+1x(>&MLy
zmEV6%(6}5R)pu-eD_{R*<o2gH(tU8v^SJih;R@qVXAqR47JsrFrT5adl>Z#M|HOc9
z92518;yYRe!3qf<nAv0Ew*O-So^SwC`Q8_>9;J14>)+9s|J3Los2`_KxvGCWqdp++
zz9pTtfXZdfWRh;_Hwxj#*}wYzu!^%`-rLiUBD8tcjm5wB>vFn8#<$Vq+6$W=s(zfj
zB5rZRY}}Jg>%NS<_oICBo6{O@w^!#~vB&QFR0yALH<oHkYwZ_F<2=sV9{i)|RZ89)
zXZW^Lb0qmB>}$y$=e72%`vk{rAF1-I9q(5c`vgC+`>{R~STu2jzF9d-V^3+rr()Nl
zvU6@bYVlV4<W~tcKh|#o3TMd}E4bM`%59Ia=pGXLNA0}1YQehx<~xgfE8@-pMKNz0
zWK_L6kG=2c*PlGPVHGJ>e{W~U|5=Yp;Kr@?&m)!AwR{N_(0dD;^ZFiFG2+gZKIE+b
zBu1bI2jEK3JG|U=Um82sqpuwjAa0liyMnfD|Kz~@y>Dbjt1sJ;<nt#snmDggYTGu}
zkGZVzU+4YTdHqjY;?AYMhz>KDzZ?QtjD<2ddiS)(T_1alV1IJXFZez?@j)TYfmr96
z8g)M!%kr6E5aDwbIOL26KX*Ni*X3;#F=AkS4i*}ke^w8H%ao1zIC39Hpp>(s8o;;>
zV9~~H@fBNd|DNR!EFjCO6YPErtSQa^^P2cR8#b8$Tw#~ptA%5SFcLhl6irxLQ}I1%
zrp7p!yF(ar=zWLYKXp5;)whL~IC8D{q0?mlUk-&Ea&s+g*wbjmAtXAtID$=Pze9`f
z%jgDNBgmuH0A~I~!qwRHOe@9Z`}%BbfVD{zp#vQ{qjTp63Os+lP?Tu=k5MSTBfgI_
zg6cZx$PMVQK_xY_<%Pf@;k>mQ+v1QU8~8@`F%@I}a0R0nFtn>-DId?p<)Kerer7oI
z^TW_$g&$HR?z!g+2o9PelTY@j1(Hl9hbdzHWF{4Le2^&bnH){3YxLm;1Sv0>6sozW
z7_hj6_IK7ymF4$Ehykl|$KQEce{zL$l~V!615k^k%UH-7taQ!YhMqg$U2X$h`Wj`=
zj7U8DxXl{X!O4d*OMye?G<dnFq_0Z7mu7*mtg)7lt}jn-RSVC)KdBc=_>Ty!Gl$IX
zLXXm;iOC1RRi2ssefPU|+GV%4{;Du3h@YG5Sb$-QO#TjB5Spp7HUK<c$M?vnpIjl}
zb{-PFofxVwaBdM6)Ssj5eRe@D<ex1>rvg<Xtw~nmP}tF(pDa~RrOF(EL!7z9N@l7o
z?TW11hHulA&jQC85|Fti`VZt@K`Q?dq2=~f&G*p2QF;>kPaLs7A=7MncWwcJXmVa*
zQY$K~9I&|7;_tx4k)MlTbpzl@cyc9n*C$uF_XEWv3g|5W7PZ(B$yNP_iocffU6Zau
zqJ0t*B+KCYR8uvP11=YVq$m!7DfG@-1Pc*hDSIs++bzn}e&cNj?+CXpqqMZ<{3nFg
zBU_f{DNDBzuq`s)_n`x?h1z&rIUa6kbX&21A~8A139z_}^LOCVj5^OZ1Hgl!hfV7J
z;R?JPzQeW@Xt_L&((+b8<kx?{9Qga!M>&dmY@scvwJtad%eCN!o(Bn>6(yQu)Dz{*
zcT3{IG>Mw-xoDwMPs?axP1`@k7^2I6L}>A$tZ9Es;TWmUW;?dGv^9wT+CSk*+vw;S
zg-J#dI=!&NRv`U5aB(6gH=|$xc#dTx$-O?gA|f21nDOwCzf;-;%_!<K`J|R4*S`af
zJ(1a6=g_sQY2lbr{C`Af!})G#9!!ITk@>s`l~r`N?m;HfJG%SS71YETwJ>!BEOy8J
z9k?8lI1mMs0C=q1pHUJ%v0O<IKwBiBLI3NlJ>gD-n^fXZBj~V1Y3p_${Y)dSsaCh=
zsJFwvDYWV6f$|*z@!>NIW1-LLBhj@^$Pgr?&;S`5fwt|iONdbnlny&_RYN`JOkMRN
zMxK^kF{|pF$#17~hC+wtc&>wt4CqnTAyB53|90XnIz(bwp*gNfD~buUiyHk#GEY&(
zSI-}iR0`+<n;jUQ*^wGD9c|94t;Gg*U29NJ>U3<`k4>KNm*vU6mtQ`t<7qQtr7S!b
zNEj1kTV`fx*;@wJWL(-vet|#SsWVDtZ(ws`ZbhYM?x*#oY{WqK!ckEqFbah%#JLei
ztfAE-t6Za%Vv?iE+oGM6Io&13)4pr%-M+54t&8KY=Cp@)%jgko{|i@z$&$)B3R%-X
zD$o+4iE<`a`cn{CoB1-Lc?GMoWJ9TL3Kr>^>S+K0$qq>avw1eMm^_b#onGOyE`kMZ
z>j^79D?R#rfRiBbu$RDwm0J4jR=~m(({Qm%a>cHcXT#4GG<RL&IkyS#cp6saiF-R8
z2UES?>I)b`eA>%=7{nWb7MHKi!@nuqp^G8a!;L!RK78vnG06(e0Va;DLgJ(5>{W@e
zF#cSjWmO@j?~W}$$3XRr9+Ao@MpV1bGijNM0G^4y6RZf-^BpvJ*C9QS;B>_SfuIe3
zx90BsWTqyHAKnMJmalLvJ4xN?S^%drLnGW&3XQP`Q`~jxTvK#L-5R_IK7sqXj!4HB
z^2A#5q4d<!;?p!R8@njEyaU4;VT#SA^8BGP)LQE<qn=VubVL_{*HiNFCB5bz@poYe
zDsVB(v%s^+i#_2(dVL997^p%b3t{I%e;;3K@2ns2+&QwMA*vyU2>o?Fi*^RkKqKkV
zh`|RNM~We(n7mRwh$vJCFcFC93IF6D1rEubNLuqCxOaz*1+U&$=Qr`s>~_>ZOcW^E
zQd$zTIsUIX8$XR7ZSwN8q7)P4J~%-KRWr9p*>a+P@poXCa<kCnd#zkj@#9JPSnaey
z=g+N{%-+#ODY|I9LjD>x<386@B5y`&Oi@S09q8VgLv>|8M2(W7&<w;*(PYzP6AgQ9
zAZ&+g#dUU>>MZY8jb>|q{*MAWAbgUw72tH#V~y{V2wKcNMI0y51GSJQ_wvzI-2D;6
z8b@X4c$N&=z9p-2mEV@FNkE`L)ryQ_H?YkMI!EOFbB7&_j!x5MxinIeTY+Z_O5xhR
z`!Ac5czv$zx%R-#4BFl0n;D3P7<_erO0nI@Zfnc_#SZjuo?yasg~51wUY~H$qiAS4
zv#*zIL(z1ERtoCJq}DP~jDR0i>E62|)Y#SeC??XPq?Cu_>xkkjiemN@tD3YxDq3^8
zA{GMtl8oi`<9@de&u%EKyiod352N!7PPmKf6_f@Jg@B-lxH{3eXnID(E<gvd0TY3k
z`tY{CH|Nq#1uiC8u-c*5d3%flZ+2*u#nf!cg3y-J<<kAX<ZP59ngJ$G;>W7OD>Zq-
z?5Mdbzc2=2BpGgJ@ltL$JaZvcRs2Fy<Z$Eevbk5$z4M-|WCK#ACNY_;<Jnu`X+qON
zclTj?;BL-0ozqU!>YQn%uFAPs^K|pM?sny@l5g=&Pod&fl>Rf4T%y{J_vX6+^))r3
z04JAi=038JP2}Jg`U*V|S|DP4C-ekq1ms-RJm@zSZ3&mgwL6{Qef>fas5Q<&M&awK
z>$Djebrdtoe(nKli>TNcVpj;@T4E{ST8tUuLie+5H~*-P@v+b;ld1V@#qa&>N*=;@
zU|Mr3?m}OV<;ie8$usdVCvxQhbKx)13#@oYC^{v&S28Bay+;-A+3C*mp<o65?zBq(
z+RMr|6Ng5((O!uv<n5@(=x1rErMaSgx918fU1p?hIJ?%Zd78%Vtxwl@4l-0Gb_m&s
zo715d-;r8cAHeucH*G;bS6Preuw*irK`Vv=CPEP_i_KS&ZOmOc<~p~90nX}kK%pxB
zx#znyh$K!pPFzkG-M{AS(&o|(Z47FMys}N4iz1)Xi%lpqoiB=5+z#wgksF5$@si54
zGKr#KUii*%F%q7+I3auXq)V(NQ55T1u?`3MyIyARIa%OD;rr%e<LMIaoFny82SJ;l
zZ5iDY*Y4=U*pNo`7^qU?qK`-90(%D}pe0rTPDfHqUVI-#UlPN>xiM^me%~KK&)Arc
z>0@)9OrVF+0*;+!aL<N(^Hau(Kn>gjGAdOSBl*oWMH0cOI@SOC>*c$HL66Z#B?{Lf
zY?O7(DntG4l1=xZKo;6qR>=;J;-($K;-NB^a>M)nJLO@E=VbTMQ!UM1oxj1b2-;%w
zvwKuH0xr{#qtAzQlUL_XsPVdlq}qw@Yk|%!gFfyJ(4o0%a%F5yX@_2C*>rfh$H);&
zhGD!1YmQK0GBg7a#jeT$ow}aOq|0Pz$r*GTEFX}h(=F!2;HEXxoq1i<_AX4Gl$Z*b
z2*mVJg-yLA-)5o?#tz2v?rGt!0Zef=`t|g<MX-fAW+`26S^YI<JI*-@N6JcbSe@PI
za#{qJo$TOr8z<t&UIumv<c3Xyw}{z!R*VJ;=5GJjqnd(1VJynBbEXG9VgVHEz%Dc!
zoIL9l8tq33bl>6O0-)N%oZ@Ga5GB4U9)Us3f@?MuyFN#BZ=ApOU4Qau(D{0a51>O|
zr6|Bzh!KR{mqm<XOQoUssHVi~5s+)>O3{N_X#b(x_y&Qz4xkvfl@+(wDFUSmRb<rk
z&@}eE|MUWXJ}p0U=iHc_&hdCv+@Uv0R=Ae``jXeHYeB{14tcq>2F~xo-ijA^1L9?}
zBh)ay^8jLVE{-4TXZcnS>oV7qG1GF6KNqX#6f$SW4`k)Ck`IA;BQA!aa^+FvhzRrE
z65c%%^<w-(Td-zVmlz^Rl{UCKk&`;()kpJsyy+8Wv4#dVS?GZ<gO@5XH=W&zoLex|
z9nG}4O`>B)wpgOJDu{T3DV>QzN~P21EvpcKi9k%t#p`A@L_XP)BK0<cd&me-s4Bnp
z+u~x;zqT!pvz#tZ|C+POhbVqnsMv9)*R(fTJZDduKibTFPhiR+37t#1k^ioU+3VWO
zR20lBo4e<SNr4Ncf>#+Hy~%wFZNuv+&EjmO1O!2NRmJ!hI@I4_OgzW=@NEYgwgo#s
z31}?mUmbNWfTH7ALHROzU*d*YjYL2UM<M`lI*F#UL5Rh+KA~f5kfR=PE2huNAN0rr
zfx)}6gDYl9I3qbg@sGq`4k`k5T?fc0M-M17#m|n(%4Tf~->#)evXu5sR0FQ<a|T>X
z-D6TT`L3<lz?>U4L!8Z0NY}`U1I{f~_o&|Vo>L{4X4>Tto{h9YU&qr8P;$-Yi-Lwj
zAW9R77d<0$Xz-FLCzQa;J!ukt39Prm{pju7@ExIdIm6PPg>mv($^~k37d`6!%lY%Y
zx=|<&?aAmJfT>Ho-(4?#?-wt+H-{j8J7<FX)nnAVNnbMZV!!HZH=Kta#M2YY<v4+e
zTY;GV*r~CSm8(%K#HPMPR_Y|h0ENowpS-WBd9wPs8R&AlsQfi&JF}fzQ>f&7Xg8RQ
zZow+RSX#U6L3h?J-UW6E<c7wMOh&Yu?XtQp`Z_bVuBvrWB)`z^FS{=Tv`vM(EL8{|
z!nE=V<dxx$kzVODEzzR+LiQ1EcS?5(&YV|2eyH4U4b@BUeedRQ%ZyqFVDG@aIZcQH
zoP{3b_9ekf=F*C}`;CSk)cUlp>9FM^!tv@QDin!uL3PE7#YNg=dB}<thfXS<4B%x1
zKHwcnvZ_6B_C{e`=U25iB7aqmL71&f*V1B7Yib<ReA}bOS<jbsQ5NNbMLBLYc5y*M
zSrp<H=Q)e|>Ab3tU08BuY7ETeU4|jqR26Sj*!n{M-YYj*B<;SDry0aBwi4H%=Yb`t
zTM(aHEoR@a4Fa`t<SrTJnq%nFT1*}ci^|LCaf!WPVW>grlQmq(?h`Y?CNy0Ov-H8F
zzGHKbX$-P+<3I_n0fq{6+RihLi_Jz1mSd%s`bUZmtW|AoP-}hB?&g_>bg`IsncvpB
z9VGqjIU9~R8BuKJV7G!&Z?*lMTI&#j<cO<soe>axR=LDDx!*q6pIjU4_r+vG_PEXS
zs4|FW3M_k<vRp&`>K3-l#3hpHd}i2AF!lPLacqVmbRcK_m`^z6d?^HPnpA>#wj-C*
zO?Ke<p#o!WamHD`)pdA;3XWtuE~~XPw_(xV*R1A<n{O~v>qAuSyJi|mx|)q+Th>Aw
znJ&BT-{gF6x9gQPPD?LW7kzhhy(;fu^}HQutR!5!aQ%R4wa=>PT$h;2Dz4BeegUOo
zw>O?vIPv9U*-=&d)<{x5LC(FE&Axc+*%X6#f#eXVf^`5hd-4{oPmNsUG>J2Ku&H<q
zNB1@)D3~708F-LsD@4YodDGU`uqOEI@pKrgApcsYW?FLMD?JJNsFQXT$q3S>A5XLQ
zuGXsh0X-gCAxJ%1Jpr3?@yukLs<j_@xG_)Ev6vor$-t!A2gLMI!7`;~E!62j+7QME
z8%)>U42n(JSmWP+*KgJA8c;?(a0-rBEUjbY#Nw>BzaAoRO;3%Z`);1ozo^Im!E!@q
z3QC#*o@faklML*QKFFdqy%3VG?}kSu+c^C`*IqujO4aU!4PN#*3gyZO2}_MH-f2z~
zC;B;0_z&xYA&d4e+3u-?*-7{BSuFXui4=XSYmaf2v&Kw`Qc`Vj%@ZTO*W3~7SmS|X
z;<n~Hx?pF&W@AICI&zoSl<jdywU5nT0|+tLuW`aEo9Apask>G`A+fZrft%<@c%Lvd
z)oCF&+P8158Kkl+z%t&m4h%DiLEMO6Fk6f1gJ#fVgyYc`Z{!{kdJq~|W^YwNSa=!}
z07erf@y|?&JQ5Otb&0ul?ByP!*R(ATk19=g?rs@RllBxH=m}r-3Tj-Ow?Nyp3E%YG
zDs;-4@P4Kf<K>aIC02!UO88T}g>Ud&CiIQ_g#G#L^?Jc{C9ySOHUQ4x6syQ(x)~=_
zaE@JHe=ivZWo4|44uMG>o?<84yGC4Kx4xsSUEBTL9NJz5E7{Qz*&Yop+t-GBvRMH-
zQ`dey#zcEzQZ61E&@hIqgrBc<^;E+RBBc?#L4v}p2AJTibb$|{(IxtI{CXm3`ccIG
z;J%yf2BZ)@>g}V_mn$*PBAl$R4ie9vYkWrMyrv$E_sl`Qurk~5mjKcN0K`L&8$hYK
z1T%SfDIg{s<ODyTc@y?ka+P?TvURJ;J7gKjTjnx9#5SQ0rl>(@SSHmYqi<uFxX0bX
zIKHGN8W}a(9dW2ITXdU!Wq688@=Rbf1qxbIge^malwJJShf_SIP+qjxCce}qrnAf2
zfL^lQc=Uy@xhU5A;srZpP)58_#?8DcvaYZ!W|lC!KSLWjbNU9AHFBNr8j+K+0v#8B
z;HgnY11uVE0k0Ti!RSh5vqd$^4mFw)gB_dR#hbM!Ul(1)kq^*J_0^^3#;-M+C81^S
z>q;LrW-^EG?+95q9#{=jr8jggy1)bH6t3>hzN}+SBcO777}B2EO|*7Odoa?Ot!ojD
zc`J#CjY=Zp2E*63@lw~4bmD-hmW(GxyI3U9HI=IqNQ7hiHqufvjwK~p*mzhkSHK>J
zzXXsg06?aq9AGlWW;5#ZO;?8Tq0gdndvw9Mk#!Z2#l$<WpSPv(40>rps(LgRWnAAt
z&SKm1{YIEk;eu`}c1swgtDoAgPa~gm;1yZwYN{oB^5vI{?1Q4=*esX|F1sbzCMbRW
zEys_gb0xToHbfvB;_qI|t_LdFM8Tr|>X&|qU^=vpt_voKywWl3mjGGN!Gk%F-Oz;t
zs$pF<+ZjslH$sock?6V>y>22T-fGbz{=m--Fh=;DxAN+ZdEWINed*}0inL`j2FT9z
z!8d~EC&g#TxRje5)EWckj7D<N9bumDOQ0(KnB*j}6-mnrhsO@7x*d(e^3)h5on4Kv
zNbExPEo$u{*i?7zFJy|WJ#<FCsnjeus1!WrzFs|IFw73#N1eWuun=7DHJP2d)g`i|
z<Cl<t8I$Q_&C3-q>H1#+Na$ckLRY+LZ)wmw7iI6AYM$Bg3^j)L;I4+@Vw4nq>Xh;|
zY(Qx@T7?pcY(vbBe4#3jPzDi+$8i#vuQ=wBjKufAVv*wPdwwwarMY*J^9X_Jm2gCF
zv~UCr)WfT{%n;NOjD@qzW7myZv<gmC;bK=lp_D`nc7@1syLLLI+=w3?O7tEaFV>^h
z>1r-46A<d{z_bzZQEg{-{(&Do0kWG+^q%Tw7Sh}fOC>JH<8&?XCExwRf8}m$uv$FS
zHj{hNygRS7iO{y`HhPP4+3jh0mM&PJlRx!D5_#1s%hFB_1XQK4@*2;=t~|#*HqOL%
z_vpd^R{pgLvZ2LG;f&xol<8;^Ph&owk@w@UxdK>Ze_UuZ`=sFe0U)aWQNuvByE1co
zHRy_<DB<;s(8o^_uH@40V;q(%;I*&*5<mdr@~-pf=thUXY(pG*`EHgAEn?;yZloR5
zfb-g8we}&N3a>`DiV3p~)X=n-+D6r5l+fU&*2i;ombL`W>V84EWO}BBAUJ>CY+?)b
zRpT%wnz9q0(#y=$=A<{kwDtS$yB(m+5PKxeJNrb0*`m@`zZ`e%;Fg>1QQXCr`llJ{
z(BrD`;~ODxl*_2s+<Vv(0ga`6fuS_5>963QGFUkekljW2@pzip3B~lIQ)oHsuTHsy
z@5u?X<i9A`^;9+DiRPM#CJU4uHr(5ICOY_M^Irs<tAj|-lZi&8S6Vtl^S(X7KvjB_
z%aETXSrl`PYnmIJ60k6tbp>8jF|`Yu7n?UYUDE6lBL+Pd2o`44QI;<!l(TwuwnRCv
z0;1~Pt|lpq*_msp9=t0W=5<O1t7G7$u7*mgODJ2afaSS=4IofsgJDrln;5TlE$zmx
z#ztq(Wl>3`OLcio9p||jJe93BEbG}PuKM8hmd|-Tw}yXh!MT?U^_E}i3jjkAvD#Aj
zPrkBN!+6i#H?WCw{qOojf+2ljKuc2~ZDa&YKZ$Rzz8aYmbFJlOit|@WjFEvQ{V=6~
z`ipZZH-HMVa!)ZI!xi~DzH+Gcv0nDpl2qA6=Pnd^unOY=;o!GQR28T%XXLPUvbsjP
zV-<91-SrD0#D&pc=#h;{jA!g(01Mvw^MVo^wO5-Ui<#6EJv6;zAct6FO~}r5>`#2f
ztqXt_wY9s2RUi6^Rs#N@Ouo+dR$8Lld!7I?zQO_s`4@Hh!b<=8()tYwW^cRWB!P*f
zPVQj<Ic|Tl*p&l-`6R%O@0av;L`Cb1#|*al7A6v}tJt;5GfK90uRox=VV_mqjPNQU
zrN@bLY+}o>_we8YnHfD+>xW)=rOW)=oihO~g2vI3SXEp)><UoD4`_4lD!XPk?ti(R
zO#sNEE@y8(Ah{J<;csppkbc|qDS?^er7H5qS%4ZB6x4X6=1Eo}wUF;5T6R^w3O3{T
zpS@9I*<<V-DP=*iF;nqRzEExTGRWYm7juKaP8_e{%i&mN&P&IyW3I77mQ~1>lLlH#
z)ao#wLyXLQ-|+znAC=@+pp30c>mO9Gc(8SOjU~RTikqV#!Xt8uN~;*Z(Uki9r>Z4s
zjSo}-6@vuG_ib)obHJ`E@5p)JZV!T7*Frd*#eUzk86*{`LlL3A76RJa$+`TJ+U!L}
zr76ZGcBhfr7EB-O?t<E-9yFnK2-Elob_H!@7$L8nZ2_x=B6{^rOdw%zxW_@eum^E_
zLI+27tT_qDYlSZOo0|s&LB<JrVDAsH6=Oz?5(A_d2hd;~-GChcbqcR4bY60mb;bLR
z-~L5c%ju0rU$!rfPLITU9yb7lJ)R!G@o|h8erBqtrivHNsO%}(Fi6lz1}>mb9P0{q
zJzt{HAw|=5Gw%ZuKB>lFdcudiQ!uuE4nyujB0H)Awsrh8zF6T?>FNH)MfpJ8l8kl_
zBn110BWX7xn1`sqt}Co2;UAip^v^XfHsSre`O-=Tu|@i_x;wIZWWj!hwXR*98K^=z
z{)M@ku=W{og4F;U?S7cji*0H`=G<_~3u$_PYHsr3M(xsr>k6CxA8j6xbV^)!ueTI$
zML9|aC~r|gc?l)oz=q2TYaK~TuF@|Ayzws@nxAZ#v<tgngwgh#&nW;zyi20DnGL>1
zEt!GruD+0+S2Y&wxDMxz{GcwrJJY&ui88;U01ehb@CPJ(QiwO>KNSt2SY&N2=F_;M
z;+ipl;12w-SZW3Igx3Lv8w^xPW`SSkO<O!QTBA<r<U$5^efV<hhen0=*+!*sB8U@P
zQQYR$soB`t_;SA_B4^S1VI?boF#GuESocF#VypWi$U>jinAIo#!M<(#o^sPf+gsYr
zFrZUGTAe%t$YurK|IN(<M1SB+Y_DOv=V58$6hLos7(j0q!|Y7dFpmAj++|le90nWs
z{)=9YEe~mVKSLpc=Y~}6HiI&#e#d)FTp9h5MQzl^1}>OnbsPWf>}2!svXaoa?)S0a
zE1Idnf)7aes0I&0kAEy00BsB;_=LwgmBJZTfNchsq_KxHc+RR}X*brxm*lZ?uP)Tp
zohcm1u4<kJb{)7?*1O!4#EgEfDRFR*Fk8e2hJy$TM}iDBKt#%RxzUSqxp!uRUgAkw
z<9tv6Um7u$jx-z{se~aA*s(_pU+6KggN51Xizb~7*8z#V#O<?xZ}Wgui<9<NrZ4!>
z;Ryo&=}F6%zp_^6`Zsr17jc6eOP5@g;5S%s`d_rMEI0AWt-Qrnc9SZ_#}5$aPPyG}
z^g@2)m2eM3kP~P8uqkxM3z)OU2i5T9`A=*GG)@~JACT}#`3>fLDjEQ7z)s<1geVjq
zWIFnyV14_dKON}x9ud@kh+LAwwLRX0Z@JUhwSqdwG{vst-?2V4Ao`zcK$5vNy>p&{
z_|SCG?GVCLaFaZHjI#9!&!|@y?Yk+;a5UmdXcr}()<QU{U1gq<B0ROW*narI;N9tN
zpab!6`11S*ZP1|iw>A&Rvh^LT%GMd@c=VMR;Lg#a;?B`d_j7HHUXBs?s3ljy&ToX+
z?f$b4?$-=d*9XhtAP9NZ>Kq`%%XYV3OL7V~(!dMLLY|pI;Pk=5FD+`PjrlLeGnm!4
zAAMpgph<Fw{D_2)DsO}8r=kJShOw4wKaMfs7ET9jbNR_-O~5c|ahp?i{M1yrd3&Sp
z6HBwU>1R`87f&H7Nk&eG@=MI1PJKs=Dq()CEt$v<ek!M}pXn(s5}%@=Y2D#H##oe6
z)MjyU6y<EVybg>HB}V){Yex%Xn+;|gwcXj8w{6TI`E^KVa$0<1l~~_`TZIH$u0J)g
zpnrp<1vQqp!Ft_Fw;Eq?U8M&h^%&n%zGAJN{9gLFh9BHL;i?CLt1#r_?4fMcVw@2q
zO!A;zm=j)Fg2TYVK|QG9d<-Yj%Y4<I;}~7K59L(ob6m#kcunZwZ{fWfqTO*f9zQ;U
zO-z<YsF`qk{c=ONq}Io+HjgB)zZbWzP1UTPbnJ>O6x=5Ho1w{7x)#Iws7zZ~)``;l
zu!>!z4#4R>k$`!>9FRFJ5<i$XH~tgx&J<K`|Bj0nzIpzd9nh*r?=F?LOuJ>@woL4+
zK-o(CRV3!1EPi(YYS>`aT3hM>yTDn^hJLAKY-Fnk<$Tv3P_+95d%QKL#*zfB6XuA7
zqj&iph$xS8K0_p&E^4!?g{zCE)k}+P9^DvyuzxoX;{%#Kx82e`0+dxA)SK&}75tR{
zcr4w}USHlFQi)HrS*!Jcz#~07dosL#ux#iS@tOHK8TwsI?SgxZH#YHCl>MWwAg>B-
zsVKUwq`h%!QtOTXQ#eDGS~$UByJS;IT8;3iCpxV4g1Qjv1mn~brN~oB1avtxpsoTp
z<v?j2o;$tQRf5|!uQB*yJi?-v8I@T7{Oz_#B$v_X5Vlj_6!}w^9wYoV)uSs48AZtF
zL-bSybw2VsWKJz63#(cjy^IbzU(JRDex)<9b?g!G>RoHu{)~+a9lt}mvNviM_lono
z)~V|rzBv^TGfC-v?Z9$UOFSnC1O-~GBH1{V!7cLV>%B^A8qc9DC*6Cege0%2UAXaH
zXb(pzDiMrt9eXUkklocNoYZ=Byt_(D`)D3SyTvMj!;N_1<e7b2*%}1%N8fEjGhk^f
zE_mK|q=e^I<4LF%gbaq!yeHI_;P}PA1CHT(R<!bvjB#2L!95Ucy7PWkpdc}IL*axn
zW`|KrA}_s1^3t2R+`4Syg{~r9)4`k5OIe}UvJI19nsFXCP<UqPd3Q@a8W+mna}?dK
z?>=WA$w}0|{Cj~?RI3&uOO(sLJX|bKNR|d_&>*YmHH$3+l69)~6k)fx(_cwBEv=q<
zuGBCo=jB6slU5zN07!CiO((TI(<(eRiWteUn&7l<?<bAS)cN$|m(US^jb+Sbn|Qb+
zpIF<v-}ZQq9v%8V?*O`n4>k7fue2dq4_x&9CIL2lWwGBnNm|+ClNALI^%ZU%GcS99
z5}stQ1w}Iquw2<CK~EA+a5$be`vYtAt%dd*{}Go!YrP7f4>H8UG*>=uA~Rx3KA49#
zy#%@pcBX8vlybpgM-V1%LfbGg-hwdi#2JKWMFg_m$}<{J$s6;c4c%Cg3OFA7cfheI
z`kSaHiS0kcUd!V74bz|B83~An7o9)j9#LIXeJLn_)<M=Z6n<!ld~n;xD%@OnchPbz
zVWK<)EL%6Td(usWe$163kxm?h;LcPgcDy;_g&90+tCs!#H~t#;hl?~1tE$0yu$T_G
zFcQ7(PNy991;wU1Sei|%tWxB9xsnGqsv_j8J&g(W8iOKRMUr<_cwm{jg|c>PgPESI
zB^*1(-=7CW&tn-K<+4TxJ=#=rl#;0r$R&pk{ETqj(qO7Rob%%JM5Q@bQJ8Emfz>o+
z0z*Vc6~lVCBRbPIGx>zze(GfAmP3}o-T0EJ^Q<k=UDxJg;4U$ZFsUI%Y!c_8Mf1~-
zDlm8AR7?Tp9kde{FNA(DZ@PVb=+<GPH2W#Tne*$7`_%3X&m4lMD4ktpx+Qiu(O;Iv
z@6zQ*Lyh|h-O|P-I6nV(z_H(!t{<4?EhP*GIIATqTmJ*r8-3;Rb#Bc|uZGQ`SE8SY
z<xkT;v|8UODt>>nA7o}ilI5P4INWXXsOuuDP@?15MWYf-G0`>&teGF2$=jL8;e?$b
zhvGbhchq5WAL6@6f6L)qbml8z%87EmrT(n>i<|9&m`c(T_3VE?-GNWc9XsM3HodOY
zQ<3DP65t^lZ-;)I2Z)KJOOe7;5qc)(dajaXbma6KB}E~VoIADNoD6N+mMFG8t~y_E
z7gu7YYVTUbb66@&8-}}Q-^Etl&m|Wd?#^QZ#V<xFD!J$?E9T5x9MvN{F;9wqS6eyE
ziKY(eTeMdZ6@<MT>6A&g<vlYkP2_ajobWA!i*O&!!`VNK-$`Q#ekvxLZrPInZR?!h
z20YnM)1@$wv+b|ld#l1I-XW~v7vtF0PNO9_Zu)n?al+#s)_3@LJ8loGwkI;|$Uc)^
zrkc(U1zqX{Jyh$}uMpgBty^y4b}A}9&&eq|$CJXDG!0az0Sxhl%!%Y#P=ofsGKhU<
zm*=0rYNoT_x2eH?zjJQ=Ooq*f@q`Vvq0vpYOH<Sf#^RK6!WYw@3$}u?&>RA<c!_%A
zFnJz<>Q=Xp=BC(0%FhBxJ{ew&$uo-QF3cdN+TuHDnc6kcP2)&<jq?&ZRC(l(4r*v!
znMRy)hHaWpxn~Y>0Ae>E)=vz6TN9YP#FJ-t(i&ea)F!Hn0xRgoBIfAZQLQ)8SNF?C
zZ!~X#iVOY049QDo#U4wRn+k3!o!t6ZLD;VeJqaFuKtC7=2F&|D1%+RPelTx7>;?4q
z$P%=6fQW${yEuZV9L0=HI-|VjirZe|toc3U$1;O>nTD#pB%qQ!rBFg%{}wn(&d&Ux
zmY2RTY23?b%)PO-sawpvQ1A#)Z!RG(``%#-P?iCsA6mmVii%%V*0WtMHHUTi4o8Z&
zjv2hG;>y2YRy}Oy)?p*__Ef=75lmw(>#Xk|b%FQ2a<Oy;88Ullv-zyisri3=SuCS%
zLMI0g&Y0l>;Y-xBz78z%8$M{7uqzw<VgjQ`@=|{inB;O^l81?p@F+(!!SsGxds=)a
zX&D_plEhpO>~N}RUDz9F*{NLlw$?SVenWW^Z8uXo1EV{Z3e$^7Lklr0a3mj&Pi!oZ
z7-9@H<Ct1H-A0<{(F0}9?!yISO>qftc8*y#n?Rch1StsH8q4sWIU8Nn=0A)SZ#|PS
z=_coVG%s)a=)T5Kk=Z4DY1?;ewJ-z&$|)Fyt51c8Yb+c`bcql5m%8*xs)K7_1X1D=
z9B=%egJYGav4Pf%6-^9#waa^%Z6s}kaA!9G8u(cd5o1a=1@0IiMYrP*IA_OKS=dcP
zJHW9mGP$Ht@>MV4nABHZ2NmZZquev1=?X_rLv6eZ^h{|wj|GyPi7S*o7C~S(U9%cZ
zM-$H$w)dh3&#&E_dSb<@FMnL2w0FhIb$|L8xh{tH!dEo<_5tk}Z5p`I)1jKpIrJ)T
zI2TxG!j3%7VP6(1Hs&%~JpFXag&W`2K6|QyE^=h)eP5L)JKyzn4LL8Ws4clfFN)%*
zVQ67W)0OSQ0r%}yr#mPqxiQn77Gn!WVT(OC+ua(yJpX+0Py2|ASRX(Mb|n>*Pj&82
z>Ry<vriGJ)0ojbsT2J72Bz0U;X&carddLTJnjASp#gKNXM@Oc%uw?PrY}0V`J)X;)
zJ`0#;OntjtQwS{t)0`e%XdN=jBuav}pi$e+N|&4(PCHgwg9d&QAFImXh&Sdjly+|U
z7ms}0)S3ADE>M;eT)fFY4Vebqrg=;r13Q~o?9p`Bl({cLl-g<3hSAbWro61hjeTiY
z;kmr2yMDP7c(Ug&K@dqn$#-EWi0DGLrn~118V?SL3B$SoZ5Qwm0BEEjR@-K7q!{%^
z4j)^n*qB$9KqY@BJiulI+uorE%cIWu4i_7!6M!BAHxb3@HpLf5P=IF+fc<!81NWTk
zq%W;f@NErlxF;e~z7YgSbg%a*uM}{ZhLIK1(jG52eXw*`_^Aq)>PX=XVutUG!ea6w
zJi@Vz2_YBj<pHz?A}>1JfW}j`$FD!4@j1+#c%%|n>TX=joZQu^E7WB%3~f<#WC?&<
ziy8}Y?E-Gy6C5VK)iaPJ^GdI3e%S~E&<Ppd&am!sMo|v-WBKRYZ;wf_+-IhiHRLh5
z6Y|~c7?>~#i&2E^^|?j+r0FI%*&`Y2joe?=9=;2M`ubl{`yocm{ZHipc%J}4_c!*I
z5r!?gL_i)n{omwK?E-Z`J6Z5eC5%mrfdSKWZI_b}NCEC0KSJ=-=s0_<zf=m=g9z%l
zV0;&cVNR_Bguw(ghvkRu)ys0)ei8gXQ)y{aCl0UjtObzVtzYCa>O#f`rb%;zQ1)zC
zOc-i1LT<VB2TMKGOiE9bgnA5~dOMMJ)-w#-s9l&?QYu9|)R_0@)hN?N=8P-h=r9yr
zrpP%u(z}}m<eW?O&v^^k>g<#%@kt5gR~dj#<}r}$+t228o$W>0&33LG5cJOA7NF3e
zN+z;l6GZ@AfgcJL`&kb87FD|vQ&l)Bqf?T@1Oqw@^BkhVIu}4|#?7?>(_GS)3qH;Z
z%h(G1_&L<tPNgf75>UKI2GXgNhL27|0;JK2v7#`5fA(fve4sw^^GWWiM7{<fnE<;0
zq-P9pZ%$;zAvRBq&yGpo38b<69-=)!hOreQKE{iAXp0(7k#LES^J?y%B6&_&bPWXf
znQbDv@D+D{OqwH6<Xo<>r3?q;93l8?h}_JiSsTu~!H@k;0-%%R1r^`?U*%2_qex7>
zBJVT5SA!Q1KIGAoWY8`%TBi#DSKz55#eSAUt_jM3hJ%>W<HZ6i67rZps&@kiCg;7w
zrdkTKfoTNYz@?7dk_yIa0gqBOWaeRV6-ikKCJ76LMIJb3iLMan>WlT_(RgGKP~hx-
zocer{S1gxdZ`xDubl4p5BOFj?LsN&_r|6<*EYaerBS66qSn!+Z2Yv`v6j?>W?FNc$
z4Vf#m8W)G*fM<5PB<GVEb99lFK#_AcoOCZsAm_XXzr{);5Whf}J!|_HZ}9<iMp3IL
zH-5I}y$8i9^4=)bkm>sJXfJM$gjf=REhYeP1)iT!>}NUTmeAuwF0@YM>&l9c5?*DB
zJio9+#z3B#>%@+l0n=bEJ<EDL4e4Y9JQEUc1ktlmkQ)*i#c|}gkSt31E~9{^ck@({
zI|=)M0++F3`148bw{;ev4sI*<QtM|JYVqzxDU$eZUcnS;NJWuVB;0YJ$cCX9lK8j0
zaSPz-gAzdja0e-OBA%(px#Tu|DjLYS))zOyBEiT(TwTSMn#Ililxu@CX(8RJpRIWV
z?yI22Br2}|bGtlY0t}|6{Br$&u<KGusIOJ34vxZD)_(VyOpQYTgNL-gS#$DpgAwZi
zaRrLKcKdTcX0Hn#Ja2z?!}ZS%3bqZr7<XToRQ_C?o3{?UD9HNlE2Ym3GWXZtY_FZ~
z-t##({GwHV^PI`)Z`OTou+H|!jd>As1hdao#)CG%AmYK<FF!XJiCLj)FPMf!e6Dbu
zNdgAh|9HIjbA#_eD}ge|rtv?QSC7<z!RFK7pZwh5|BnLtJ~qiU`{e$kD{m_P2UMr6
M>`#?^d-3}J1@sn6S^xk5


From ed519b0f24ea1620a81653012103122a2e367d83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ph=C3=BAc=20H=2E=20L=C3=AA=20Kh=E1=BA=AFc?= <lkhphuc@pm.me>
Date: Fri, 17 Oct 2025 17:21:25 +0700
Subject: [PATCH 02/36] Update engine.py with correct error message on assert

---
 nanochat/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/engine.py b/nanochat/engine.py
index de1253a..eb3fcac 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -83,7 +83,7 @@ class KVCache:
         for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)):
             if ix in [0, 1, 3, 5]:
                 # num_layers, batch_size, num_heads, head_dim must match
-                assert dim1 == dim2, f"Batch dim mismatch: {dim1} != {dim2}"
+                assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}"
             elif ix == 2:
                 # batch_size can be expanded
                 assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}"

From 2b58e2dd2ae134078c610665bf0811196c62830c Mon Sep 17 00:00:00 2001
From: obxium <obxium@gmail.com>
Date: Sat, 18 Oct 2025 09:31:11 -0400
Subject: [PATCH 03/36] Update logo in code as well

---
 nanochat/common.py | 91 +++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 33 deletions(-)

diff --git a/nanochat/common.py b/nanochat/common.py
index 8b10df9..bb825ff 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -8,43 +8,58 @@ import logging
 import torch
 import torch.distributed as dist
 
+
 class ColoredFormatter(logging.Formatter):
     """Custom formatter that adds colors to log messages."""
+
     # ANSI color codes
     COLORS = {
-        'DEBUG': '\033[36m',    # Cyan
-        'INFO': '\033[32m',     # Green
-        'WARNING': '\033[33m',  # Yellow
-        'ERROR': '\033[31m',    # Red
-        'CRITICAL': '\033[35m', # Magenta
+        "DEBUG": "\033[36m",  # Cyan
+        "INFO": "\033[32m",  # Green
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[35m",  # Magenta
     }
-    RESET = '\033[0m'
-    BOLD = '\033[1m'
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+
     def format(self, record):
         # Add color to the level name
         levelname = record.levelname
         if levelname in self.COLORS:
-            record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
+            record.levelname = (
+                f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
+            )
         # Format the message
         message = super().format(record)
         # Add color to specific parts of the message
-        if levelname == 'INFO':
+        if levelname == "INFO":
             # Highlight numbers and percentages
-            message = re.sub(r'(\d+\.?\d*\s*(?:GB|MB|%|docs))', rf'{self.BOLD}\1{self.RESET}', message)
-            message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
+            message = re.sub(
+                r"(\d+\.?\d*\s*(?:GB|MB|%|docs))",
+                rf"{self.BOLD}\1{self.RESET}",
+                message,
+            )
+            message = re.sub(
+                r"(Shard \d+)",
+                rf"{self.COLORS['INFO']}{self.BOLD}\1{self.RESET}",
+                message,
+            )
         return message
 
+
 def setup_default_logging():
     handler = logging.StreamHandler()
-    handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
-    logging.basicConfig(
-        level=logging.INFO,
-        handlers=[handler]
+    handler.setFormatter(
+        ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     )
+    logging.basicConfig(level=logging.INFO, handlers=[handler])
+
 
 setup_default_logging()
 logger = logging.getLogger(__name__)
 
+
 def get_base_dir():
     # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
     if os.environ.get("NANOCHAT_BASE_DIR"):
@@ -56,39 +71,44 @@ def get_base_dir():
     os.makedirs(nanochat_dir, exist_ok=True)
     return nanochat_dir
 
-def print0(s="",**kwargs):
-    ddp_rank = int(os.environ.get('RANK', 0))
+
+def print0(s="", **kwargs):
+    ddp_rank = int(os.environ.get("RANK", 0))
     if ddp_rank == 0:
         print(s, **kwargs)
 
+
 def print_banner():
     # Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
     banner = """
-                                                   █████                 █████
-                                                  ░░███                 ░░███
- ████████    ██████   ████████    ██████   ██████  ░███████    ██████   ███████
-░░███░░███  ░░░░░███ ░░███░░███  ███░░███ ███░░███ ░███░░███  ░░░░░███ ░░░███░
- ░███ ░███   ███████  ░███ ░███ ░███ ░███░███ ░░░  ░███ ░███   ███████   ░███
- ░███ ░███  ███░░███  ░███ ░███ ░███ ░███░███  ███ ░███ ░███  ███░░███   ░███ ███
- ████ █████░░████████ ████ █████░░██████ ░░██████  ████ █████░░████████  ░░█████
-░░░░ ░░░░░  ░░░░░░░░ ░░░░ ░░░░░  ░░░░░░   ░░░░░░  ░░░░ ░░░░░  ░░░░░░░░    ░░░░░
-"""
+                                                       █████                █████
+                                                      ░░███                ░░███
+     ████████    ██████   ████████    ██████   ██████  ░███████    ██████  ███████
+    ░░███░░███  ░░░░░███ ░░███░░███  ███░░███ ███░░███ ░███░░███  ░░░░░███░░░███░
+     ░███ ░███   ███████  ░███ ░███ ░███ ░███░███ ░░░  ░███ ░███   ███████  ░███
+     ░███ ░███  ███░░███  ░███ ░███ ░███ ░███░███  ███ ░███ ░███  ███░░███  ░███ ███
+     ████ █████░░████████ ████ █████░░██████ ░░██████  ████ █████░░███████  ░░█████
+    ░░░░ ░░░░░  ░░░░░░░░ ░░░░ ░░░░░  ░░░░░░   ░░░░░░  ░░░░ ░░░░░  ░░░░░░░░   ░░░░░
+    """
     print0(banner)
 
+
 def is_ddp():
     # TODO is there a proper way
-    return int(os.environ.get('RANK', -1)) != -1
+    return int(os.environ.get("RANK", -1)) != -1
+
 
 def get_dist_info():
     if is_ddp():
-        assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
-        ddp_rank = int(os.environ['RANK'])
-        ddp_local_rank = int(os.environ['LOCAL_RANK'])
-        ddp_world_size = int(os.environ['WORLD_SIZE'])
+        assert all(var in os.environ for var in ["RANK", "LOCAL_RANK", "WORLD_SIZE"])
+        ddp_rank = int(os.environ["RANK"])
+        ddp_local_rank = int(os.environ["LOCAL_RANK"])
+        ddp_world_size = int(os.environ["WORLD_SIZE"])
         return True, ddp_rank, ddp_local_rank, ddp_world_size
     else:
         return False, 0, 0, 1
 
+
 def compute_init():
     """Basic initialization that we keep doing over and over, so make common."""
 
@@ -104,13 +124,13 @@ def compute_init():
     # torch.backends.cudnn.benchmark = False
 
     # Precision
-    torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
+    torch.set_float32_matmul_precision("high")  # uses tf32 instead of fp32 for matmuls
 
     # Distributed setup: Distributed Data Parallel (DDP), optional
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
     if ddp:
         device = torch.device("cuda", ddp_local_rank)
-        torch.cuda.set_device(device) # make "cuda" default to this device
+        torch.cuda.set_device(device)  # make "cuda" default to this device
         dist.init_process_group(backend="nccl", device_id=device)
         dist.barrier()
     else:
@@ -121,16 +141,21 @@ def compute_init():
 
     return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
 
+
 def compute_cleanup():
     """Companion function to compute_init, to clean things up before script exit"""
     if is_ddp():
         dist.destroy_process_group()
 
+
 class DummyWandb:
     """Useful if we wish to not use wandb but have all the same signatures"""
+
     def __init__(self):
         pass
+
     def log(self, *args, **kwargs):
         pass
+
     def finish(self):
         pass

From fca2b8cd07a0929fb5a0368522f11a061d03e52e Mon Sep 17 00:00:00 2001
From: Marius Wachtler <undingen@gmail.com>
Date: Fri, 24 Oct 2025 14:29:35 -0500
Subject: [PATCH 04/36] harden eval: prevent the calc tool from accessing
 globals and locals By passing empty globals() and locals() to eval() we can
 prevent simple malicious cases where the user gets the model to output
 something like

```<global variable/func> or "a".count("a")```
e.g.
```signal.raise_signal(9) or "a".count("a")``` which would kill the process.
or one could maybe get it to output secrets etc.

I think to make it 100% secure one would need to parse the AST and only execute secure nodes but this should make it much more robust.
---
 nanochat/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/engine.py b/nanochat/engine.py
index fee06a1..77530c5 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -37,7 +37,7 @@ def eval_with_timeout(formula, max_time=3):
         with timeout(max_time, formula):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", SyntaxWarning)
-                return eval(formula)
+                return eval(formula, {"__builtins__": {}}, {})
     except Exception as e:
         signal.alarm(0)
         # print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage

From a9de4b103858223646e0e8ba29ed32b8516aad8f Mon Sep 17 00:00:00 2001
From: water-vapor <vapor@uchicago.edu>
Date: Sun, 26 Oct 2025 01:43:49 -0500
Subject: [PATCH 05/36] Fix tok/sec metrics for base_train and mid_train when
 gradient accumulation is not 1

---
 scripts/base_train.py | 2 +-
 scripts/mid_train.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/base_train.py b/scripts/base_train.py
index 3725805..47ecba4 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -294,7 +294,7 @@ for step in range(num_iterations + 1):
     smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
     debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
     pct_done = 100 * step / num_iterations
-    tok_per_sec = int(world_tokens_per_fwdbwd / dt)
+    tok_per_sec = int(total_batch_size / dt)
     flops_per_sec = num_flops_per_token * total_batch_size / dt
     promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
     mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
diff --git a/scripts/mid_train.py b/scripts/mid_train.py
index eedb262..6c2b82f 100644
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@@ -268,7 +268,7 @@ while True:
     smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
     debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
     pct_done = 100 * progress
-    tok_per_sec = int(world_tokens_per_fwdbwd / dt)
+    tok_per_sec = int(total_batch_size / dt)
     flops_per_sec = num_flops_per_token * total_batch_size / dt
     promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
     mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %

From 5e0987a431553a84ba82d835d1da5daccd70d095 Mon Sep 17 00:00:00 2001
From: Ajeesh Sunil <98960341+Aj-esh@users.noreply.github.com>
Date: Tue, 28 Oct 2025 20:05:38 +0000
Subject: [PATCH 06/36] numpy isnt acting as a dependency for nanochat, so isnt
 it better to remove numpy from dependencies list

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index da674f4..3d03c4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
     "datasets>=4.0.0",
     "fastapi>=0.117.1",
     "files-to-prompt>=0.6",
-    "numpy==1.26.4",
     "psutil>=7.1.0",
     "regex>=2025.9.1",
     "setuptools>=80.9.0",

From cbd560a83d93a8de8ebb238608ee571e7952e2ac Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 29 Oct 2025 11:42:56 +0100
Subject: [PATCH 07/36] revert formatting changes to minimize diff and merge
 conflicts

---
 nanochat/common.py | 67 +++++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 45 deletions(-)

diff --git a/nanochat/common.py b/nanochat/common.py
index bb825ff..a0867b0 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -8,58 +8,45 @@ import logging
 import torch
 import torch.distributed as dist
 
-
 class ColoredFormatter(logging.Formatter):
     """Custom formatter that adds colors to log messages."""
-
     # ANSI color codes
     COLORS = {
-        "DEBUG": "\033[36m",  # Cyan
-        "INFO": "\033[32m",  # Green
-        "WARNING": "\033[33m",  # Yellow
-        "ERROR": "\033[31m",  # Red
-        "CRITICAL": "\033[35m",  # Magenta
+        'DEBUG': '\033[36m',  # Cyan
+        'INFO': '\033[32m',  # Green
+        'WARNING': '\033[33m',  # Yellow
+        'ERROR': '\033[31m',  # Red
+        'CRITICAL': '\033[35m',  # Magenta
     }
-    RESET = "\033[0m"
-    BOLD = "\033[1m"
+    RESET = '\033[0m'
+    BOLD = '\033[1m'
 
     def format(self, record):
         # Add color to the level name
         levelname = record.levelname
         if levelname in self.COLORS:
-            record.levelname = (
-                f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
-            )
+            record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
         # Format the message
         message = super().format(record)
         # Add color to specific parts of the message
-        if levelname == "INFO":
+        if levelname == 'INFO':
             # Highlight numbers and percentages
-            message = re.sub(
-                r"(\d+\.?\d*\s*(?:GB|MB|%|docs))",
-                rf"{self.BOLD}\1{self.RESET}",
-                message,
-            )
-            message = re.sub(
-                r"(Shard \d+)",
-                rf"{self.COLORS['INFO']}{self.BOLD}\1{self.RESET}",
-                message,
-            )
+            message = re.sub(r'(\d+\.?\d*\s*(?:GB|MB|%|docs))', rf'{self.BOLD}\1{self.RESET}', message)
+            message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
         return message
 
 
 def setup_default_logging():
     handler = logging.StreamHandler()
-    handler.setFormatter(
-        ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[handler]
     )
-    logging.basicConfig(level=logging.INFO, handlers=[handler])
-
 
 setup_default_logging()
 logger = logging.getLogger(__name__)
 
-
 def get_base_dir():
     # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
     if os.environ.get("NANOCHAT_BASE_DIR"):
@@ -71,13 +58,11 @@ def get_base_dir():
     os.makedirs(nanochat_dir, exist_ok=True)
     return nanochat_dir
 
-
 def print0(s="", **kwargs):
-    ddp_rank = int(os.environ.get("RANK", 0))
+    ddp_rank = int(os.environ.get('RANK', 0))
     if ddp_rank == 0:
         print(s, **kwargs)
 
-
 def print_banner():
     # Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
     banner = """
@@ -92,23 +77,20 @@ def print_banner():
     """
     print0(banner)
 
-
 def is_ddp():
     # TODO is there a proper way
-    return int(os.environ.get("RANK", -1)) != -1
-
+    return int(os.environ.get('RANK', -1)) != -1
 
 def get_dist_info():
     if is_ddp():
-        assert all(var in os.environ for var in ["RANK", "LOCAL_RANK", "WORLD_SIZE"])
-        ddp_rank = int(os.environ["RANK"])
-        ddp_local_rank = int(os.environ["LOCAL_RANK"])
-        ddp_world_size = int(os.environ["WORLD_SIZE"])
+        assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
+        ddp_rank = int(os.environ['RANK'])
+        ddp_local_rank = int(os.environ['LOCAL_RANK'])
+        ddp_world_size = int(os.environ['WORLD_SIZE'])
         return True, ddp_rank, ddp_local_rank, ddp_world_size
     else:
         return False, 0, 0, 1
 
-
 def compute_init():
     """Basic initialization that we keep doing over and over, so make common."""
 
@@ -124,7 +106,7 @@ def compute_init():
     # torch.backends.cudnn.benchmark = False
 
     # Precision
-    torch.set_float32_matmul_precision("high")  # uses tf32 instead of fp32 for matmuls
+    torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
 
     # Distributed setup: Distributed Data Parallel (DDP), optional
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
@@ -141,21 +123,16 @@ def compute_init():
 
     return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
 
-
 def compute_cleanup():
     """Companion function to compute_init, to clean things up before script exit"""
     if is_ddp():
         dist.destroy_process_group()
 
-
 class DummyWandb:
     """Useful if we wish to not use wandb but have all the same signatures"""
-
     def __init__(self):
         pass
-
     def log(self, *args, **kwargs):
         pass
-
     def finish(self):
         pass

From 3fa974f93c1f94c3dc0fe4f57915a0d1aa73feaf Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 29 Oct 2025 11:45:02 +0100
Subject: [PATCH 08/36] few more reverts

---
 nanochat/common.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/nanochat/common.py b/nanochat/common.py
index a0867b0..d80d4ba 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -12,15 +12,14 @@ class ColoredFormatter(logging.Formatter):
     """Custom formatter that adds colors to log messages."""
     # ANSI color codes
     COLORS = {
-        'DEBUG': '\033[36m',  # Cyan
-        'INFO': '\033[32m',  # Green
+        'DEBUG': '\033[36m',    # Cyan
+        'INFO': '\033[32m',     # Green
         'WARNING': '\033[33m',  # Yellow
-        'ERROR': '\033[31m',  # Red
-        'CRITICAL': '\033[35m',  # Magenta
+        'ERROR': '\033[31m',    # Red
+        'CRITICAL': '\033[35m', # Magenta
     }
     RESET = '\033[0m'
     BOLD = '\033[1m'
-
     def format(self, record):
         # Add color to the level name
         levelname = record.levelname
@@ -35,7 +34,6 @@ class ColoredFormatter(logging.Formatter):
             message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
         return message
 
-
 def setup_default_logging():
     handler = logging.StreamHandler()
     handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
@@ -58,7 +56,7 @@ def get_base_dir():
     os.makedirs(nanochat_dir, exist_ok=True)
     return nanochat_dir
 
-def print0(s="", **kwargs):
+def print0(s="",**kwargs):
     ddp_rank = int(os.environ.get('RANK', 0))
     if ddp_rank == 0:
         print(s, **kwargs)

From 70319851fc960bc472ac7cfe9518c9478ada402e Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Wed, 29 Oct 2025 19:48:34 +0100
Subject: [PATCH 09/36] fix typo

---
 scripts/base_eval.py | 2 +-
 scripts/chat_sft.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index 8efde4f..3d403cc 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -1,5 +1,5 @@
 """
-Evlauate the CORE metric for a given model.
+Evaluate the CORE metric for a given model.
 
 Run on a single GPU:
 python base_eval.py
diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py
index e6e4565..bbeb1f9 100644
--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@@ -192,7 +192,7 @@ for step in range(num_iterations):
         })
         model.train()
 
-    # evlauate accuracy of the multiple choice tasks (which are quick to run)
+    # evaluate accuracy of the multiple choice tasks (which are quick to run)
     if last_step or (step > 0 and step % eval_metrics_every == 0):
         model.eval()
         metrics = {}

From f15732524a1cbe782c4546ef9db458cd88d7df1e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 1 Nov 2025 14:13:29 +0000
Subject: [PATCH 10/36] make deepwiki link better

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f13dba0..18ea5ce 100644
--- a/README.md
+++ b/README.md
@@ -113,7 +113,7 @@ files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --
 
 This includes all py, rs, html, toml, sh files, excludes the `rustbpe/target` folder, and chooses the cxml output format. Everything is written to the `packaged.txt` file, which atm measures ~330KB (i.e. well below ~100K tokens for a state of the art LLM), and ~8K lines of code in 45 files.
 
-Alternatively, I recommend using [DeepWiki](https://deepwiki.com/) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off.
+Alternatively, I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off.
 
 ## Tests
 

From 7d2c4a3d957bd9cdc1e4e54b1ab8a947ffc74edb Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 1 Nov 2025 15:28:30 +0000
Subject: [PATCH 11/36] delete pandas dep in base_eval use csv instead

---
 scripts/base_eval.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index 8efde4f..c488c8a 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -1,5 +1,5 @@
 """
-Evlauate the CORE metric for a given model.
+Evaluate the CORE metric for a given model.
 
 Run on a single GPU:
 python base_eval.py
@@ -10,14 +10,13 @@ torchrun --nproc_per_node=8 base_eval.py
 The script will print the CORE metric to the console.
 """
 import os
-import sys
+import csv
 import time
 import json
 import random
 import yaml
 from contextlib import nullcontext
 
-import pandas as pd
 import torch
 
 from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type
@@ -26,13 +25,12 @@ from nanochat.checkpoint_manager import load_model
 from nanochat.core_eval import evaluate_task
 
 # -----------------------------------------------------------------------------
-# nanoChat specific function dealing with I/O etc.
+# nanochat specific function dealing with I/O etc.
 
 def evaluate_model(model, tokenizer, device, max_per_task=-1):
     """
     Evaluate a base model on the CORE benchmark.
     - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
-    TODO: clean up this function, delete the need for all the files, for pandas dependency, etc.
     """
     # Load config and task metadata
     base_dir = get_base_dir()
@@ -43,7 +41,15 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     with open(config_path, 'r') as f:
         config = yaml.safe_load(f)
     tasks = config['icl_tasks']
-    eval_metadata = pd.read_csv(eval_meta_data)
+
+    # Load random baseline values from eval metadata
+    random_baselines = {}
+    with open(eval_meta_data, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            task_name = row['Eval Task']
+            random_baseline = row['Random baseline']
+            random_baselines[task_name] = float(random_baseline)
 
     # Evaluate each task
     results = {}
@@ -75,8 +81,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
         accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
 
         results[label] = accuracy
-        row = eval_metadata[eval_metadata["Eval Task"] == label]
-        random_baseline = row["Random baseline"].values[0]
+        random_baseline = random_baselines[label]
         centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
         centered_results[label] = centered_result
         end_time = time.time()

From cf587acb1a51003463c7eda250e95842802b80fd Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 1 Nov 2025 16:04:38 +0000
Subject: [PATCH 12/36] move eval bundle download to be lazy and inside the
 python code so that we can substantially simplify the run bash scripts

---
 dev/runcpu.sh        |  7 -------
 nanochat/common.py   | 14 ++++++++++----
 run1000.sh           |  7 -------
 scripts/base_eval.py | 29 +++++++++++++++++++++++++----
 speedrun.sh          |  9 ---------
 5 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/dev/runcpu.sh b/dev/runcpu.sh
index 469e51d..ffacefa 100755
--- a/dev/runcpu.sh
+++ b/dev/runcpu.sh
@@ -22,13 +22,6 @@ fi
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 source "$HOME/.cargo/env"
 uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
 
 # wipe the report
 python -m nanochat.report reset
diff --git a/nanochat/common.py b/nanochat/common.py
index a5a6d2e..8272378 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -58,7 +58,7 @@ def get_base_dir():
     os.makedirs(nanochat_dir, exist_ok=True)
     return nanochat_dir
 
-def download_file_with_lock(url, filename):
+def download_file_with_lock(url, filename, postprocess_fn=None):
     """
     Downloads a file from a URL to a local path in the base directory.
     Uses a lock file to prevent concurrent downloads among multiple ranks.
@@ -76,18 +76,24 @@ def download_file_with_lock(url, filename):
         # All other ranks block until it is released
         fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
 
+        # Recheck after acquiring lock (another process may have downloaded it)
         if os.path.exists(file_path):
             return file_path
 
+        # Download the content as bytes
         print(f"Downloading {url}...")
         with urllib.request.urlopen(url) as response:
-            content = response.read().decode('utf-8')
+            content = response.read() # bytes
 
-        with open(file_path, 'w') as f:
+        # Write to local file
+        with open(file_path, 'wb') as f:
             f.write(content)
-
         print(f"Downloaded to {file_path}")
 
+        # Run the postprocess function if provided
+        if postprocess_fn is not None:
+            postprocess_fn(file_path)
+
     # Clean up the lock file after the lock is released
     try:
         os.remove(lock_path)
diff --git a/run1000.sh b/run1000.sh
index 6f454e0..e0bc4c4 100644
--- a/run1000.sh
+++ b/run1000.sh
@@ -19,13 +19,6 @@ python -m nanochat.report reset
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 source "$HOME/.cargo/env"
 uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
 
 # train tokenizer on ~4B characters and kick off download of the rest for pretraining
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index c488c8a..21f7bac 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -2,10 +2,10 @@
 Evaluate the CORE metric for a given model.
 
 Run on a single GPU:
-python base_eval.py
+python -m scripts.base_eval
 
 Run with torchrun on e.g. 8 GPUs:
-torchrun --nproc_per_node=8 base_eval.py
+torchrun --nproc_per_node=8 -m scripts.base_eval
 
 The script will print the CORE metric to the console.
 """
@@ -13,13 +13,16 @@ import os
 import csv
 import time
 import json
-import random
 import yaml
+import shutil
+import random
+import zipfile
+import tempfile
 from contextlib import nullcontext
 
 import torch
 
-from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type
+from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
 from nanochat.tokenizer import HuggingFaceTokenizer
 from nanochat.checkpoint_manager import load_model
 from nanochat.core_eval import evaluate_task
@@ -27,6 +30,21 @@ from nanochat.core_eval import evaluate_task
 # -----------------------------------------------------------------------------
 # nanochat specific function dealing with I/O etc.
 
+# ~162MB of data needed to evaluate the CORE metric
+EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
+
+def place_eval_bundle(file_path):
+    # here file_path is the path to the eval_bundle.zip file
+    # we need to unzip it and place it in the base directory
+    base_dir = get_base_dir()
+    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(tmpdir)
+        extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
+        shutil.move(extracted_bundle_dir, eval_bundle_dir)
+    print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
+
 def evaluate_model(model, tokenizer, device, max_per_task=-1):
     """
     Evaluate a base model on the CORE benchmark.
@@ -35,6 +53,9 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     # Load config and task metadata
     base_dir = get_base_dir()
     eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
+    # Download the eval bundle to disk (and unzip if needed)
+    if not os.path.exists(eval_bundle_dir):
+        download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
     config_path = os.path.join(eval_bundle_dir, "core.yaml")
     data_base_path = os.path.join(eval_bundle_dir, "eval_data")
     eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
diff --git a/speedrun.sh b/speedrun.sh
index 35dd39e..32c8870 100644
--- a/speedrun.sh
+++ b/speedrun.sh
@@ -73,15 +73,6 @@ python -m scripts.tok_eval
 # -----------------------------------------------------------------------------
 # Base model (pretraining)
 
-# Download the eval_bundle from s3 to evaluate CORE metric during training (~162MB)
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
-
 # The d20 model is 561M parameters.
 # Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens.
 # Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.

From d54c9cbf8c3ec7e4436bef404d605700f661f12c Mon Sep 17 00:00:00 2001
From: Manuel Saelices <msaelices@gmail.com>
Date: Sat, 1 Nov 2025 23:38:50 +0100
Subject: [PATCH 13/36] CPU Support, as bfloat16 params breaks inference

---
 nanochat/checkpoint_manager.py | 37 +++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index f400d47..26fdb0d 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -1,6 +1,7 @@
 """
 Utilities for saving and loading model/optim/state checkpoints.
 """
+
 import os
 import re
 import glob
@@ -16,12 +17,15 @@ from nanochat.common import setup_default_logging
 # Set up logging
 setup_default_logging()
 logger = logging.getLogger(__name__)
+
+
 def log0(message):
-    if int(os.environ.get('RANK', 0)) == 0:
+    if int(os.environ.get("RANK", 0)) == 0:
         logger.info(message)
 
+
 def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data):
-    assert int(os.environ.get('RANK', 0)) == 0 # prevent footguns for now
+    assert int(os.environ.get("RANK", 0)) == 0  # prevent footguns for now
     os.makedirs(checkpoint_dir, exist_ok=True)
     # Save the model state (parameters)
     model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
@@ -64,7 +68,15 @@ def build_model(checkpoint_dir, step, device, phase):
     - meta data saved during base model training
     """
     assert phase in ["train", "eval"], f"Invalid phase: {phase}"
-    model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
+    model_data, optimizer_data, meta_data = load_checkpoint(
+        checkpoint_dir, step, device, load_optimizer=False
+    )
+    if device.type == "cpu":
+        # Convert bfloat16 tensors to float for CPU inference
+        model_data = {
+            k: v.float() if v.dtype == torch.bfloat16 else v
+            for k, v in model_data.items()
+        }
     # Hack: fix torch compile issue, which prepends all keys with _orig_mod.
     model_data = {k.lstrip("_orig_mod."): v for k, v in model_data.items()}
     model_config_kwargs = meta_data["model_config"]
@@ -74,7 +86,7 @@ def build_model(checkpoint_dir, step, device, phase):
         model = GPT(model_config)
     # Load the model state
     model.to_empty(device=device)
-    model.init_weights() # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
+    model.init_weights()  # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
     model.load_state_dict(model_data, strict=True, assign=True)
     # Put the model in the right training phase / mode
     if phase == "eval":
@@ -90,7 +102,11 @@ def build_model(checkpoint_dir, step, device, phase):
 
 def find_largest_model(checkpoint_dir):
     # attempt to guess the model tag: take the biggest model available
-    model_tags = [f for f in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, f))]
+    model_tags = [
+        f
+        for f in os.listdir(checkpoint_dir)
+        if os.path.isdir(os.path.join(checkpoint_dir, f))
+    ]
     if not model_tags:
         raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
     # 1) normally all model tags are of the form d<number>, try that first:
@@ -104,7 +120,9 @@ def find_largest_model(checkpoint_dir):
         candidates.sort(key=lambda x: x[0], reverse=True)
         return candidates[0][1]
     # 2) if that failed, take the most recently updated model:
-    model_tags.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True)
+    model_tags.sort(
+        key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True
+    )
     return model_tags[0]
 
 
@@ -113,12 +131,16 @@ def find_last_step(checkpoint_dir):
     checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt"))
     if not checkpoint_files:
         raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
-    last_step = int(max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files))
+    last_step = int(
+        max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files)
+    )
     return last_step
 
+
 # -----------------------------------------------------------------------------
 # convenience functions that take into account nanochat's directory structure
 
+
 def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=None):
     if model_tag is None:
         # guess the model tag by defaulting to the largest model
@@ -134,6 +156,7 @@ def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=Non
     model, tokenizer, meta_data = build_model(checkpoint_dir, step, device, phase)
     return model, tokenizer, meta_data
 
+
 def load_model(source, *args, **kwargs):
     model_dir = {
         "base": "base_checkpoints",

From ba4f40bf588a83ed3ee4d3c02cb7581edfb105ba Mon Sep 17 00:00:00 2001
From: Jing Zhang <vinjn@users.noreply.github.com>
Date: Sat, 1 Nov 2025 21:27:00 -0700
Subject: [PATCH 14/36] Update run1000.sh to add missing --run=$WANDB_RUN

---
 run1000.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run1000.sh b/run1000.sh
index e0bc4c4..46325d9 100644
--- a/run1000.sh
+++ b/run1000.sh
@@ -70,7 +70,7 @@ python -m scripts.tok_eval
 # which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd
 # start to overfit hard.
 # 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8
+torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN
 torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
 torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
 

From 036a3c5881c7e6430d5565bf8f1224fef54cdc82 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Sun, 2 Nov 2025 14:16:43 +0100
Subject: [PATCH 15/36] revert formatting changes to facilitate review

---
 nanochat/checkpoint_manager.py | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index 26fdb0d..a1120cb 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -1,7 +1,6 @@
 """
 Utilities for saving and loading model/optim/state checkpoints.
 """
-
 import os
 import re
 import glob
@@ -17,15 +16,13 @@ from nanochat.common import setup_default_logging
 # Set up logging
 setup_default_logging()
 logger = logging.getLogger(__name__)
-
-
 def log0(message):
-    if int(os.environ.get("RANK", 0)) == 0:
+    if int(os.environ.get('RANK', 0)) == 0:
         logger.info(message)
 
 
 def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data):
-    assert int(os.environ.get("RANK", 0)) == 0  # prevent footguns for now
+    assert int(os.environ.get('RANK', 0)) == 0  # prevent footguns for now
     os.makedirs(checkpoint_dir, exist_ok=True)
     # Save the model state (parameters)
     model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
@@ -68,9 +65,7 @@ def build_model(checkpoint_dir, step, device, phase):
     - meta data saved during base model training
     """
     assert phase in ["train", "eval"], f"Invalid phase: {phase}"
-    model_data, optimizer_data, meta_data = load_checkpoint(
-        checkpoint_dir, step, device, load_optimizer=False
-    )
+    model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
     if device.type == "cpu":
         # Convert bfloat16 tensors to float for CPU inference
         model_data = {
@@ -86,7 +81,7 @@ def build_model(checkpoint_dir, step, device, phase):
         model = GPT(model_config)
     # Load the model state
     model.to_empty(device=device)
-    model.init_weights()  # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
+    model.init_weights() # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
     model.load_state_dict(model_data, strict=True, assign=True)
     # Put the model in the right training phase / mode
     if phase == "eval":
@@ -102,11 +97,7 @@ def build_model(checkpoint_dir, step, device, phase):
 
 def find_largest_model(checkpoint_dir):
     # attempt to guess the model tag: take the biggest model available
-    model_tags = [
-        f
-        for f in os.listdir(checkpoint_dir)
-        if os.path.isdir(os.path.join(checkpoint_dir, f))
-    ]
+    model_tags = [f for f in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, f))]
     if not model_tags:
         raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
     # 1) normally all model tags are of the form d<number>, try that first:
@@ -120,9 +111,7 @@ def find_largest_model(checkpoint_dir):
         candidates.sort(key=lambda x: x[0], reverse=True)
         return candidates[0][1]
     # 2) if that failed, take the most recently updated model:
-    model_tags.sort(
-        key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True
-    )
+    model_tags.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True)
     return model_tags[0]
 
 
@@ -131,16 +120,12 @@ def find_last_step(checkpoint_dir):
     checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt"))
     if not checkpoint_files:
         raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
-    last_step = int(
-        max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files)
-    )
+    last_step = int(max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files))
     return last_step
 
-
 # -----------------------------------------------------------------------------
 # convenience functions that take into account nanochat's directory structure
 
-
 def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=None):
     if model_tag is None:
         # guess the model tag by defaulting to the largest model
@@ -156,7 +141,6 @@ def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=Non
     model, tokenizer, meta_data = build_model(checkpoint_dir, step, device, phase)
     return model, tokenizer, meta_data
 
-
 def load_model(source, *args, **kwargs):
     model_dir = {
         "base": "base_checkpoints",

From 5bfcd31b7311036a647b0677d2638046ef05f252 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Sun, 2 Nov 2025 14:17:10 +0100
Subject: [PATCH 16/36] revert more formatting changes

---
 nanochat/checkpoint_manager.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index a1120cb..262ff97 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -20,9 +20,8 @@ def log0(message):
     if int(os.environ.get('RANK', 0)) == 0:
         logger.info(message)
 
-
 def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data):
-    assert int(os.environ.get('RANK', 0)) == 0  # prevent footguns for now
+    assert int(os.environ.get('RANK', 0)) == 0 # prevent footguns for now
     os.makedirs(checkpoint_dir, exist_ok=True)
     # Save the model state (parameters)
     model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")

From f1e15f5f4df2842f09f7dd2756fbfbad00b975ec Mon Sep 17 00:00:00 2001
From: Josh Odom <odomobo@gmail.com>
Date: Sun, 2 Nov 2025 23:40:37 -0600
Subject: [PATCH 17/36] Fixing subtle bug: lstrip removes all matching
 characters, including potentially required ones. Use removeprefix instead.

---
 nanochat/checkpoint_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index 262ff97..a9327c4 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -72,7 +72,7 @@ def build_model(checkpoint_dir, step, device, phase):
             for k, v in model_data.items()
         }
     # Hack: fix torch compile issue, which prepends all keys with _orig_mod.
-    model_data = {k.lstrip("_orig_mod."): v for k, v in model_data.items()}
+    model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()}
     model_config_kwargs = meta_data["model_config"]
     log0(f"Building model with config: {model_config_kwargs}")
     model_config = GPTConfig(**model_config_kwargs)

From 226953b841f322bf88cb0f2af460a897f00393a2 Mon Sep 17 00:00:00 2001
From: Dipesh Babu <dipeshmahato@outlook.com>
Date: Mon, 3 Nov 2025 01:20:56 -0500
Subject: [PATCH 18/36] fix: open JSONL and results CSV with UTF-8 encoding for
 portability

---
 scripts/base_eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index 21f7bac..a987049 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -88,7 +88,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
 
         # Load data for this task
         data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
-        with open(data_path, 'r') as f:
+        with open(data_path, 'r', encoding='utf-8') as f:
             data = [json.loads(line.strip()) for line in f]
 
         # shuffle the data because in many cases it appears ordered but we want
@@ -184,7 +184,7 @@ def main():
         results = out["results"]
         centered_results = out["centered_results"]
         core_metric = out["core_metric"]
-        with open(output_csv_path, 'w') as f:
+        with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
             f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
             for label in results:
                 f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")

From c72b8b230966fa072f777c59e1b78eb83e39b3b0 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Nov 2025 21:27:12 +0100
Subject: [PATCH 19/36] add explicit UTF-8 encoding

---
 nanochat/checkpoint_manager.py |  4 ++--
 nanochat/common.py             |  2 +-
 nanochat/report.py             | 12 ++++++------
 scripts/base_eval.py           |  4 ++--
 scripts/chat_web.py            |  2 +-
 tasks/customjson.py            |  2 +-
 tasks/spellingbee.py           |  4 ++--
 tests/test_rustbpe.py          |  4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index a9327c4..e1a7d91 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data)
         log0(f"Saved optimizer file to: {optimizer_path}")
     # Save the metadata dict as json
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "w") as f:
+    with open(meta_path, "w", encoding='utf-8') as f:
         json.dump(meta_data, f, indent=2)
     log0(f"Saved metadata file to: {meta_path}")
 
@@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False):
         optimizer_data = torch.load(optimizer_path, map_location=device)
     # Load the metadata
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "r") as f:
+    with open(meta_path, "r", encoding='utf-8') as f:
         meta_data = json.load(f)
     return model_data, optimizer_data, meta_data
 
diff --git a/nanochat/common.py b/nanochat/common.py
index 4e5fc06..ee02a6e 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -70,7 +70,7 @@ def download_file_with_lock(url, filename, postprocess_fn=None):
     if os.path.exists(file_path):
         return file_path
 
-    with open(lock_path, 'w') as lock_file:
+    with open(lock_path, 'w', encoding='utf-8') as lock_file:
 
         # Only a single rank can acquire this lock
         # All other ranks block until it is released
diff --git a/nanochat/report.py b/nanochat/report.py
index d0a65e0..2f65e9d 100644
--- a/nanochat/report.py
+++ b/nanochat/report.py
@@ -170,7 +170,7 @@ Generated: {timestamp}
     # count dependencies via uv.lock
     uv_lock_lines = 0
     if os.path.exists('uv.lock'):
-        with open('uv.lock', 'r') as f:
+        with open('uv.lock', 'r', encoding='utf-8') as f:
             uv_lock_lines = len(f.readlines())
 
     header += f"""
@@ -241,7 +241,7 @@ class Report:
         slug = slugify(section)
         file_name = f"{slug}.md"
         file_path = os.path.join(self.report_dir, file_name)
-        with open(file_path, "w") as f:
+        with open(file_path, "w", encoding='utf-8') as f:
             f.write(f"## {section}\n")
             f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
             for item in data:
@@ -272,11 +272,11 @@ class Report:
         final_metrics = {} # the most important final metrics we'll add as table at the end
         start_time = None
         end_time = None
-        with open(report_file, "w") as out_file:
+        with open(report_file, "w", encoding='utf-8') as out_file:
             # write the header first
             header_file = os.path.join(report_dir, "header.md")
             if os.path.exists(header_file):
-                with open(header_file, "r") as f:
+                with open(header_file, "r", encoding='utf-8') as f:
                     header_content = f.read()
                     out_file.write(header_content)
                     start_time = extract_timestamp(header_content, "Run started:")
@@ -293,7 +293,7 @@ class Report:
                 if not os.path.exists(section_file):
                     print(f"Warning: {section_file} does not exist, skipping")
                     continue
-                with open(section_file, "r") as in_file:
+                with open(section_file, "r", encoding='utf-8') as in_file:
                     section = in_file.read()
                 # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
                 if "rl" not in file_name:
@@ -373,7 +373,7 @@ class Report:
         header_file = os.path.join(self.report_dir, "header.md")
         header = generate_header()
         start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        with open(header_file, "w") as f:
+        with open(header_file, "w", encoding='utf-8') as f:
             f.write(header)
             f.write(f"Run started: {start_time}\n\n---\n\n")
         print(f"Reset report and wrote header to {header_file}")
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index a987049..3663538 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -59,7 +59,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     config_path = os.path.join(eval_bundle_dir, "core.yaml")
     data_base_path = os.path.join(eval_bundle_dir, "eval_data")
     eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
-    with open(config_path, 'r') as f:
+    with open(config_path, 'r', encoding='utf-8') as f:
         config = yaml.safe_load(f)
     tasks = config['icl_tasks']
 
@@ -193,7 +193,7 @@ def main():
         print0("="*80)
         print0(f"Model: {model_name}")
         print0("="*80)
-        with open(output_csv_path, 'r') as f:
+        with open(output_csv_path, 'r', encoding='utf-8') as f:
             print0(f.read())
 
     # Log to report
diff --git a/scripts/chat_web.py b/scripts/chat_web.py
index d7479c7..5d0b44a 100644
--- a/scripts/chat_web.py
+++ b/scripts/chat_web.py
@@ -243,7 +243,7 @@ app.add_middleware(
 async def root():
     """Serve the chat UI."""
     ui_html_path = os.path.join("nanochat", "ui.html")
-    with open(ui_html_path, "r") as f:
+    with open(ui_html_path, "r", encoding='utf-8') as f:
         html_content = f.read()
     # Replace the API_URL to use the same origin
     html_content = html_content.replace(
diff --git a/tasks/customjson.py b/tasks/customjson.py
index f4683c8..e1b5f0b 100644
--- a/tasks/customjson.py
+++ b/tasks/customjson.py
@@ -32,7 +32,7 @@ class CustomJSON(Task):
             print("-" * 80)
 
         else:
-            with open(filepath, 'r') as f:
+            with open(filepath, 'r', encoding='utf-8') as f:
                 for line in f:
                     line = line.strip()
                     if not line:  # skip empty lines
diff --git a/tasks/spellingbee.py b/tasks/spellingbee.py
index c051fe7..3b45305 100644
--- a/tasks/spellingbee.py
+++ b/tasks/spellingbee.py
@@ -119,7 +119,7 @@ class SpellingBee(Task):
         self.split = split
         filename = WORD_LIST_URL.split("/")[-1]
         word_list_path = download_file_with_lock(WORD_LIST_URL, filename)
-        with open(word_list_path) as f:
+        with open(word_list_path, 'r', encoding='utf-8') as f:
             words = [line.strip() for line in f]
         self.words = words
 
@@ -238,7 +238,7 @@ class SimpleSpelling(Task):
         self.split = split
         filename = WORD_LIST_URL.split("/")[-1]
         word_list_path = download_file_with_lock(WORD_LIST_URL, filename)
-        with open(word_list_path) as f:
+        with open(word_list_path, 'r', encoding='utf-8') as f:
             words = [line.strip() for line in f]
         rng = random.Random(42)
         rng.shuffle(words) # use a different word order than the SpellingBee task
diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py
index 5f95721..bad3c92 100644
--- a/tests/test_rustbpe.py
+++ b/tests/test_rustbpe.py
@@ -455,13 +455,13 @@ def enwik8_path():
 @pytest.fixture(scope="module")
 def enwik8_small(enwik8_path):
     """Fixture providing 100KB of enwik8 for quick tests."""
-    with open(enwik8_path, "r") as f:
+    with open(enwik8_path, "r", encoding='utf-8') as f:
         return f.read(100_000)
 
 @pytest.fixture(scope="module")
 def enwik8_large(enwik8_path):
     """Fixture providing 10MB of enwik8 for performance tests."""
-    with open(enwik8_path, "r") as f:
+    with open(enwik8_path, "r", encoding='utf-8') as f:
         return f.read(10**7)
 
 def time_function(func, *args, **kwargs):

From e22fc6f2fac0c3d5f3ecd3ba6b09f7d694014b64 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Nov 2025 21:46:39 +0100
Subject: [PATCH 20/36] few more explicit UTF-8 encodings

---
 dev/gen_synthetic_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py
index 13e5f55..73f4ac9 100644
--- a/dev/gen_synthetic_data.py
+++ b/dev/gen_synthetic_data.py
@@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from nanochat.common import get_base_dir
 
-api_key = open("openroutertoken.txt").read().strip()
+api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip()
 
 url = "https://openrouter.ai/api/v1/chat/completions"
 headers = {
@@ -45,7 +45,7 @@ headers = {
   "Content-Type": "application/json"
 }
 
-readme = open("README.md").read().strip()
+readme = open("README.md", 'r', encoding='utf-8').read().strip()
 prompt = r"""
 I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want:
 

From 2ce62ec07693a30d25264514fbaae0b918bfb200 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Nov 2025 21:52:02 +0100
Subject: [PATCH 21/36] ensure consistency of quotes within each statement

---
 dev/gen_synthetic_data.py      |  4 ++--
 nanochat/checkpoint_manager.py |  4 ++--
 nanochat/report.py             | 10 +++++-----
 scripts/chat_web.py            |  2 +-
 tests/test_rustbpe.py          |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py
index 73f4ac9..068824f 100644
--- a/dev/gen_synthetic_data.py
+++ b/dev/gen_synthetic_data.py
@@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from nanochat.common import get_base_dir
 
-api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip()
+api_key = open("openroutertoken.txt", "r", encoding="utf-8").read().strip()
 
 url = "https://openrouter.ai/api/v1/chat/completions"
 headers = {
@@ -45,7 +45,7 @@ headers = {
   "Content-Type": "application/json"
 }
 
-readme = open("README.md", 'r', encoding='utf-8').read().strip()
+readme = open("README.md", "r", encoding="utf-8").read().strip()
 prompt = r"""
 I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want:
 
diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index e1a7d91..378b0ed 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data)
         log0(f"Saved optimizer file to: {optimizer_path}")
     # Save the metadata dict as json
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "w", encoding='utf-8') as f:
+    with open(meta_path, "w", encoding="utf-8") as f:
         json.dump(meta_data, f, indent=2)
     log0(f"Saved metadata file to: {meta_path}")
 
@@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False):
         optimizer_data = torch.load(optimizer_path, map_location=device)
     # Load the metadata
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "r", encoding='utf-8') as f:
+    with open(meta_path, "r", encoding="utf-8") as f:
         meta_data = json.load(f)
     return model_data, optimizer_data, meta_data
 
diff --git a/nanochat/report.py b/nanochat/report.py
index 2f65e9d..0b0ebd7 100644
--- a/nanochat/report.py
+++ b/nanochat/report.py
@@ -241,7 +241,7 @@ class Report:
         slug = slugify(section)
         file_name = f"{slug}.md"
         file_path = os.path.join(self.report_dir, file_name)
-        with open(file_path, "w", encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
             f.write(f"## {section}\n")
             f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
             for item in data:
@@ -272,11 +272,11 @@ class Report:
         final_metrics = {} # the most important final metrics we'll add as table at the end
         start_time = None
         end_time = None
-        with open(report_file, "w", encoding='utf-8') as out_file:
+        with open(report_file, "w", encoding="utf-8") as out_file:
             # write the header first
             header_file = os.path.join(report_dir, "header.md")
             if os.path.exists(header_file):
-                with open(header_file, "r", encoding='utf-8') as f:
+                with open(header_file, "r", encoding="utf-8") as f:
                     header_content = f.read()
                     out_file.write(header_content)
                     start_time = extract_timestamp(header_content, "Run started:")
@@ -293,7 +293,7 @@ class Report:
                 if not os.path.exists(section_file):
                     print(f"Warning: {section_file} does not exist, skipping")
                     continue
-                with open(section_file, "r", encoding='utf-8') as in_file:
+                with open(section_file, "r", encoding="utf-8") as in_file:
                     section = in_file.read()
                 # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
                 if "rl" not in file_name:
@@ -373,7 +373,7 @@ class Report:
         header_file = os.path.join(self.report_dir, "header.md")
         header = generate_header()
         start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        with open(header_file, "w", encoding='utf-8') as f:
+        with open(header_file, "w", encoding="utf-8") as f:
             f.write(header)
             f.write(f"Run started: {start_time}\n\n---\n\n")
         print(f"Reset report and wrote header to {header_file}")
diff --git a/scripts/chat_web.py b/scripts/chat_web.py
index 5d0b44a..4b67b62 100644
--- a/scripts/chat_web.py
+++ b/scripts/chat_web.py
@@ -243,7 +243,7 @@ app.add_middleware(
 async def root():
     """Serve the chat UI."""
     ui_html_path = os.path.join("nanochat", "ui.html")
-    with open(ui_html_path, "r", encoding='utf-8') as f:
+    with open(ui_html_path, "r", encoding="utf-8") as f:
         html_content = f.read()
     # Replace the API_URL to use the same origin
     html_content = html_content.replace(
diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py
index bad3c92..aca67fc 100644
--- a/tests/test_rustbpe.py
+++ b/tests/test_rustbpe.py
@@ -455,13 +455,13 @@ def enwik8_path():
 @pytest.fixture(scope="module")
 def enwik8_small(enwik8_path):
     """Fixture providing 100KB of enwik8 for quick tests."""
-    with open(enwik8_path, "r", encoding='utf-8') as f:
+    with open(enwik8_path, "r", encoding="utf-8") as f:
         return f.read(100_000)
 
 @pytest.fixture(scope="module")
 def enwik8_large(enwik8_path):
     """Fixture providing 10MB of enwik8 for performance tests."""
-    with open(enwik8_path, "r", encoding='utf-8') as f:
+    with open(enwik8_path, "r", encoding="utf-8") as f:
         return f.read(10**7)
 
 def time_function(func, *args, **kwargs):

From 7a40ee77b4695ccb7350a679230eb6a7f8a6ae29 Mon Sep 17 00:00:00 2001
From: Dipesh Babu <dipeshmahato@outlook.com>
Date: Mon, 3 Nov 2025 16:00:56 -0500
Subject: [PATCH 22/36] fix: cast bf16 to fp32 on MPS (like CPU) to avoid dtype
 issues

---
 nanochat/checkpoint_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index a9327c4..2fcb01b 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -65,7 +65,7 @@ def build_model(checkpoint_dir, step, device, phase):
     """
     assert phase in ["train", "eval"], f"Invalid phase: {phase}"
     model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
-    if device.type == "cpu":
+    if device.type in {"cpu", "mps"}:
         # Convert bfloat16 tensors to float for CPU inference
         model_data = {
             k: v.float() if v.dtype == torch.bfloat16 else v

From 1e89af986263e96b9c594a2dfb80d9c8237f2d4e Mon Sep 17 00:00:00 2001
From: Yasser Makram <yassermakram@gmail.com>
Date: Tue, 4 Nov 2025 07:22:34 +0000
Subject: [PATCH 23/36] Replace fcntl with filelock for Windows compatibility

---
 nanochat/common.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/nanochat/common.py b/nanochat/common.py
index 4e5fc06..2195a9f 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -5,10 +5,10 @@ Common utilities for nanochat.
 import os
 import re
 import logging
-import fcntl
 import urllib.request
 import torch
 import torch.distributed as dist
+from filelock import FileLock
 
 class ColoredFormatter(logging.Formatter):
     """Custom formatter that adds colors to log messages."""
@@ -70,11 +70,7 @@ def download_file_with_lock(url, filename, postprocess_fn=None):
     if os.path.exists(file_path):
         return file_path
 
-    with open(lock_path, 'w') as lock_file:
-
-        # Only a single rank can acquire this lock
-        # All other ranks block until it is released
-        fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
+    with FileLock(lock_path):
 
         # Recheck after acquiring lock (another process may have downloaded it)
         if os.path.exists(file_path):
@@ -94,12 +90,6 @@ def download_file_with_lock(url, filename, postprocess_fn=None):
         if postprocess_fn is not None:
             postprocess_fn(file_path)
 
-    # Clean up the lock file after the lock is released
-    try:
-        os.remove(lock_path)
-    except OSError:
-        pass  # Ignore if already removed by another process
-
     return file_path
 
 def print0(s="",**kwargs):

From f1683c5b1643c255d59903870eec91e17d5bf801 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Tue, 4 Nov 2025 21:36:10 +0100
Subject: [PATCH 24/36] set nproc_per_node as var in speedrun and run1000
 scripts

---
 run1000.sh  | 18 +++++++++++-------
 speedrun.sh | 21 ++++++++++++---------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/run1000.sh b/run1000.sh
index 46325d9..58ee3bc 100644
--- a/run1000.sh
+++ b/run1000.sh
@@ -70,18 +70,22 @@ python -m scripts.tok_eval
 # which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd
 # start to overfit hard.
 # 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
+
+# Number of processes/GPUs to use
+NPROC_PER_NODE=8
+
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
 
 # midtrain
 # NOTE: ensure that we use the same device_batch_size here as the base training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
 
 # sft
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
 
 # generate final report
 python -m nanochat.report generate
diff --git a/speedrun.sh b/speedrun.sh
index 32c8870..7955ec5 100644
--- a/speedrun.sh
+++ b/speedrun.sh
@@ -82,12 +82,15 @@ python -m scripts.tok_eval
 echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID
 
+# Number of processes/GPUs to use
+NPROC_PER_NODE=8
+
 # pretrain the d20 model
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
 # evaluate the model on a larger chunk of train/val data and draw some samples
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
 # evaluate the model on CORE tasks
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
 
 # -----------------------------------------------------------------------------
 # Midtraining (teach the model conversation special tokens, tool use, multiple choice)
@@ -97,15 +100,15 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
 
 # run midtraining and eval the model
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
 
 # -----------------------------------------------------------------------------
 # Supervised Finetuning (domain adaptation to each sequence all by itself per row)
 
 # train sft and re-eval right away (should see a small bump)
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
 
 # chat with the model over CLI! Leave out the -p to chat interactively
 # python -m scripts.chat_cli -p "Why is the sky blue?"
@@ -118,9 +121,9 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
 # (optional)
 
 # run reinforcement learning
-# torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN
 # eval the RL model only on GSM8K
-# torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K
 
 # -----------------------------------------------------------------------------
 # Generate the full report by putting together all the sections

From c6b7ab744055d5915e6ccb61088de80c10cbaff9 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Wed, 5 Nov 2025 21:08:30 +0000
Subject: [PATCH 25/36] grad clip logging and printing and cosmetics

---
 scripts/base_train.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/scripts/base_train.py b/scripts/base_train.py
index ddd2c98..594c709 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -271,9 +271,11 @@ for step in range(num_iterations + 1):
         loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
         loss.backward()
         x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
-    # gradient clipping (TODO possibly experiment with)
-    if grad_clip > 0.0:
-        torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
+    # gradient clipping
+    grad_clip_enabled = grad_clip > 0.0
+    if grad_clip_enabled:
+        grad_norm_tensor = torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
+        grad_norm = grad_norm_tensor.item() # GPU tensor -> CPU float (note: cpu-gpu sync point)
     # step the optimizers
     lrm = get_lr_multiplier(step)
     for opt in optimizers:
@@ -300,9 +302,10 @@ for step in range(num_iterations + 1):
     mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
     if step > 10:
         total_training_time += dt # only count the time after the first 10 steps
-    print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
+    print_grad_norm = f" grad norm: {grad_norm:.4f} |" if grad_clip_enabled else ""
+    print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} |{print_grad_norm} lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
     if step % 100 == 0:
-        wandb_run.log({
+        log_data = {
             "step": step,
             "total_training_flops": flops_so_far,
             "total_training_time": total_training_time,
@@ -311,7 +314,10 @@ for step in range(num_iterations + 1):
             "train/dt": dt,
             "train/tok_per_sec": tok_per_sec,
             "train/mfu": mfu,
-        })
+        }
+        if grad_clip_enabled:
+            log_data["train/grad_norm"] = grad_norm
+        wandb_run.log(log_data)
 
 # print a few more stats
 print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")

From b399e431681d61dcced768c062b13a9089c0c21c Mon Sep 17 00:00:00 2001
From: "howardgao@outlook.com" <haogao96>
Date: Thu, 6 Nov 2025 08:56:45 +0800
Subject: [PATCH 26/36] fix engine test bug

---
 nanochat/engine.py | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/nanochat/engine.py b/nanochat/engine.py
index 916a9cf..da85085 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -17,8 +17,9 @@ import signal
 import warnings
 from contextlib import contextmanager
 from collections import deque
-from nanochat.common import compute_init
+from nanochat.common import compute_init, autodetect_device_type
 from nanochat.checkpoint_manager import load_model
+from contextlib import nullcontext 
 
 # -----------------------------------------------------------------------------
 # Calculator tool helpers
@@ -327,8 +328,11 @@ if __name__ == "__main__":
     import time
     # init compute
     ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
+    device_type = autodetect_device_type()
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+
     # load the model and tokenizer
-    model, tokenizer, meta = load_model("base", device, phase="eval")
+    model, tokenizer, meta = load_model("sft", device, phase="eval")
     bos_token_id = tokenizer.get_bos_token_id()
     # common hyperparameters
     kwargs = dict(max_tokens=64, temperature=0.0)
@@ -339,10 +343,11 @@ if __name__ == "__main__":
     torch.cuda.synchronize()
     t0 = time.time()
     stream = model.generate(prompt_tokens, **kwargs)
-    for token in stream:
-        generated_tokens.append(token)
-        chunk = tokenizer.decode([token])
-        print(chunk, end="", flush=True)
+    with autocast_ctx:
+        for token in stream:
+            generated_tokens.append(token)
+            chunk = tokenizer.decode([token])
+            print(chunk, end="", flush=True)
     print()
     torch.cuda.synchronize()
     t1 = time.time()
@@ -354,11 +359,12 @@ if __name__ == "__main__":
     stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
     torch.cuda.synchronize()
     t0 = time.time()
-    for token_column, token_masks in stream:
-        token = token_column[0] # only print out the first row
-        generated_tokens.append(token)
-        chunk = tokenizer.decode([token])
-        print(chunk, end="", flush=True)
+    with autocast_ctx:
+        for token_column, token_masks in stream:
+            token = token_column[0] # only print out the first row
+            generated_tokens.append(token)
+            chunk = tokenizer.decode([token])
+            print(chunk, end="", flush=True)
     print()
     torch.cuda.synchronize()
     t1 = time.time()

From adb5d4a16c0a8dd9d50e05176a2cac08931562bc Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 13 Nov 2025 15:16:27 +0000
Subject: [PATCH 27/36] uv lock has to change when we removed numpy the other
 commit

---
 uv.lock | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/uv.lock b/uv.lock
index f01bba3..4e9b0bd 100644
--- a/uv.lock
+++ b/uv.lock
@@ -311,7 +311,7 @@ name = "exceptiongroup"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.12' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
 wheels = [
@@ -777,7 +777,6 @@ dependencies = [
     { name = "datasets" },
     { name = "fastapi" },
     { name = "files-to-prompt" },
-    { name = "numpy" },
     { name = "psutil" },
     { name = "regex" },
     { name = "setuptools" },
@@ -811,7 +810,6 @@ requires-dist = [
     { name = "datasets", specifier = ">=4.0.0" },
     { name = "fastapi", specifier = ">=0.117.1" },
     { name = "files-to-prompt", specifier = ">=0.6" },
-    { name = "numpy", specifier = "==1.26.4" },
     { name = "psutil", specifier = ">=7.1.0" },
     { name = "regex", specifier = ">=2025.9.1" },
     { name = "setuptools", specifier = ">=80.9.0" },
@@ -951,7 +949,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.10.2.21"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "extra == 'extra-8-nanochat-gpu'" },
+    { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
@@ -964,7 +962,7 @@ name = "nvidia-cufft-cu12"
 version = "11.3.3.83"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "extra == 'extra-8-nanochat-gpu'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" },
@@ -996,9 +994,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.3.90"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "extra == 'extra-8-nanochat-gpu'" },
-    { name = "nvidia-cusparse-cu12", marker = "extra == 'extra-8-nanochat-gpu'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "extra == 'extra-8-nanochat-gpu'" },
+    { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "nvidia-cusparse-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" },
@@ -1011,7 +1009,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.8.93"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "extra == 'extra-8-nanochat-gpu'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" },
@@ -1955,7 +1953,7 @@ name = "triton"
 version = "3.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "setuptools", marker = "extra == 'extra-8-nanochat-gpu'" },
+    { name = "setuptools", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" },

From 91f09ccd0d48daf89eee6ef7fcec05977fd87068 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 13 Nov 2025 15:28:18 +0000
Subject: [PATCH 28/36] minor fix comment in engine

---
 nanochat/engine.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nanochat/engine.py b/nanochat/engine.py
index 916a9cf..1d541c7 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -107,8 +107,9 @@ class KVCache:
         assert self.kv_cache is None, "Cannot prefill a non-empty KV cache"
         assert other.kv_cache is not None, "Cannot prefill with a None KV cache"
         for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)):
+            # ix 0: num_layers, 1: k/v, 2: batch_size, 3: num_heads, 4: seq_len, 5: head_dim
             if ix in [0, 1, 3, 5]:
-                # num_layers, batch_size, num_heads, head_dim must match
+                # num_layers, k/v, num_heads, head_dim must match
                 assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}"
             elif ix == 2:
                 # batch_size can be expanded

From c6abcdfe3a23f3cc3656e4132a606e8753415fca Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 13 Nov 2025 15:34:40 +0000
Subject: [PATCH 29/36] big change: add pretraining resumption logic so that
 checkpoints can now be approximately resumed and training can continue. this
 is useful for very long runs when you don't want the anxiety of your run
 crashing for some reason. alternatively, it's a way to recover training in
 the event of loss spikes. i mean, this should have been there in v0 but it's
 ok. the resumption is approximate to control complexity and bloat, but it's
 possible we want to change that in the future. to use, set --save_every to a
 step interval to write checkpoints with, and then use --resume_from_step to
 resume optimization from a given step. only base model training (pretraining)
 supports this atm, but it's ok because midtraining is comparably quite a bit
 faster.

---
 nanochat/checkpoint_manager.py | 35 +++++++------
 nanochat/common.py             |  2 +
 nanochat/dataloader.py         | 82 ++++++++++++++++++++++---------
 scripts/base_train.py          | 89 +++++++++++++++++++++++++---------
 4 files changed, 145 insertions(+), 63 deletions(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index b7d2191..63f257f 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -20,33 +20,32 @@ def log0(message):
     if int(os.environ.get('RANK', 0)) == 0:
         logger.info(message)
 
-def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data):
-    assert int(os.environ.get('RANK', 0)) == 0 # prevent footguns for now
-    os.makedirs(checkpoint_dir, exist_ok=True)
-    # Save the model state (parameters)
-    model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
-    torch.save(model_data, model_path)
-    log0(f"Saved model file to: {model_path}")
-    # Save the optimizer state (useful for SFT or any other fine-tuning)
+def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
+    if rank == 0:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        # Save the model state parameters
+        model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
+        torch.save(model_data, model_path)
+        logger.info(f"Saved model parameters to: {model_path}")
+        # Save the metadata dict as json
+        meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta_data, f, indent=2)
+        logger.info(f"Saved metadata to: {meta_path}")
+    # Note that optimizer state is sharded across ranks, so each rank must save its own.
     if optimizer_data is not None:
-        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}.pt")
+        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
         torch.save(optimizer_data, optimizer_path)
-        log0(f"Saved optimizer file to: {optimizer_path}")
-    # Save the metadata dict as json
-    meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "w", encoding="utf-8") as f:
-        json.dump(meta_data, f, indent=2)
-    log0(f"Saved metadata file to: {meta_path}")
+        logger.info(f"Saved optimizer state to: {optimizer_path}")
 
-
-def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False):
+def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0):
     # Load the model state
     model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
     model_data = torch.load(model_path, map_location=device)
     # Load the optimizer state if requested
     optimizer_data = None
     if load_optimizer:
-        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}.pt")
+        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
         optimizer_data = torch.load(optimizer_path, map_location=device)
     # Load the metadata
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
diff --git a/nanochat/common.py b/nanochat/common.py
index d4a9828..8f36f94 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -148,6 +148,8 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
         assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
 
     # Reproducibility
+    # Note that we set the global seeds here, but most of the code uses explicit rng objects.
+    # The only place where global rng might be used is nn.Module initialization of the model weights.
     torch.manual_seed(42)
     if device_type == "cuda":
         torch.cuda.manual_seed(42)
diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py
index 6c864d3..3271298 100644
--- a/nanochat/dataloader.py
+++ b/nanochat/dataloader.py
@@ -1,49 +1,87 @@
 from collections import deque
 
 import torch
+import pyarrow.parquet as pq
 
 from nanochat.common import get_dist_info
-from nanochat.dataset import parquets_iter_batched
+from nanochat.dataset import list_parquet_files
 from nanochat.tokenizer import get_tokenizer
 
-def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda"):
-    """Stream pretraining text from parquet files, tokenize, yield training batches."""
+def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
+    """
+    Stream pretraining text from parquet files, tokenize, yield training batches.
+
+    This implementation became a bit more complex because we wish to support approximate resume training.
+    Instead of turning this into a Class, we opt to return the state_dict with every batch,
+    and then the caller can pass in a state_dict to resume training from a desired point.
+    Note that this resumption is atm only *approximate* for simplicity.
+    We won't repeat the same documents but we might skip a few.
+    The state_dict that is returned can be later passed into this function via `resume_state_dict` to approximately resume.
+
+    Perfect state resumption is possible but would be a lot more bloated, probably not worth it atm.
+    """
     assert split in ["train", "val"], "split must be 'train' or 'val'"
+
+    # infinite iterator over document batches (list of text strings)
     ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    def document_batches():
+        parquet_paths = list_parquet_files()
+        parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
+        resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
+        resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
+        pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0)
+        while True: # iterate infinitely (multi-epoch)
+            while pq_idx < len(parquet_paths): # iterate over all parquet files
+                filepath = parquet_paths[pq_idx]
+                pf = pq.ParquetFile(filepath)
+                # Start from resume point if resuming on same file, otherwise from DDP rank
+                # I know this state resumption is a little bit tricky and a little bit hacky... sigh.
+                if resume_rg_idx is not None:
+                    base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size
+                    base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
+                    rg_idx = base_idx * ddp_world_size + ddp_rank
+                    resume_rg_idx = None # set to None as we only want to do this a single time
+                else:
+                    rg_idx = ddp_rank
+                while rg_idx < pf.num_row_groups:
+                    rg = pf.read_row_group(rg_idx)
+                    batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
+                    # the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
+                    for i in range(0, len(batch), tokenizer_batch_size):
+                        yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
+                    rg_idx += ddp_world_size # advance to the next row group (in DDP)
+                pq_idx += 1 # advance to the next parquet file
+    batches = document_batches()
+
+    # Now emit batches of tokens.
     needed_tokens = B * T + 1 # +1 is because we also need the target at the last token
     # get the tokenizer and the bos token
     tokenizer = get_tokenizer()
     bos_token = tokenizer.get_bos_token_id()
     # scratch buffer holds the tokens for one iteration
     token_buffer = deque() # we stream tokens on the right and pop from the left
-
-    # infinite iterator over document batches
-    def document_batches():
-        while True:
-            # batch will iterate in group size of the parquet files, usually e.g. 1024 rows
-            for batch in parquets_iter_batched(split=split, start=ddp_rank, step=ddp_world_size):
-                # for the tokenizer we might want to go in usually smaller batches, e.g. 128 rows
-                for i in range(0, len(batch), tokenizer_batch_size):
-                    yield batch[i:i+tokenizer_batch_size]
-    batches = document_batches()
-
-    batch_index = 0
     while True:
         # Accumulate enough tokens for one iteration before yielding.
         while len(token_buffer) < needed_tokens:
-            doc_batch = next(batches)
+            doc_batch, (pq_idx, rg_idx) = next(batches)
             token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads)
             for tokens in token_lists:
                 token_buffer.extend(tokens)
-            batch_index += 1
         # Move tokens from the deque into the scratch buffer
         tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
-        # CUDA supports memory pinning for faster transfers between CPU and GPU:
-        scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda"))
+        # CUDA supports memory pinning for asynchronous transfers between CPU and GPU
+        use_cuda_optimizations = device == "cuda"
+        scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda_optimizations) # in PyTorch, long=int64
         # Create the inputs/targets as 1D tensors
-        inputs_cpu = scratch[:-1].to(dtype=torch.int32)
+        inputs_cpu = scratch[:-1]
         targets_cpu = scratch[1:]
         # Reshape to 2D and move to GPU async
-        inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True)
-        targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True)
+        inputs = inputs_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
+        targets = targets_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
+        state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training
+        yield inputs, targets, state_dict
+
+def tokenizing_distributed_data_loader(*args, **kwargs):
+    # helper function that only emits the inputs/targets and not the state_dict
+    for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs):
         yield inputs, targets
diff --git a/scripts/base_train.py b/scripts/base_train.py
index 594c709..c9ea6c9 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -20,10 +20,10 @@ import wandb
 import torch
 
 from nanochat.gpt import GPT, GPTConfig
-from nanochat.dataloader import tokenizing_distributed_data_loader
+from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
 from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
 from nanochat.tokenizer import get_tokenizer, get_token_bytes
-from nanochat.checkpoint_manager import save_checkpoint
+from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint
 from nanochat.loss_eval import evaluate_bpb
 from nanochat.engine import Engine
 from scripts.base_eval import evaluate_model
@@ -52,12 +52,14 @@ grad_clip = 1.0 # gradient clipping value (0.0 = disabled)
 warmup_ratio = 0.0 # ratio of iterations for LR warmup
 warmdown_ratio = 0.2 # ratio of iterations for LR warmdown
 final_lr_frac = 0.0 # final LR is this fraction of the initial LR
+resume_from_step = -1 # resume training from this step of the optimization (-1 = disable)
 # Evaluation
 eval_every = 250 # every how many steps to evaluate the model for val bpb
 eval_tokens = 20*524288 # number of tokens to evaluate val loss on
 core_metric_every = 2000 # every how many steps to evaluate the core metric (-1 = disable)
 core_metric_max_per_task = 500 # examples per task in estimating the core metric
 sample_every = 2000 # every how many steps to sample from the model
+save_every = -1 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run)
 # Output
 model_tag = "" # optionally override the model tag for the output checkpoint directory name
 # now allow CLI to override the settings via the configurator lol
@@ -103,16 +105,31 @@ grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd
 print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}")
 print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
 print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
+
 # -----------------------------------------------------------------------------
 # Initialize the Model
+
+# Create a new model with random weights
 model_config_kwargs = dict(sequence_len=max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim)
 with torch.device("meta"):
     model_config = GPTConfig(**model_config_kwargs)
     model = GPT(model_config)
 model.to_empty(device=device)
 model.init_weights()
-orig_model = model # original, uncompiled model, for saving raw model state_dict
-model = torch.compile(model, dynamic=False) # TODO: dynamic True/False think through
+
+# If we are resuming, overwrite the model parameters with those of the checkpoint
+base_dir = get_base_dir()
+output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
+checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
+resuming = resume_from_step != -1
+if resuming:
+    print0(f"Resuming optimization from step {resume_from_step}")
+    model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, resume_from_step, device, load_optimizer=True, rank=ddp_rank)
+    model.load_state_dict(model_data, strict=True, assign=True)
+    del model_data # free up this memory after the copy
+
+orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape)
+model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe
 num_params = sum(p.numel() for p in model.parameters())
 print0(f"Number of parameters: {num_params:,}")
 num_flops_per_token = model.estimate_flops()
@@ -143,12 +160,18 @@ print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}")
 optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay)
 adamw_optimizer, muon_optimizer = optimizers
 
+if resuming:
+    for opt, dat in zip(optimizers, optimizer_data):
+        opt.load_state_dict(dat)
+    del optimizer_data # free up the memory
+
+# -----------------------------------------------------------------------------
 # Initialize the DataLoaders for train/val
-base_dir = get_base_dir()
 tokens_dir = os.path.join(base_dir, "tokenized_data")
-train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train", device=device)
+dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"]
+train_loader = tokenizing_distributed_data_loader_with_state(device_batch_size, max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
 build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device)
-x, y = next(train_loader) # kick off load of the very first batch of data
+x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data
 
 # -----------------------------------------------------------------------------
 # Set up hyperparameter schedulers
@@ -171,15 +194,25 @@ def get_muon_momentum(it):
     momentum = (1 - frac) * 0.85 + frac * 0.95
     return momentum
 
+# -----------------------------------------------------------------------------
+# Loop state (variables updated by the training loop)
+
+if not resuming:
+    step = 0
+    min_val_bpb = float("inf")
+    smooth_train_loss = 0 # EMA of training loss
+    total_training_time = 0 # total wall-clock time of training
+else:
+    step = meta_data["step"]
+    loop_state = meta_data["loop_state"]
+    min_val_bpb = loop_state["min_val_bpb"]
+    smooth_train_loss = loop_state["smooth_train_loss"]
+    total_training_time = loop_state["total_training_time"]
+
 # -----------------------------------------------------------------------------
 # Training loop
-min_val_bpb = float("inf")
-smooth_train_loss = 0 # EMA of training loss
-ema_beta = 0.9 # EMA decay factor
-total_training_time = 0 # total wall-clock time of training
-# note that we run +1 steps only so that we can eval and save at the end
-for step in range(num_iterations + 1):
-    last_step = step == num_iterations
+while True:
+    last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end
     flops_so_far = num_flops_per_token * total_batch_size * step
 
     # once in a while: evaluate the val bpb (all ranks participate)
@@ -237,25 +270,31 @@ for step in range(num_iterations + 1):
             print0(tokenizer.decode(sample[0]))
         model.train()
 
-    # save checkpoint at the end of the run (only on master process)
-    if master_process and last_step:
-        output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
-        checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
+    # save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step
+    if last_step or (step > 0 and step != resume_from_step and save_every > 0 and step % save_every == 0):
         save_checkpoint(
             checkpoint_dir,
             step,
-            orig_model.state_dict(),
-            [opt.state_dict() for opt in optimizers], # TODO: make sure saving across ranks is done correctly
-            {
+            orig_model.state_dict(), # model parameters
+            [opt.state_dict() for opt in optimizers], # optimizer states
+            { # metadata saved as json
                 "step": step,
                 "val_bpb": val_bpb, # loss at last step
                 "model_config": model_config_kwargs,
                 "user_config": user_config, # inputs to the training script
                 "device_batch_size": device_batch_size,
                 "max_seq_len": max_seq_len,
-            }
+                "dataloader_state_dict": dataloader_state_dict,
+                "loop_state": { # all loop state (other than step) so that we can resume training
+                    "min_val_bpb": min_val_bpb,
+                    "smooth_train_loss": smooth_train_loss,
+                    "total_training_time": total_training_time,
+                },
+            },
+            rank=ddp_rank,
         )
 
+    # termination conditions (TODO: possibly also add loss explosions etc.)
     if last_step:
         break
 
@@ -270,7 +309,7 @@ for step in range(num_iterations + 1):
         train_loss = loss.detach() # for logging
         loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
         loss.backward()
-        x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
+        x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
     # gradient clipping
     grad_clip_enabled = grad_clip > 0.0
     if grad_clip_enabled:
@@ -293,6 +332,7 @@ for step in range(num_iterations + 1):
     # -------------------------------------------------------------------------
 
     # logging
+    ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging
     smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
     debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
     pct_done = 100 * step / num_iterations
@@ -319,6 +359,9 @@ for step in range(num_iterations + 1):
             log_data["train/grad_norm"] = grad_norm
         wandb_run.log(log_data)
 
+    # state update
+    step += 1
+
 # print a few more stats
 print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
 print0(f"Total training time: {total_training_time/60:.2f}m")

From 7b7fd0fe71cf496304d0b8d4e3571c2fc412356b Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 13 Nov 2025 16:07:54 +0000
Subject: [PATCH 30/36] thank you Sophie for your help with nanochat

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 18ea5ce..faee896 100644
--- a/README.md
+++ b/README.md
@@ -201,6 +201,7 @@ Current LLM policy: disclosure. When submitting a PR, please declare any parts t
 - Thank you to [HuggingFace](https://huggingface.co/) for fineweb and smoltalk.
 - Thank you [Lambda](https://lambda.ai/service/gpu-cloud) for the compute used in developing this project.
 - Thank you to chief LLM whisperer 🧙‍♂️ Alec Radford for advice/guidance.
+- Thank you to the repo czar Sophie [@svlandeg](https://github.com/svlandeg) for help with managing issues, pull requests and discussions of nanochat.
 
 ## Cite
 

From 9a71d1368899b7bfbb8e1fad966b683ec80a5760 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 13 Nov 2025 16:08:30 +0000
Subject: [PATCH 31/36] typo oops

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index faee896..c96ac23 100644
--- a/README.md
+++ b/README.md
@@ -201,7 +201,7 @@ Current LLM policy: disclosure. When submitting a PR, please declare any parts t
 - Thank you to [HuggingFace](https://huggingface.co/) for fineweb and smoltalk.
 - Thank you [Lambda](https://lambda.ai/service/gpu-cloud) for the compute used in developing this project.
 - Thank you to chief LLM whisperer 🧙‍♂️ Alec Radford for advice/guidance.
-- Thank you to the repo czar Sophie [@svlandeg](https://github.com/svlandeg) for help with managing issues, pull requests and discussions of nanochat.
+- Thank you to the repo czar Sofie [@svlandeg](https://github.com/svlandeg) for help with managing issues, pull requests and discussions of nanochat.
 
 ## Cite
 

From e5efb4b471cd708a5aa816462e8fce78cb2b4431 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Fri, 14 Nov 2025 11:13:42 +0100
Subject: [PATCH 32/36] add test_engine.py to file structure

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 18ea5ce..4b50d69 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,7 @@ python -m pytest tests/test_rustbpe.py -v -s
 │   ├── smoltalk.py                 # Conglomerate dataset of SmolTalk from HF
 │   └── spellingbee.py              # Task teaching model to spell/count letters
 ├── tests
+│   └── test_engine.py
 │   └── test_rustbpe.py
 └── uv.lock
 ```

From a2fb3c83a66dd4199e7aa0fcaddda28e3fe85bbf Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Fri, 14 Nov 2025 11:20:25 +0100
Subject: [PATCH 33/36] fix typos

---
 nanochat/loss_eval.py | 4 ++--
 scripts/chat_eval.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/nanochat/loss_eval.py b/nanochat/loss_eval.py
index 6fcbea3..5a556e6 100644
--- a/nanochat/loss_eval.py
+++ b/nanochat/loss_eval.py
@@ -9,9 +9,9 @@ import torch.distributed as dist
 def evaluate_bpb(model, batches, steps, token_bytes):
     """
     Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
-    which is a tokenization vocab size-indepedent metric, meaning you are still comparing
+    which is a tokenization vocab size-independent metric, meaning you are still comparing
     apples:apples if you change the vocab size. The way this works is that instead of just
-    calculating the average loss as usual, you calculate the sum loss, and indepependently
+    calculating the average loss as usual, you calculate the sum loss, and independently
     also the sum bytes (of all the target tokens), and divide. This normalizes the loss by
     the number of bytes that the target tokens represent.
 
diff --git a/scripts/chat_eval.py b/scripts/chat_eval.py
index 616411d..cae2f0f 100644
--- a/scripts/chat_eval.py
+++ b/scripts/chat_eval.py
@@ -1,6 +1,6 @@
 """
 Evaluate the Chat model.
-All the generic code lives here, and all the evlauation-specific
+All the generic code lives here, and all the evaluation-specific
 code lives in nanochat directory and is imported from here.
 
 Example runs:

From c6f5bd67db78982f02d19d86005524819aa410fc Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 14 Nov 2025 12:20:03 +0100
Subject: [PATCH 34/36] revert change of base to sft for quick inline test

---
 nanochat/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/engine.py b/nanochat/engine.py
index da85085..295d889 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -332,7 +332,7 @@ if __name__ == "__main__":
     autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
 
     # load the model and tokenizer
-    model, tokenizer, meta = load_model("sft", device, phase="eval")
+    model, tokenizer, meta = load_model("base", device, phase="eval")
     bos_token_id = tokenizer.get_bos_token_id()
     # common hyperparameters
     kwargs = dict(max_tokens=64, temperature=0.0)

From bc1fca39f33074fec4319ef46d96e09b8024c824 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 15 Nov 2025 15:43:37 +0000
Subject: [PATCH 35/36] mqa -> gqa to reduce confusion

---
 nanochat/gpt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index b640f1e..8b220c3 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -8,7 +8,7 @@ Notable features:
 - norm after token embedding
 - no learnable params in rmsnorm
 - no bias in linear layers
-- Multi-Query Attention (MQA) support for more efficient inference
+- Group-Query Attention (GQA) support for more efficient inference
 """
 
 import math
@@ -29,7 +29,7 @@ class GPTConfig:
     vocab_size: int = 50304
     n_layer: int = 12
     n_head: int = 6 # number of query heads
-    n_kv_head: int = 6 # number of key/value heads (MQA)
+    n_kv_head: int = 6 # number of key/value heads (GQA)
     n_embd: int = 768
 
 

From 11e68bf4427aef8748a8c0c3978c9c03838a9466 Mon Sep 17 00:00:00 2001
From: Sam Abrahams <sjabraha@gmail.com>
Date: Mon, 17 Nov 2025 11:32:56 -0500
Subject: [PATCH 36/36] Fix comment: rotary embeddings final dimension size

---
 nanochat/gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index 8b220c3..216343c 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -244,7 +244,7 @@ class GPT(nn.Module):
     def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'):
         B, T = idx.size()
 
-        # Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim))
+        # Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim/2))
         assert T <= self.cos.size(1), f"Sequence length grew beyond the rotary embeddings cache: {T} > {self.cos.size(1)}"
         assert idx.device == self.cos.device, f"Rotary embeddings and idx are on different devices: {idx.device} != {self.cos.device}"
         assert self.cos.dtype == torch.bfloat16, "Rotary embeddings must be in bfloat16"