This commit is contained in:
MrPOS666 2026-03-05 15:25:57 -05:00
parent c7ba252142
commit af9f842b83
6 changed files with 1027 additions and 4 deletions

20
a3.aux Normal file
View File

@ -0,0 +1,20 @@
\relax
\providecommand\hyper@newdestlabel[2]{}
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
\global\let\oldnewlabel\newlabel
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
\AtEndDocument{\ifx\hyper@anchor\@undefined
\let\newlabel\oldnewlabel
\fi}
\fi}
\global\let\hyper@last\relax
\gdef\HyperFirstAtBeginDocument#1{#1}
\providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Three literature-based changes relative to the Feb 2026 nanochat architecture. Changes 1--2 were evaluated in Part 2.}}{3}{table.1}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Part 2 ablations (BPB lower is better).}}{3}{table.2}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Context extension results.}}{4}{table.3}\protected@file@percent }
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Final model vs pico baseline (BPB lower is better).}}{4}{table.4}\protected@file@percent }
\gdef \@abspage@last{5}

818
a3.log Normal file
View File

@ -0,0 +1,818 @@
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2024.1.1) 5 MAR 2026 14:12
entering extended mode
restricted \write18 enabled.
%&-line parsing enabled.
**a3.tex
(./a3.tex
LaTeX2e <2022-11-01> patch level 1
L3 programming layer <2023-02-22>
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/article.cls
Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/size11.clo
File: size11.clo 2022/07/02 v1.4n Standard LaTeX file (size option)
)
\c@part=\count185
\c@section=\count186
\c@subsection=\count187
\c@subsubsection=\count188
\c@paragraph=\count189
\c@subparagraph=\count190
\c@figure=\count191
\c@table=\count192
\abovecaptionskip=\skip48
\belowcaptionskip=\skip49
\bibindent=\dimen140
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
Package: geometry 2020/01/02 v5.9 Page Geometry
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty
Package: keyval 2022/05/29 v1.15 key=value parser (DPC)
\KV@toks@=\toks16
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty
Package: iftex 2022/02/03 v1.0f TeX engine tests
))
\Gm@cnth=\count193
\Gm@cntv=\count194
\c@Gm@tempcnt=\count195
\Gm@bindingoffset=\dimen141
\Gm@wd@mp=\dimen142
\Gm@odd@mp=\dimen143
\Gm@even@mp=\dimen144
\Gm@layoutwidth=\dimen145
\Gm@layoutheight=\dimen146
\Gm@layouthoffset=\dimen147
\Gm@layoutvoffset=\dimen148
\Gm@dimlist=\toks17
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/booktabs/booktabs.sty
Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
\heavyrulewidth=\dimen149
\lightrulewidth=\dimen150
\cmidrulewidth=\dimen151
\belowrulesep=\dimen152
\belowbottomsep=\dimen153
\aboverulesep=\dimen154
\abovetopsep=\dimen155
\cmidrulesep=\dimen156
\cmidrulekern=\dimen157
\defaultaddspace=\dimen158
\@cmidla=\count196
\@cmidlb=\count197
\@aboverulesep=\dimen159
\@belowrulesep=\dimen160
\@thisruleclass=\count198
\@lastruleclass=\count199
\@thisrulewidth=\dimen161
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
Package: amsmath 2022/04/08 v2.17n AMS math features
\@mathmargin=\skip50
For additional information on amsmath, use the `?' option.
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
Package: amstext 2021/08/26 v2.01 AMS text
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty
File: amsgen.sty 1999/11/30 v2.0 generic functions
\@emptytoks=\toks18
\ex@=\dimen162
))
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
\pmbraise@=\dimen163
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty
Package: amsopn 2022/04/08 v2.04 operator names
)
\inf@bad=\count266
LaTeX Info: Redefining \frac on input line 234.
\uproot@=\count267
\leftroot@=\count268
LaTeX Info: Redefining \overline on input line 399.
LaTeX Info: Redefining \colon on input line 410.
\classnum@=\count269
\DOTSCASE@=\count270
LaTeX Info: Redefining \ldots on input line 496.
LaTeX Info: Redefining \dots on input line 499.
LaTeX Info: Redefining \cdots on input line 620.
\Mathstrutbox@=\box51
\strutbox@=\box52
LaTeX Info: Redefining \big on input line 722.
LaTeX Info: Redefining \Big on input line 723.
LaTeX Info: Redefining \bigg on input line 724.
LaTeX Info: Redefining \Bigg on input line 725.
\big@size=\dimen164
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
\macc@depth=\count271
LaTeX Info: Redefining \bmod on input line 905.
LaTeX Info: Redefining \pmod on input line 910.
LaTeX Info: Redefining \smash on input line 940.
LaTeX Info: Redefining \relbar on input line 970.
LaTeX Info: Redefining \Relbar on input line 971.
\c@MaxMatrixCols=\count272
\dotsspace@=\muskip16
\c@parentequation=\count273
\dspbrk@lvl=\count274
\tag@help=\toks19
\row@=\count275
\column@=\count276
\maxfields@=\count277
\andhelp@=\toks20
\eqnshift@=\dimen165
\alignsep@=\dimen166
\tagshift@=\dimen167
\tagwidth@=\dimen168
\totwidth@=\dimen169
\lineht@=\dimen170
\@envbody=\toks21
\multlinegap=\skip51
\multlinetaggap=\skip52
\mathdisplay@stack=\toks22
LaTeX Info: Redefining \[ on input line 2953.
LaTeX Info: Redefining \] on input line 2954.
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/graphicx.sty
Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/graphics.sty
Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR)
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/trig.sty
Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
)
Package graphics Info: Driver file: pdftex.def on input line 107.
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics-def/pdftex.def
File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex
))
\Gin@req@height=\dimen171
\Gin@req@width=\dimen172
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.te
x
\pgfutil@everybye=\toks23
\pgfutil@tempdima=\dimen173
\pgfutil@tempdimb=\dimen174
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
\pgfutil@abb=\box53
) (/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/pgf.revision.tex)
Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10)
))
Package: pgf 2023-01-15 v3.1.10 (3.1.10)
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
Package: pgfsys 2023-01-15 v3.1.10 (3.1.10)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
\pgfkeys@pathtoks=\toks24
\pgfkeys@temptoks=\toks25
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfil
tered.code.tex
\pgfkeys@tmptoks=\toks26
))
\pgf@x=\dimen175
\pgf@y=\dimen176
\pgf@xa=\dimen177
\pgf@ya=\dimen178
\pgf@xb=\dimen179
\pgf@yb=\dimen180
\pgf@xc=\dimen181
\pgf@yc=\dimen182
\pgf@xd=\dimen183
\pgf@yd=\dimen184
\w@pgf@writea=\write3
\r@pgf@reada=\read2
\c@pgf@counta=\count278
\c@pgf@countb=\count279
\c@pgf@countc=\count280
\c@pgf@countd=\count281
\t@pgf@toka=\toks27
\t@pgf@tokb=\toks28
\t@pgf@tokc=\toks29
\pgf@sys@id@count=\count282
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10)
)
Driver file for pgf: pgfsys-pdftex.def
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.d
ef
File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-p
df.def
File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10)
)))
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.
code.tex
File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfsyssoftpath@smallbuffer@items=\count283
\pgfsyssoftpath@bigbuffer@items=\count284
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.
code.tex
File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10)
)) (/usr/local/texlive/2023/texmf-dist/tex/latex/xcolor/xcolor.sty
Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK)
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics-cfg/color.cfg
File: color.cfg 2016/01/02 v1.6 sample color configuration
)
Package xcolor Info: Driver file: pdftex.def on input line 227.
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/mathcolor.ltx)
Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353.
Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1357.
Package xcolor Info: Model `RGB' extended on input line 1369.
Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371.
Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372.
Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373.
Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374.
Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375.
Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376.
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
Package: pgfcore 2023-01-15 v3.1.10 (3.1.10)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
\pgfmath@dimen=\dimen185
\pgfmath@count=\count285
\pgfmath@box=\box54
\pgfmath@toks=\toks30
\pgfmath@stack@operand=\toks31
\pgfmath@stack@operation=\toks32
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.
tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic
.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigo
nometric.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.rando
m.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.compa
rison.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.
code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round
.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.
code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integ
erarithmetics.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex
\c@pgfmathroundto@lastzeros=\count286
)) (/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfint.code.tex)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.co
de.tex
File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@picminx=\dimen186
\pgf@picmaxx=\dimen187
\pgf@picminy=\dimen188
\pgf@picmaxy=\dimen189
\pgf@pathminx=\dimen190
\pgf@pathmaxx=\dimen191
\pgf@pathminy=\dimen192
\pgf@pathmaxy=\dimen193
\pgf@xx=\dimen194
\pgf@xy=\dimen195
\pgf@yx=\dimen196
\pgf@yy=\dimen197
\pgf@zx=\dimen198
\pgf@zy=\dimen199
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconst
ruct.code.tex
File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@path@lastx=\dimen256
\pgf@path@lasty=\dimen257
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage
.code.tex
File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@shorten@end@additional=\dimen258
\pgf@shorten@start@additional=\dimen259
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.co
de.tex
File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfpic=\box55
\pgf@hbox=\box56
\pgf@layerbox@main=\box57
\pgf@picture@serial@count=\count287
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicst
ate.code.tex
File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgflinewidth=\dimen260
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransform
ations.code.tex
File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@pt@x=\dimen261
\pgf@pt@y=\dimen262
\pgf@pt@temp=\dimen263
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.cod
e.tex
File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.c
ode.tex
File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathproce
ssing.code.tex
File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.co
de.tex
File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfarrowsep=\dimen264
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.cod
e.tex
File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@max=\dimen265
\pgf@sys@shading@range@num=\count288
\pgf@shadingcount=\count289
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.cod
e.tex
File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.
code.tex
File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfexternal@startupbox=\box58
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.co
de.tex
File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretranspare
ncy.code.tex
File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.
code.tex
File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.
tex
File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10)
)))
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.cod
e.tex
File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfnodeparttextbox=\box59
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.
tex
File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version
-0-65.sty
Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10)
\pgf@nodesepstart=\dimen266
\pgf@nodesepend=\dimen267
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version
-1-18.sty
Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10)
))
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgffor.sty
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)
) (/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/math/pgfmath.sty
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex))
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex
Package: pgffor 2023-01-15 v3.1.10 (3.1.10)
\pgffor@iter=\dimen268
\pgffor@skip=\dimen269
\pgffor@stack=\toks33
\pgffor@toks=\toks34
))
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.cod
e.tex
Package: tikz 2023-01-15 v3.1.10 (3.1.10)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothan
dlers.code.tex
File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgf@plot@mark@count=\count290
\pgfplotmarksize=\dimen270
)
\tikz@lastx=\dimen271
\tikz@lasty=\dimen272
\tikz@lastxsaved=\dimen273
\tikz@lastysaved=\dimen274
\tikz@lastmovetox=\dimen275
\tikz@lastmovetoy=\dimen276
\tikzleveldistance=\dimen277
\tikzsiblingdistance=\dimen278
\tikz@figbox=\box60
\tikz@figbox@bg=\box61
\tikz@tempbox=\box62
\tikz@tempbox@bg=\box63
\tikztreelevel=\count291
\tikznumberofchildren=\count292
\tikznumberofcurrentchild=\count293
\tikz@fig@count=\count294
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.cod
e.tex
File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10)
\pgfmatrixcurrentrow=\count295
\pgfmatrixcurrentcolumn=\count296
\pgf@matrix@numberofcolumns=\count297
)
\tikz@expandcount=\count298
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/librarie
s/tikzlibrarytopaths.code.tex
File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10)
)))
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/librarie
s/tikzlibrarypositioning.code.tex
File: tikzlibrarypositioning.code.tex 2023-01-15 v3.1.10 (3.1.10)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/hyperref.sty
Package: hyperref 2023-02-07 v7.00v Hypertext links for LaTeX
(/usr/local/texlive/2023/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/infwarerr/infwarerr.sty
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
)
Package pdftexcmds Info: \pdf@primitive is available.
Package pdftexcmds Info: \pdf@ifprimitive is available.
Package pdftexcmds Info: \pdfdraftmode found.
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty
Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/pdfescape/pdfescape.sty
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/hycolor/hycolor.sty
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/auxhook/auxhook.sty
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/nameref.sty
Package: nameref 2022-05-17 v2.50 Cross-referencing by name of section
(/usr/local/texlive/2023/texmf-dist/tex/latex/refcount/refcount.sty
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/gettitlestring/gettitlestring.s
ty
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
(/usr/local/texlive/2023/texmf-dist/tex/latex/kvoptions/kvoptions.sty
Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO)
))
\c@section@level=\count299
)
\@linkdim=\dimen279
\Hy@linkcounter=\count300
\Hy@pagecounter=\count301
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/pd1enc.def
File: pd1enc.def 2023-02-07 v7.00v Hyperref: PDFDocEncoding definition (HO)
Now handling font encoding PD1 ...
... no UTF-8 mapping file for font encoding PD1
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/intcalc/intcalc.sty
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
)
(/usr/local/texlive/2023/texmf-dist/tex/generic/etexcmds/etexcmds.sty
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
)
\Hy@SavedSpaceFactor=\count302
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/puenc.def
File: puenc.def 2023-02-07 v7.00v Hyperref: PDF Unicode definition (HO)
Now handling font encoding PU ...
... no UTF-8 mapping file for font encoding PU
)
Package hyperref Info: Hyper figures OFF on input line 4177.
Package hyperref Info: Link nesting OFF on input line 4182.
Package hyperref Info: Hyper index ON on input line 4185.
Package hyperref Info: Plain pages OFF on input line 4192.
Package hyperref Info: Backreferencing OFF on input line 4197.
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
Package hyperref Info: Bookmarks ON on input line 4425.
\c@Hy@tempcnt=\count303
(/usr/local/texlive/2023/texmf-dist/tex/latex/url/url.sty
\Urlmuskip=\muskip17
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
)
LaTeX Info: Redefining \url on input line 4763.
\XeTeXLinkMargin=\dimen280
(/usr/local/texlive/2023/texmf-dist/tex/generic/bitset/bitset.sty
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
(/usr/local/texlive/2023/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO
)
))
\Fld@menulength=\count304
\Field@Width=\dimen281
\Fld@charsize=\dimen282
Package hyperref Info: Hyper figures OFF on input line 6042.
Package hyperref Info: Link nesting OFF on input line 6047.
Package hyperref Info: Hyper index ON on input line 6050.
Package hyperref Info: backreferencing OFF on input line 6057.
Package hyperref Info: Link coloring OFF on input line 6062.
Package hyperref Info: Link coloring with OCG OFF on input line 6067.
Package hyperref Info: PDF/A mode OFF on input line 6072.
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/atbegshi-ltx.sty
Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
package with kernel methods
)
\Hy@abspage=\count305
\c@Item=\count306
\c@Hfootnote=\count307
)
Package hyperref Info: Driver (autodetected): hpdftex.
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/hpdftex.def
File: hpdftex.def 2023-02-07 v7.00v Hyperref driver for pdfTeX
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/atveryend-ltx.sty
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac
kage
with kernel methods
)
\Fld@listcount=\count308
\c@bookmark@seq@number=\count309
(/usr/local/texlive/2023/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO)
(/usr/local/texlive/2023/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
)
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
85.
)
\Hy@SectionHShift=\skip53
) (/usr/local/texlive/2023/texmf-dist/tex/latex/enumitem/enumitem.sty
Package: enumitem 2019/06/20 v3.9 Customized lists
\labelindent=\skip54
\enit@outerparindent=\dimen283
\enit@toks=\toks35
\enit@inbox=\box64
\enit@count@id=\count310
\enitdp@description=\count311
)
(/usr/local/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def
File: l3backend-pdftex.def 2023-01-16 L3 backend support: PDF output (pdfTeX)
\l__color_backend_stack_int=\count312
\l__pdf_internal_box=\box65
)
(./a3.aux)
\openout1 = `a3.aux'.
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 21.
LaTeX Font Info: ... okay on input line 21.
*geometry* driver: auto-detecting
*geometry* detected driver: pdftex
*geometry* verbose mode - [ preamble ] result:
* driver: pdftex
* paper: <default>
* layout: <same size as paper>
* layoutoffset:(h,v)=(0.0pt,0.0pt)
* modes:
* h-part:(L,W,R)=(72.26999pt, 469.75502pt, 72.26999pt)
* v-part:(T,H,B)=(72.26999pt, 650.43001pt, 72.26999pt)
* \paperwidth=614.295pt
* \paperheight=794.96999pt
* \textwidth=469.75502pt
* \textheight=650.43001pt
* \oddsidemargin=0.0pt
* \evensidemargin=0.0pt
* \topmargin=-37.0pt
* \headheight=12.0pt
* \headsep=25.0pt
* \topskip=11.0pt
* \footskip=30.0pt
* \marginparwidth=59.0pt
* \marginparsep=10.0pt
* \columnsep=10.0pt
* \skip\footins=10.0pt plus 4.0pt minus 2.0pt
* \hoffset=0.0pt
* \voffset=0.0pt
* \mag=1000
* \@twocolumnfalse
* \@twosidefalse
* \@mparswitchfalse
* \@reversemarginfalse
* (1in=72.27pt=25.4mm, 1cm=28.453pt)
(/usr/local/texlive/2023/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
[Loading MPS to PDF converter (version 2006.09.02).]
\scratchcounter=\count313
\scratchdimen=\dimen284
\scratchbox=\box66
\nofMPsegments=\count314
\nofMParguments=\count315
\everyMPshowfont=\toks36
\MPscratchCnt=\count316
\MPscratchDim=\dimen285
\MPnumerator=\count317
\makeMPintoPDFobject=\count318
\everyMPtoPDFconversion=\toks37
) (/usr/local/texlive/2023/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty
Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf
Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4
85.
(/usr/local/texlive/2023/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv
e
))
Package hyperref Info: Link coloring OFF on input line 21.
(./a3.out) (./a3.out)
\@outlinefile=\write4
\openout4 = `a3.out'.
[1
{/usr/local/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}]
Underfull \hbox (badness 10000) in paragraph at lines 66--67
[]\OT1/cmr/m/n/10.95 Often im-
[]
Underfull \hbox (badness 10000) in paragraph at lines 66--67
\OT1/cmr/m/n/10.95 proves con-ver-
[]
Underfull \hbox (badness 10000) in paragraph at lines 66--67
\OT1/cmr/m/n/10.95 gence/accuracy
[]
Underfull \hbox (badness 10000) in paragraph at lines 66--67
\OT1/cmr/m/n/10.95 but can in-crease
[]
Underfull \hbox (badness 10000) in paragraph at lines 68--68
[]\OT1/cmr/m/n/10.95 Standard Trans-former
[]
Underfull \hbox (badness 10000) in paragraph at lines 68--68
[]\OT1/cmr/m/n/10.95 Remove \OT1/cmtt/m/n/10.95 softcap *
[]
Underfull \hbox (badness 10000) in paragraph at lines 68--68
\OT1/cmtt/m/n/10.95 tanh(logits/softcap)
[]
Underfull \hbox (badness 3271) in paragraph at lines 68--69
\OT1/cmr/m/n/10.95 i-bra-tion or learn-
[]
Underfull \hbox (badness 2359) in paragraph at lines 70--70
[]\OT1/cmr/m/n/10.95 Gated lin-ear units im-
[]
Underfull \hbox (badness 7504) in paragraph at lines 70--70
\OT1/cmr/m/n/10.95 prove ex-pres-siv-ity and
[]
Underfull \hbox (badness 4378) in paragraph at lines 70--70
[]\OT1/cmr/m/n/10.95 Replace 2-layer MLP with
[]
Underfull \hbox (badness 4266) in paragraph at lines 70--70
\OT1/cmr/m/n/10.95 SwiGLU (three pro-jec-tions,
[]
Underfull \hbox (badness 3525) in paragraph at lines 70--71
\OT1/cmr/m/n/10.95 plex-ity at sim-i-lar
[]
Overfull \hbox (30.64403pt too wide) in paragraph at lines 62--73
[][]
[]
LaTeX Warning: `h' float specifier changed to `ht'.
[2] [3] [4]
Overfull \hbox (61.77394pt too wide) in paragraph at lines 171--172
[]\OT1/cmr/m/n/10.95 Part 2 eval: \OT1/cmtt/m/n/10.95 python -m scripts.base[]e
val --eval bpb --model-tag <model> --device-batch-size
[]
Overfull \hbox (27.28181pt too wide) in paragraph at lines 172--173
[]\OT1/cmr/m/n/10.95 Part 3 eval: \OT1/cmtt/m/n/10.95 python -m scripts.base[]e
val --eval bpb --model-tag pico-d8-ctx512 --step
[]
Overfull \hbox (15.78444pt too wide) in paragraph at lines 173--174
[]\OT1/cmr/m/n/10.95 Part 4 eval: \OT1/cmtt/m/n/10.95 python -m scripts.base[]e
val --eval bpb --model-tag final-d12-nosoftcap
[]
[5{/usr/local/texlive/2023/texmf-dist/fonts/enc/dvips/cm-super/cm-super-ts1.enc
}] (./a3.aux)
Package rerunfilecheck Info: File `a3.out' has not changed.
(rerunfilecheck) Checksum: D41D8CD98F00B204E9800998ECF8427E;0.
)
Here is how much of TeX's memory you used:
20351 strings out of 476025
383151 string characters out of 5790018
1862388 words of memory out of 5000000
40442 multiletter control sequences out of 15000+600000
518865 words of font info for 55 fonts, out of 8000000 for 9000
1141 hyphenation exceptions out of 8191
84i,11n,89p,555b,771s stack positions out of 10000i,1000n,20000p,200000b,200000s
</usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx10.pfb
></usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx12.pfb>
</usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb><
/usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi8.pfb></u
sr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb></usr
/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr12.pfb></usr/l
ocal/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr17.pfb></usr/loc
al/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr6.pfb></usr/local/
texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr8.pfb></usr/local/tex
live/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/local/texl
ive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy8.pfb></usr/local/texliv
e/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmtt10.pfb></usr/local/texlive
/2023/texmf-dist/fonts/type1/public/cm-super/sfrm1095.pfb>
Output written on a3.pdf (5 pages, 174204 bytes).
PDF statistics:
143 PDF objects out of 1000 (max. 8388607)
108 compressed objects within 2 object streams
41 named destinations out of 1000 (max. 500000)
13 words of extra memory for PDF output out of 10000 (max. 10000000)

0
a3.out Normal file
View File

BIN
a3.pdf Normal file

Binary file not shown.

187
a3.tex Normal file
View File

@ -0,0 +1,187 @@
\documentclass[11pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage{tikz}
\usetikzlibrary{positioning}
\usepackage{hyperref}
\usepackage{enumitem}
\setlist{nosep}
\title{CSC490 A3: Pre-training Nanochat}
\author{
Chris Cao (Student \#1009840460)\\
Yanzhen Chen (Student \#1010317630)\\
Clarina Ong (Student \#1008820180)\\
Martin Zou (Student \#1009992885)
}
\date{March 5, 2026}
\begin{document}
\maketitle
\section*{Part 1: Architecture Review}
\subsection*{Baseline (Oct 13) vs. Feb 2026 Nanochat}
The current nanochat implementation (Feb 2026) is a compact GPT-style decoder stack with several modernized architectural choices. From \texttt{nanochat/gpt.py}, notable features include rotary position embeddings (RoPE), RMSNorm without learnable parameters, QK normalization, untied token embedding and LM head, ReLU$^2$ MLP activation, sliding-window attention, value embeddings with gating, per-layer residual scalars, and GQA support. These changes collectively target stability, efficiency, and scaling behavior.
\subsection*{Model Diagram (Feb 2026 Nanochat)}
\begin{center}
\begin{tikzpicture}[node distance=1.2cm,>=stealth,scale=0.95, every node/.style={scale=0.95}]
\node (tok) [draw, rounded corners] {Token IDs};
\node (wte) [draw, rounded corners, below=of tok] {Token Embedding (wte)};
\node (nrm0) [draw, rounded corners, below=of wte] {RMSNorm};
\node (x0) [draw, rounded corners, below=of nrm0] {$x_0$ residual buffer};
\node (blk) [draw, rounded corners, below=of x0, minimum width=6cm, align=left] {
\textbf{Transformer Block} (repeated $L$ times)\\
\quad RMSNorm $\rightarrow$ Self-Attn (RoPE, QK norm, GQA, window)\\
\quad + Value Embedding (gated)\\
\quad Residual Scalars ($\lambda_\text{resid}$, $\lambda_{x_0}$)\\
\quad RMSNorm $\rightarrow$ MLP (ReLU$^2$)
};
\node (nrm1) [draw, rounded corners, below=of blk] {RMSNorm};
\node (lmh) [draw, rounded corners, below=of nrm1] {LM Head (untied)};
\node (log) [draw, rounded corners, below=of lmh] {Logits};
\draw[->] (tok) -- (wte);
\draw[->] (wte) -- (nrm0);
\draw[->] (nrm0) -- (x0);
\draw[->] (x0) -- (blk);
\draw[->] (blk) -- (nrm1);
\draw[->] (nrm1) -- (lmh);
\draw[->] (lmh) -- (log);
\end{tikzpicture}
\end{center}
\subsection*{Three Literature-Based Changes (Compared to Feb 2026 Nanochat)}
I selected three changes from the literature; two of them were implemented and evaluated in Part 2.
\begin{table}[h]
\centering
\begin{tabular}{p{3.2cm} p{4.3cm} p{5.1cm} p{3.3cm}}
\toprule
\textbf{Change} & \textbf{Motivation} & \textbf{Technical Details} & \textbf{Potential Impact} \\
\midrule
ReLU$^2$ $\rightarrow$ GELU & GELU is a smooth activation shown to improve optimization in Transformer models. & Replace \texttt{F.relu(x).square()} with \texttt{F.gelu(x)} in the MLP. & Often improves convergence/accuracy but can increase compute slightly. \\
\addlinespace
Remove logit softcapping & Standard Transformer training uses raw logits before softmax; softcapping can reduce gradient signal for confident predictions. & Remove \texttt{softcap * tanh(logits/softcap)} and use raw FP32 logits. & Could improve calibration or learning of rare tokens; may risk instability if logits explode. \\
\addlinespace
SwiGLU MLP (not tested) & Gated linear units improve expressivity and scaling behavior in large LMs. & Replace 2-layer MLP with SwiGLU (three projections, gated activation). & Often improves perplexity at similar compute. \\
\bottomrule
\end{tabular}
\caption{Three literature-based changes relative to the Feb 2026 nanochat architecture. Changes 1--2 were evaluated in Part 2.}
\end{table}
\textbf{References:} GELU (Hendrycks \& Gimpel, 2016); standard Transformer logits (Vaswani et al., 2017); SwiGLU/GLU (Shazeer, 2020).
\section*{Part 2: Ablations on Picochat}
\subsection*{Setup}
I used a small pico configuration: depth 8, seq length 2048, vocab size 32768. This scale fits on a single A5000 and provides rapid iteration while preserving architecture structure. Two changes from Part 1 were ablated: GELU and removing logit softcapping. All runs used the same tokenizer and dataset.
\subsection*{Results}
\begin{table}[h]
\centering
\begin{tabular}{l l r r}
\toprule
\textbf{Model} & \textbf{Change} & \textbf{Train BPB} & \textbf{Val BPB} \\
\midrule
pico-d8-baseline-v32768 & baseline (ReLU$^2$ + softcap) & 0.9932 & 0.9996 \\
pico-d8-gelu & ReLU$^2$ $\rightarrow$ GELU & 1.5799 & 2.2303 \\
pico-d8-nosoftcap & remove logit softcap & 0.6310 & 1.4397 \\
\bottomrule
\end{tabular}
\caption{Part 2 ablations (BPB lower is better).}
\end{table}
\subsection*{Commentary}
Both modifications degraded validation BPB relative to the baseline. GELU performed worst in this low-budget regime. Removing softcap was less harmful but still worse than baseline. This suggests the current nanochat choices are tuned for this scale, and naive swaps may not transfer well. For larger runs, GELU and no-softcap could behave differently, but the local evidence suggests caution.
\subsection*{Tracking and Cost}
Training was tracked with W\&B in offline mode. GPU cost is not included because a price-per-hour value was not available at runtime. I report total training time from logs for transparency.
\section*{Part 3: Extending the Context Window}
\subsection*{Procedure}
I trained a depth-8 pico model at sequence length 512 on a small subset (2 shards), then resumed training from that checkpoint at sequence length 2048. This mirrors standard practice where shorter context warm-up can stabilize optimization, then longer context extends capability.
\subsection*{Results}
\begin{table}[h]
\centering
\begin{tabular}{l r r r}
\toprule
\textbf{Checkpoint} & \textbf{Seq Len} & \textbf{Train BPB} & \textbf{Val BPB} \\
\midrule
pico-d8-ctx512 step 2000 & 512 & 0.3887 & 1.9778 \\
pico-d8-ctx512 step 3000 & 2048 & 0.7741 & 1.5435 \\
\bottomrule
\end{tabular}
\caption{Context extension results.}
\end{table}
\subsection*{Commentary}
Moving to 2048 reduced validation BPB substantially (1.98 $\rightarrow$ 1.54), indicating better generalization and longer-context modeling. The 512 model overfits the short context: low train BPB but poor validation. This supports the idea that longer context provides stronger supervision and improves robustness.
\section*{Part 4: Final Nanochat}
\subsection*{Final Configuration and Justification}
Final model uses depth 12, seq length 2048, vocab size 32768, and the no-softcap change (ReLU$^2$ retained). This is a minimal, controlled modification with manageable compute on A5000 while still representing a larger ``nanochat''-scale model.
\subsection*{Training on Full Dataset}
The full FineWeb-Edu 100B shuffled dataset was used (1823 shards, 160 GB on disk).
\subsection*{Results}
\begin{table}[h]
\centering
\begin{tabular}{l r r r r}
\toprule
\textbf{Model} & \textbf{Depth} & \textbf{Params} & \textbf{Train BPB} & \textbf{Val BPB} \\
\midrule
pico-d8-baseline-v32768 & 8 & 125{,}829{,}648 & 0.9932 & 0.9996 \\
final-d12-nosoftcap & 12 & 286{,}262{,}424 & 0.9155 & 0.9195 \\
\bottomrule
\end{tabular}
\caption{Final model vs pico baseline (BPB lower is better).}
\end{table}
\subsection*{Scaling Law Estimate}
Using $L = k N^{-\alpha}$ with validation BPB as loss $L$ and parameter count $N$, the observed scaling exponent is:
\[
\alpha = \frac{\ln(L_\text{pico} / L_\text{nano})}{\ln(N_\text{nano} / N_\text{pico})} \approx 0.102
\]
The predicted loss at nano scale matches the observed value by construction with two points. The small exponent reflects limited scale and training horizon.
\subsection*{Emergent Ability Questions (Nano > Pico)}
\begin{enumerate}
\item Summarize a two-paragraph article into three bullet points and extract two dates.
\item Solve a multi-step travel-time word problem with unit conversion.
\item Explain the output of a short Python function for a given input.
\item Compare two short passages and list three differences.
\item Write an 80-word polite email declining an invitation.
\item Compute mean and median of a list of numbers.
\item Solve a two-step arithmetic word problem.
\item Translate a short English paragraph into Chinese.
\item Follow a hidden instruction embedded at the top of a long prompt.
\item Explain supervised vs self-supervised learning in three sentences.
\end{enumerate}
\section*{Appendix: Commands Used (Summary)}
\begin{itemize}
\item Part 2 eval: \texttt{python -m scripts.base\_eval --eval bpb --model-tag <model> --device-batch-size 8 --split-tokens 524288}
\item Part 3 eval: \texttt{python -m scripts.base\_eval --eval bpb --model-tag pico-d8-ctx512 --step 2000/3000 ...}
\item Part 4 eval: \texttt{python -m scripts.base\_eval --eval bpb --model-tag final-d12-nosoftcap ...}
\end{itemize}
\section*{References}
\begin{itemize}
\item Devlin et al., 2018. BERT: Pre-training of Deep Bidirectional Transformers.
\item Warner et al., 2025. ModernBert.
\item Hendrycks \& Gimpel, 2016. GELU.
\item Vaswani et al., 2017. Attention Is All You Need.
\item Shazeer, 2020. Gated Linear Units for LMs (SwiGLU).
\item Su et al., 2021. RoPE.
\item Zhang \& Sennrich, 2019. RMSNorm.
\end{itemize}
\end{document}

View File

@ -126,7 +126,7 @@ class MLP(nn.Module):
def forward(self, x):
x = self.c_fc(x)
x = F.relu(x).square()
x = F.gelu(x)
x = self.c_proj(x)
return x
@ -407,11 +407,9 @@ class GPT(nn.Module):
x = norm(x)
# Forward the lm_head (compute logits)
softcap = 15 # smoothly cap the logits to the range [-softcap, softcap]
logits = self.lm_head(x) # (B, T, padded_vocab_size) <- very big tensor, large amount of memory
logits = logits[..., :self.config.vocab_size] # slice to remove padding
logits = logits.float() # switch to fp32 for logit softcap and loss computation
logits = softcap * torch.tanh(logits / softcap) # squash the logits
logits = logits.float() # switch to fp32 for loss computation
if targets is not None:
# training: given the targets, compute and return the loss