mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-19 11:23:19 +00:00
A3
This commit is contained in:
parent
c7ba252142
commit
af9f842b83
20
a3.aux
Normal file
20
a3.aux
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
\relax
|
||||
\providecommand\hyper@newdestlabel[2]{}
|
||||
\providecommand\HyperFirstAtBeginDocument{\AtBeginDocument}
|
||||
\HyperFirstAtBeginDocument{\ifx\hyper@anchor\@undefined
|
||||
\global\let\oldnewlabel\newlabel
|
||||
\gdef\newlabel#1#2{\newlabelxx{#1}#2}
|
||||
\gdef\newlabelxx#1#2#3#4#5#6{\oldnewlabel{#1}{{#2}{#3}}}
|
||||
\AtEndDocument{\ifx\hyper@anchor\@undefined
|
||||
\let\newlabel\oldnewlabel
|
||||
\fi}
|
||||
\fi}
|
||||
\global\let\hyper@last\relax
|
||||
\gdef\HyperFirstAtBeginDocument#1{#1}
|
||||
\providecommand\HyField@AuxAddToFields[1]{}
|
||||
\providecommand\HyField@AuxAddToCoFields[2]{}
|
||||
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Three literature-based changes relative to the Feb 2026 nanochat architecture. Changes 1--2 were evaluated in Part 2.}}{3}{table.1}\protected@file@percent }
|
||||
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces Part 2 ablations (BPB lower is better).}}{3}{table.2}\protected@file@percent }
|
||||
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Context extension results.}}{4}{table.3}\protected@file@percent }
|
||||
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Final model vs pico baseline (BPB lower is better).}}{4}{table.4}\protected@file@percent }
|
||||
\gdef \@abspage@last{5}
|
||||
818
a3.log
Normal file
818
a3.log
Normal file
|
|
@ -0,0 +1,818 @@
|
|||
This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) (preloaded format=pdflatex 2024.1.1) 5 MAR 2026 14:12
|
||||
entering extended mode
|
||||
restricted \write18 enabled.
|
||||
%&-line parsing enabled.
|
||||
**a3.tex
|
||||
(./a3.tex
|
||||
LaTeX2e <2022-11-01> patch level 1
|
||||
L3 programming layer <2023-02-22>
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/article.cls
|
||||
Document Class: article 2022/07/02 v1.4n Standard LaTeX document class
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/size11.clo
|
||||
File: size11.clo 2022/07/02 v1.4n Standard LaTeX file (size option)
|
||||
)
|
||||
\c@part=\count185
|
||||
\c@section=\count186
|
||||
\c@subsection=\count187
|
||||
\c@subsubsection=\count188
|
||||
\c@paragraph=\count189
|
||||
\c@subparagraph=\count190
|
||||
\c@figure=\count191
|
||||
\c@table=\count192
|
||||
\abovecaptionskip=\skip48
|
||||
\belowcaptionskip=\skip49
|
||||
\bibindent=\dimen140
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/geometry/geometry.sty
|
||||
Package: geometry 2020/01/02 v5.9 Page Geometry
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/keyval.sty
|
||||
Package: keyval 2022/05/29 v1.15 key=value parser (DPC)
|
||||
\KV@toks@=\toks16
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/ifvtex.sty
|
||||
Package: ifvtex 2019/10/25 v1.7 ifvtex legacy package. Use iftex instead.
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/iftex/iftex.sty
|
||||
Package: iftex 2022/02/03 v1.0f TeX engine tests
|
||||
))
|
||||
\Gm@cnth=\count193
|
||||
\Gm@cntv=\count194
|
||||
\c@Gm@tempcnt=\count195
|
||||
\Gm@bindingoffset=\dimen141
|
||||
\Gm@wd@mp=\dimen142
|
||||
\Gm@odd@mp=\dimen143
|
||||
\Gm@even@mp=\dimen144
|
||||
\Gm@layoutwidth=\dimen145
|
||||
\Gm@layoutheight=\dimen146
|
||||
\Gm@layouthoffset=\dimen147
|
||||
\Gm@layoutvoffset=\dimen148
|
||||
\Gm@dimlist=\toks17
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/booktabs/booktabs.sty
|
||||
Package: booktabs 2020/01/12 v1.61803398 Publication quality tables
|
||||
\heavyrulewidth=\dimen149
|
||||
\lightrulewidth=\dimen150
|
||||
\cmidrulewidth=\dimen151
|
||||
\belowrulesep=\dimen152
|
||||
\belowbottomsep=\dimen153
|
||||
\aboverulesep=\dimen154
|
||||
\abovetopsep=\dimen155
|
||||
\cmidrulesep=\dimen156
|
||||
\cmidrulekern=\dimen157
|
||||
\defaultaddspace=\dimen158
|
||||
\@cmidla=\count196
|
||||
\@cmidlb=\count197
|
||||
\@aboverulesep=\dimen159
|
||||
\@belowrulesep=\dimen160
|
||||
\@thisruleclass=\count198
|
||||
\@lastruleclass=\count199
|
||||
\@thisrulewidth=\dimen161
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsmath.sty
|
||||
Package: amsmath 2022/04/08 v2.17n AMS math features
|
||||
\@mathmargin=\skip50
|
||||
|
||||
For additional information on amsmath, use the `?' option.
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amstext.sty
|
||||
Package: amstext 2021/08/26 v2.01 AMS text
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsgen.sty
|
||||
File: amsgen.sty 1999/11/30 v2.0 generic functions
|
||||
\@emptytoks=\toks18
|
||||
\ex@=\dimen162
|
||||
))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsbsy.sty
|
||||
Package: amsbsy 1999/11/29 v1.2d Bold Symbols
|
||||
\pmbraise@=\dimen163
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/amsmath/amsopn.sty
|
||||
Package: amsopn 2022/04/08 v2.04 operator names
|
||||
)
|
||||
\inf@bad=\count266
|
||||
LaTeX Info: Redefining \frac on input line 234.
|
||||
\uproot@=\count267
|
||||
\leftroot@=\count268
|
||||
LaTeX Info: Redefining \overline on input line 399.
|
||||
LaTeX Info: Redefining \colon on input line 410.
|
||||
\classnum@=\count269
|
||||
\DOTSCASE@=\count270
|
||||
LaTeX Info: Redefining \ldots on input line 496.
|
||||
LaTeX Info: Redefining \dots on input line 499.
|
||||
LaTeX Info: Redefining \cdots on input line 620.
|
||||
\Mathstrutbox@=\box51
|
||||
\strutbox@=\box52
|
||||
LaTeX Info: Redefining \big on input line 722.
|
||||
LaTeX Info: Redefining \Big on input line 723.
|
||||
LaTeX Info: Redefining \bigg on input line 724.
|
||||
LaTeX Info: Redefining \Bigg on input line 725.
|
||||
\big@size=\dimen164
|
||||
LaTeX Font Info: Redeclaring font encoding OML on input line 743.
|
||||
LaTeX Font Info: Redeclaring font encoding OMS on input line 744.
|
||||
\macc@depth=\count271
|
||||
LaTeX Info: Redefining \bmod on input line 905.
|
||||
LaTeX Info: Redefining \pmod on input line 910.
|
||||
LaTeX Info: Redefining \smash on input line 940.
|
||||
LaTeX Info: Redefining \relbar on input line 970.
|
||||
LaTeX Info: Redefining \Relbar on input line 971.
|
||||
\c@MaxMatrixCols=\count272
|
||||
\dotsspace@=\muskip16
|
||||
\c@parentequation=\count273
|
||||
\dspbrk@lvl=\count274
|
||||
\tag@help=\toks19
|
||||
\row@=\count275
|
||||
\column@=\count276
|
||||
\maxfields@=\count277
|
||||
\andhelp@=\toks20
|
||||
\eqnshift@=\dimen165
|
||||
\alignsep@=\dimen166
|
||||
\tagshift@=\dimen167
|
||||
\tagwidth@=\dimen168
|
||||
\totwidth@=\dimen169
|
||||
\lineht@=\dimen170
|
||||
\@envbody=\toks21
|
||||
\multlinegap=\skip51
|
||||
\multlinetaggap=\skip52
|
||||
\mathdisplay@stack=\toks22
|
||||
LaTeX Info: Redefining \[ on input line 2953.
|
||||
LaTeX Info: Redefining \] on input line 2954.
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/graphicx.sty
|
||||
Package: graphicx 2021/09/16 v1.2d Enhanced LaTeX Graphics (DPC,SPQR)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/graphics.sty
|
||||
Package: graphics 2022/03/10 v1.4e Standard LaTeX Graphics (DPC,SPQR)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/trig.sty
|
||||
Package: trig 2021/08/11 v1.11 sin cos tan (DPC)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics-cfg/graphics.cfg
|
||||
File: graphics.cfg 2016/06/04 v1.11 sample graphics configuration
|
||||
)
|
||||
Package graphics Info: Driver file: pdftex.def on input line 107.
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics-def/pdftex.def
|
||||
File: pdftex.def 2022/09/22 v1.2b Graphics/color driver for pdftex
|
||||
))
|
||||
\Gin@req@height=\dimen171
|
||||
\Gin@req@width=\dimen172
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/frontendlayer/tikz.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgf.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfrcs.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-common.te
|
||||
x
|
||||
\pgfutil@everybye=\toks23
|
||||
\pgfutil@tempdima=\dimen173
|
||||
\pgfutil@tempdimb=\dimen174
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfutil-latex.def
|
||||
\pgfutil@abb=\box53
|
||||
) (/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfrcs.code.tex
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/pgf.revision.tex)
|
||||
Package: pgfrcs 2023-01-15 v3.1.10 (3.1.10)
|
||||
))
|
||||
Package: pgf 2023-01-15 v3.1.10 (3.1.10)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/basiclayer/pgfcore.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/systemlayer/pgfsys.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys.code.tex
|
||||
Package: pgfsys 2023-01-15 v3.1.10 (3.1.10)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex
|
||||
\pgfkeys@pathtoks=\toks24
|
||||
\pgfkeys@temptoks=\toks25
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeyslibraryfil
|
||||
tered.code.tex
|
||||
\pgfkeys@tmptoks=\toks26
|
||||
))
|
||||
\pgf@x=\dimen175
|
||||
\pgf@y=\dimen176
|
||||
\pgf@xa=\dimen177
|
||||
\pgf@ya=\dimen178
|
||||
\pgf@xb=\dimen179
|
||||
\pgf@yb=\dimen180
|
||||
\pgf@xc=\dimen181
|
||||
\pgf@yc=\dimen182
|
||||
\pgf@xd=\dimen183
|
||||
\pgf@yd=\dimen184
|
||||
\w@pgf@writea=\write3
|
||||
\r@pgf@reada=\read2
|
||||
\c@pgf@counta=\count278
|
||||
\c@pgf@countb=\count279
|
||||
\c@pgf@countc=\count280
|
||||
\c@pgf@countd=\count281
|
||||
\t@pgf@toka=\toks27
|
||||
\t@pgf@tokb=\toks28
|
||||
\t@pgf@tokc=\toks29
|
||||
\pgf@sys@id@count=\count282
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgf.cfg
|
||||
File: pgf.cfg 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
Driver file for pgf: pgfsys-pdftex.def
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-pdftex.d
|
||||
ef
|
||||
File: pgfsys-pdftex.def 2023-01-15 v3.1.10 (3.1.10)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsys-common-p
|
||||
df.def
|
||||
File: pgfsys-common-pdf.def 2023-01-15 v3.1.10 (3.1.10)
|
||||
)))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsyssoftpath.
|
||||
code.tex
|
||||
File: pgfsyssoftpath.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgfsyssoftpath@smallbuffer@items=\count283
|
||||
\pgfsyssoftpath@bigbuffer@items=\count284
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/systemlayer/pgfsysprotocol.
|
||||
code.tex
|
||||
File: pgfsysprotocol.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)) (/usr/local/texlive/2023/texmf-dist/tex/latex/xcolor/xcolor.sty
|
||||
Package: xcolor 2022/06/12 v2.14 LaTeX color extensions (UK)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics-cfg/color.cfg
|
||||
File: color.cfg 2016/01/02 v1.6 sample color configuration
|
||||
)
|
||||
Package xcolor Info: Driver file: pdftex.def on input line 227.
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/graphics/mathcolor.ltx)
|
||||
Package xcolor Info: Model `cmy' substituted by `cmy0' on input line 1353.
|
||||
Package xcolor Info: Model `hsb' substituted by `rgb' on input line 1357.
|
||||
Package xcolor Info: Model `RGB' extended on input line 1369.
|
||||
Package xcolor Info: Model `HTML' substituted by `rgb' on input line 1371.
|
||||
Package xcolor Info: Model `Hsb' substituted by `hsb' on input line 1372.
|
||||
Package xcolor Info: Model `tHsb' substituted by `hsb' on input line 1373.
|
||||
Package xcolor Info: Model `HSB' substituted by `hsb' on input line 1374.
|
||||
Package xcolor Info: Model `Gray' substituted by `gray' on input line 1375.
|
||||
Package xcolor Info: Model `wave' substituted by `hsb' on input line 1376.
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcore.code.tex
|
||||
Package: pgfcore 2023-01-15 v3.1.10 (3.1.10)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathutil.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathparser.code.tex
|
||||
\pgfmath@dimen=\dimen185
|
||||
\pgfmath@count=\count285
|
||||
\pgfmath@box=\box54
|
||||
\pgfmath@toks=\toks30
|
||||
\pgfmath@stack@operand=\toks31
|
||||
\pgfmath@stack@operation=\toks32
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.code.
|
||||
tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.basic
|
||||
.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.trigo
|
||||
nometric.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.rando
|
||||
m.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.compa
|
||||
rison.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.base.
|
||||
code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.round
|
||||
.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.misc.
|
||||
code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfunctions.integ
|
||||
erarithmetics.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathcalc.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmathfloat.code.tex
|
||||
\c@pgfmathroundto@lastzeros=\count286
|
||||
)) (/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfint.code.tex)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepoints.co
|
||||
de.tex
|
||||
File: pgfcorepoints.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@picminx=\dimen186
|
||||
\pgf@picmaxx=\dimen187
|
||||
\pgf@picminy=\dimen188
|
||||
\pgf@picmaxy=\dimen189
|
||||
\pgf@pathminx=\dimen190
|
||||
\pgf@pathmaxx=\dimen191
|
||||
\pgf@pathminy=\dimen192
|
||||
\pgf@pathmaxy=\dimen193
|
||||
\pgf@xx=\dimen194
|
||||
\pgf@xy=\dimen195
|
||||
\pgf@yx=\dimen196
|
||||
\pgf@yy=\dimen197
|
||||
\pgf@zx=\dimen198
|
||||
\pgf@zy=\dimen199
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathconst
|
||||
ruct.code.tex
|
||||
File: pgfcorepathconstruct.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@path@lastx=\dimen256
|
||||
\pgf@path@lasty=\dimen257
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathusage
|
||||
.code.tex
|
||||
File: pgfcorepathusage.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@shorten@end@additional=\dimen258
|
||||
\pgf@shorten@start@additional=\dimen259
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorescopes.co
|
||||
de.tex
|
||||
File: pgfcorescopes.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgfpic=\box55
|
||||
\pgf@hbox=\box56
|
||||
\pgf@layerbox@main=\box57
|
||||
\pgf@picture@serial@count=\count287
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoregraphicst
|
||||
ate.code.tex
|
||||
File: pgfcoregraphicstate.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgflinewidth=\dimen260
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretransform
|
||||
ations.code.tex
|
||||
File: pgfcoretransformations.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@pt@x=\dimen261
|
||||
\pgf@pt@y=\dimen262
|
||||
\pgf@pt@temp=\dimen263
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorequick.cod
|
||||
e.tex
|
||||
File: pgfcorequick.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreobjects.c
|
||||
ode.tex
|
||||
File: pgfcoreobjects.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepathproce
|
||||
ssing.code.tex
|
||||
File: pgfcorepathprocessing.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorearrows.co
|
||||
de.tex
|
||||
File: pgfcorearrows.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgfarrowsep=\dimen264
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreshade.cod
|
||||
e.tex
|
||||
File: pgfcoreshade.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@max=\dimen265
|
||||
\pgf@sys@shading@range@num=\count288
|
||||
\pgf@shadingcount=\count289
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreimage.cod
|
||||
e.tex
|
||||
File: pgfcoreimage.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoreexternal.
|
||||
code.tex
|
||||
File: pgfcoreexternal.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgfexternal@startupbox=\box58
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorelayers.co
|
||||
de.tex
|
||||
File: pgfcorelayers.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcoretranspare
|
||||
ncy.code.tex
|
||||
File: pgfcoretransparency.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorepatterns.
|
||||
code.tex
|
||||
File: pgfcorepatterns.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/basiclayer/pgfcorerdf.code.
|
||||
tex
|
||||
File: pgfcorerdf.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleshapes.cod
|
||||
e.tex
|
||||
File: pgfmoduleshapes.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgfnodeparttextbox=\box59
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmoduleplot.code.
|
||||
tex
|
||||
File: pgfmoduleplot.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version
|
||||
-0-65.sty
|
||||
Package: pgfcomp-version-0-65 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@nodesepstart=\dimen266
|
||||
\pgf@nodesepend=\dimen267
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/compatibility/pgfcomp-version
|
||||
-1-18.sty
|
||||
Package: pgfcomp-version-1-18 2023-01-15 v3.1.10 (3.1.10)
|
||||
))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgffor.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/utilities/pgfkeys.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgfkeys.code.tex)
|
||||
) (/usr/local/texlive/2023/texmf-dist/tex/latex/pgf/math/pgfmath.sty
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/math/pgfmath.code.tex))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/utilities/pgffor.code.tex
|
||||
Package: pgffor 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgffor@iter=\dimen268
|
||||
\pgffor@skip=\dimen269
|
||||
\pgffor@stack=\toks33
|
||||
\pgffor@toks=\toks34
|
||||
))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/tikz.cod
|
||||
e.tex
|
||||
Package: tikz 2023-01-15 v3.1.10 (3.1.10)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/libraries/pgflibraryplothan
|
||||
dlers.code.tex
|
||||
File: pgflibraryplothandlers.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgf@plot@mark@count=\count290
|
||||
\pgfplotmarksize=\dimen270
|
||||
)
|
||||
\tikz@lastx=\dimen271
|
||||
\tikz@lasty=\dimen272
|
||||
\tikz@lastxsaved=\dimen273
|
||||
\tikz@lastysaved=\dimen274
|
||||
\tikz@lastmovetox=\dimen275
|
||||
\tikz@lastmovetoy=\dimen276
|
||||
\tikzleveldistance=\dimen277
|
||||
\tikzsiblingdistance=\dimen278
|
||||
\tikz@figbox=\box60
|
||||
\tikz@figbox@bg=\box61
|
||||
\tikz@tempbox=\box62
|
||||
\tikz@tempbox@bg=\box63
|
||||
\tikztreelevel=\count291
|
||||
\tikznumberofchildren=\count292
|
||||
\tikznumberofcurrentchild=\count293
|
||||
\tikz@fig@count=\count294
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/modules/pgfmodulematrix.cod
|
||||
e.tex
|
||||
File: pgfmodulematrix.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
\pgfmatrixcurrentrow=\count295
|
||||
\pgfmatrixcurrentcolumn=\count296
|
||||
\pgf@matrix@numberofcolumns=\count297
|
||||
)
|
||||
\tikz@expandcount=\count298
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/librarie
|
||||
s/tikzlibrarytopaths.code.tex
|
||||
File: tikzlibrarytopaths.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)))
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pgf/frontendlayer/tikz/librarie
|
||||
s/tikzlibrarypositioning.code.tex
|
||||
File: tikzlibrarypositioning.code.tex 2023-01-15 v3.1.10 (3.1.10)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/hyperref.sty
|
||||
Package: hyperref 2023-02-07 v7.00v Hypertext links for LaTeX
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/ltxcmds/ltxcmds.sty
|
||||
Package: ltxcmds 2020-05-10 v1.25 LaTeX kernel commands for general use (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pdftexcmds/pdftexcmds.sty
|
||||
Package: pdftexcmds 2020-06-27 v0.33 Utility functions of pdfTeX for LuaTeX (HO
|
||||
)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/infwarerr/infwarerr.sty
|
||||
Package: infwarerr 2019/12/03 v1.5 Providing info/warning/error messages (HO)
|
||||
)
|
||||
Package pdftexcmds Info: \pdf@primitive is available.
|
||||
Package pdftexcmds Info: \pdf@ifprimitive is available.
|
||||
Package pdftexcmds Info: \pdfdraftmode found.
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/kvsetkeys/kvsetkeys.sty
|
||||
Package: kvsetkeys 2022-10-05 v1.19 Key value parser (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/kvdefinekeys/kvdefinekeys.sty
|
||||
Package: kvdefinekeys 2019-12-19 v1.6 Define keys (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/pdfescape/pdfescape.sty
|
||||
Package: pdfescape 2019/12/09 v1.15 Implements pdfTeX's escape features (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/hycolor/hycolor.sty
|
||||
Package: hycolor 2020-01-27 v1.10 Color options for hyperref/bookmark (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/letltxmacro/letltxmacro.sty
|
||||
Package: letltxmacro 2019/12/03 v1.6 Let assignment for LaTeX macros (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/auxhook/auxhook.sty
|
||||
Package: auxhook 2019-12-17 v1.6 Hooks for auxiliary files (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/nameref.sty
|
||||
Package: nameref 2022-05-17 v2.50 Cross-referencing by name of section
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/refcount/refcount.sty
|
||||
Package: refcount 2019/12/15 v3.6 Data extraction from label references (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/gettitlestring/gettitlestring.s
|
||||
ty
|
||||
Package: gettitlestring 2019/12/15 v1.6 Cleanup title references (HO)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/kvoptions/kvoptions.sty
|
||||
Package: kvoptions 2022-06-15 v3.15 Key value format for package options (HO)
|
||||
))
|
||||
\c@section@level=\count299
|
||||
)
|
||||
\@linkdim=\dimen279
|
||||
\Hy@linkcounter=\count300
|
||||
\Hy@pagecounter=\count301
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/pd1enc.def
|
||||
File: pd1enc.def 2023-02-07 v7.00v Hyperref: PDFDocEncoding definition (HO)
|
||||
Now handling font encoding PD1 ...
|
||||
... no UTF-8 mapping file for font encoding PD1
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/intcalc/intcalc.sty
|
||||
Package: intcalc 2019/12/15 v1.3 Expandable calculations with integers (HO)
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/etexcmds/etexcmds.sty
|
||||
Package: etexcmds 2019/12/15 v1.7 Avoid name clashes with e-TeX commands (HO)
|
||||
)
|
||||
\Hy@SavedSpaceFactor=\count302
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/puenc.def
|
||||
File: puenc.def 2023-02-07 v7.00v Hyperref: PDF Unicode definition (HO)
|
||||
Now handling font encoding PU ...
|
||||
... no UTF-8 mapping file for font encoding PU
|
||||
)
|
||||
Package hyperref Info: Hyper figures OFF on input line 4177.
|
||||
Package hyperref Info: Link nesting OFF on input line 4182.
|
||||
Package hyperref Info: Hyper index ON on input line 4185.
|
||||
Package hyperref Info: Plain pages OFF on input line 4192.
|
||||
Package hyperref Info: Backreferencing OFF on input line 4197.
|
||||
Package hyperref Info: Implicit mode ON; LaTeX internals redefined.
|
||||
Package hyperref Info: Bookmarks ON on input line 4425.
|
||||
\c@Hy@tempcnt=\count303
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/url/url.sty
|
||||
\Urlmuskip=\muskip17
|
||||
Package: url 2013/09/16 ver 3.4 Verb mode for urls, etc.
|
||||
)
|
||||
LaTeX Info: Redefining \url on input line 4763.
|
||||
\XeTeXLinkMargin=\dimen280
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/bitset/bitset.sty
|
||||
Package: bitset 2019/12/09 v1.3 Handle bit-vector datatype (HO)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/bigintcalc/bigintcalc.sty
|
||||
Package: bigintcalc 2019/12/15 v1.5 Expandable calculations on big integers (HO
|
||||
)
|
||||
))
|
||||
\Fld@menulength=\count304
|
||||
\Field@Width=\dimen281
|
||||
\Fld@charsize=\dimen282
|
||||
Package hyperref Info: Hyper figures OFF on input line 6042.
|
||||
Package hyperref Info: Link nesting OFF on input line 6047.
|
||||
Package hyperref Info: Hyper index ON on input line 6050.
|
||||
Package hyperref Info: backreferencing OFF on input line 6057.
|
||||
Package hyperref Info: Link coloring OFF on input line 6062.
|
||||
Package hyperref Info: Link coloring with OCG OFF on input line 6067.
|
||||
Package hyperref Info: PDF/A mode OFF on input line 6072.
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/atbegshi-ltx.sty
|
||||
Package: atbegshi-ltx 2021/01/10 v1.0c Emulation of the original atbegshi
|
||||
package with kernel methods
|
||||
)
|
||||
\Hy@abspage=\count305
|
||||
\c@Item=\count306
|
||||
\c@Hfootnote=\count307
|
||||
)
|
||||
Package hyperref Info: Driver (autodetected): hpdftex.
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/hyperref/hpdftex.def
|
||||
File: hpdftex.def 2023-02-07 v7.00v Hyperref driver for pdfTeX
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/base/atveryend-ltx.sty
|
||||
Package: atveryend-ltx 2020/08/19 v1.0a Emulation of the original atveryend pac
|
||||
kage
|
||||
with kernel methods
|
||||
)
|
||||
\Fld@listcount=\count308
|
||||
\c@bookmark@seq@number=\count309
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/rerunfilecheck/rerunfilecheck.sty
|
||||
Package: rerunfilecheck 2022-07-10 v1.10 Rerun checks for auxiliary files (HO)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/generic/uniquecounter/uniquecounter.sty
|
||||
Package: uniquecounter 2019/12/15 v1.4 Provide unlimited unique counter (HO)
|
||||
)
|
||||
Package uniquecounter Info: New unique counter `rerunfilecheck' on input line 2
|
||||
85.
|
||||
)
|
||||
\Hy@SectionHShift=\skip53
|
||||
) (/usr/local/texlive/2023/texmf-dist/tex/latex/enumitem/enumitem.sty
|
||||
Package: enumitem 2019/06/20 v3.9 Customized lists
|
||||
\labelindent=\skip54
|
||||
\enit@outerparindent=\dimen283
|
||||
\enit@toks=\toks35
|
||||
\enit@inbox=\box64
|
||||
\enit@count@id=\count310
|
||||
\enitdp@description=\count311
|
||||
)
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/l3backend/l3backend-pdftex.def
|
||||
File: l3backend-pdftex.def 2023-01-16 L3 backend support: PDF output (pdfTeX)
|
||||
\l__color_backend_stack_int=\count312
|
||||
\l__pdf_internal_box=\box65
|
||||
)
|
||||
(./a3.aux)
|
||||
\openout1 = `a3.aux'.
|
||||
|
||||
LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for TS1/cmr/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for PD1/pdf/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
LaTeX Font Info: Checking defaults for PU/pdf/m/n on input line 21.
|
||||
LaTeX Font Info: ... okay on input line 21.
|
||||
|
||||
*geometry* driver: auto-detecting
|
||||
*geometry* detected driver: pdftex
|
||||
*geometry* verbose mode - [ preamble ] result:
|
||||
* driver: pdftex
|
||||
* paper: <default>
|
||||
* layout: <same size as paper>
|
||||
* layoutoffset:(h,v)=(0.0pt,0.0pt)
|
||||
* modes:
|
||||
* h-part:(L,W,R)=(72.26999pt, 469.75502pt, 72.26999pt)
|
||||
* v-part:(T,H,B)=(72.26999pt, 650.43001pt, 72.26999pt)
|
||||
* \paperwidth=614.295pt
|
||||
* \paperheight=794.96999pt
|
||||
* \textwidth=469.75502pt
|
||||
* \textheight=650.43001pt
|
||||
* \oddsidemargin=0.0pt
|
||||
* \evensidemargin=0.0pt
|
||||
* \topmargin=-37.0pt
|
||||
* \headheight=12.0pt
|
||||
* \headsep=25.0pt
|
||||
* \topskip=11.0pt
|
||||
* \footskip=30.0pt
|
||||
* \marginparwidth=59.0pt
|
||||
* \marginparsep=10.0pt
|
||||
* \columnsep=10.0pt
|
||||
* \skip\footins=10.0pt plus 4.0pt minus 2.0pt
|
||||
* \hoffset=0.0pt
|
||||
* \voffset=0.0pt
|
||||
* \mag=1000
|
||||
* \@twocolumnfalse
|
||||
* \@twosidefalse
|
||||
* \@mparswitchfalse
|
||||
* \@reversemarginfalse
|
||||
* (1in=72.27pt=25.4mm, 1cm=28.453pt)
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/context/base/mkii/supp-pdf.mkii
|
||||
[Loading MPS to PDF converter (version 2006.09.02).]
|
||||
\scratchcounter=\count313
|
||||
\scratchdimen=\dimen284
|
||||
\scratchbox=\box66
|
||||
\nofMPsegments=\count314
|
||||
\nofMParguments=\count315
|
||||
\everyMPshowfont=\toks36
|
||||
\MPscratchCnt=\count316
|
||||
\MPscratchDim=\dimen285
|
||||
\MPnumerator=\count317
|
||||
\makeMPintoPDFobject=\count318
|
||||
\everyMPtoPDFconversion=\toks37
|
||||
) (/usr/local/texlive/2023/texmf-dist/tex/latex/epstopdf-pkg/epstopdf-base.sty
|
||||
Package: epstopdf-base 2020-01-24 v2.11 Base part for package epstopdf
|
||||
Package epstopdf-base Info: Redefining graphics rule for `.eps' on input line 4
|
||||
85.
|
||||
|
||||
(/usr/local/texlive/2023/texmf-dist/tex/latex/latexconfig/epstopdf-sys.cfg
|
||||
File: epstopdf-sys.cfg 2010/07/13 v1.3 Configuration of (r)epstopdf for TeX Liv
|
||||
e
|
||||
))
|
||||
Package hyperref Info: Link coloring OFF on input line 21.
|
||||
|
||||
(./a3.out) (./a3.out)
|
||||
\@outlinefile=\write4
|
||||
\openout4 = `a3.out'.
|
||||
|
||||
[1
|
||||
|
||||
{/usr/local/texlive/2023/texmf-var/fonts/map/pdftex/updmap/pdftex.map}]
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 66--67
|
||||
[]\OT1/cmr/m/n/10.95 Often im-
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 66--67
|
||||
\OT1/cmr/m/n/10.95 proves con-ver-
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 66--67
|
||||
\OT1/cmr/m/n/10.95 gence/accuracy
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 66--67
|
||||
\OT1/cmr/m/n/10.95 but can in-crease
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 68--68
|
||||
[]\OT1/cmr/m/n/10.95 Standard Trans-former
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 68--68
|
||||
[]\OT1/cmr/m/n/10.95 Remove \OT1/cmtt/m/n/10.95 softcap *
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 10000) in paragraph at lines 68--68
|
||||
\OT1/cmtt/m/n/10.95 tanh(logits/softcap)
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 3271) in paragraph at lines 68--69
|
||||
\OT1/cmr/m/n/10.95 i-bra-tion or learn-
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 2359) in paragraph at lines 70--70
|
||||
[]\OT1/cmr/m/n/10.95 Gated lin-ear units im-
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 7504) in paragraph at lines 70--70
|
||||
\OT1/cmr/m/n/10.95 prove ex-pres-siv-ity and
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 4378) in paragraph at lines 70--70
|
||||
[]\OT1/cmr/m/n/10.95 Replace 2-layer MLP with
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 4266) in paragraph at lines 70--70
|
||||
\OT1/cmr/m/n/10.95 SwiGLU (three pro-jec-tions,
|
||||
[]
|
||||
|
||||
|
||||
Underfull \hbox (badness 3525) in paragraph at lines 70--71
|
||||
\OT1/cmr/m/n/10.95 plex-ity at sim-i-lar
|
||||
[]
|
||||
|
||||
|
||||
Overfull \hbox (30.64403pt too wide) in paragraph at lines 62--73
|
||||
[][]
|
||||
[]
|
||||
|
||||
|
||||
LaTeX Warning: `h' float specifier changed to `ht'.
|
||||
|
||||
[2] [3] [4]
|
||||
Overfull \hbox (61.77394pt too wide) in paragraph at lines 171--172
|
||||
[]\OT1/cmr/m/n/10.95 Part 2 eval: \OT1/cmtt/m/n/10.95 python -m scripts.base[]e
|
||||
val --eval bpb --model-tag <model> --device-batch-size
|
||||
[]
|
||||
|
||||
|
||||
Overfull \hbox (27.28181pt too wide) in paragraph at lines 172--173
|
||||
[]\OT1/cmr/m/n/10.95 Part 3 eval: \OT1/cmtt/m/n/10.95 python -m scripts.base[]e
|
||||
val --eval bpb --model-tag pico-d8-ctx512 --step
|
||||
[]
|
||||
|
||||
|
||||
Overfull \hbox (15.78444pt too wide) in paragraph at lines 173--174
|
||||
[]\OT1/cmr/m/n/10.95 Part 4 eval: \OT1/cmtt/m/n/10.95 python -m scripts.base[]e
|
||||
val --eval bpb --model-tag final-d12-nosoftcap
|
||||
[]
|
||||
|
||||
[5{/usr/local/texlive/2023/texmf-dist/fonts/enc/dvips/cm-super/cm-super-ts1.enc
|
||||
}] (./a3.aux)
|
||||
Package rerunfilecheck Info: File `a3.out' has not changed.
|
||||
(rerunfilecheck) Checksum: D41D8CD98F00B204E9800998ECF8427E;0.
|
||||
)
|
||||
Here is how much of TeX's memory you used:
|
||||
20351 strings out of 476025
|
||||
383151 string characters out of 5790018
|
||||
1862388 words of memory out of 5000000
|
||||
40442 multiletter control sequences out of 15000+600000
|
||||
518865 words of font info for 55 fonts, out of 8000000 for 9000
|
||||
1141 hyphenation exceptions out of 8191
|
||||
84i,11n,89p,555b,771s stack positions out of 10000i,1000n,20000p,200000b,200000s
|
||||
</usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx10.pfb
|
||||
></usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmbx12.pfb>
|
||||
</usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi10.pfb><
|
||||
/usr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmmi8.pfb></u
|
||||
sr/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr10.pfb></usr
|
||||
/local/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr12.pfb></usr/l
|
||||
ocal/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr17.pfb></usr/loc
|
||||
al/texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr6.pfb></usr/local/
|
||||
texlive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmr8.pfb></usr/local/tex
|
||||
live/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb></usr/local/texl
|
||||
ive/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy8.pfb></usr/local/texliv
|
||||
e/2023/texmf-dist/fonts/type1/public/amsfonts/cm/cmtt10.pfb></usr/local/texlive
|
||||
/2023/texmf-dist/fonts/type1/public/cm-super/sfrm1095.pfb>
|
||||
Output written on a3.pdf (5 pages, 174204 bytes).
|
||||
PDF statistics:
|
||||
143 PDF objects out of 1000 (max. 8388607)
|
||||
108 compressed objects within 2 object streams
|
||||
41 named destinations out of 1000 (max. 500000)
|
||||
13 words of extra memory for PDF output out of 10000 (max. 10000000)
|
||||
|
||||
187
a3.tex
Normal file
187
a3.tex
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
\documentclass[11pt]{article}
|
||||
\usepackage[margin=1in]{geometry}
|
||||
\usepackage{booktabs}
|
||||
\usepackage{amsmath}
|
||||
\usepackage{graphicx}
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{positioning}
|
||||
\usepackage{hyperref}
|
||||
\usepackage{enumitem}
|
||||
\setlist{nosep}
|
||||
|
||||
\title{CSC490 A3: Pre-training Nanochat}
|
||||
\author{
|
||||
Chris Cao (Student \#1009840460)\\
|
||||
Yanzhen Chen (Student \#1010317630)\\
|
||||
Clarina Ong (Student \#1008820180)\\
|
||||
Martin Zou (Student \#1009992885)
|
||||
}
|
||||
\date{March 5, 2026}
|
||||
|
||||
\begin{document}
|
||||
\maketitle
|
||||
|
||||
\section*{Part 1: Architecture Review}
|
||||
|
||||
\subsection*{Baseline (Oct 13) vs. Feb 2026 Nanochat}
|
||||
The current nanochat implementation (Feb 2026) is a compact GPT-style decoder stack with several modernized architectural choices. From \texttt{nanochat/gpt.py}, notable features include rotary position embeddings (RoPE), RMSNorm without learnable parameters, QK normalization, untied token embedding and LM head, ReLU$^2$ MLP activation, sliding-window attention, value embeddings with gating, per-layer residual scalars, and GQA support. These changes collectively target stability, efficiency, and scaling behavior.
|
||||
|
||||
\subsection*{Model Diagram (Feb 2026 Nanochat)}
|
||||
\begin{center}
|
||||
\begin{tikzpicture}[node distance=1.2cm,>=stealth,scale=0.95, every node/.style={scale=0.95}]
|
||||
\node (tok) [draw, rounded corners] {Token IDs};
|
||||
\node (wte) [draw, rounded corners, below=of tok] {Token Embedding (wte)};
|
||||
\node (nrm0) [draw, rounded corners, below=of wte] {RMSNorm};
|
||||
\node (x0) [draw, rounded corners, below=of nrm0] {$x_0$ residual buffer};
|
||||
\node (blk) [draw, rounded corners, below=of x0, minimum width=6cm, align=left] {
|
||||
\textbf{Transformer Block} (repeated $L$ times)\\
|
||||
\quad RMSNorm $\rightarrow$ Self-Attn (RoPE, QK norm, GQA, window)\\
|
||||
\quad + Value Embedding (gated)\\
|
||||
\quad Residual Scalars ($\lambda_\text{resid}$, $\lambda_{x_0}$)\\
|
||||
\quad RMSNorm $\rightarrow$ MLP (ReLU$^2$)
|
||||
};
|
||||
\node (nrm1) [draw, rounded corners, below=of blk] {RMSNorm};
|
||||
\node (lmh) [draw, rounded corners, below=of nrm1] {LM Head (untied)};
|
||||
\node (log) [draw, rounded corners, below=of lmh] {Logits};
|
||||
|
||||
\draw[->] (tok) -- (wte);
|
||||
\draw[->] (wte) -- (nrm0);
|
||||
\draw[->] (nrm0) -- (x0);
|
||||
\draw[->] (x0) -- (blk);
|
||||
\draw[->] (blk) -- (nrm1);
|
||||
\draw[->] (nrm1) -- (lmh);
|
||||
\draw[->] (lmh) -- (log);
|
||||
\end{tikzpicture}
|
||||
\end{center}
|
||||
|
||||
\subsection*{Three Literature-Based Changes (Compared to Feb 2026 Nanochat)}
|
||||
I selected three changes from the literature; two of them were implemented and evaluated in Part 2.
|
||||
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{p{3.2cm} p{4.3cm} p{5.1cm} p{3.3cm}}
|
||||
\toprule
|
||||
\textbf{Change} & \textbf{Motivation} & \textbf{Technical Details} & \textbf{Potential Impact} \\
|
||||
\midrule
|
||||
ReLU$^2$ $\rightarrow$ GELU & GELU is a smooth activation shown to improve optimization in Transformer models. & Replace \texttt{F.relu(x).square()} with \texttt{F.gelu(x)} in the MLP. & Often improves convergence/accuracy but can increase compute slightly. \\
|
||||
\addlinespace
|
||||
Remove logit softcapping & Standard Transformer training uses raw logits before softmax; softcapping can reduce gradient signal for confident predictions. & Remove \texttt{softcap * tanh(logits/softcap)} and use raw FP32 logits. & Could improve calibration or learning of rare tokens; may risk instability if logits explode. \\
|
||||
\addlinespace
|
||||
SwiGLU MLP (not tested) & Gated linear units improve expressivity and scaling behavior in large LMs. & Replace 2-layer MLP with SwiGLU (three projections, gated activation). & Often improves perplexity at similar compute. \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Three literature-based changes relative to the Feb 2026 nanochat architecture. Changes 1--2 were evaluated in Part 2.}
|
||||
\end{table}
|
||||
|
||||
\textbf{References:} GELU (Hendrycks \& Gimpel, 2016); standard Transformer logits (Vaswani et al., 2017); SwiGLU/GLU (Shazeer, 2020).
|
||||
|
||||
\section*{Part 2: Ablations on Picochat}
|
||||
|
||||
\subsection*{Setup}
|
||||
I used a small pico configuration: depth 8, seq length 2048, vocab size 32768. This scale fits on a single A5000 and provides rapid iteration while preserving architecture structure. Two changes from Part 1 were ablated: GELU and removing logit softcapping. All runs used the same tokenizer and dataset.
|
||||
|
||||
\subsection*{Results}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{l l r r}
|
||||
\toprule
|
||||
\textbf{Model} & \textbf{Change} & \textbf{Train BPB} & \textbf{Val BPB} \\
|
||||
\midrule
|
||||
pico-d8-baseline-v32768 & baseline (ReLU$^2$ + softcap) & 0.9932 & 0.9996 \\
|
||||
pico-d8-gelu & ReLU$^2$ $\rightarrow$ GELU & 1.5799 & 2.2303 \\
|
||||
pico-d8-nosoftcap & remove logit softcap & 0.6310 & 1.4397 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Part 2 ablations (BPB lower is better).}
|
||||
\end{table}
|
||||
|
||||
\subsection*{Commentary}
|
||||
Both modifications degraded validation BPB relative to the baseline. GELU performed worst in this low-budget regime. Removing softcap was less harmful but still worse than baseline. This suggests the current nanochat choices are tuned for this scale, and naive swaps may not transfer well. For larger runs, GELU and no-softcap could behave differently, but the local evidence suggests caution.
|
||||
|
||||
\subsection*{Tracking and Cost}
|
||||
Training was tracked with W\&B in offline mode. GPU cost is not included because a price-per-hour value was not available at runtime. I report total training time from logs for transparency.
|
||||
|
||||
\section*{Part 3: Extending the Context Window}
|
||||
|
||||
\subsection*{Procedure}
|
||||
I trained a depth-8 pico model at sequence length 512 on a small subset (2 shards), then resumed training from that checkpoint at sequence length 2048. This mirrors standard practice where shorter context warm-up can stabilize optimization, then longer context extends capability.
|
||||
|
||||
\subsection*{Results}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{l r r r}
|
||||
\toprule
|
||||
\textbf{Checkpoint} & \textbf{Seq Len} & \textbf{Train BPB} & \textbf{Val BPB} \\
|
||||
\midrule
|
||||
pico-d8-ctx512 step 2000 & 512 & 0.3887 & 1.9778 \\
|
||||
pico-d8-ctx512 step 3000 & 2048 & 0.7741 & 1.5435 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Context extension results.}
|
||||
\end{table}
|
||||
|
||||
\subsection*{Commentary}
|
||||
Moving to 2048 reduced validation BPB substantially (1.98 $\rightarrow$ 1.54), indicating better generalization and longer-context modeling. The 512 model overfits the short context: low train BPB but poor validation. This supports the idea that longer context provides stronger supervision and improves robustness.
|
||||
|
||||
\section*{Part 4: Final Nanochat}
|
||||
|
||||
\subsection*{Final Configuration and Justification}
|
||||
Final model uses depth 12, seq length 2048, vocab size 32768, and the no-softcap change (ReLU$^2$ retained). This is a minimal, controlled modification with manageable compute on A5000 while still representing a larger ``nanochat''-scale model.
|
||||
|
||||
\subsection*{Training on Full Dataset}
|
||||
The full FineWeb-Edu 100B shuffled dataset was used (1823 shards, 160 GB on disk).
|
||||
|
||||
\subsection*{Results}
|
||||
\begin{table}[h]
|
||||
\centering
|
||||
\begin{tabular}{l r r r r}
|
||||
\toprule
|
||||
\textbf{Model} & \textbf{Depth} & \textbf{Params} & \textbf{Train BPB} & \textbf{Val BPB} \\
|
||||
\midrule
|
||||
pico-d8-baseline-v32768 & 8 & 125{,}829{,}648 & 0.9932 & 0.9996 \\
|
||||
final-d12-nosoftcap & 12 & 286{,}262{,}424 & 0.9155 & 0.9195 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\caption{Final model vs pico baseline (BPB lower is better).}
|
||||
\end{table}
|
||||
|
||||
\subsection*{Scaling Law Estimate}
|
||||
Using $L = k N^{-\alpha}$ with validation BPB as loss $L$ and parameter count $N$, the observed scaling exponent is:
|
||||
\[
|
||||
\alpha = \frac{\ln(L_\text{pico} / L_\text{nano})}{\ln(N_\text{nano} / N_\text{pico})} \approx 0.102
|
||||
\]
|
||||
The predicted loss at nano scale matches the observed value by construction with two points. The small exponent reflects limited scale and training horizon.
|
||||
|
||||
\subsection*{Emergent Ability Questions (Nano > Pico)}
|
||||
\begin{enumerate}
|
||||
\item Summarize a two-paragraph article into three bullet points and extract two dates.
|
||||
\item Solve a multi-step travel-time word problem with unit conversion.
|
||||
\item Explain the output of a short Python function for a given input.
|
||||
\item Compare two short passages and list three differences.
|
||||
\item Write an 80-word polite email declining an invitation.
|
||||
\item Compute mean and median of a list of numbers.
|
||||
\item Solve a two-step arithmetic word problem.
|
||||
\item Translate a short English paragraph into Chinese.
|
||||
\item Follow a hidden instruction embedded at the top of a long prompt.
|
||||
\item Explain supervised vs self-supervised learning in three sentences.
|
||||
\end{enumerate}
|
||||
|
||||
\section*{Appendix: Commands Used (Summary)}
|
||||
\begin{itemize}
|
||||
\item Part 2 eval: \texttt{python -m scripts.base\_eval --eval bpb --model-tag <model> --device-batch-size 8 --split-tokens 524288}
|
||||
\item Part 3 eval: \texttt{python -m scripts.base\_eval --eval bpb --model-tag pico-d8-ctx512 --step 2000/3000 ...}
|
||||
\item Part 4 eval: \texttt{python -m scripts.base\_eval --eval bpb --model-tag final-d12-nosoftcap ...}
|
||||
\end{itemize}
|
||||
|
||||
\section*{References}
|
||||
\begin{itemize}
|
||||
\item Devlin et al., 2018. BERT: Pre-training of Deep Bidirectional Transformers.
|
||||
\item Warner et al., 2025. ModernBert.
|
||||
\item Hendrycks \& Gimpel, 2016. GELU.
|
||||
\item Vaswani et al., 2017. Attention Is All You Need.
|
||||
\item Shazeer, 2020. Gated Linear Units for LMs (SwiGLU).
|
||||
\item Su et al., 2021. RoPE.
|
||||
\item Zhang \& Sennrich, 2019. RMSNorm.
|
||||
\end{itemize}
|
||||
|
||||
\end{document}
|
||||
|
|
@ -126,7 +126,7 @@ class MLP(nn.Module):
|
|||
|
||||
def forward(self, x):
|
||||
x = self.c_fc(x)
|
||||
x = F.relu(x).square()
|
||||
x = F.gelu(x)
|
||||
x = self.c_proj(x)
|
||||
return x
|
||||
|
||||
|
|
@ -407,11 +407,9 @@ class GPT(nn.Module):
|
|||
x = norm(x)
|
||||
|
||||
# Forward the lm_head (compute logits)
|
||||
softcap = 15 # smoothly cap the logits to the range [-softcap, softcap]
|
||||
logits = self.lm_head(x) # (B, T, padded_vocab_size) <- very big tensor, large amount of memory
|
||||
logits = logits[..., :self.config.vocab_size] # slice to remove padding
|
||||
logits = logits.float() # switch to fp32 for logit softcap and loss computation
|
||||
logits = softcap * torch.tanh(logits / softcap) # squash the logits
|
||||
logits = logits.float() # switch to fp32 for loss computation
|
||||
|
||||
if targets is not None:
|
||||
# training: given the targets, compute and return the loss
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user