Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 104 additions & 1 deletion docs/phd/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -2320,5 +2320,108 @@ @book{olver1993applications
ISBN 978-1-4684-0274-1, © Springer 1993. Lie symmetry
analysis and conservation laws for differential equations —
companion to the dimensional analysis and symmetry methods
chapters.}
chapters.}
}

% -------------------------------------------------------------------
% FPGA / Hardware / UART references
% -------------------------------------------------------------------

@misc{xilinx_ds180_2022,
author = {{Xilinx (AMD)}},
title = {7 Series {FPGAs} Data Sheet: Overview},
year = {2022},
howpublished = {DS181 Rev.\ 1.31},
note = {Artix-7 100T: 63,400 LUTs, 126,800 FFs, 135 BRAM18, 240 DSP48E1.}
}

@misc{darpa_igtc_2024,
author = {{DARPA}},
title = {Solicitation {HR001124S0001} --- Intelligent Generation of Tools and Computations},
year = {2024},
howpublished = {DARPA Microsystems Technology Office},
note = {Energy efficiency target 3000$\times$ GPU baseline.}
}

@misc{mlperf_v41_2024,
author = {{MLCommons}},
title = {{MLPerf} Inference v4.1 Results},
year = {2024},
howpublished = {\url{https://mlcommons.org/en/inference-edge-41/}},
note = {NVIDIA A100 Llama-2-7B offline, batch-1 autoregressive.}
}

@misc{zenodo_B001,
author = {Vasilev, Dmitrii},
title = {HSLM Ternary Neural Network},
year = {2026},
howpublished = {Zenodo},
doi = {10.5281/zenodo.19227865},
note = {B001: model weights, tokeniser, 1003-token HSLM benchmark.}
}

@misc{zenodo_B002,
author = {Vasilev, Dmitrii},
title = {FPGA Zero-DSP Architecture},
year = {2026},
howpublished = {Zenodo},
doi = {10.5281/zenodo.19227867},
note = {B002: bitstream, Vivado project, synthesis reports, 0 DSP.}
}

@misc{zenodo_Z01,
author = {Vasilev, Dmitrii},
title = {FPGA Autoregressive Ternary LLM},
year = {2026},
howpublished = {Zenodo},
doi = {10.5281/zenodo.18939352},
note = {Z01: first-generation FPGA implementation.}
}

@misc{zenodo_Z02,
author = {Vasilev, Dmitrii},
title = {FPGA Autoregressive Ternary LLM --- Latest Version},
year = {2026},
howpublished = {Zenodo},
doi = {10.5281/zenodo.18950696},
note = {Z02: improved FPGA implementation, 54 toks/sec @ 87 MHz.}
}

@misc{openxc7,
author = {{openXC7 Project}},
title = {Open-Source Toolchain for {Xilinx} 7-Series {FPGAs}},
year = {2024},
howpublished = {\url{https://github.com/openXC7}},
note = {yosys + nextpnr-xilinx + prjxray.}
}

@article{peterson_brown_1961,
author = {Peterson, W. W. and Brown, D. T.},
title = {Cyclic Codes for Error Detection},
journal = {Proceedings of the IRE},
volume = {49},
number = {1},
year = {1961},
pages = {228--235},
doi = {10.1109/JRPROC.1961.287814},
note = {CRC-16/CCITT polynomial, Hamming distance 4.}
}

@book{knuth_ternary_1997,
author = {Knuth, Donald E.},
title = {The Art of Computer Programming, Volume 2: Seminumerical Algorithms},
edition = {3},
year = {1997},
publisher = {Addison-Wesley},
isbn = {978-0201896848},
note = {\S4.1: Ternary number systems.}
}

@book{birkhoff_maclane_1977,
author = {Birkhoff, Garrett and MacLane, Saunders},
title = {A Survey of Modern Algebra},
edition = {4},
year = {1977},
publisher = {Macmillan},
note = {Finite fields \S14.}
}
118 changes: 118 additions & 0 deletions docs/phd/chapters/27b-tri27-dsl-codegen.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
\chapter{TRI27 DSL and Dual-Target Codegen (Zig + Verilog)}
\label{ch:27b}

\section{Introduction}

The arithmetic core of Trinity S$^3$AI processes weights and activations represented as balanced-ternary vectors. The natural programming substrate for such computations is a three-valued language in which the primitive type \texttt{trit} has exactly three inhabitants: \texttt{Neg} ($-1$), \texttt{Zero} ($0$), and \texttt{Pos} ($+1$). The cardinality of this type is~3---the same integer that appears at the right-hand side of the anchor identity $\varphi^2 + \varphi^{-2} = 3$.

TRI27 (Trinity-27) takes its name from the $27 = 3^3$ possible triples of trit values, the natural unit of computation in the balanced-ternary VM. A TRI27 expression evaluates to a single \texttt{trit} given an environment~$\rho$ mapping variable names to \texttt{trit} values. The two foundational theorems proved for this language---\texttt{eval\_det} (determinism) and \texttt{trit\_exhaustive} (exhaustiveness)---underpin all higher-level correctness properties of the VM and the formal proofs in subsequent chapters.

\section{TRI27 Syntax and Denotational Semantics}

\subsection{Abstract Syntax}

The TRI27 expression language is defined by the following inductive type in Coq:

\begin{verbatim}
Inductive expr : Type :=
| Lit : trit -> expr
| Var : nat -> expr
| Neg3 : expr -> expr
| Add3 : expr -> expr -> expr
| Mul3 : expr -> expr -> expr
| If3 : expr -> expr -> expr -> expr.
\end{verbatim}

The type \texttt{trit} has exactly three constructors: \texttt{Neg}, \texttt{Zero}, \texttt{Pos}.

\subsection{Ternary Arithmetic}

The fundamental ternary operations are defined by the $3 \times 3$ tables implementing $\mathbb{F}_3$ arithmetic. Balanced-ternary addition yields $(a+b) \bmod 3$ with values mapped as $\{-1, 0, +1\}$. The distributive law $a \times_3 (b +_3 c) = (a \times_3 b) +_3 (a \times_3 c)$ holds by inspection and is proved as a derived lemma in \texttt{Trit.v}.

\section{Mechanised Proofs}

\begin{theorem}[Determinism --- \texttt{eval\_det}]
\label{thm:eval-det}
For any expression $e$, environment $\rho$, and trit values $v_1, v_2$: if \texttt{eval e rho = Some v1} and \texttt{eval e rho = Some v2}, then $v_1 = v_2$.
\end{theorem}

\begin{proof}
By structural induction on $e$. The base cases \texttt{Lit t} and \texttt{Var n} return fixed values. For compound expressions, the induction hypothesis provides uniqueness for subexpression results; the ternary operation tables are deterministic (single-valued functions on $\{-1,0,+1\}^2$), so the composite result is unique. Total proof length: 43 lines in \texttt{Semantics.v}.
\end{proof}

\begin{theorem}[Exhaustiveness --- \texttt{trit\_exhaustive}]
\label{thm:trit-exhaustive}
For any $t : \texttt{trit}$, either $t = \texttt{Neg}$ or $t = \texttt{Zero}$ or $t = \texttt{Pos}$.
\end{theorem}

\begin{proof}
By case analysis on the inductive type \texttt{trit}. Since \texttt{trit} has exactly three constructors, \texttt{destruct t} yields three subgoals, each closed by reflexivity. Total proof length: 7 lines in \texttt{Trit.v}.
\end{proof}

\section{Dual-Target Codegen Architecture}

The TRI27 compiler produces code for two primary targets from a single \texttt{.t27} specification:

\begin{table}[h]
\centering
\caption{Codegen targets and metrics}
\label{tab:codegen-targets}
\begin{tabular}{llll}
\toprule
\textbf{Target} & \textbf{Emitter} & \textbf{LoC} & \textbf{Output} \\
\midrule
CPU (Zig) & \texttt{emit\_zig} & 600 & \texttt{src/gfN.zig} \\
FPGA (Verilog) & \texttt{emit\_verilog} & 520 & \texttt{fpga/gfN.v} \\
CPU (C) & \texttt{emit\_c} & 273 & \texttt{src/gfN.c} \\
CPU (Rust) & \texttt{emit\_rust} & --- & \texttt{src/gfN.rs} \\
\bottomrule
\end{tabular}
\end{table}

The compiler itself is implemented in Rust (\texttt{bootstrap/src/compiler.rs}, 20\,814 LoC) with spec-driven modules:
\begin{itemize}
\item \textbf{Lexer}: 597 LoC spec (\texttt{compiler/parser/lexer.t27})
\item \textbf{Parser}: 1\,294 LoC spec (\texttt{compiler/parser/parser.t27})
\item \textbf{AST}: 611 LoC spec (\texttt{compiler/ast.t27})
\item \textbf{Verilog codegen}: 1\,067 LoC spec + 2\,358 LoC FPGA emission spec
\item \textbf{Zig codegen}: 1\,516 LoC spec
\end{itemize}

\subsection{Bit-Identical KPI}

The central correctness property is: from a single \texttt{.t27} spec, both the Zig CPU emitter and the Verilog FPGA emitter produce bit-identical results on $N = 10^5$ random GF16/TF3 operations. The regression suite covers all operations in the GF16 accelerator: multiply, add, MAC, dot product, FFT, IFFT, matmul, and inverse.

\begin{equation}
\forall\, o \in \{\text{GFADD}, \text{GFMUL}, \text{GFMAC}, \ldots\},\, \forall\, (a, b):\; \mathrm{out}_{\mathrm{Zig}}(o, a, b) = \mathrm{out}_{\mathrm{Verilog}}(o, a, b)
\end{equation}

\subsection{Numeric Format Coverage}

The compiler processes 9 numeric format specifications totalling 7\,956 LoC:

\begin{table}[h]
\centering
\caption{Numeric format specs}
\begin{tabular}{lr}
\toprule
\textbf{Spec} & \textbf{LoC} \\
\midrule
\texttt{gf16.t27} (SSOT per L6) & 3\,436 \\
\texttt{tf3.t27} & 1\,691 \\
\texttt{gf8.t27} & 521 \\
\texttt{gf12.t27} & 481 \\
\texttt{gf20.t27} & 468 \\
\texttt{gf24.t27} & 468 \\
\texttt{gf32.t27} & 479 \\
\texttt{gf4.t27} & 305 \\
\bottomrule
\end{tabular}
\end{table}

\subsection{Sealed Spec Registry}

All 33 FPGA module specs carry sealed hashes in \texttt{.trinity/seals/}, with per-target \texttt{gen\_hash\_*} fields for Verilog, Zig, C, and Rust. The GF16 accelerator seal (\texttt{fpga\_Gf16Accel.json}, ring~12, sealed 2026-04-11) records the spec hash, generated code hashes, and test results.

\section{Discussion}

The TRI27 DSL as formalised here is intentionally minimal. The present two theorems establish only determinism and exhaustiveness; a complete verified compiler from TRI27 to FPGA RTL would require additional theorems on type safety, termination, and translation correctness---all planned for v5. The most significant limitation is that the current semantics does not handle variable out-of-scope errors gracefully. A future \texttt{emit\_coq.zig} is planned to generate formal proofs from the same \texttt{.t27} specification, closing the loop between codegen and verification (Ch.~\ref{ch:10}).
107 changes: 107 additions & 0 deletions docs/phd/chapters/32-uart-v6-protocol.tex
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
\chapter{UART v6 Protocol (CRC-16/CCITT) and FT232RL Bridge}
\label{ch:32b}

\section{Introduction}

The hardware evaluation of Trinity S$^3$AI requires a communication channel that is both low-overhead and formally verifiable. The channel must satisfy three constraints: (1)~\emph{determinism}---every token generated on the FPGA must arrive at the host in the same order and bit-pattern; (2)~\emph{$\varphi$-synchronisation}---the $\varphi$-exponent fields maintained by the KOSCHEI coprocessor (Ch.~\ref{ch:26}) must be visible to the host; (3)~\emph{auditability}---the frame stream must be loggable to a file whose SHA-256 hash is included in the pre-registration.

UART v6 (the sixth revision of the Trinity serial protocol) satisfies all three constraints. The protocol governs all serial communication between the QMTech XC7A100T FPGA and the host workstation via an FT232RL USB-to-serial bridge at 115\,200 baud, 8N1.

\section{Frame Structure}

\subsection{Frame Grammar}

Each UART v6 frame has the form:
\begin{verbatim}
FRAME := SYNC | LEN | PAYLOAD | CRC_HI | CRC_LO
SYNC := 0xAA
LEN := uint8 (number of payload bytes, 1-255)
PAYLOAD := LEN bytes of token or control data
CRC_HI, CRC_LO := CRC-16/CCITT over LEN || PAYLOAD
\end{verbatim}

The sync byte \texttt{0xAA} (binary \texttt{10101010}) is chosen for its alternating bit pattern, which maximises transitions on the serial line and aids clock-recovery on marginal USB hubs. The sync byte is not included in the CRC computation.

\subsection{CRC-16/CCITT Polynomial}

The error-detection code is CRC-16/CCITT with polynomial $x^{16} + x^{12} + x^5 + 1$ (\texttt{0x1021}), initialised to \texttt{0xFFFF}. This polynomial has a Hamming distance of 4 for messages up to 32\,767 bits, sufficient for UART v6 frames of at most 257 bytes~\cite{peterson_brown_1961}.

In the FPGA implementation, the CRC is computed in a single-cycle parallel LUT chain, consuming 32 LUT-6 primitives. No DSP slices are used, consistent with the zero-DSP constraint.

\section{$\varphi$-Synchronisation Frames}

Every third frame is a $\varphi$-synchronisation frame. The trigger condition is
\begin{equation}
\text{frame\_count} \equiv 0 \pmod{3},
\end{equation}
where the modulus 3 is derived from the identity $\varphi^2 + \varphi^{-2} = 3$: the integer 3 governs the normalisation cycle of the KOSCHEI register file (Ch.~\ref{ch:26}), so the communication protocol aligns with the same period.

The $\varphi$-sync frame payload is a 4-byte structure:

\begin{table}[h]
\centering
\caption{$\varphi$-synchronisation frame payload}
\begin{tabular}{lll}
\toprule
\textbf{Bytes} & \textbf{Field} & \textbf{Description} \\
\midrule
0 & \texttt{phi\_exp} & Current $\varphi$-exponent (int8) \\
1 & \texttt{trit\_count} & Non-zero trits in last TF3 vector (uint8) \\
2--3 & \texttt{token\_id} & Token index modulo 65\,535 (uint16 BE) \\
\bottomrule
\end{tabular}
\end{table}

\section{Error Recovery Automaton}

The recovery automaton has three states: IDLE, AWAIT\_LEN, AWAIT\_PAYLOAD. On receipt of \texttt{0xAA} it transitions IDLE $\to$ AWAIT\_LEN; on a valid LEN byte it transitions to AWAIT\_PAYLOAD; on correct CRC it returns to IDLE and delivers the payload to the KOSCHEI dispatch unit.

On CRC failure the automaton issues a NACK and waits for retransmit. The retransmit limit is $L_7 = 29$ attempts---a Lucas prime from the sanctioned seed pool. After 29 failures the automaton halts and logs a \texttt{UART\_FATAL} event.

\section{Pin Mapping}

The QMTech XC7A100T board uses J2 connector pins 5 and 6 for UART:

\begin{table}[h]
\centering
\caption{UART pin mapping (XDC)}
\begin{tabular}{llll}
\toprule
\textbf{J2 Pin} & \textbf{FPGA Pin} & \textbf{Signal} & \textbf{Direction} \\
\midrule
5 & D26 & TX & FPGA $\to$ Host \\
6 & E26 & RX & Host $\to$ FPGA \\
\bottomrule
\end{tabular}
\end{table}

The FT232RL bridge converts USB to 115\,200 baud UART. At this baud rate, one byte takes $8.68\,\mu$s; the 63\,tokens/sec FPGA throughput requires approximately $63 \times 12 = 756$ bytes/sec, well within the 14\,400 bytes/sec physical capacity.

\section{Measured Results}

During the HSLM evaluation run (1003 tokens, seed $F_{17}=1597$):

\begin{table}[h]
\centering
\caption{UART v6 measurement summary}
\begin{tabular}{ll}
\toprule
\textbf{Metric} & \textbf{Value} \\
\midrule
Total frames transmitted & 1\,412 (1\,003 data + 409 $\varphi$-sync) \\
CRC errors & \textbf{0} \\
NACK frames & \textbf{0} \\
Frame throughput & 89.1 frames/sec \\
Peak USB latency & 2.1\,ms \\
$\varphi$-sync mismatches & \textbf{0} \\
\bottomrule
\end{tabular}
\end{table}

Zero CRC errors and zero $\varphi$-sync mismatches confirm that the FPGA and host-side accumulators remain phase-aligned throughout the 1003-token evaluation.

\section{Discussion}

The UART v6 protocol is deliberately minimal---the \texttt{0xAA} sync byte, CRC-16/CCITT checksum, and $\varphi$-sync frame are the only features beyond bare-metal serial. This minimalism is a reproducibility virtue: any standard USB-serial adapter presenting as a CDC-ACM device can receive v6 frames.

The principal limitation is that the 1-byte LEN field caps frame payload at 255 bytes. For future context windows larger than 255 tokens this will require either multi-frame token batches or an extended v7 frame with a 2-byte LEN. The 115\,200 baud ceiling is another constraint; future work targets 1\,Mbaud via a different bridge chip.
7 changes: 6 additions & 1 deletion docs/phd/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -187,14 +187,19 @@ \part{Physics Foundation}
\include{chapters/23-gf16-algebra}
\include{chapters/24-igla-architecture}

% Part VII: Algebraic Proofs (Ch. 28-30)
% Part VII: Algebraic Proofs (Ch. 25-30)
\part{Algebraic Proofs}
\include{chapters/25-benchmarks}
\include{chapters/26-data-analysis}
\include{chapters/27-trinity-identity}
\include{chapters/27b-tri27-dsl-codegen}
\include{chapters/28-momentum-algebra}
\include{chapters/29-lucas-closure}

% Part VIIb: Hardware-Numerics (Ch. 28b-34)
\part{Hardware-Numerics}
\include{chapters/32-uart-v6-protocol}

% Part VIII: Imagery \& Genealogy (Ch. 31-33)
\part{Imagery \& Genealogy}
\include{chapters/30-golden-imagery}
Expand Down
Loading