\documentclass[11pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{amsmath,amssymb,amsthm}
\usepackage{graphicx}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{algorithm}
\usepackage{algpseudocode}

\graphicspath{{figures/}}

\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\starG}{\star_G}
\newcommand{\calA}{\mathcal{A}}
\newcommand{\calB}{\mathcal{B}}
\newcommand{\calC}{\mathcal{C}}
\newcommand{\calI}{\mathcal{I}}
\newcommand{\calS}{\mathcal{S}}
\newcommand{\calT}{\mathcal{T}}
\newcommand{\calU}{\mathcal{U}}
\newcommand{\calV}{\mathcal{V}}
\newcommand{\calX}{\mathcal{X}}
\newcommand{\calZ}{\mathcal{Z}}
\newcommand{\bbC}{\mathbb{C}}
\newcommand{\bbK}{\mathbb{K}}
\newcommand{\bbR}{\mathbb{R}}
\newcommand{\bbZ}{\mathbb{Z}}
\newcommand{\Rsq}{R\textsuperscript{2}}

\hypersetup{colorlinks=true,linkcolor=blue,citecolor=blue}

\begin{document}

\begin{center}
{\LARGE \textbf{Supplementary Information}}\\[10pt]
{\Large Group-Algebraic Tensors: Provably-optimal Equivariant Learning and Physical Symmetry Discovery}\\[10pt]
{\large Hoyos, Ubaru, Huh, Kalantzis, Clarkson, Kilmer, Avron, Horesh}
\end{center}

\tableofcontents
\newpage

%% ========================================================================
\section{Mathematical Foundations}
%% ========================================================================

\subsection{Notation}

\begin{itemize}
\item $G$: finite group of order $n = |G|$ with identity $e$.
\item $\hat{G}$: set of equivalence classes of irreducible unitary representations.
\item $d_\rho = \dim(\rho)$ for $\rho \in \hat{G}$.
\item $\calA(:,:,g)$: frontal slice at group element $g$. $\calA_{ij} = \calA(i,j,:)$: tube at indices $i,j$.
\end{itemize}

\subsection{The Group Algebra}

\begin{definition}[Group Algebra]
$\bbR[G]$ is the vector space of formal sums $\sum_{g \in G} a_g g$ with convolution product:
\begin{equation}
\left(\sum_g a_g g\right) \cdot \left(\sum_h b_h h\right) = \sum_{c \in G} \left(\sum_{g \in G} a_g b_{g^{-1}c}\right) c,
\end{equation}
 where $a_g$ and $b_h$ are scalar coefficients in $\bbR$.
\end{definition}

%% ========================================================================
\section{The Convolution Tensor, Spectral Decomposition, and Generalized Fourier Matrix}
%% ========================================================================

\begin{definition}[Convolution Tensor]
$\calT \in \bbR^{n \times n \times n}$ is defined by $\calT(a,b,c) = \delta(g_a g_b = g_c)$ (also known as the structure constants of the group algebra).
\end{definition}

\begin{proposition}[Properties of $\calT$]\label{prop:T_props}
\begin{enumerate}
\item[(i)] Associativity:
\[
\sum_d \calT(a,b,d)\,\calT(d,c,e)
        \;=\; \sum_d \calT(a,d,e)\,\calT(b,c,d).
\]
\item[(ii)] Identity: $\calT(e,b,c) = \delta_{bc}$.
\item[(iii)] Each slice $\calT(a,:,:)$ is a permutation matrix.
\end{enumerate}
\end{proposition}

\begin{proof}
(i) follows from associativity of group multiplication; both sides equal $\delta(g_a g_b g_c = g_e)$. (ii)--(iii) follow from the group axioms.
\end{proof}

\begin{definition}[Generalized Fourier Transform Matrix]
$F_G \in \bbC^{n\times n}$ is defined row-wise: the row $F_G(g, :)$ is given by the concatenation
$[\mathrm{rvec}(\rho_1(g)), \dots, \mathrm{rvec}(\rho_\ell(g))]$,
where $\mathrm{rvec}(\rho_i(g))$ denotes the row-vectorization of the matrix $\rho_i(g)$ for each $\rho_i \in \hat{G}$. For abelian groups, $F_G$ is a generalized Fourier matrix and for cyclic groups, it reduces to the standard DFT matrix. For non-abelian groups $F_G$ is invertible but in general neither unitary nor block-unitary.
\end{definition}

\begin{theorem}[Peter--Weyl Spectral Decomposition]\label{thm:pw_spectral}
\begin{equation}
\calT(a,b,c) = \sum_{i,j,k} \calC(i,j,k) \, F_G(a,i) \, F_G(b,j) \, F_G^{-1}(c,k)
\end{equation}
where $\calC$ is a core tensor  that is typically sparse. For abelian groups, $\calC$ is diagonal.
\end{theorem}

\begin{proof}
By the Peter--Weyl theorem, $\{\sqrt{d_\rho}\rho_{ij}(g) : \rho \in \hat{G}\}$ is an orthonormal basis for $L^2(G)$. Convolution becomes block multiplication in the Fourier domain:
$\widehat{(f*h)}(\rho) = \hat{f}(\rho) \cdot \hat{h}(\rho)$.
Writing this in index notation using the row-vectorization construction of $F_G$ yields the spectral decomposition with core $\calC$ determined by the Fourier-domain multiplication structure.
\end{proof}

\begin{corollary}
For $G = \bbZ_n$: $F_G = \mathrm{DFT}_n$ and $\calC$ is diagonal, recovering the circular convolution theorem  (i.e., the $t-$product).
\end{corollary}

%% ========================================================================
\section{The Group Fourier Transform of a Tensor}
%% ========================================================================

\begin{definition}[Group Fourier Transform of a Tensor]\label{def:tensor_ft}
For $\calA \in \bbR^{\ell \times m \times n}$, the Group Fourier transform $\mathcal{F}_G$ assigns to each irrep $\rho \in \hat{G}$ the $\ell d_\rho \times m d_\rho$ block matrix
\begin{equation}
    \hat{\calA}(:,:,\rho) = \sum_{g \in G} \calA(i,j,g)\,\rho(g), \qquad \text{with }(i,j)\text{ indexing the }\ell \times m\text{ blocks.}
\end{equation}
The full Fourier representation is the block-diagonal matrix $\oplus_{\rho \in \hat{G}} \hat{\calA}(:,:,\rho)$.
\end{definition}

\begin{proposition}[Group Fourier Inversion Theorem]
Given $\hat{\calA}(:,:,\rho)$ for each $\rho \in \hat{G}$, the inverse Group Fourier transform recovers
\begin{equation}
    \calA(i,j,g) = \sum_{\rho \in \hat{G}} \frac{d_\rho}{n}\,\mathrm{Tr}\!\left[\hat{\calA}(i,j,\rho)\,\rho(g)^H \right].
\end{equation}
\end{proposition}
\begin{proof}
Follows by applying the standard Peter--Weyl inversion theorem for each fixed $i$ and $j$.
\end{proof}

\begin{remark}
The two notions of ``Fourier transform'' used in the paper are related as follows. The contraction of $\calA$ with $F_G$ along its group dimension (as in SI Section~2) and the block-diagonal form $\oplus_\rho \hat{\calA}(:,:,\rho)$ (Definition~\ref{def:tensor_ft}) are equivalent via the row-vectorization reshaping $\mathrm{rvec}$ used in the construction of $F_G$. The block-diagonal form is used for the SVD computation (one standard matrix SVD per irrep block); the $F_G$-contraction form is used for the product computation and for establishing the spectral decomposition of $\calT$.
\end{remark}

\begin{algorithm}[h]
\caption{$\starG$ product (used by every $\starG$-based method)}
\label{alg:starg_product}
\begin{algorithmic}[1]
\Require $\calA \in \bbR^{\ell \times m \times n}$, $\calB \in \bbR^{m \times p \times n}$, generalized Fourier matrix $F_G \in \bbC^{n \times n}$, irrep block sizes $\{d_\rho\}_{\rho \in \hat{G}}$
\Ensure $\calC = \calA \starG \calB \in \bbR^{\ell \times p \times n}$
\State $\hat{\calA} \gets \calA \times_3 F_G$ \Comment{Generalized Fourier transform along group dimension}
\State $\hat{\calB} \gets \calB \times_3 F_G$
\For{$\rho \in \hat{G}$ \textbf{in parallel}}
    \State extract $\hat{\calA}_\rho \in \bbC^{\ell d_\rho \times m d_\rho}$, $\hat{\calB}_\rho \in \bbC^{m d_\rho \times p d_\rho}$ from block-diagonal form
    \State $\hat{\calC}_\rho \gets \hat{\calA}_\rho \cdot \hat{\calB}_\rho$ \Comment{ordinary matrix product}
\EndFor
\State assemble $\hat{\calC}$ from $\{\hat{\calC}_\rho\}_\rho$ in block-diagonal form
\State $\calC \gets \mathrm{Re}(\hat{\calC} \times_3 F_G^{-1})$ \Comment{inverse Generalized Fourier transform; imaginary part is zero up to roundoff for real inputs}
\State \Return $\calC$
\end{algorithmic}
\end{algorithm}

%% ========================================================================
\section{The $\starG$ Algebra: Properties and Proofs}
%% ========================================================================

\begin{definition}[$\starG$ Product]
For $\calA \in \bbR^{\ell \times m \times n}$, $\calB \in \bbR^{m \times p \times n}$:
\begin{equation}
(\calA \starG \calB)_{ij}(c) = \sum_k \sum_{a \in G} \calA_{ik}(a) \calB_{kj}(a^{-1}c).
\end{equation}
Equivalently: $\widehat{(\calA \starG \calB)}(:,:,\rho) = \hat{\calA}(:,:,\rho) \cdot \hat{\calB}(:,:,\rho)$ for each irrep $\rho \in \hat{G}$.
\end{definition}

\begin{proposition}[Algebraic Properties]
(i)~Associativity. (ii)~Distributivity. (iii)~Identity: $\calI(:,:,e)=I$, $\calI(:,:,g\neq e)=0$.
(iv)~$(\calA \starG \calB)^H = \calB^H \starG \calA^H$.
\end{proposition}

\begin{proof}
All follow from the corresponding matrix properties applied per-irrep in the Fourier domain (Definition~\ref{def:tensor_ft}), plus linearity and invertibility of the Fourier transform.
\end{proof}

%% ========================================================================
\section{The $\starG$-SVD: Full Proof of Optimality}
%% ========================================================================

\begin{theorem}[$\starG$-SVD Existence]\label{thm:svd_exist}
Every $\calA \in \bbR^{\ell \times m \times n}$ admits $\calA = \calU \starG \calS \starG \calV^H$ where $\calU,\calV$ are $\starG$-unitary and $\calS$ is f-diagonal.
\end{theorem}

\begin{proof}
For each irrep $\rho \in \hat{G}$, $\hat{\calA}(:,:,\rho)$ is a standard matrix admitting SVD: $\hat{\calA}(:,:,\rho) = U_\rho \Sigma_\rho V_\rho^H$. Setting $\hat{\calU}(:,:,\rho) = U_\rho$, $\hat{\calS}(:,:,\rho) = \Sigma_\rho$, $\hat{\calV}(:,:,\rho) = V_\rho$ and applying the inverse Fourier transform yields the $\starG$-SVD. Unitarity: $\widehat{\calU^H \starG \calU}(:,:,\rho) = U_\rho^H U_\rho = I$ for all $\rho$, so $\calU^H \starG \calU = \calI$.
\end{proof}

\begin{algorithm}[h]
\caption{$\starG$-SVD}
\label{alg:svd}
\begin{algorithmic}[1]
\Require $\calA \in \bbR^{\ell \times m \times n}$
\Ensure $\calU, \calS, \calV$
\State Compute $\hat{\calA}(:,:,\rho)$ for all $\rho \in \hat{G}$ using $\mathcal{F}_G$ (Definition~\ref{def:tensor_ft})
\For{$\rho \in \hat{G}$}
    \State $[U_\rho, \Sigma_\rho, V_\rho] \gets \mathrm{SVD}(\hat{\calA}(:,:,\rho))$
\EndFor
\State Apply $\mathcal{F}_G^{-1}$ to $\{U_\rho\}$, $\{\Sigma_\rho\}$, $\{V_\rho\}$ to obtain $\calU, \calS, \calV$
\end{algorithmic}
\end{algorithm}

\begin{theorem}[Eckart--Young for $\starG$]\label{thm:ey_full}
Let $\calA = \calU \starG \calS \starG \calV^H$ with singular tubes ordered by $\|\mathbf{s}_1\|_F \geq \cdots \geq \|\mathbf{s}_r\|_F$. The rank-$k$ truncation $\calA_k$ satisfies:
\begin{equation}
\|\calA - \calA_k\|_F^2 = \sum_{i=k+1}^r \|\mathbf{s}_i\|_F^2 \leq \|\calA - \calB\|_F^2
\end{equation}
for any $\calB$ with $\starG$-rank $\leq k$.
\end{theorem}

\begin{proof}
\textbf{Step 1 (Parseval).} By the generalized Fourier transform's isometry (Peter--Weyl):
\begin{equation}
\|\calA - \calB\|_F^2 = \sum_{\rho \in \hat{G}} \frac{d_\rho}{n}\|\hat{\calA}(:,:,\rho) - \hat{\calB}(:,:,\rho)\|_F^2.
\end{equation}

\textbf{Step 2 (Per-irrep Eckart--Young).} If $\starG$-rank$(\calB) \leq k$, then $\mathrm{rank}(\hat{\calB}(:,:,\rho)) \leq k$ for each $\rho$. By the classical Eckart--Young theorem:
\begin{equation}
\|\hat{\calA}(:,:,\rho) - \hat{\calA}_k(:,:,\rho)\|_F^2 \leq \|\hat{\calA}(:,:,\rho) - \hat{\calB}(:,:,\rho)\|_F^2.
\end{equation}

\textbf{Step 3 (Summation).} Summing over $\rho \in \hat{G}$:
\begin{equation}
\|\calA - \calA_k\|_F^2 = \sum_{\rho \in \hat{G}} \frac{d_\rho}{n}\sum_{i=k+1}^r \sigma_i(\rho)^2
= \sum_{i=k+1}^r \underbrace{\sum_\rho \frac{d_\rho}{n}\sigma_i(\rho)^2}_{= \|\mathbf{s}_i\|_F^2} \leq \|\calA - \calB\|_F^2.
\end{equation}
\end{proof}

\begin{remark}
This is \emph{exact} optimality. By contrast: CP decomposition is NP-hard to compute optimally; Tucker/HOSVD provides only quasi-optimal guarantees with factor $\sqrt{d}$ \cite{desilva2008tensor}; tensor-train has no global optimality. The $\starG$-SVD achieves both polynomial-time computation and exact optimality by leveraging group structure.
\end{remark}

%% ========================================================================
\section{Product Groups: Full Proof}
%% ========================================================================

\begin{theorem}[Product Group Ring Isomorphism]\label{thm:prod_full}
For $G = G_1 \times \cdots \times G_d$:
(i)~$\bbK_G \cong \bbK_{G_1} \otimes \cdots \otimes \bbK_{G_d}$.
(ii)~$\calT_G = \calT_{G_1} \otimes \cdots \otimes \calT_{G_d}$.
(iii)~$F_G = F_{G_1} \otimes \cdots \otimes F_{G_d}$.
\end{theorem}

\begin{proof}
\textbf{(i)} Standard: $\bbR[G_1 \times \cdots \times G_d] \cong \bbR[G_1] \otimes \cdots \otimes \bbR[G_d]$ \cite{serre1977linear}.

\textbf{(ii)} The product group multiplication $(a_1,\ldots,a_d)(b_1,\ldots,b_d) = (a_1 b_1,\ldots,a_d b_d)$ gives
\begin{equation}
\calT_G(\mathbf{a},\mathbf{b},\mathbf{c}) = \prod_{i=1}^d \calT_{G_i}(a_i,b_i,c_i) = (\calT_{G_1} \otimes \cdots \otimes \calT_{G_d})(\mathbf{a},\mathbf{b},\mathbf{c}).
\end{equation}

\textbf{(iii)} Irreps of $G_1 \times \cdots \times G_d$ are tensor products $\rho_1 \otimes \cdots \otimes \rho_d$. Matrix elements factor: $(\rho_1 \otimes \cdots \otimes \rho_d)(g_1,\ldots,g_d) = \rho_1(g_1) \otimes \cdots \otimes \rho_d(g_d)$. By the $\mathrm{rvec}$ construction of $F_G$ and the mixed-product property of Kronecker products, $F_G = F_{G_1} \otimes \cdots \otimes F_{G_d}$.
\end{proof}

\begin{corollary}[2D Frequency Resolution]
For $G = \bbZ_{n_1} \times \bbZ_{n_2}$, the Fourier transform $F_G = \mathrm{DFT}_{n_1} \otimes \mathrm{DFT}_{n_2}$ computes a 2D DFT, resolving coupled frequencies $(f_1, f_2)$ that are invisible to either factor alone.
\end{corollary}

%% ========================================================================
\section{Invariant Feature Extraction Algorithm}
\label{sec:feat_alg}
%% ========================================================================

The $\starG$-feature extractor used in every linear and Neural-$\starG$
experiment of this paper is given in Algorithm~\ref{alg:features}. The seven
feature blocks (a)--(g) below correspond to the seven concatenated columns
of the output. Block (a) is the DC component; (b) the AC standard
deviation; (c) the global per-frequency power; (d) the per-row Fourier power
restricted to the first $K$ equivariant rows; (e) the singular tube norms
from the $\starG$-SVD; (f) the rows of $\calX$ that are constant under the
group action (recovered by row-variance thresholding); and (g) four spectral
statistics of the unfolded matrix $\calX_{(1)}$. Every feature is exactly
$G$-invariant; proofs are in SI Section~7.

\begin{algorithm}[!tb]
\caption{$\starG$ invariant feature extraction (\texttt{extractStarGFeatures.m})}
\label{alg:features}
\begin{algorithmic}[1]
\Require batch $\calX \in \bbR^{N \times n_f \times n}$, group algebra $G$, optional normalization parameters $\Theta$ from training
\Ensure feature matrix $\Phi \in \bbR^{N \times d_{\mathrm{feat}} + 1}$ ($+1$ for the unregularized intercept) and updated $\Theta$
\If{$\Theta$ is not provided}
    \State $(p, q) \gets \arg\max_{p \cdot q \leq n_f} \min(p, q)$ \Comment{best near-square reshape for $\starG$-SVD}
    \State $n_{\mathrm{svd}} \gets \min(p, q)$
    \State \(\sigma_{\mathrm{row}} \gets \mathrm{var}_g(\calX(1, :, :))\); $\mathrm{inv\_mask} \gets \sigma_{\mathrm{row}} < 10^{-8} \cdot \max(\sigma_{\mathrm{row}})$ \Comment{rows constant under $G$}
    \State $\mathrm{eq\_idx} \gets \{j : \mathrm{inv\_mask}_j = \text{false}\}$;\ $K \gets \min(14, |\mathrm{eq\_idx}|)$
\EndIf
\State $\bar{\calX}_{ij} \gets \tfrac{1}{n} \sum_g \calX(i, j, g)$ \Comment{(a) DC component, $N \times n_f$}
\State $\sigma_{ij} \gets \mathrm{std}_g \calX(i, j, g)$ \Comment{(b) AC energy, $N \times n_f$}
\State $\hat{\calX} \gets \calX \times_3 F_G$ \Comment{Generalized Fourier transform along group dim}
\State $P^{\mathrm{col}}_{ik} \gets \sum_j |\hat{\calX}(i, j, k)|^2$ \Comment{(c) per-frequency power, $N \times n$}
\State $P^{\mathrm{row}}_{i,(r-1)n + k} \gets |\hat{\calX}(i, \mathrm{eq\_idx}_r, k)|^2$ \Comment{(d) per-row Fourier power, $N \times K n$}
\For{$i = 1, \ldots, N$}
    \State $X_i \gets$ pad $\calX(i, :, :)$ to $p \times q \times n$
    \State $[\,\cdot\,, \calS_i, \,\cdot\,] \gets \mathrm{starG\_SVD}(X_i)$ \Comment{(e) singular tubes}
    \State $T_{i, k} \gets \|\calS_i(k, k, :)\|_F$ for $k = 1, \ldots, n_{\mathrm{svd}}$; sort $T_{i, :}$ descending
    \State $V_{ij} \gets \calX(i, j, 1)$ for $j$ with $\mathrm{inv\_mask}_j$ \Comment{(f) direct invariants}
    \State $\Sigma_i \gets \mathrm{svd}(\calX_{(1)}(i, :, :))$ \Comment{(g) spectral statistics}
    \State $S_i \gets [\sum \Sigma_i,\ \Sigma_{i,1},\ \Sigma_{i,1} / \Sigma_{i,\mathrm{end}},\ -\sum_k \tilde{\Sigma}_{i,k} \log \tilde{\Sigma}_{i,k}]$ where $\tilde{\Sigma}_i = \Sigma_i / \sum \Sigma_i$
\EndFor
\State $\Phi \gets [\bar{\calX}\ |\ \sigma\ |\ P^{\mathrm{col}}\ |\ P^{\mathrm{row}}\ |\ T\ |\ V\ |\ S]$
\State replace NaN/Inf with 0
\If{$\Theta$ not provided}
    \State $\mathrm{keep} \gets \{j : \mathrm{std}(\Phi_{:, j}) \geq 10^{-8}\}$
    \State $\mu \gets \mathrm{mean}(\Phi_{:, \mathrm{keep}})$;\ $s \gets \mathrm{std}(\Phi_{:, \mathrm{keep}})$;\ $s_j \gets \max(s_j, 1)$
    \State store $\Theta = (p, q, n_{\mathrm{svd}}, \mathrm{inv\_mask}, \mathrm{eq\_idx}, K, \mathrm{keep}, \mu, s)$
\EndIf
\State $\Phi \gets (\Phi_{:, \mathrm{keep}} - \mu) / s$
\State $\Phi \gets [\,\mathbf{1}_N\ |\ \Phi\,]$ \Comment{prepend unregularized intercept}
\State \Return $(\Phi, \Theta)$
\end{algorithmic}
\end{algorithm}

%% ========================================================================
\section{Equivariance Proofs}
%% ========================================================================

\begin{proposition}[Equivariance of $\starG$]
$(g \cdot \calA) \starG \calB = g \cdot (\calA \starG \calB) = \calA \starG (g \cdot \calB)$.
\end{proposition}

\begin{proof}
By definition of the group action and the $\starG$ product,
\begin{align*}
\bigl((g\cdot\calA)\starG\calB\bigr)_{ij}(h)
   &= \sum_k \sum_{a\in G} \calA_{ik}(g^{-1}a)\,\calB_{kj}(a^{-1}h).
\end{align*}
Substituting $a'=g^{-1}a$ (so $a=ga'$ and $a^{-1}=a'^{-1}g^{-1}$),
\begin{align*}
   &= \sum_k \sum_{a'\in G} \calA_{ik}(a')\,\calB_{kj}(a'^{-1}g^{-1}h) \\
   &= (\calA\starG\calB)_{ij}(g^{-1}h)
    = \bigl(g\cdot(\calA\starG\calB)\bigr)_{ij}(h).
\end{align*}
The right-equivariance identity follows by the symmetric argument applied
to the second factor.
\end{proof}

\begin{corollary}[Invariance of Features]
The following are invariant under $g \cdot X$:
(i)~$\|\mathbf{s}_i\|_F$ (singular tube norms);
(ii)~$\|\hat{X}(:,:,\rho)\|_F^2$ (per-irrep Fourier power);
(iii)~$\bar{X}_j = \frac{1}{n}\sum_g X(j,g)$ (DC component).
\end{corollary}

\begin{proof}
(i) The group action permutes frontal slices; the SVD is computed per-irrep where the action multiplies each block by a unitary, preserving singular values. (ii) The group action shifts $g \to g'g$ inside the sum defining $\hat{X}(:,:,\rho)$, multiplying the block by $\rho(g')$, which is unitary, leaving the Frobenius norm unchanged. (iii) $\frac{1}{n}\sum_g X(j,g'g) = \frac{1}{n}\sum_h X(j,h) = \bar{X}_j$.
\end{proof}

%% ========================================================================
\section{Extended Data}
%% ========================================================================

\begin{figure}[!ht]
\centering
\includegraphics[width=0.7\textwidth]{fig_ablation.pdf}
\caption{\textbf{Extended Data Figure~1: Ablation of symmetry components.} Removing group structure from the product group experiment causes systematic degradation.}
\label{fig:ext_ablation}
\end{figure}

\begin{figure}[!ht]
\centering
\includegraphics[width=0.85\textwidth]{fig_scatter.pdf}
\caption{\textbf{Extended Data Figure~2: Predicted vs.\ true (synthetic).} (a)~$\starG$-SVD: perfect diagonal. (b)~Standard MLP: scattered.}
\label{fig:ext_scatter}
\end{figure}

\begin{table}[h]
\centering
\caption{Per-method hyperparameter settings used in all experiments. Hidden
widths $[64, 32]$, ReLU activations, He initialization, and an unregularized
bias per layer are common to all neural baselines. ``Native'' = original
training set of size $n$; ``$|G|$-aug'' = original training set augmented by
applying every $g \in G$ to each input, yielding $|G| \cdot n$ samples.}
\label{tab:hyperparams_full}
\begin{tabular}{@{}p{2.4cm}p{3.4cm}p{2.2cm}p{1.6cm}p{1.0cm}p{1.0cm}p{1.4cm}@{}}
\toprule
\textbf{Method} & \textbf{Input} & \textbf{Architecture} &
\textbf{Train set} & \textbf{lr} & \textbf{Epochs} & \textbf{Batch} \\
\midrule
$\starG$-SVD + Ridge & $\starG$-features & Linear, ridge $\lambda \in \{10^{-3},\ldots,10^3\}$ (val.) & native & --  & --  & full \\
Standard MLP        & raw $\calX(:, e)$ ($z$-norm)             & $[n_f, 64, 32, 1]$  & native     & 0.003 & 300\textsuperscript{a} & 256\textsuperscript{b} \\
Invariant MLP       & $[\mathrm{mean}, \mathrm{std}, \min, \max]_g$ $\calX$ ($z$-norm) & $[4 n_f, 64, 32, 1]$ & native & 0.003 & 300 & 256 \\
Augmented MLP       & raw $\calX(:, e)$ ($z$-norm on aug.\ set) & $[n_f, 64, 32, 1]$  & $|G|$-aug & 0.003\textsuperscript{c} & 80--300\textsuperscript{d} & 256 \\
Neural $\starG$     & $\starG$-features                        & $\starG$ layers $[\cdot, 64, 32, 1]$ & native & 0.003 & 300 & 32 \\
\bottomrule
\end{tabular}
\medskip

\noindent\footnotesize
\textsuperscript{a}~80 epochs in the synthetic $\bbZ_{12}$ experiment;
300 elsewhere.\quad
\textsuperscript{b}~32 in the synthetic experiment.\quad
\textsuperscript{c}~0.005 in the synthetic experiment.\quad
\textsuperscript{d}~80 in the synthetic experiment, 200 in the
product-group experiment, 300 elsewhere; the smaller epoch budget for
heavily augmented training reflects the $|G|$-fold larger gradient budget per
epoch. Optimizer is Adam ($\beta_1 = 0.9, \beta_2 = 0.999, \varepsilon =
10^{-8}$) with early-stopping patience 20 on validation MSE for every neural
baseline.
\end{table}

\begin{table}[h]
\centering
\caption{Computational cost (wall-clock, single seed, 1{,}000 molecules).}
\begin{tabular}{@{}lccc@{}}
\toprule
\textbf{Method} & \textbf{Feature (s)} & \textbf{Train (s)} & \textbf{Total (s)} \\
\midrule
$\starG$-SVD + Ridge & 0.6 & $<$0.1 & 0.7 \\
Neural $\starG$ & 0.6 & 0.5 & 1.1 \\
Standard MLP & -- & 0.4 & 0.4 \\
Invariant MLP & -- & 0.5 & 0.5 \\
Augmented MLP & -- & 4.0 & 4.0 \\
\bottomrule
\end{tabular}
\end{table}

%% ========================================================================
\section{Wigner--Eckart Discovery: Extended Data}
%% ========================================================================

\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{irrep_bars.pdf}
\caption{\textbf{Extended Data Figure~4: Per-irrep predictive power.} Grouped bar chart showing $R^2$ from each irrep's features alone, for all 9 quantum properties. The qualitative pattern shift between scalar properties (A$_1$-dominated) and dipole vector components (T$_1$-dominated) is the data-driven signature of the Wigner--Eckart selection rules.}
\end{figure}

\begin{figure}[!ht]
\centering
\includegraphics[width=\textwidth]{irrep_heatmap.pdf}
\caption{\textbf{Extended Data Figure~5: Irrep decomposition heatmap.} $R^2$ for each (property, irrep) pair, sorted by tensor rank. The block structure separating rank-0 from rank-1 properties is visible as a qualitative change in the A$_1$ and T$_1$ columns across the horizontal separator.}
\end{figure}

The octahedral group $O$ was constructed programmatically from its 24 rotation matrices (6 face, 8 vertex, 6 edge rotations plus identity). The multiplication table was verified to satisfy the group axioms. The five irreducible representations were constructed as: A$_1$ (trivial), A$_2$ (determinant), T$_1$ (the rotation matrices themselves, 3D), and E + T$_2$ (from the rank-2 symmetric traceless tensor representation, decomposed into the 2D and 3D invariant subspaces). All representations were verified to be closed under the group multiplication.

\begin{algorithm}[h]
\caption{Per-irrep Fourier decomposition for Wigner--Eckart analysis}
\label{alg:per_irrep}
\begin{algorithmic}[1]
\Require feature batch $\calX \in \bbR^{N \times n_f \times |G|}$, irreps $\hat{G} = \{\rho_1, \ldots, \rho_M\}$ with dimensions $d_\rho$ and matrices $\rho(g)$, target vector $y \in \bbR^N$, ridge grid $\Lambda$
\Ensure per-irrep predictive scores $\{R^2_\rho\}_{\rho \in \hat{G}}$
\For{$\rho \in \hat{G}$}
    \For{$i = 1, \ldots, N$ and $j = 1, \ldots, n_f$ \textbf{in parallel}}
        \State $\hat{X}^\rho_{ij} \gets \sqrt{d_\rho / |G|} \sum_{g} \calX(i, j, g) \, \rho(g) \in \bbC^{d_\rho \times d_\rho}$
        \State $P^\rho_{ij} \gets \|\hat{X}^\rho_{ij}\|_F^2$ \Comment{$G$-invariant power}
    \EndFor
    \State assemble $\Phi^\rho \in \bbR^{N \times n_f}$ from $\{P^\rho_{ij}\}$
    \State split $\Phi^\rho, y$ into train/val/test (70/15/15); standardize $\Phi^\rho$
    \State $\lambda^\star \gets \arg\min_{\lambda \in \Lambda} \mathrm{MSE}_{\mathrm{val}}(\Phi^\rho, y; \lambda)$
    \State $w_\rho \gets (\Phi^{\rho \top}_{\mathrm{train}} \Phi^\rho_{\mathrm{train}} + \lambda^\star I)^{-1} \Phi^{\rho \top}_{\mathrm{train}} y_{\mathrm{train}}$
    \State $R^2_\rho \gets 1 - \mathrm{SS}_{\mathrm{res}}(\Phi^\rho_{\mathrm{test}} w_\rho, y_{\mathrm{test}}) / \mathrm{SS}_{\mathrm{tot}}(y_{\mathrm{test}})$
\EndFor
\State \Return $\{R^2_\rho\}$, $\{T_1 / A_1\text{ ratio}\}$, irrep heatmap data
\end{algorithmic}
\end{algorithm}

%% ========================================================================
\section{Symmetry and Factorization Discovery Algorithm}
\label{sec:discovery_alg}
%% ========================================================================

The symmetry-discovery experiment of Section~2.4 of the main text scans a
candidate library of finite groups, fits the $\starG$ pipeline with each
candidate, and selects the group that maximizes a combined accuracy /
invariance score. Algorithm~\ref{alg:discovery} states the procedure
precisely. The factorization-discovery experiment uses the same algorithm
restricted to candidate groups of the form $\bbZ_a \times \bbZ_b$ with
$ab = n_{\mathrm{group}}$.

\begin{algorithm}[h]
\caption{Symmetry / factorization discovery via $\starG$ score scan}
\label{alg:discovery}
\begin{algorithmic}[1]
\Require dataset $(\calX, y)$, candidate library $\mathcal{G} = \{G_1, \ldots, G_K\}$, mixing weight $\alpha \in [0, 1]$ (default 0.7)
\Ensure ranked list $\{(G_k, \mathrm{score}_k)\}_{k=1}^K$
\For{$G_k \in \mathcal{G}$}
    \State construct $G_k$, $\calT_{G_k}$, $F_{G_k}$ (cached)
    \State $\Phi_k \gets$ Algorithm~\ref{alg:features}$(\calX, G_k)$
    \State split into train/val/test; standardize
    \State $\lambda^\star \gets \arg\min_{\lambda} \mathrm{MSE}_{\mathrm{val}}(\Phi_k, y; \lambda)$
    \State $R^2_k \gets$ ridge-regression $R^2$ on the validation fold
    \State $\nu_k \gets \mathrm{Var}_{g \in G_k} \hat{y}_k(g \cdot \calX_{\mathrm{val}})$ \Comment{rotation variance under $G_k$}
\EndFor
\State $\nu_{\max} \gets \max_k \nu_k$
\State $\mathrm{score}_k \gets \alpha \cdot R^2_k + (1 - \alpha) \cdot (1 - \nu_k / \nu_{\max})$
\State \Return $\{(G_k, \mathrm{score}_k)\}$ sorted descending by score
\end{algorithmic}
\end{algorithm}

%% ========================================================================
\section{Baseline Implementation Algorithms}
\label{sec:baseline_alg}
%% ========================================================================

This section gives explicit pseudocode for the two non-trivial baselines used
in the main paper: the Augmented MLP (the strongest non-equivariant
baseline) and the Neural~$\starG$ network (the equivariant
non-linear baseline). The Standard MLP and Invariant MLP differ from the
Augmented MLP only by their input representation (raw frontal slice and
$[\mathrm{mean}, \mathrm{std}, \min, \max]_g \calX$ respectively) and by the
absence of orbit augmentation; both follow the standard MLP training loop
that wraps Algorithm~\ref{alg:augmented_mlp} once augmentation is removed.

\begin{algorithm}[h]
\caption{Augmented MLP training}
\label{alg:augmented_mlp}
\begin{algorithmic}[1]
\Require training tensor $\calX^{\mathrm{tr}} \in \bbR^{n \times n_f \times |G|}$, target $y^{\mathrm{tr}} \in \bbR^n$, group $G$, validation $(\calX^{\mathrm{va}}, y^{\mathrm{va}})$, hidden widths $h = [64, 32]$, learning rate $\eta$, max epochs $E$, batch size $B$, patience $P$
\Ensure trained weights $W = \{W^{(1)}, W^{(2)}, W^{(3)}\}$, biases $b$
\State $\tilde{X} \gets \mathrm{reshape}(\mathrm{permute}(\calX^{\mathrm{tr}}, [1, 3, 2]), [\,n |G|, n_f\,])$ \Comment{stack all $|G|$ orbit copies}
\State $\tilde{y} \gets \mathrm{repmat}(y^{\mathrm{tr}}, |G|, 1)$ \Comment{labels are $G$-invariant; replicate}
\State $(\mu, s) \gets (\mathrm{mean}(\tilde{X}), \mathrm{std}(\tilde{X}) + 10^{-8})$ \Comment{$z$-norm on augmented set}
\State $\tilde{X} \gets (\tilde{X} - \mu) / s$
\State initialize $W^{(\ell)} \sim \mathcal{N}(0, 2 / \mathrm{fan\_in}_\ell)$ (He init); $b^{(\ell)} \gets 0$
\State Adam state: $m^{(\ell)}, v^{(\ell)} \gets 0$, step counter $t \gets 0$
\State best-$W \gets W$; $\mathrm{wait} \gets 0$; $\mathrm{best\_val} \gets +\infty$
\For{$\mathrm{epoch} = 1, \ldots, E$}
    \State shuffle $\tilde{X}$
    \For{each minibatch of size $B$}
        \State $t \gets t + 1$
        \State forward: $A^{(0)} \gets X_{\mathrm{batch}}$;\ $A^{(\ell)} \gets \mathrm{ReLU}(W^{(\ell)} A^{(\ell-1)} + b^{(\ell)})$ for $\ell < L$;\ $A^{(L)} \gets W^{(L)} A^{(L-1)} + b^{(L)}$
        \State $\mathcal{L} \gets \tfrac{1}{B} \|A^{(L)} - y_{\mathrm{batch}}\|^2$
        \State backprop $\nabla_W \mathcal{L}, \nabla_b \mathcal{L}$
        \State Adam update: $m, v$ exponential moving averages with $\beta_1 = 0.9, \beta_2 = 0.999, \varepsilon = 10^{-8}$
        \State $W^{(\ell)} \gets W^{(\ell)} - \eta \cdot \hat{m}^{(\ell)} / (\sqrt{\hat{v}^{(\ell)}} + \varepsilon)$
    \EndFor
    \State $\mathcal{L}_{\mathrm{val}} \gets$ MSE on $((\calX^{\mathrm{va}}(:, :, e) - \mu) / s, y^{\mathrm{va}})$
    \If{$\mathcal{L}_{\mathrm{val}} < \mathrm{best\_val}$}
        \State $\mathrm{best\_val} \gets \mathcal{L}_{\mathrm{val}}$;\ best-$W \gets W$;\ $\mathrm{wait} \gets 0$
    \Else
        \State $\mathrm{wait} \gets \mathrm{wait} + 1$;\ \textbf{if} $\mathrm{wait} \geq P$ \textbf{break}
    \EndIf
\EndFor
\State \Return best-$W$, best-$b$
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[h]
\caption{Neural $\starG$ forward pass and gradient}
\label{alg:neural_starg}
\begin{algorithmic}[1]
\Require batch $\calX \in \bbR^{N \times n_f \times |G|}$, $\starG$-weights $\{W^{(\ell)}\}_{\ell=1}^{L}$ with $W^{(\ell)} \in \bbR^{n_{\ell+1} \times n_\ell \times |G|}$, biases $\{b^{(\ell)}\} \in \bbR^{n_{\ell+1} \times 1 \times |G|}$, group $G$
\Ensure scalar predictions $\hat{y} \in \bbR^N$
\State $\calA^{(0)} \gets \calX$
\For{$\ell = 1, \ldots, L$}
    \State $\calZ^{(\ell)} \gets W^{(\ell)} \starG \calA^{(\ell-1)} + b^{(\ell)}$ \Comment{Algorithm~\ref{alg:starg_product}}
    \If{$\ell < L$}
        \State $\calA^{(\ell)} \gets \mathrm{ReLU}(\calZ^{(\ell)})$
    \Else
        \State $\calA^{(\ell)} \gets \calZ^{(\ell)}$ \Comment{linear output}
    \EndIf
\EndFor
\State $\hat{y}_i \gets \tfrac{1}{n_L |G|} \sum_{j, g} \calA^{(L)}(i, j, g)$ \Comment{$G$-invariant pooling}
\State \Return $\hat{y}$
\State \textbf{Training} (300 epochs, Adam $\eta = 0.003$, batch 32, patience 20): backprop through Algorithm~\ref{alg:starg_product} layer-by-layer; equivariance is preserved exactly to floating-point precision because each step factors through the per-irrep block multiplication.
\end{algorithmic}
\end{algorithm}

\subsection{ENN baselines: SchNet, e3nn, MACE}
\label{sec:enn_baselines}

The three ENN baselines used in Section~2.7 are not reimplemented from
scratch: each is a published reference implementation, executed on the
same train/val/test splits and the same seeds as the $\starG$ and MLP
baselines. The configuration we used is recorded explicitly so that the
comparison is reproducible.

\begin{itemize}
\item \textbf{SchNet}~\cite{schutt2017schnet}.
  Reference implementation: \texttt{schnetpack} v2.0.4 (pinned in the
  repository's \texttt{requirements.txt}).
  Configuration: $128$ atom-basis features, $6$ interaction blocks,
  $20$-Gaussian radial basis, cosine cutoff at $5.0$\,\AA, MSE loss with
  L1 monitoring, Adam $\eta = 5{\times}10^{-4}$, batch $64$, max $200$
  epochs, early-stop patience $20$. We run the standard
  \texttt{spk.datasets.QM9} loader with \texttt{remove\_uncharacterized
  =True} to match PyG's $130{,}831$-molecule subset, the
  \texttt{ASENeighborList(cutoff=5.0)} transform, and a per-target
  $z$-norm via \texttt{RemoveOffsets}/\texttt{AddOffsets}.
  Scalar targets only ($\mu$, $\alpha$, gap, ZPVE).

\item \textbf{e3nn-based SE(3)-equivariant model}~\cite{thomas2018tensor,geiger2022e3nn}.
  A compact equivariant message-passing network built directly from
  \texttt{e3nn} v0.5.4 primitives. Three layers, hidden irreps
  \texttt{32x0e+16x1o+8x2e}, edge spherical harmonics
  \texttt{1x0e+1x1o+1x2e}, RBF $16$ Gaussians on the $0\!-\!5$\,\AA{}
  cutoff, gated equivariant non-linearities, sum-pool over atoms,
  $\texttt{FullyConnectedTensorProduct}$ head with output irreps matched
  to target rank (\texttt{1x0e} for scalars, \texttt{1x1o} for
  $\boldsymbol{\mu}$ vector, \texttt{1x2e+1x0e} for $\alpha$ tensor).
  Adam $\eta = 5{\times}10^{-4}$, batch $32$, $200$ epochs, patience
  $20$. Used for tensor-rank-matched comparison rather than as the SOTA
  ENN target.

\item \textbf{MACE}~\cite{batatia2022mace}.
  Reference implementation: \texttt{mace-torch} v0.3.15 (pinned in
  \texttt{requirements.txt}). Configuration:
  \texttt{ScaleShiftMACE} with $r_{\max} = 5.0$\,\AA, $8$ Bessel radial
  features, $5$-th order polynomial cutoff, $\ell_{\max}=3$,
  correlation $3$, two interaction blocks
  (\texttt{RealAgnosticInteractionBlock} first;
  \texttt{RealAgnosticResidualInteractionBlock} second), hidden irreps
  \texttt{128x0e+128x1o}, MLP irreps \texttt{16x0e}, $5$ elements
  (H/C/N/O/F), per-target shift/scale $(\bar y, \mathrm{std}\,y)$.
  Optimizer: Adam (\texttt{amsgrad}), $\eta=10^{-3}$, batch $32$,
  \texttt{ReduceLROnPlateau} (factor $0.5$, patience $15$), max $200$
  epochs, early-stop patience $25$ on validation MSE. Total parameter
  count: $945{,}168$. Scalar targets only.
\end{itemize}

\noindent
\emph{Implementation notes.} Three non-trivial integration adjustments
were required that may be useful to anyone reproducing the comparison.
(i)~In \texttt{schnetpack}'s \texttt{ModelOutput}, every metric must be
an \texttt{nn.Module} (lambda functions are silently rejected by
\texttt{nn.ModuleDict}); we use only \texttt{L1Loss} as the metric and
recompute RMSE/$R^2$ ourselves from saved test predictions.
(ii)~In \texttt{mace-torch} ${\geq}\,0.3.10$, \texttt{interaction\_cls}
and \texttt{interaction\_cls\_first} are mandatory, and
\texttt{hidden\_irreps}/\texttt{MLP\_irreps} must be \texttt{o3.Irreps}
objects, not strings; per-molecule \texttt{AtomicData} construction
requires a single shared \texttt{z\_table} for the full element set
(H/C/N/O/F) so that the per-graph \texttt{node\_attrs} have uniform
width when batched. (iii)~Loading \texttt{e3nn 0.4.4}'s pickled
\texttt{constants.pt} (transitively imported by \texttt{mace-torch})
fails under PyTorch ${\geq}\,2.6$'s default
\texttt{weights\_only=True}; we set
\texttt{TORCH\_FORCE\_NO\_WEIGHTS\_ONLY\_LOAD=1} for the comparison
runs only.

%% ========================================================================
\section{End-to-End Workflow}
\label{sec:workflow}
%% ========================================================================

The complete pipeline used to produce every numerical result in this paper
is summarized in Algorithm~\ref{alg:pipeline}. The pipeline is identical
across the synthetic, QM9, product-group, symmetry-discovery, and
Wigner--Eckart experiments; only the group $G$, the featurization
$\phi: \mathrm{molecule} \mapsto \calX$, and the regression head differ.

\begin{algorithm}[h]
\caption{End-to-end $\starG$-SVD~+~ridge pipeline}
\label{alg:pipeline}
\begin{algorithmic}[1]
\Require dataset $\{(\mathrm{mol}_i, y_i)\}_{i=1}^{N}$, group $G$, featurizer $\phi$, ridge grid $\Lambda$
\Ensure trained predictor $\hat{f}$, test scores
\State precompute $\calT_G$, $F_G$, irrep dimensions $\{d_\rho\}$
\For{$i = 1, \ldots, N$}
    \State $\calX_i \gets \phi(\mathrm{mol}_i; G)$ \Comment{tensorial featurization}
\EndFor
\State assemble $\calX \in \bbR^{N \times n_f \times |G|}$
\State split $\calX, y$ into train/val/test (70/15/15)
\State $(\Phi_{\mathrm{tr}}, \Theta) \gets$ Algorithm~\ref{alg:features}$(\calX_{\mathrm{tr}}, G)$
\State $\Phi_{\mathrm{va}}, \Phi_{\mathrm{te}} \gets$ Algorithm~\ref{alg:features}$(\cdot, G; \Theta)$
\State $\lambda^\star \gets \arg\min_{\lambda \in \Lambda} \mathrm{MSE}_{\mathrm{val}}(\Phi_{\mathrm{va}}, y_{\mathrm{va}}; \lambda)$
\State $w \gets (\Phi_{\mathrm{tr}}^\top \Phi_{\mathrm{tr}} + \lambda^\star I)^{-1} \Phi_{\mathrm{tr}}^\top y_{\mathrm{tr}}$
\State $R^2_{\mathrm{te}} \gets 1 - \mathrm{SS_{res}}(\Phi_{\mathrm{te}} w, y_{\mathrm{te}}) / \mathrm{SS_{tot}}(y_{\mathrm{te}})$
\State $\nu \gets \mathrm{Var}_{g \in G} \hat{y}(g \cdot \calX_{\mathrm{te}})$ \Comment{rotation-variance audit}
\State \Return $\hat{f}: \calX \mapsto \mathrm{Algorithm~\ref{alg:features}}(\calX; \Theta) \cdot w$, $R^2_{\mathrm{te}}$, $\nu$
\end{algorithmic}
\end{algorithm}

%% ========================================================================
\section{Formal Verification in Lean~4}
\label{sec:lean}
%% ========================================================================

All core algebraic results in this paper have been machine-verified in the
Lean~4 proof assistant~\cite{demoura2021lean4} using the Mathlib
library~\cite{mathlib2020}.  The formalization comprises 600 lines of
Lean~4, with zero unresolved proof obligations
 (\texttt{sorry}), providing a certificate of correctness for every
theorem, lemma, and corollary in the main text and supplementary
information.

\subsection{Architecture}

The formalization is organized into six modules mirroring the paper's
logical structure:

\begin{center}
\begin{tabular}{@{}llrl@{}}
\toprule
\textbf{Module} & \textbf{Paper section} & \textbf{Lines} & \textbf{Content} \\
\midrule
\texttt{Basic.lean}         & SI~\S\S1--4   & 78  & Convolution tensor, $\starG$ product, transpose \\
\texttt{Algebra.lean}       & SI~\S4        & 107 & Associativity, distributivity, identity laws \\
\texttt{ProductGroup.lean}  & Theorem~2     & 70  & Product group factorization, Kronecker irreps \\
\texttt{Equivariance.lean}  & SI~\S7        & 86  & Equivariance, Frobenius/Fourier invariance \\
\texttt{WignerEckart.lean}  & \S2.5         & 85  & Octahedral irreps, selection rules \\
\texttt{SVD.lean}           & Theorem~1     & 174 & $\starG$-SVD, Eckart--Young optimality \\
\bottomrule
\end{tabular}
\end{center}

\subsection{Axiom Budget}

Five standard results from linear algebra and finite-group harmonic analysis
are axiomatized because they are not yet available in Mathlib.  Every other
statement is derived from first principles.

\begin{center}
\begin{tabular}{@{}lll@{}}
\toprule
\textbf{Axiom} & \textbf{Content} & \textbf{Reference} \\
\midrule
\texttt{matrix\_best\_rank\_k\_approx} & Classical Eckart--Young & \cite{eckart1936approximation} \\
\texttt{parseval\_group} & Plancherel identity, finite groups & \cite{serre1977linear}, \S2.4 \\
\texttt{fourier\_multiplicative} & $\starG$ maps to block products & \cite{serre1977linear}, Ch.~7 \\
\texttt{fourier\_surjective} & Generalized Fourier inversion & \cite{peterweil1927} \\
\texttt{fourier\_injective} & Peter--Weyl completeness (declared, currently unused) & \cite{peterweil1927} \\
\bottomrule
\end{tabular}
\end{center}

\subsection{Key Proof Techniques}

\paragraph{Associativity of $\starG$ (Proposition~4.2(i)).}
Rather than fragile nested sum-exchange calls (\texttt{Finset.sum\_comm}),
we define an explicit \texttt{Equiv} on the 4-tuple product type
$\mathrm{Fin}\,p \times (G \times (\mathrm{Fin}\,m \times G))$ that
simultaneously permutes components and applies the bijection
$b \mapsto a^{-1}b$.  A single call to \texttt{Fintype.sum\_equiv} then
completes the proof, with the group-element arithmetic handled by the
\texttt{group} tactic.

\paragraph{Kronecker product of irreps (Theorem~2(iii)).}
The tensor-product representation $\rho_1 \otimes \rho_2$ is defined
entry-wise via \texttt{finProdFinEquiv.symm}, mapping
$\mathrm{Fin}(d_1 d_2)$ indices to pairs $\mathrm{Fin}\,d_1 \times
\mathrm{Fin}\,d_2$.  A \texttt{sum\_split} helper converts sums over
$\mathrm{Fin}(d_1 d_2)$ into double sums, after which the
\texttt{is\_hom} and \texttt{unitary} proofs factor naturally into
products of single sums via $\rho_i.\text{is\_hom}$ and
$\rho_i.\text{unitary}$.

\paragraph{Fourier power invariance (Corollary~7.2(ii)).}
The \texttt{fourierBlock\_leftAction} lemma shows that the group action
multiplies each Fourier block by $(I_\ell \otimes \rho(g))$.  An
\texttt{orthogonal\_preserves\_sum\_sq} lemma proves
$\sum_s (\sum_{s'} R_{s,s'} v_{s'})^2 = \sum_s v_s^2$ when
$R^\top R = I$, by expanding squares, exchanging sums, and applying
orthogonality.

\paragraph{Eckart--Young for $\starG$ (Theorem~1).}
The optimal rank-$k$ approximation is defined in the Fourier domain via
\texttt{fourier\_surjective}: its Fourier block at each irrep $\rho$ is,
by construction, the best rank-$k$ matrix approximation of
$\hat{\calA}(:,:,\rho)$ (obtained from \texttt{matrix\_best\_rank\_k\_approx}).
Per-irrep optimality is then a direct consequence of the classical matrix
Eckart--Young theorem.  The global bound follows by applying
\texttt{parseval\_group} to decompose the Frobenius error into per-irrep
terms, multiplying each per-irrep inequality by the positive Parseval
weight $d_\rho / |G|$, and summing via \texttt{Finset.sum\_le\_sum}.

\paragraph{Wigner--Eckart selection rules (\S2.5).}
The octahedral group's five irreps are encoded as an inductive type
\texttt{OctIrrep} with decidable equality.  Clebsch--Gordan multiplicities
are hardcoded from the standard character table and verified by \texttt{rfl}
(definitional equality).  The three selection rules are proved as concrete
multiplicity computations: (i)~\texttt{A$_1$ $\otimes$ $\rho$ = $\rho$}
for all $\rho$; (ii)~\texttt{T$_1$ $\otimes$ T$_1$} contains A$_1$;
(iii)~\texttt{Sym$^2$(T$_1$)} has zero T$_1$ multiplicity.  The dimension
formula $\sum d_\rho^2 = 24$ is verified by \texttt{native\_decide}.

\subsection{Verification Status}

The formalization achieves:
\begin{itemize}
\item Zero \texttt{sorry} (unresolved proof obligations) across all six modules.
\item Five declared axioms, all corresponding to standard textbook results
  not yet available in Mathlib. Of these, four are transitively used in
  proofs of theorems in the paper:
  \begin{itemize}
    \item \texttt{matrix\_best\_rank\_k\_approx}, \texttt{parseval\_group},
      and \texttt{fourier\_surjective}, used in
      Theorem~\ref{thm:ey_full};
    \item \texttt{fourier\_multiplicative}, used in
      Theorem~\ref{thm:prod_full}.
  \end{itemize}
  The fifth axiom, \texttt{fourier\_injective}, is declared but not
  currently invoked; it is retained for completeness, since closing it
  would simultaneously close \texttt{fourier\_surjective} via
  Peter--Weyl.
\item Complete coverage of Theorems~\ref{thm:ey_full}
  and~\ref{thm:prod_full}, the algebraic identities of
  Proposition~\ref{prop:T_props} and the $\starG$-product properties,
  equivariance and Frobenius/Fourier-power invariance, the Kronecker
  product construction for product-group irreps, and the Wigner--Eckart
  selection rules from \S2.5.
\item All algebraic theorems (associativity, identity, distributivity,
  transpose, equivariance, Frobenius and per-irrep Fourier-power invariance)
  depend solely on Lean's three core axioms (\texttt{propext},
  \texttt{Classical.choice}, \texttt{Quot.sound}); they introduce no
  project-level axioms. The \texttt{StarG/Audit.lean} module exhibits the
  full \texttt{\#print axioms} certificate for each theorem.
\end{itemize}

\noindent
To our knowledge, this is the first machine-verified proof of an
Eckart--Young-type optimality theorem for symmetry-preserving tensor
approximation.

\bibliographystyle{unsrt}
\bibliography{references}

\end{document}