"""Molecule → tensor featurization for the ★_G pipeline.
Featurizers produce a `(n_feat, |G|)` molecule-level summary, see
CONTRIBUTING.md for the input-contract discipline. Adding more
features means appending rows; it never means adding a per-atom
dimension.
* cyclic_angular_features: 14 rows of angular projections under z-axis
rotations at n equally-spaced angles. Used with G = Z_n.
* octahedral_features: 14 rows of features under the 24 rotations of
the chiral octahedral group O. Used in the Wigner-Eckart experiment.
* coulomb_eig_extended_features: 14 angular rows + 29 Coulomb-matrix
sorted eigenvalues replicated as invariant rows = (43, |G|) tensor.
The Coulomb-matrix eigenvalues are a classical Rupp-2012 descriptor
of inter-atomic distances and atomic numbers; they encode bond-
topology information that pure angular projections lack, while
remaining a molecule-level summary (no atom dim, no learnable
parameters). Illustrative, shows the same algebra scales with
input richness.
* raw_atomic_features: per-atom Z, position, and Mulliken charge,
consumed only by SchNet / e3nn / MACE baselines (not by ★_G).
"""
from __future__ import annotations
from typing import List
import numpy as np
import torch
from .qm9 import QM9Sample
# Absolute import works when running scripts from the large_scale/ dir
# (which adds it to sys.path). Avoids the "beyond top-level package" error
# raised when this file is imported as part of the data subpackage.
from starg_torch.octahedral import octahedral_rotations
def cyclic_angular_features(
sample: QM9Sample,
n_rot: int,
n_feat: int = 14,
) -> np.ndarray:
"""Compute (n_feat, n_rot) tensor of angular features.
Identical to the MATLAB angular_features() used in the QM9 experiment:
inner products of atomic positions with a rotating measurement basis at
angles 2π g / n. Rows include 5 invariant rows (mean Z, distance moments)
and 9 equivariant rows (sin/cos of angular projection at varying scales).
"""
pos = sample.coords - sample.coords.mean(axis=0, keepdims=True)
Z = sample.Z
angles = 2.0 * np.pi * np.arange(n_rot) / n_rot
out = np.zeros((n_feat, n_rot))
for g, theta in enumerate(angles):
c, s = np.cos(theta), np.sin(theta)
R = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])
rotated = pos @ R.T
out[0, g] = float(np.mean(Z))
out[1, g] = float(np.mean(np.linalg.norm(rotated, axis=1)))
out[2, g] = float(np.std(np.linalg.norm(rotated, axis=1)))
out[3, g] = float(np.mean(rotated[:, 2] ** 2))
out[4, g] = float(np.mean(np.abs(rotated[:, 2])))
# 9 equivariant rows: sin/cos at scales 1, 2, 3 and integrated dipoles
for scale in range(1, 4):
out[5 + (scale - 1) * 3, g] = float(np.sum(Z * np.cos(scale * np.arctan2(rotated[:, 1], rotated[:, 0]))))
out[6 + (scale - 1) * 3, g] = float(np.sum(Z * np.sin(scale * np.arctan2(rotated[:, 1], rotated[:, 0]))))
out[7 + (scale - 1) * 3, g] = float(np.sum(Z * rotated[:, 0] ** scale))
return out # shape (n_feat, n_rot)
def octahedral_features(
sample: QM9Sample,
n_feat: int = 14,
) -> np.ndarray:
"""Compute (n_feat, 24) tensor of features under the 24 octahedral rotations."""
rotations = octahedral_rotations()
pos = sample.coords - sample.coords.mean(axis=0, keepdims=True)
Z = sample.Z
q = sample.charges
n_rot = 24
out = np.zeros((n_feat, n_rot))
for g, R in enumerate(rotations):
rp = pos @ R.T
out[0, g] = float(np.mean(Z))
out[1, g] = float(np.mean(np.linalg.norm(rp, axis=1)))
out[2, g] = float(np.std(np.linalg.norm(rp, axis=1)))
# Vector projections (transform as l=1)
out[3, g] = float(np.sum(Z * rp[:, 0]))
out[4, g] = float(np.sum(Z * rp[:, 1]))
out[5, g] = float(np.sum(Z * rp[:, 2]))
# Mulliken-charge dipole (true rank-1 tensor)
out[6, g] = float(np.sum(q * rp[:, 0]))
out[7, g] = float(np.sum(q * rp[:, 1]))
out[8, g] = float(np.sum(q * rp[:, 2]))
# Quadrupole-like rank-2 features (xx-yy, 2zz-xx-yy, xy, xz, yz components)
out[9, g] = float(np.sum(Z * (rp[:, 0] ** 2 - rp[:, 1] ** 2)))
out[10, g] = float(np.sum(Z * (2 * rp[:, 2] ** 2 - rp[:, 0] ** 2 - rp[:, 1] ** 2)))
out[11, g] = float(np.sum(Z * rp[:, 0] * rp[:, 1]))
out[12, g] = float(np.sum(Z * rp[:, 0] * rp[:, 2]))
out[13, g] = float(np.sum(Z * rp[:, 1] * rp[:, 2]))
return out
def _coulomb_matrix(coords: np.ndarray, Z: np.ndarray) -> np.ndarray:
"""Coulomb matrix (Rupp et al. 2012). M[i,j] = Z_i Z_j / |r_i - r_j|
for i != j and 0.5 * Z_i^2.4 on the diagonal."""
n = len(Z)
M = np.zeros((n, n))
for i in range(n):
M[i, i] = 0.5 * float(Z[i]) ** 2.4
for j in range(i + 1, n):
d = float(np.linalg.norm(coords[i] - coords[j]))
v = float(Z[i]) * float(Z[j]) / max(d, 1e-8)
M[i, j] = v
M[j, i] = v
return M
def coulomb_eig_extended_features(
sample: QM9Sample,
n_rot: int = 12,
n_eig: int = 29,
) -> np.ndarray:
"""Combine the 14-row angular features with `n_eig` sorted Coulomb-
matrix eigenvalues replicated across the group dimension as invariant
rows. Returns shape (14 + n_eig, n_rot).
The Coulomb-matrix eigenvalues are a classical molecule-level descriptor
that encodes inter-atomic distances and atomic numbers in a permutation-
and rotation-invariant way. We sort them in descending order, pad to
length `n_eig` (29 = max QM9 atom count), and tile across the group
dimension. Because they are invariant, every group element sees the
same value, which is the right semantic for a property of the molecule
itself.
Discipline: the output is still `(n_feat, n_rot)` per molecule, no atom
dimension, no learnable parameters. See CONTRIBUTING.md.
"""
angular = cyclic_angular_features(sample, n_rot=n_rot, n_feat=14)
cm = _coulomb_matrix(sample.coords, sample.Z)
eig = np.sort(np.linalg.eigvalsh(cm))[::-1]
eig = np.pad(eig, (0, max(0, n_eig - len(eig))))[:n_eig]
inv_rows = np.tile(eig[:, None], (1, n_rot))
return np.vstack([angular, inv_rows])
def raw_atomic_features(sample: QM9Sample) -> dict:
"""Return a dict with keys (z, pos, charges) for ENN baselines."""
return {
"z": torch.tensor(sample.Z, dtype=torch.long),
"pos": torch.tensor(sample.coords, dtype=torch.float32),
"charges": torch.tensor(sample.charges, dtype=torch.float32),
}
def stack_batch(samples: List[QM9Sample], featurizer, **kwargs) -> torch.Tensor:
"""Apply a featurizer to a list of samples and stack into (N, n_feat, n_g)."""
feats = [featurizer(s, **kwargs) for s in samples]
return torch.tensor(np.stack(feats, axis=0), dtype=torch.float32)