python/large_scale/data/featurizers.py · tensor-group-sym

"""Molecule → tensor featurization for the ★_G pipeline.

Featurizers produce a `(n_feat, |G|)` molecule-level summary, see
CONTRIBUTING.md for the input-contract discipline. Adding more
features means appending rows; it never means adding a per-atom
dimension.

  * cyclic_angular_features: 14 rows of angular projections under z-axis
    rotations at n equally-spaced angles. Used with G = Z_n.

  * octahedral_features: 14 rows of features under the 24 rotations of
    the chiral octahedral group O. Used in the Wigner-Eckart experiment.

  * coulomb_eig_extended_features: 14 angular rows + 29 Coulomb-matrix
    sorted eigenvalues replicated as invariant rows = (43, |G|) tensor.
    The Coulomb-matrix eigenvalues are a classical Rupp-2012 descriptor
    of inter-atomic distances and atomic numbers; they encode bond-
    topology information that pure angular projections lack, while
    remaining a molecule-level summary (no atom dim, no learnable
    parameters). Illustrative, shows the same algebra scales with
    input richness.

  * raw_atomic_features: per-atom Z, position, and Mulliken charge,
    consumed only by SchNet / e3nn / MACE baselines (not by ★_G).
"""

from __future__ import annotations

from typing import List

import numpy as np
import torch

from .qm9 import QM9Sample
# Absolute import works when running scripts from the large_scale/ dir
# (which adds it to sys.path). Avoids the "beyond top-level package" error
# raised when this file is imported as part of the data subpackage.
from starg_torch.octahedral import octahedral_rotations


def cyclic_angular_features(
    sample: QM9Sample,
    n_rot: int,
    n_feat: int = 14,
) -> np.ndarray:
    """Compute (n_feat, n_rot) tensor of angular features.

    Identical to the MATLAB angular_features() used in the QM9 experiment:
    inner products of atomic positions with a rotating measurement basis at
    angles 2π g / n. Rows include 5 invariant rows (mean Z, distance moments)
    and 9 equivariant rows (sin/cos of angular projection at varying scales).
    """
    pos = sample.coords - sample.coords.mean(axis=0, keepdims=True)
    Z = sample.Z
    angles = 2.0 * np.pi * np.arange(n_rot) / n_rot
    out = np.zeros((n_feat, n_rot))
    for g, theta in enumerate(angles):
        c, s = np.cos(theta), np.sin(theta)
        R = np.array([[c, -s, 0], [s, c, 0], [0, 0, 1]])
        rotated = pos @ R.T
        out[0, g] = float(np.mean(Z))
        out[1, g] = float(np.mean(np.linalg.norm(rotated, axis=1)))
        out[2, g] = float(np.std(np.linalg.norm(rotated, axis=1)))
        out[3, g] = float(np.mean(rotated[:, 2] ** 2))
        out[4, g] = float(np.mean(np.abs(rotated[:, 2])))
        # 9 equivariant rows: sin/cos at scales 1, 2, 3 and integrated dipoles
        for scale in range(1, 4):
            out[5 + (scale - 1) * 3, g] = float(np.sum(Z * np.cos(scale * np.arctan2(rotated[:, 1], rotated[:, 0]))))
            out[6 + (scale - 1) * 3, g] = float(np.sum(Z * np.sin(scale * np.arctan2(rotated[:, 1], rotated[:, 0]))))
            out[7 + (scale - 1) * 3, g] = float(np.sum(Z * rotated[:, 0] ** scale))
    return out  # shape (n_feat, n_rot)


def octahedral_features(
    sample: QM9Sample,
    n_feat: int = 14,
) -> np.ndarray:
    """Compute (n_feat, 24) tensor of features under the 24 octahedral rotations."""
    rotations = octahedral_rotations()
    pos = sample.coords - sample.coords.mean(axis=0, keepdims=True)
    Z = sample.Z
    q = sample.charges
    n_rot = 24
    out = np.zeros((n_feat, n_rot))
    for g, R in enumerate(rotations):
        rp = pos @ R.T
        out[0, g] = float(np.mean(Z))
        out[1, g] = float(np.mean(np.linalg.norm(rp, axis=1)))
        out[2, g] = float(np.std(np.linalg.norm(rp, axis=1)))
        # Vector projections (transform as l=1)
        out[3, g] = float(np.sum(Z * rp[:, 0]))
        out[4, g] = float(np.sum(Z * rp[:, 1]))
        out[5, g] = float(np.sum(Z * rp[:, 2]))
        # Mulliken-charge dipole (true rank-1 tensor)
        out[6, g] = float(np.sum(q * rp[:, 0]))
        out[7, g] = float(np.sum(q * rp[:, 1]))
        out[8, g] = float(np.sum(q * rp[:, 2]))
        # Quadrupole-like rank-2 features (xx-yy, 2zz-xx-yy, xy, xz, yz components)
        out[9, g] = float(np.sum(Z * (rp[:, 0] ** 2 - rp[:, 1] ** 2)))
        out[10, g] = float(np.sum(Z * (2 * rp[:, 2] ** 2 - rp[:, 0] ** 2 - rp[:, 1] ** 2)))
        out[11, g] = float(np.sum(Z * rp[:, 0] * rp[:, 1]))
        out[12, g] = float(np.sum(Z * rp[:, 0] * rp[:, 2]))
        out[13, g] = float(np.sum(Z * rp[:, 1] * rp[:, 2]))
    return out


def _coulomb_matrix(coords: np.ndarray, Z: np.ndarray) -> np.ndarray:
    """Coulomb matrix (Rupp et al. 2012). M[i,j] = Z_i Z_j / |r_i - r_j|
    for i != j and 0.5 * Z_i^2.4 on the diagonal."""
    n = len(Z)
    M = np.zeros((n, n))
    for i in range(n):
        M[i, i] = 0.5 * float(Z[i]) ** 2.4
        for j in range(i + 1, n):
            d = float(np.linalg.norm(coords[i] - coords[j]))
            v = float(Z[i]) * float(Z[j]) / max(d, 1e-8)
            M[i, j] = v
            M[j, i] = v
    return M


def coulomb_eig_extended_features(
    sample: QM9Sample,
    n_rot: int = 12,
    n_eig: int = 29,
) -> np.ndarray:
    """Combine the 14-row angular features with `n_eig` sorted Coulomb-
    matrix eigenvalues replicated across the group dimension as invariant
    rows. Returns shape (14 + n_eig, n_rot).

    The Coulomb-matrix eigenvalues are a classical molecule-level descriptor
    that encodes inter-atomic distances and atomic numbers in a permutation-
    and rotation-invariant way. We sort them in descending order, pad to
    length `n_eig` (29 = max QM9 atom count), and tile across the group
    dimension. Because they are invariant, every group element sees the
    same value, which is the right semantic for a property of the molecule
    itself.

    Discipline: the output is still `(n_feat, n_rot)` per molecule, no atom
    dimension, no learnable parameters. See CONTRIBUTING.md.
    """
    angular = cyclic_angular_features(sample, n_rot=n_rot, n_feat=14)
    cm = _coulomb_matrix(sample.coords, sample.Z)
    eig = np.sort(np.linalg.eigvalsh(cm))[::-1]
    eig = np.pad(eig, (0, max(0, n_eig - len(eig))))[:n_eig]
    inv_rows = np.tile(eig[:, None], (1, n_rot))
    return np.vstack([angular, inv_rows])


def raw_atomic_features(sample: QM9Sample) -> dict:
    """Return a dict with keys (z, pos, charges) for ENN baselines."""
    return {
        "z": torch.tensor(sample.Z, dtype=torch.long),
        "pos": torch.tensor(sample.coords, dtype=torch.float32),
        "charges": torch.tensor(sample.charges, dtype=torch.float32),
    }


def stack_batch(samples: List[QM9Sample], featurizer, **kwargs) -> torch.Tensor:
    """Apply a featurizer to a list of samples and stack into (N, n_feat, n_g)."""
    feats = [featurizer(s, **kwargs) for s in samples]
    return torch.tensor(np.stack(feats, axis=0), dtype=torch.float32)