python/large_scale/data/make_synthetic_qm9.py · tensor-group-sym

"""Generate a small synthetic QM9-format dataset for smoke-testing the pipeline.

This produces .xyz files in the *exact* format expected by QM9Dataset
(C:/lrc/starg/python/large_scale/data/qm9.py), filled with random small
H/C/N/O/F molecules and reasonable property values. It is NOT a substitute
for real QM9 data and should not be used for any reported result, only
to verify that imports work, the loader parses, featurisation runs, and
the training loop converges without errors.

Run:
    python make_synthetic_qm9.py --out_dir <dir> --n_molecules 2000
"""

from __future__ import annotations

import argparse
from pathlib import Path

import numpy as np


ATOM_SYMBOLS = ["H", "C", "N", "O", "F"]
ATOM_Z = {"H": 1, "C": 6, "N": 7, "O": 8, "F": 9}


def make_one(idx: int, rng: np.random.Generator) -> str:
    """Build a single .xyz file content matching QM9 layout."""
    n_atoms = int(rng.integers(3, 20))
    Z = rng.choice(ATOM_SYMBOLS, size=n_atoms,
                   p=[0.5, 0.3, 0.08, 0.1, 0.02])  # H-rich
    coords = rng.normal(scale=2.0, size=(n_atoms, 3))
    charges = rng.normal(scale=0.3, size=n_atoms)
    # 12 QM9 properties (mu, alpha, homo, lumo, gap, R^2, zpve, U0, U, H, G, Cv)
    props = rng.normal(size=12)
    props[1] = abs(props[1]) * 80 + 30           # alpha (positive)
    props[2] = -0.3 + 0.1 * props[2]              # homo
    props[3] = -0.05 + 0.1 * props[3]             # lumo
    props[4] = props[3] - props[2]                # gap = lumo - homo
    props[6] = 0.05 + 0.02 * abs(props[6])        # zpve (positive)

    lines = [str(n_atoms)]
    prop_str = "gdb " + " ".join(f"{idx}") + " " + " ".join(f"{p:.6f}" for p in props)
    lines.append(prop_str)
    for atom, xyz, q in zip(Z, coords, charges):
        lines.append(f"{atom}\t{xyz[0]:.6f}\t{xyz[1]:.6f}\t{xyz[2]:.6f}\t{q:.6f}")
    # Vibrational frequencies row (parsed but not used) and SMILES
    lines.append(" ".join("0.0" for _ in range(max(n_atoms - 6, 1))))
    smiles = "C" * (n_atoms // 2)  # placeholder
    lines.append(f"{smiles}\t{smiles}")
    lines.append("InChI=1S/synthetic")
    return "\n".join(lines) + "\n"


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--out_dir", required=True)
    ap.add_argument("--n_molecules", type=int, default=2000)
    ap.add_argument("--seed", type=int, default=0)
    args = ap.parse_args()

    out = Path(args.out_dir)
    out.mkdir(parents=True, exist_ok=True)
    rng = np.random.default_rng(args.seed)

    for i in range(args.n_molecules):
        content = make_one(i + 1, rng)
        (out / f"dsgdb9nsd_{i+1:06d}.xyz").write_text(content)
    print(f"[ok] wrote {args.n_molecules} synthetic .xyz files to {out}")


if __name__ == "__main__":
    main()