"""Generate a small synthetic QM9-format dataset for smoke-testing the pipeline.
This produces .xyz files in the *exact* format expected by QM9Dataset
(C:/lrc/starg/python/large_scale/data/qm9.py), filled with random small
H/C/N/O/F molecules and reasonable property values. It is NOT a substitute
for real QM9 data and should not be used for any reported result, only
to verify that imports work, the loader parses, featurisation runs, and
the training loop converges without errors.
Run:
python make_synthetic_qm9.py --out_dir
--n_molecules 2000
"""
from __future__ import annotations
import argparse
from pathlib import Path
import numpy as np
ATOM_SYMBOLS = ["H", "C", "N", "O", "F"]
ATOM_Z = {"H": 1, "C": 6, "N": 7, "O": 8, "F": 9}
def make_one(idx: int, rng: np.random.Generator) -> str:
"""Build a single .xyz file content matching QM9 layout."""
n_atoms = int(rng.integers(3, 20))
Z = rng.choice(ATOM_SYMBOLS, size=n_atoms,
p=[0.5, 0.3, 0.08, 0.1, 0.02]) # H-rich
coords = rng.normal(scale=2.0, size=(n_atoms, 3))
charges = rng.normal(scale=0.3, size=n_atoms)
# 12 QM9 properties (mu, alpha, homo, lumo, gap, R^2, zpve, U0, U, H, G, Cv)
props = rng.normal(size=12)
props[1] = abs(props[1]) * 80 + 30 # alpha (positive)
props[2] = -0.3 + 0.1 * props[2] # homo
props[3] = -0.05 + 0.1 * props[3] # lumo
props[4] = props[3] - props[2] # gap = lumo - homo
props[6] = 0.05 + 0.02 * abs(props[6]) # zpve (positive)
lines = [str(n_atoms)]
prop_str = "gdb " + " ".join(f"{idx}") + " " + " ".join(f"{p:.6f}" for p in props)
lines.append(prop_str)
for atom, xyz, q in zip(Z, coords, charges):
lines.append(f"{atom}\t{xyz[0]:.6f}\t{xyz[1]:.6f}\t{xyz[2]:.6f}\t{q:.6f}")
# Vibrational frequencies row (parsed but not used) and SMILES
lines.append(" ".join("0.0" for _ in range(max(n_atoms - 6, 1))))
smiles = "C" * (n_atoms // 2) # placeholder
lines.append(f"{smiles}\t{smiles}")
lines.append("InChI=1S/synthetic")
return "\n".join(lines) + "\n"
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--out_dir", required=True)
ap.add_argument("--n_molecules", type=int, default=2000)
ap.add_argument("--seed", type=int, default=0)
args = ap.parse_args()
out = Path(args.out_dir)
out.mkdir(parents=True, exist_ok=True)
rng = np.random.default_rng(args.seed)
for i in range(args.n_molecules):
content = make_one(i + 1, rng)
(out / f"dsgdb9nsd_{i+1:06d}.xyz").write_text(content)
print(f"[ok] wrote {args.n_molecules} synthetic .xyz files to {out}")
if __name__ == "__main__":
main()