"""Generate a small synthetic QM9-format dataset for smoke-testing the pipeline. This produces .xyz files in the *exact* format expected by QM9Dataset (C:/lrc/starg/python/large_scale/data/qm9.py), filled with random small H/C/N/O/F molecules and reasonable property values. It is NOT a substitute for real QM9 data and should not be used for any reported result, only to verify that imports work, the loader parses, featurisation runs, and the training loop converges without errors. Run: python make_synthetic_qm9.py --out_dir --n_molecules 2000 """ from __future__ import annotations import argparse from pathlib import Path import numpy as np ATOM_SYMBOLS = ["H", "C", "N", "O", "F"] ATOM_Z = {"H": 1, "C": 6, "N": 7, "O": 8, "F": 9} def make_one(idx: int, rng: np.random.Generator) -> str: """Build a single .xyz file content matching QM9 layout.""" n_atoms = int(rng.integers(3, 20)) Z = rng.choice(ATOM_SYMBOLS, size=n_atoms, p=[0.5, 0.3, 0.08, 0.1, 0.02]) # H-rich coords = rng.normal(scale=2.0, size=(n_atoms, 3)) charges = rng.normal(scale=0.3, size=n_atoms) # 12 QM9 properties (mu, alpha, homo, lumo, gap, R^2, zpve, U0, U, H, G, Cv) props = rng.normal(size=12) props[1] = abs(props[1]) * 80 + 30 # alpha (positive) props[2] = -0.3 + 0.1 * props[2] # homo props[3] = -0.05 + 0.1 * props[3] # lumo props[4] = props[3] - props[2] # gap = lumo - homo props[6] = 0.05 + 0.02 * abs(props[6]) # zpve (positive) lines = [str(n_atoms)] prop_str = "gdb " + " ".join(f"{idx}") + " " + " ".join(f"{p:.6f}" for p in props) lines.append(prop_str) for atom, xyz, q in zip(Z, coords, charges): lines.append(f"{atom}\t{xyz[0]:.6f}\t{xyz[1]:.6f}\t{xyz[2]:.6f}\t{q:.6f}") # Vibrational frequencies row (parsed but not used) and SMILES lines.append(" ".join("0.0" for _ in range(max(n_atoms - 6, 1)))) smiles = "C" * (n_atoms // 2) # placeholder lines.append(f"{smiles}\t{smiles}") lines.append("InChI=1S/synthetic") return "\n".join(lines) + "\n" def main(): ap = argparse.ArgumentParser() ap.add_argument("--out_dir", required=True) ap.add_argument("--n_molecules", type=int, default=2000) ap.add_argument("--seed", type=int, default=0) args = ap.parse_args() out = Path(args.out_dir) out.mkdir(parents=True, exist_ok=True) rng = np.random.default_rng(args.seed) for i in range(args.n_molecules): content = make_one(i + 1, rng) (out / f"dsgdb9nsd_{i+1:06d}.xyz").write_text(content) print(f"[ok] wrote {args.n_molecules} synthetic .xyz files to {out}") if __name__ == "__main__": main()