python/large_scale/bsub/prep_schnet_dbs.bsub · tensor-group-sym

#!/bin/bash
# prep_schnet_dbs.bsub ,  build the three schnetpack QM9 DBs serially.
#
# Multiple SchNet jobs sharing the same per-seed .db file race each other
# during initial extraction; the partial .db then breaks every subsequent
# DataLoader fetch with KeyError 'no match'. This single job builds all
# three .db files one after another so the actual training array can
# safely reuse them in parallel.

#BSUB -J starg_schnet_prep
#BSUB -o logs/schnet_prep_%J.out
#BSUB -e logs/schnet_prep_%J.err
#BSUB -q normal
#BSUB -n 4
#BSUB -W 02:00
#BSUB -M 32GB

set -uo pipefail
mkdir -p logs

WORKDIR=$HOME/starg/python/large_scale
QM9_DIR=${QM9_DIR:-/u/$USER/data/qm9/dsgdb9nsd}

cd "$WORKDIR"
export PYTHONPATH=".:${PYTHONPATH:-}"

echo "[$(date)] host=$(hostname) prep schnetpack QM9 dbs"

python3 - <<PY
import schnetpack as spk
import schnetpack.transform as trn
from pathlib import Path
qm9_dir = "${QM9_DIR}"
for seed in (0, 1, 2):
    db = Path(qm9_dir) / f"qm9_seed{seed}.db"
    print(f"[prep] seed={seed} -> {db}")
    qm9 = spk.datasets.QM9(
        datapath=str(db),
        batch_size=64,
        num_train=110000,
        num_val=10000,
        remove_uncharacterized=True,
        transforms=[trn.ASENeighborList(cutoff=5.0), trn.CastTo32()],
        num_workers=0,
        pin_memory=False,
    )
    qm9.prepare_data()
    qm9.setup()
    print(f"[prep] seed={seed}: done, db size on disk:")
    import os; print(f"        {os.path.getsize(db):,} bytes")
PY

echo "[$(date)] done"