#!/bin/bash
# prep_schnet_dbs.bsub , build the three schnetpack QM9 DBs serially.
#
# Multiple SchNet jobs sharing the same per-seed .db file race each other
# during initial extraction; the partial .db then breaks every subsequent
# DataLoader fetch with KeyError 'no match'. This single job builds all
# three .db files one after another so the actual training array can
# safely reuse them in parallel.
#BSUB -J starg_schnet_prep
#BSUB -o logs/schnet_prep_%J.out
#BSUB -e logs/schnet_prep_%J.err
#BSUB -q normal
#BSUB -n 4
#BSUB -W 02:00
#BSUB -M 32GB
set -uo pipefail
mkdir -p logs
WORKDIR=$HOME/starg/python/large_scale
QM9_DIR=${QM9_DIR:-/u/$USER/data/qm9/dsgdb9nsd}
cd "$WORKDIR"
export PYTHONPATH=".:${PYTHONPATH:-}"
echo "[$(date)] host=$(hostname) prep schnetpack QM9 dbs"
python3 - <<PY
import schnetpack as spk
import schnetpack.transform as trn
from pathlib import Path
qm9_dir = "${QM9_DIR}"
for seed in (0, 1, 2):
db = Path(qm9_dir) / f"qm9_seed{seed}.db"
print(f"[prep] seed={seed} -> {db}")
qm9 = spk.datasets.QM9(
datapath=str(db),
batch_size=64,
num_train=110000,
num_val=10000,
remove_uncharacterized=True,
transforms=[trn.ASENeighborList(cutoff=5.0), trn.CastTo32()],
num_workers=0,
pin_memory=False,
)
qm9.prepare_data()
qm9.setup()
print(f"[prep] seed={seed}: done, db size on disk:")
import os; print(f" {os.path.getsize(db):,} bytes")
PY
echo "[$(date)] done"