code/src/server/fedavg.py · csc8114

import copy
import os
import time
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import threading
from typing import Any

import torch

from proto import fsl_pb2
from src.shared.config_artifacts import build_config_ref, build_config_snapshot
from src.shared.serialization import tensor_to_bytes
from src.shared.common import cfg


@dataclass
class PendingUpdate:
    client_id: int
    weights: dict[str, torch.Tensor]
    base_round: int
    local_epochs: int
    arrived_at: float


class FedAvgCoordinator:
    def __init__(
        self,
        *,
        num_clients: int,
        hidden_size: int,
        session_id: str,
        session_dir: str,
        periodic_dir: str,
        ckpt_interval: int,
        min_clients_per_round: int = 2,
        round_timeout_sec: float = 15.0,
        grace_period_sec: float = 0.0,
        max_staleness: int = 0,
    ):
        self.num_clients = num_clients
        self.hidden_size = hidden_size
        self.session_id = session_id
        self.session_dir = session_dir
        self.periodic_dir = periodic_dir
        self.ckpt_interval = ckpt_interval
        self.min_clients_per_round = max(1, int(min_clients_per_round))
        self.round_timeout_sec = max(0.1, float(round_timeout_sec))
        self.grace_period_sec = max(0.0, float(grace_period_sec))
        # max_staleness=0: strict synchronous FedAvg (legacy default).
        # max_staleness=N: accept updates up to N rounds stale; weight by
        # local_epochs / (1 + staleness) to discount older contributions.
        self.max_staleness = max(0, int(max_staleness))
        self._quorum_reached_at: float | None = None
        self._startup_deadline: float | None = None

        self.client_weights_buffer: dict[int, PendingUpdate] = {}
        self.global_weights = None
        self.current_round = 0
        self.lock = threading.Lock()
        self.round_cond = threading.Condition(self.lock)
        self.round_start_time: float | None = None
        self.round_error: str | None = None
        self._expected_schema: dict[str, tuple[tuple[int, ...], torch.dtype]] | None = None
        self._active_clients: set[int] = set()
        self._completed_clients: set[int] = set()

    @staticmethod
    def _validate_weights_object(local_weights: Any) -> dict[str, torch.Tensor]:
        if not isinstance(local_weights, dict) or not local_weights:
            raise ValueError("Client weights must be a non-empty state_dict-like dict.")
        normalized: dict[str, torch.Tensor] = {}
        for key, value in local_weights.items():
            if not isinstance(key, str):
                raise ValueError("State dict keys must be strings.")
            if not isinstance(value, torch.Tensor):
                raise ValueError(f"State dict value for key '{key}' is not a torch.Tensor.")
            normalized[key] = value
        return normalized

    @staticmethod
    def _build_schema(weights: dict[str, torch.Tensor]) -> dict[str, tuple[tuple[int, ...], torch.dtype]]:
        return {key: (tuple(tensor.shape), tensor.dtype) for key, tensor in weights.items()}

    def _validate_against_schema(self, weights: dict[str, torch.Tensor]) -> None:
        schema = self._expected_schema
        if schema is None:
            self._expected_schema = self._build_schema(weights)
            return

        current_keys = set(weights.keys())
        expected_keys = set(schema.keys())
        if current_keys != expected_keys:
            missing = sorted(expected_keys - current_keys)
            extra = sorted(current_keys - expected_keys)
            raise ValueError(
                f"State dict key mismatch. missing={missing[:5]} extra={extra[:5]}"
            )

        for key, tensor in weights.items():
            expected_shape, expected_dtype = schema[key]
            if tuple(tensor.shape) != expected_shape:
                raise ValueError(
                    f"Tensor shape mismatch for '{key}': got {tuple(tensor.shape)} expected {expected_shape}"
                )
            if tensor.dtype != expected_dtype:
                raise ValueError(
                    f"Tensor dtype mismatch for '{key}': got {tensor.dtype} expected {expected_dtype}"
                )

    def register_client(self, client_id: int) -> None:
        with self.round_cond:
            if client_id not in self._completed_clients:
                self._active_clients.add(int(client_id))
            self.round_cond.notify_all()

    def mark_client_completed(self, client_id: int, *, server_model, optimizer) -> None:
        with self.round_cond:
            client_id = int(client_id)
            self._completed_clients.add(client_id)
            self._active_clients.discard(client_id)
            print(
                f"[FED AVG] Client {client_id} marked complete. "
                f"active_clients={len(self._active_clients)}"
            )
            if self.client_weights_buffer and self._has_quorum_locked():
                self._aggregate_locked(
                    server_model=server_model,
                    optimizer=optimizer,
                    reason="active-set-updated",
                )
            self.round_cond.notify_all()

    def synchronize(self, request, *, local_weights, server_model, optimizer) -> fsl_pb2.SyncResponse:
        client_id = int(request.client_id)
        base_round = int(getattr(request, "base_round", self.current_round))
        local_epochs = int(getattr(request, "local_epochs", 0))

        with self.round_cond:
            if client_id not in self._completed_clients:
                self._active_clients.add(client_id)

            # --- Safety Catch: Prevent rounds exceeding config ---
            num_rounds = cfg.get("training", {}).get("num_rounds", 30)
            if self.current_round >= num_rounds:
                print(f"[FED AVG] Target rounds ({num_rounds}) reached. Rejecting update from Client:{client_id}")
                return self._build_sync_response_locked(
                    accepted=False,
                    applied_round=self.current_round,
                    status_message="FINISHED",
                    refresh_only=True
                )

            if base_round < self.current_round:
                staleness = self.current_round - base_round
                if staleness > self.max_staleness:
                    print(
                        f"[FED AVG] Rejecting overly stale update from Client:{client_id} "
                        f"(staleness={staleness} > max_staleness={self.max_staleness})"
                    )
                    return self._build_sync_response_locked(
                        accepted=False,
                        applied_round=0,
                        refresh_only=True,
                        status_message=(
                            f"Stale update rejected: staleness={staleness} "
                            f"exceeds max_staleness={self.max_staleness}."
                        ),
                    )
                print(
                    f"[FED AVG] Accepting stale update from Client:{client_id} "
                    f"(staleness={staleness}, max_staleness={self.max_staleness})"
                )

            if base_round > self.current_round:
                print(
                    f"[FED AVG] Client:{client_id} is ahead of server state "
                    f"(base_round={base_round}, current_round={self.current_round})"
                )
                return self._build_sync_response_locked(
                    accepted=False,
                    applied_round=0,
                    refresh_only=self.global_weights is not None,
                    status_message=(
                        f"Client is ahead of server state: client base_round={base_round}, "
                        f"server current_round={self.current_round}."
                    ),
                )

            # Before the first round, wait until enough clients have connected.
            # Deadline is measured from when the FIRST client enters this wait,
            # not from server startup — the server may have been up for a long time
            # before clients are deployed via Ansible.
            if self.current_round == 0:
                if self._startup_deadline is None:
                    self._startup_deadline = time.time() + self.round_timeout_sec
                    print(
                        f"[FED AVG] Startup wait begun: waiting up to {self.round_timeout_sec:.0f}s "
                        f"for {self.min_clients_per_round} clients "
                        f"(currently {len(self._active_clients)} registered)."
                    )
                while len(self._active_clients) < self.min_clients_per_round:
                    remaining = self._startup_deadline - time.time()
                    if remaining <= 0:
                        print(
                            f"[FED AVG] Startup wait timed out: only "
                            f"{len(self._active_clients)}/{self.min_clients_per_round} "
                            f"clients connected. Proceeding with current active set."
                        )
                        break
                    self.round_cond.wait(timeout=min(remaining, 5.0))

            target_round = self.current_round + 1
            now = time.time()
            if not self.client_weights_buffer:
                self.round_start_time = now
                self.round_error = None

            validated_weights = self._validate_weights_object(local_weights)
            self._validate_against_schema(validated_weights)
            self.client_weights_buffer[client_id] = PendingUpdate(
                client_id=client_id,
                weights=validated_weights,
                base_round=base_round,
                local_epochs=local_epochs,
                arrived_at=now,
            )
            barrier_elapsed_s = (now - self.round_start_time) if self.round_start_time is not None else 0.0
            required = self._required_clients_locked()
            print(
                f"[FED AVG] Received weights from Client:{client_id}. "
                f"Buffer size: {len(self.client_weights_buffer)}/{required} "
                f"| active_clients={len(self._active_clients)} "
                f"| base_round={base_round} local_epochs={local_epochs} "
                f"| barrier_elapsed={barrier_elapsed_s:.2f}s"
            )

            self._maybe_aggregate_locked(
                server_model=server_model,
                optimizer=optimizer,
                reason="quorum-reached",
            )

            while self.current_round < target_round and not self.round_error:
                remaining = self._remaining_window_locked()
                if remaining <= 0:
                    if self.client_weights_buffer and self.current_round < target_round:
                        self._aggregate_locked(
                            server_model=server_model,
                            optimizer=optimizer,
                            reason="timeout",
                        )
                    break
                self.round_cond.wait(timeout=remaining)
                if self.current_round < target_round and not self.round_error:
                    self._maybe_aggregate_locked(
                        server_model=server_model,
                        optimizer=optimizer,
                        reason="quorum-reached",
                    )

            if self.round_error:
                raise RuntimeError(self.round_error)

            if self.current_round < target_round or self.global_weights is None:
                raise TimeoutError("Timeout waiting for global model aggregation.")

            return self._build_sync_response_locked(
                accepted=True,
                applied_round=self.current_round,
                refresh_only=False,
                status_message=(
                    f"Accepted into aggregation round {self.current_round} "
                    f"(active_clients={len(self._active_clients)})."
                ),
            )

    def _active_client_count_locked(self) -> int:
        return len(self._active_clients)

    def _required_clients_locked(self) -> int:
        active_count = self._active_client_count_locked()
        if active_count <= 0:
            return 1
        return max(1, min(self.min_clients_per_round, active_count))

    def _remaining_window_locked(self) -> float:
        if self.round_start_time is None:
            return self.round_timeout_sec
        elapsed = time.time() - self.round_start_time
        timeout_remaining = max(0.0, self.round_timeout_sec - elapsed)
        # If quorum is reached and grace period is active, wake up sooner to check it.
        if self._quorum_reached_at is not None and self.grace_period_sec > 0.0:
            grace_remaining = max(0.0, self.grace_period_sec - (time.time() - self._quorum_reached_at))
            return min(timeout_remaining, grace_remaining)
        return timeout_remaining

    def _has_quorum_locked(self) -> bool:
        return len(self.client_weights_buffer) >= self._required_clients_locked()

    def _grace_period_elapsed_locked(self) -> bool:
        """True if grace period has passed since quorum was first reached."""
        if self.grace_period_sec <= 0.0:
            return True
        if self._quorum_reached_at is None:
            return False
        return (time.time() - self._quorum_reached_at) >= self.grace_period_sec

    def _maybe_aggregate_locked(self, *, server_model, optimizer, reason: str) -> None:
        if not (self.client_weights_buffer and self._has_quorum_locked()):
            return
        if self._quorum_reached_at is None:
            self._quorum_reached_at = time.time()
            if self.grace_period_sec > 0.0:
                print(
                    f"[FED AVG] Quorum reached ({len(self.client_weights_buffer)}/"
                    f"{self._required_clients_locked()}), "
                    f"waiting grace period {self.grace_period_sec:.0f}s for stragglers..."
                )
        if self._grace_period_elapsed_locked():
            self._aggregate_locked(server_model=server_model, optimizer=optimizer, reason=reason)

    def _build_sync_response_locked(
        self,
        *,
        accepted: bool,
        applied_round: int,
        refresh_only: bool,
        status_message: str,
    ) -> fsl_pb2.SyncResponse:
        global_weights_bytes = tensor_to_bytes(self.global_weights) if self.global_weights is not None else b""
        return fsl_pb2.SyncResponse(
            global_weights=global_weights_bytes,
            round_number=int(self.current_round),
            accepted=bool(accepted),
            applied_round=int(applied_round),
            refresh_only=bool(refresh_only),
            status_message=status_message,
        )

    def _aggregate_locked(self, *, server_model, optimizer, reason: str) -> None:
        aggregate_start = time.time()
        pending_updates = list(self.client_weights_buffer.values())
        client_ids = [update.client_id for update in pending_updates]
        barrier_elapsed_s = (time.time() - self.round_start_time) if self.round_start_time is not None else 0.0
        print(
            f"[FED AVG] Round {self.current_round + 1}: Aggregating {len(pending_updates)} models "
            f"from clients={client_ids} (active={len(self._active_clients)} "
            f"quorum={self._required_clients_locked()} reason={reason} "
            f"waited={barrier_elapsed_s:.2f}s)"
        )

        # Staleness-discounted weighting: w_i = local_epochs_i / (1 + staleness_i)
        # When max_staleness=0 all staleness values are 0, reducing to plain
        # local-epoch-weighted FedAvg (identical to prior behaviour).
        raw_weights = [
            u.local_epochs / (1.0 + max(0, self.current_round - u.base_round))
            for u in pending_updates
        ]
        total_weight = sum(raw_weights) or float(len(pending_updates))
        stalenesses = [max(0, self.current_round - u.base_round) for u in pending_updates]
        print(
            f"[FED AVG] Staleness per client: "
            + ", ".join(f"C{u.client_id}={s}" for u, s in zip(pending_updates, stalenesses))
        )
        self.global_weights = copy.deepcopy(pending_updates[0].weights)
        for key in self.global_weights.keys():
            self.global_weights[key] = sum(
                u.weights[key] * (w / total_weight)
                for u, w in zip(pending_updates, raw_weights)
            )

        self.client_weights_buffer = {}
        self._quorum_reached_at = None
        self.current_round += 1

        stamp = datetime.now().strftime("%Y%m%d%H%M%S")
        config_snapshot, snapshot_policy = build_config_snapshot()
        server_ckpt = {
            "round": self.current_round,
            "model_state_dict": server_model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "config": {
                "hidden_size": self.hidden_size,
            },
            "config_snapshot_policy": snapshot_policy,
            "config_ref": build_config_ref(),
            "session_id": self.session_id,
            "aggregated_client_ids": client_ids,
            "aggregation_reason": reason,
        }
        if config_snapshot is not None:
            server_ckpt["config_snapshot"] = config_snapshot
        server_model_path = os.path.join(
            self.session_dir,
            f"server_head_round_{self.current_round}_{stamp}.pth",
        )
        torch.save(server_ckpt, server_model_path)

        old_checkpoints = sorted(
            Path(self.session_dir).glob("server_head_round_*.pth"),
            key=self._checkpoint_sort_key,
        )
        for old_ckpt in old_checkpoints[:-1]:
            try:
                os.remove(old_ckpt)
            except Exception:
                pass

        if self.current_round % self.ckpt_interval == 0:
            periodic_path = os.path.join(
                self.periodic_dir,
                f"server_round_{self.current_round:04d}.pth",
            )
            torch.save(server_ckpt, periodic_path)
            print(f"[SERVER] Periodic ckpt saved: round {self.current_round:04d}")

        aggregate_elapsed_s = time.time() - aggregate_start
        self.round_start_time = None
        self.round_error = None
        print(
            f"[FED AVG] Successfully updated global model to Round {self.current_round} "
            f"(aggregate_time={aggregate_elapsed_s:.2f}s)"
        )
        print(f"[SERVER] Best ckpt: {self.session_id}/{Path(server_model_path).name}")
        self.round_cond.notify_all()

    @staticmethod
    def _checkpoint_sort_key(path: Path) -> tuple[int, str]:
        stem_parts = path.stem.split("_")
        try:
            round_number = int(stem_parts[3])
        except (IndexError, ValueError):
            round_number = -1
        return round_number, path.name