FiWiControl/scripts/system/pcie_hotswap_harness.py

#!/usr/bin/env python3
# Copyright (c) 2026 Umber
#
# Licensed under the Apache License, Version 2.0; see LICENSE.
#
# Lab fronthaul (PCIe) hot-swap harness — not part of pytest; run from repo root with an editable install.
# Examples:
#   python3 scripts/system/pcie_hotswap_harness.py --dry-run
#   python3 scripts/system/pcie_hotswap_harness.py --fabric-json configs/my-fabric.json --lab-ini configs/default.ini --dry-run
#   FIWI_REMOTE_IP=192.168.1.39 python3 scripts/system/pcie_hotswap_harness.py --dry-run --paths 2
#
# This script is meant to reveal design gaps. See DESIGN_GAPS below.

"""PCIe fronthaul hot-swap exercise (async, optional SSH + future Power binding).

DESIGN_GAPS (fill as the stack grows)
--------------------------------------
1. **Port routing** — With ``--fabric-json``, Acroname port per ``radio_id`` is in the file and
   copied to ``Fabric.rrh_power_ports``; ``one_cycle`` still does not call ``Power.on/off``. Placeholder
   mode (``--paths`` only) has no port map.
2. **Shared ``Power`` + lock** — Fields exist on ``Fabric``; hot-swap harness must still call
   ``power`` only under ``power_lock`` (not wired in ``one_cycle`` yet).
3. **Enumeration truth** — We only shell out placeholders (``lspci``, ``true``). Real
   checks need agreed sysfs / ``pciutils`` / driver contracts and pass/fail criteria.
4. **Telemetry ingest** — ``FrontHaulTelemetry`` has no parser from Adnacom JSON/CLI yet.
5. **INI → targets** — With ``--fabric-json``, lab INI is merged **after** JSON load (``[fabric]``,
   ``[fabric.rrh.*]``) unless ``--no-lab-ini``. INI-only JSON without interactive bind:
   ``scripts/system/fabric_realize.py`` or ``python3 -m fiwicontrol.fabric build`` / ``bind``.
6. **SPC hook** — After KPI extraction, ``fiwicontrol.spc`` charts belong in a separate reporting
   step, not inside this harness.
"""

from __future__ import annotations

import argparse
import asyncio
import logging
import os
import sys
from pathlib import Path

# Repo root on path when run as ``python3 scripts/system/pcie_hotswap_harness.py``
_REPO_ROOT = __file__.rsplit("/scripts/system/", 1)[0]
if _REPO_ROOT not in sys.path:
    sys.path.insert(0, _REPO_ROOT + "/src")

from fiwicontrol.commands import ssh_node
from fiwicontrol.fabric import Fabric
from fiwicontrol.fabric.fabric import FabricBindingStatus, FabricDefinition
from fiwicontrol.lab.inventory_config import default_lab_ini_path
from fiwicontrol.fronthaul import FrontHaul
from fiwicontrol.radio import RadioHead


async def _remote_smoke(node: ssh_node, *, label: str) -> str:
    """Minimal SSH proof; extend with lspci/sysfs once contracts exist."""
    session = await node.rexec(cmd="set -e; uname -n; lspci -nn 2>/dev/null | head -n 3 || true")
    out = ""
    if session.results is not None:
        out = session.results.decode("utf-8", errors="replace").strip()
    logging.info("[%s] remote smoke:\n%s", label, out or "(no stdout)")
    return out


async def one_cycle(
    *,
    label: str,
    rrh: RadioHead,
    node: ssh_node | None,
    dry_run: bool,
    settle_s: float,
    acroname_port: int | None = None,
) -> None:
    """One conceptual remove → wait → restore → verify (placeholders)."""
    if acroname_port is not None:
        logging.info("[%s] cycle start radio_id=%s acroname_port=%s", label, rrh.radio_id, acroname_port)
    else:
        logging.info("[%s] cycle start radio_id=%s", label, rrh.radio_id)
    if dry_run:
        logging.info("[%s] DRY-RUN: would drop VBUS / assert link down", label)
        await asyncio.sleep(settle_s)
        logging.info("[%s] DRY-RUN: would restore power / wait for training", label)
    else:
        await asyncio.sleep(settle_s)

    if node is not None:
        await _remote_smoke(node, label=label)
    else:
        logging.warning("[%s] no SSH node; skip remote checks", label)

    logging.info("[%s] cycle end", label)


async def run_campaign(
    *,
    fabric: Fabric,
    dry_run: bool,
    iterations: int,
    settle_s: float,
) -> None:
    node = fabric.concentrator
    for i in range(iterations):
        logging.info("=== iteration %s / %s (fabric_id=%s) ===", i + 1, iterations, fabric.fabric_id)
        async with asyncio.TaskGroup() as tg:
            for h in fabric.rrhs:
                port = fabric.rrh_power_ports.get(h.radio_id)
                tg.create_task(
                    one_cycle(
                        label="{}#{}".format(h.radio_id, i),
                        rrh=h,
                        node=node,
                        dry_run=dry_run,
                        settle_s=settle_s,
                        acroname_port=port,
                    )
                )


def _build_placeholder_rrhs(n: int) -> list[RadioHead]:
    out: list[RadioHead] = []
    for i in range(n):
        fh = FrontHaul(medium="pcie", vendor_id=None, device_id=None, link_states=())
        out.append(
            RadioHead(
                radio_id="rrh-{:02d}".format(i + 1),
                patch_panel_port=100 + i,
                fronthaul=fh,
            )
        )
    return out


def main() -> int:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("--dry-run", action="store_true", help="log only; no Acroname calls (none wired yet)")
    p.add_argument("--rig-ip", default=os.environ.get("FIWI_REMOTE_IP"), help="SSH target (default: $FIWI_REMOTE_IP)")
    p.add_argument(
        "--fabric-json",
        metavar="PATH",
        help="Load RRH bindings + Acroname port map from JSON (see docs/fabric-builder.md; python -m fiwicontrol.fabric build -o …)",
    )
    p.add_argument(
        "--strict-fabric-ready",
        action="store_true",
        help="Exit 2 unless Fabric.binding_cache_status is READY (needs live Acroname discovery)",
    )
    p.add_argument(
        "--lab-ini",
        metavar="PATH",
        default=None,
        help="Lab inventory INI merged over fabric JSON (default: FIWI_LAB_INI or configs/default.ini)",
    )
    p.add_argument(
        "--no-lab-ini",
        action="store_true",
        help="Do not merge [fabric] / [fabric.rrh.*] from lab INI (JSON only)",
    )
    p.add_argument("--paths", type=int, default=1, metavar="N", help="placeholder RRH count (ignored with --fabric-json)")
    p.add_argument("--iterations", type=int, default=1, help="sequential outer iterations")
    p.add_argument("--settle", type=float, default=0.5, help="seconds between placeholder phases")
    args = p.parse_args()

    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

    if args.fabric_json:
        st = Fabric.binding_cache_status(args.fabric_json)
        logging.info("Fabric JSON binding_cache_status=%s", st.value)
        if args.strict_fabric_ready and st is not FabricBindingStatus.READY:
            logging.error("Strict fabric check failed (status is not READY)")
            return 2
        if args.no_lab_ini:
            definition = FabricDefinition.load(args.fabric_json)
        else:
            ini_p = Path(args.lab_ini) if args.lab_ini else default_lab_ini_path()
            if ini_p.is_file():
                logging.info("Merging lab INI %s over fabric JSON", ini_p)
                definition = FabricDefinition.load_json_merged_with_ini(args.fabric_json, ini_p)
            else:
                logging.warning("Lab INI not found (%s); loading fabric JSON only", ini_p)
                definition = FabricDefinition.load(args.fabric_json)
        fabric = Fabric.from_definition(definition, power_lock=asyncio.Lock())
        if args.rig_ip:
            fabric = fabric.with_concentrator_override(name="rig", ipaddr=args.rig_ip)
        if fabric.concentrator is None and not args.rig_ip:
            logging.warning("No concentrator after JSON/INI merge and no --rig-ip; SSH checks skipped")
    else:
        rrhs = _build_placeholder_rrhs(max(1, args.paths))
        if not args.rig_ip:
            logging.warning("No --rig-ip or FIWI_REMOTE_IP: SSH checks skipped")
        concentrator = ssh_node(name="rig", ipaddr=args.rig_ip) if args.rig_ip else None
        fabric = Fabric(
            fabric_id="pcie-hotswap-harness",
            rrhs=tuple(rrhs),
            concentrator=concentrator,
            power_lock=asyncio.Lock(),
        )

    exit_code = 0
    try:
        asyncio.run(
            run_campaign(
                fabric=fabric,
                dry_run=args.dry_run,
                iterations=max(1, args.iterations),
                settle_s=args.settle,
            )
        )
    except* Exception as eg:
        for e in eg.exceptions:
            logging.error("campaign failed: %s", e)
        exit_code = 1
    return exit_code


if __name__ == "__main__":
    raise SystemExit(main())