FiWiControl/scripts/system/pcie_hotswap_harness.py

217 lines
8.6 KiB
Python

#!/usr/bin/env python3
# Copyright (c) 2026 Umber
#
# Licensed under the Apache License, Version 2.0; see LICENSE.
#
# Lab fronthaul (PCIe) hot-swap harness — not part of pytest; run from repo root with an editable install.
# Examples:
# python3 scripts/system/pcie_hotswap_harness.py --dry-run
# python3 scripts/system/pcie_hotswap_harness.py --fabric-json configs/my-fabric.json --lab-ini configs/default.ini --dry-run
# FIWI_REMOTE_IP=192.168.1.39 python3 scripts/system/pcie_hotswap_harness.py --dry-run --paths 2
#
# This script is meant to reveal design gaps. See DESIGN_GAPS below.
"""PCIe fronthaul hot-swap exercise (async, optional SSH + future Power binding).
DESIGN_GAPS (fill as the stack grows)
--------------------------------------
1. **Port routing** — With ``--fabric-json``, Acroname port per ``radio_id`` is in the file and
copied to ``Fabric.rrh_power_ports``; ``one_cycle`` still does not call ``Power.on/off``. Placeholder
mode (``--paths`` only) has no port map.
2. **Shared ``Power`` + lock** — Fields exist on ``Fabric``; hot-swap harness must still call
``power`` only under ``power_lock`` (not wired in ``one_cycle`` yet).
3. **Enumeration truth** — We only shell out placeholders (``lspci``, ``true``). Real
checks need agreed sysfs / ``pciutils`` / driver contracts and pass/fail criteria.
4. **Telemetry ingest** — ``FrontHaulTelemetry`` has no parser from Adnacom JSON/CLI yet.
5. **INI → targets** — With ``--fabric-json``, lab INI is merged **after** JSON load (``[fabric]``,
``[fabric.rrh.*]``) unless ``--no-lab-ini``. INI-only JSON without interactive bind:
``scripts/system/fabric_realize.py`` or ``python3 -m fiwicontrol.fabric build`` / ``bind``.
6. **SPC hook** — After KPI extraction, ``fiwicontrol.spc`` charts belong in a separate reporting
step, not inside this harness.
"""
from __future__ import annotations
import argparse
import asyncio
import logging
import os
import sys
from pathlib import Path
# Repo root on path when run as ``python3 scripts/system/pcie_hotswap_harness.py``
_REPO_ROOT = __file__.rsplit("/scripts/system/", 1)[0]
if _REPO_ROOT not in sys.path:
sys.path.insert(0, _REPO_ROOT + "/src")
from fiwicontrol.commands import ssh_node
from fiwicontrol.fabric import Fabric
from fiwicontrol.fabric.fabric import FabricBindingStatus, FabricDefinition
from fiwicontrol.lab.inventory_config import default_lab_ini_path
from fiwicontrol.fronthaul import FrontHaul
from fiwicontrol.radio import RadioHead
async def _remote_smoke(node: ssh_node, *, label: str) -> str:
"""Minimal SSH proof; extend with lspci/sysfs once contracts exist."""
session = await node.rexec(cmd="set -e; uname -n; lspci -nn 2>/dev/null | head -n 3 || true")
out = ""
if session.results is not None:
out = session.results.decode("utf-8", errors="replace").strip()
logging.info("[%s] remote smoke:\n%s", label, out or "(no stdout)")
return out
async def one_cycle(
*,
label: str,
rrh: RadioHead,
node: ssh_node | None,
dry_run: bool,
settle_s: float,
acroname_port: int | None = None,
) -> None:
"""One conceptual remove → wait → restore → verify (placeholders)."""
if acroname_port is not None:
logging.info("[%s] cycle start radio_id=%s acroname_port=%s", label, rrh.radio_id, acroname_port)
else:
logging.info("[%s] cycle start radio_id=%s", label, rrh.radio_id)
if dry_run:
logging.info("[%s] DRY-RUN: would drop VBUS / assert link down", label)
await asyncio.sleep(settle_s)
logging.info("[%s] DRY-RUN: would restore power / wait for training", label)
else:
await asyncio.sleep(settle_s)
if node is not None:
await _remote_smoke(node, label=label)
else:
logging.warning("[%s] no SSH node; skip remote checks", label)
logging.info("[%s] cycle end", label)
async def run_campaign(
*,
fabric: Fabric,
dry_run: bool,
iterations: int,
settle_s: float,
) -> None:
node = fabric.concentrator
for i in range(iterations):
logging.info("=== iteration %s / %s (fabric_id=%s) ===", i + 1, iterations, fabric.fabric_id)
async with asyncio.TaskGroup() as tg:
for h in fabric.rrhs:
port = fabric.rrh_power_ports.get(h.radio_id)
tg.create_task(
one_cycle(
label="{}#{}".format(h.radio_id, i),
rrh=h,
node=node,
dry_run=dry_run,
settle_s=settle_s,
acroname_port=port,
)
)
def _build_placeholder_rrhs(n: int) -> list[RadioHead]:
out: list[RadioHead] = []
for i in range(n):
fh = FrontHaul(medium="pcie", vendor_id=None, device_id=None, link_states=())
out.append(
RadioHead(
radio_id="rrh-{:02d}".format(i + 1),
patch_panel_port=100 + i,
fronthaul=fh,
)
)
return out
def main() -> int:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--dry-run", action="store_true", help="log only; no Acroname calls (none wired yet)")
p.add_argument("--rig-ip", default=os.environ.get("FIWI_REMOTE_IP"), help="SSH target (default: $FIWI_REMOTE_IP)")
p.add_argument(
"--fabric-json",
metavar="PATH",
help="Load RRH bindings + Acroname port map from JSON (see docs/fabric-builder.md; python -m fiwicontrol.fabric build -o …)",
)
p.add_argument(
"--strict-fabric-ready",
action="store_true",
help="Exit 2 unless Fabric.binding_cache_status is READY (needs live Acroname discovery)",
)
p.add_argument(
"--lab-ini",
metavar="PATH",
default=None,
help="Lab inventory INI merged over fabric JSON (default: FIWI_LAB_INI or configs/default.ini)",
)
p.add_argument(
"--no-lab-ini",
action="store_true",
help="Do not merge [fabric] / [fabric.rrh.*] from lab INI (JSON only)",
)
p.add_argument("--paths", type=int, default=1, metavar="N", help="placeholder RRH count (ignored with --fabric-json)")
p.add_argument("--iterations", type=int, default=1, help="sequential outer iterations")
p.add_argument("--settle", type=float, default=0.5, help="seconds between placeholder phases")
args = p.parse_args()
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
if args.fabric_json:
st = Fabric.binding_cache_status(args.fabric_json)
logging.info("Fabric JSON binding_cache_status=%s", st.value)
if args.strict_fabric_ready and st is not FabricBindingStatus.READY:
logging.error("Strict fabric check failed (status is not READY)")
return 2
if args.no_lab_ini:
definition = FabricDefinition.load(args.fabric_json)
else:
ini_p = Path(args.lab_ini) if args.lab_ini else default_lab_ini_path()
if ini_p.is_file():
logging.info("Merging lab INI %s over fabric JSON", ini_p)
definition = FabricDefinition.load_json_merged_with_ini(args.fabric_json, ini_p)
else:
logging.warning("Lab INI not found (%s); loading fabric JSON only", ini_p)
definition = FabricDefinition.load(args.fabric_json)
fabric = Fabric.from_definition(definition, power_lock=asyncio.Lock())
if args.rig_ip:
fabric = fabric.with_concentrator_override(name="rig", ipaddr=args.rig_ip)
if fabric.concentrator is None and not args.rig_ip:
logging.warning("No concentrator after JSON/INI merge and no --rig-ip; SSH checks skipped")
else:
rrhs = _build_placeholder_rrhs(max(1, args.paths))
if not args.rig_ip:
logging.warning("No --rig-ip or FIWI_REMOTE_IP: SSH checks skipped")
concentrator = ssh_node(name="rig", ipaddr=args.rig_ip) if args.rig_ip else None
fabric = Fabric(
fabric_id="pcie-hotswap-harness",
rrhs=tuple(rrhs),
concentrator=concentrator,
power_lock=asyncio.Lock(),
)
exit_code = 0
try:
asyncio.run(
run_campaign(
fabric=fabric,
dry_run=args.dry_run,
iterations=max(1, args.iterations),
settle_s=args.settle,
)
)
except* Exception as eg:
for e in eg.exceptions:
logging.error("campaign failed: %s", e)
exit_code = 1
return exit_code
if __name__ == "__main__":
raise SystemExit(main())