#!/usr/bin/env python3 from __future__ import annotations import argparse import json import os import time from typing import Dict, Optional, Tuple import urllib.error import urllib.parse import urllib.request def request_json(api_url: str, method: str, path: str, payload: Optional[Dict] = None) -> Tuple[int, Dict]: data = None headers = {"Accept": "application/json"} if payload is not None: data = json.dumps(payload).encode("utf-8") headers["Content-Type"] = "application/json" req = urllib.request.Request( f"{api_url.rstrip('/')}{path}", data=data, method=method.upper(), headers=headers, ) try: with urllib.request.urlopen(req, timeout=30.0) as resp: raw = resp.read().decode("utf-8", errors="replace") status = int(resp.getcode() or 200) except urllib.error.HTTPError as e: raw = e.read().decode("utf-8", errors="replace") status = int(e.code or 500) except Exception as e: return 0, {"ok": False, "message": str(e), "code": "HTTP_CLIENT_ERROR"} try: parsed = json.loads(raw) if raw else {} except Exception: parsed = {"raw": raw} if not isinstance(parsed, dict): parsed = {"raw": parsed} return status, parsed def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Recovery runbook for /api/v1/transport/clients/{id} lifecycle/health" ) parser.add_argument("--api-url", default=os.environ.get("API_URL", "http://127.0.0.1:8080")) parser.add_argument("--client-id", required=True) parser.add_argument("--max-restarts", type=int, default=2) parser.add_argument("--retry-delay-sec", type=float, default=1.0) parser.add_argument("--provision-if-needed", dest="provision_if_needed", action="store_true") parser.add_argument("--no-provision-if-needed", dest="provision_if_needed", action="store_false") parser.set_defaults(provision_if_needed=True) parser.add_argument("--diagnostics-json", default="") return parser.parse_args() def summarize(resp: Dict) -> str: status = str(resp.get("status") or "").strip().lower() code = str(resp.get("code") or "").strip() last_err = str(resp.get("last_error") or "").strip() if not last_err: health = resp.get("health") or {} if isinstance(health, dict): last_err = str(health.get("last_error") or "").strip() return f"status={status or 'unknown'} code={code or '-'} last_error={last_err or '-'}" def is_healthy_up(health: Dict) -> bool: status = str(health.get("status") or "").strip().lower() if status != "up": return False code = str(health.get("code") or "").strip() if code and code != "TRANSPORT_CLIENT_DEGRADED": return False last_err = str(health.get("last_error") or "").strip() if not last_err: h = health.get("health") or {} if isinstance(h, dict): last_err = str(h.get("last_error") or "").strip() return last_err == "" def action(api_url: str, client_id: str, name: str) -> Tuple[int, Dict]: method = "POST" path = f"/api/v1/transport/clients/{urllib.parse.quote(client_id)}/{name}" if name in ("health", "metrics"): method = "GET" return request_json(api_url, method, path) def client_card(api_url: str, client_id: str) -> Tuple[int, Dict]: return request_json(api_url, "GET", f"/api/v1/transport/clients/{urllib.parse.quote(client_id)}") def write_diagnostics(path: str, diag: Dict) -> None: if not path.strip(): return with open(path, "w", encoding="utf-8") as f: json.dump(diag, f, ensure_ascii=False, indent=2) def main() -> int: args = parse_args() api_url = args.api_url.strip() client_id = args.client_id.strip() if not api_url: print("[transport_recovery] ERROR: empty --api-url") return 1 if not client_id: print("[transport_recovery] ERROR: empty --client-id") return 1 if args.max_restarts < 0: print("[transport_recovery] ERROR: --max-restarts must be >= 0") return 1 print( f"[transport_recovery] API_URL={api_url} client_id={client_id} " f"max_restarts={args.max_restarts} provision_if_needed={args.provision_if_needed}" ) diagnostics: Dict = { "client_id": client_id, "started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "steps": [], } c_status, c_data = client_card(api_url, client_id) diagnostics["client_card_before"] = {"http": c_status, "payload": c_data} if c_status != 200 or not bool(c_data.get("ok", False)): print(f"[transport_recovery] ERROR: client card unavailable http={c_status} payload={c_data}") write_diagnostics(args.diagnostics_json, diagnostics) return 1 h_status, health = action(api_url, client_id, "health") diagnostics["steps"].append({"action": "health", "http": h_status, "payload": health}) if h_status != 200: print(f"[transport_recovery] ERROR: health request failed http={h_status}") write_diagnostics(args.diagnostics_json, diagnostics) return 1 print(f"[transport_recovery] initial {summarize(health)}") if is_healthy_up(health): print("[transport_recovery] already healthy") write_diagnostics(args.diagnostics_json, diagnostics) return 0 recovered = False provision_tried = False for attempt in range(1, args.max_restarts + 1): r_status, restart = action(api_url, client_id, "restart") diagnostics["steps"].append({"action": "restart", "attempt": attempt, "http": r_status, "payload": restart}) print( f"[transport_recovery] restart attempt={attempt} " f"http={r_status} ok={restart.get('ok')} code={restart.get('code')}" ) if args.retry_delay_sec > 0: time.sleep(args.retry_delay_sec) h_status, health = action(api_url, client_id, "health") diagnostics["steps"].append( {"action": "health_after_restart", "attempt": attempt, "http": h_status, "payload": health} ) if h_status == 200 and is_healthy_up(health): recovered = True print(f"[transport_recovery] recovered after restart attempt={attempt}") break if args.provision_if_needed and not provision_tried: p_status, provision = action(api_url, client_id, "provision") diagnostics["steps"].append({"action": "provision", "http": p_status, "payload": provision}) print( f"[transport_recovery] provision " f"http={p_status} ok={provision.get('ok')} code={provision.get('code')}" ) provision_tried = True s_status, start = action(api_url, client_id, "start") diagnostics["steps"].append({"action": "start", "http": s_status, "payload": start}) print(f"[transport_recovery] start http={s_status} ok={start.get('ok')} code={start.get('code')}") if args.retry_delay_sec > 0: time.sleep(args.retry_delay_sec) h_status, health = action(api_url, client_id, "health") diagnostics["steps"].append({"action": "health_after_start", "http": h_status, "payload": health}) if h_status == 200 and is_healthy_up(health): recovered = True print("[transport_recovery] recovered after provision/start") break m_status, metrics = action(api_url, client_id, "metrics") diagnostics["metrics"] = {"http": m_status, "payload": metrics} diagnostics["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) if recovered: print("[transport_recovery] RESULT: recovered") write_diagnostics(args.diagnostics_json, diagnostics) return 0 c2_status, c2_data = client_card(api_url, client_id) diagnostics["client_card_after"] = {"http": c2_status, "payload": c2_data} h2_status, h2 = action(api_url, client_id, "health") diagnostics["health_after"] = {"http": h2_status, "payload": h2} print(f"[transport_recovery] RESULT: unrecovered ({summarize(h2 if isinstance(h2, dict) else {})})") write_diagnostics(args.diagnostics_json, diagnostics) return 2 if __name__ == "__main__": raise SystemExit(main())