platform: modularize api/gui, add docs-tests-web foundation, and refresh root config
This commit is contained in:
213
scripts/transport_recovery_runbook.py
Executable file
213
scripts/transport_recovery_runbook.py
Executable file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from typing import Dict, Optional, Tuple
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
|
||||
def request_json(api_url: str, method: str, path: str, payload: Optional[Dict] = None) -> Tuple[int, Dict]:
|
||||
data = None
|
||||
headers = {"Accept": "application/json"}
|
||||
if payload is not None:
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
headers["Content-Type"] = "application/json"
|
||||
|
||||
req = urllib.request.Request(
|
||||
f"{api_url.rstrip('/')}{path}",
|
||||
data=data,
|
||||
method=method.upper(),
|
||||
headers=headers,
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30.0) as resp:
|
||||
raw = resp.read().decode("utf-8", errors="replace")
|
||||
status = int(resp.getcode() or 200)
|
||||
except urllib.error.HTTPError as e:
|
||||
raw = e.read().decode("utf-8", errors="replace")
|
||||
status = int(e.code or 500)
|
||||
except Exception as e:
|
||||
return 0, {"ok": False, "message": str(e), "code": "HTTP_CLIENT_ERROR"}
|
||||
|
||||
try:
|
||||
parsed = json.loads(raw) if raw else {}
|
||||
except Exception:
|
||||
parsed = {"raw": raw}
|
||||
if not isinstance(parsed, dict):
|
||||
parsed = {"raw": parsed}
|
||||
return status, parsed
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Recovery runbook for /api/v1/transport/clients/{id} lifecycle/health"
|
||||
)
|
||||
parser.add_argument("--api-url", default=os.environ.get("API_URL", "http://127.0.0.1:8080"))
|
||||
parser.add_argument("--client-id", required=True)
|
||||
parser.add_argument("--max-restarts", type=int, default=2)
|
||||
parser.add_argument("--retry-delay-sec", type=float, default=1.0)
|
||||
parser.add_argument("--provision-if-needed", dest="provision_if_needed", action="store_true")
|
||||
parser.add_argument("--no-provision-if-needed", dest="provision_if_needed", action="store_false")
|
||||
parser.set_defaults(provision_if_needed=True)
|
||||
parser.add_argument("--diagnostics-json", default="")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def summarize(resp: Dict) -> str:
|
||||
status = str(resp.get("status") or "").strip().lower()
|
||||
code = str(resp.get("code") or "").strip()
|
||||
last_err = str(resp.get("last_error") or "").strip()
|
||||
if not last_err:
|
||||
health = resp.get("health") or {}
|
||||
if isinstance(health, dict):
|
||||
last_err = str(health.get("last_error") or "").strip()
|
||||
return f"status={status or 'unknown'} code={code or '-'} last_error={last_err or '-'}"
|
||||
|
||||
|
||||
def is_healthy_up(health: Dict) -> bool:
|
||||
status = str(health.get("status") or "").strip().lower()
|
||||
if status != "up":
|
||||
return False
|
||||
code = str(health.get("code") or "").strip()
|
||||
if code and code != "TRANSPORT_CLIENT_DEGRADED":
|
||||
return False
|
||||
last_err = str(health.get("last_error") or "").strip()
|
||||
if not last_err:
|
||||
h = health.get("health") or {}
|
||||
if isinstance(h, dict):
|
||||
last_err = str(h.get("last_error") or "").strip()
|
||||
return last_err == ""
|
||||
|
||||
|
||||
def action(api_url: str, client_id: str, name: str) -> Tuple[int, Dict]:
|
||||
method = "POST"
|
||||
path = f"/api/v1/transport/clients/{urllib.parse.quote(client_id)}/{name}"
|
||||
if name in ("health", "metrics"):
|
||||
method = "GET"
|
||||
return request_json(api_url, method, path)
|
||||
|
||||
|
||||
def client_card(api_url: str, client_id: str) -> Tuple[int, Dict]:
|
||||
return request_json(api_url, "GET", f"/api/v1/transport/clients/{urllib.parse.quote(client_id)}")
|
||||
|
||||
|
||||
def write_diagnostics(path: str, diag: Dict) -> None:
|
||||
if not path.strip():
|
||||
return
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(diag, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def main() -> int:
|
||||
args = parse_args()
|
||||
api_url = args.api_url.strip()
|
||||
client_id = args.client_id.strip()
|
||||
if not api_url:
|
||||
print("[transport_recovery] ERROR: empty --api-url")
|
||||
return 1
|
||||
if not client_id:
|
||||
print("[transport_recovery] ERROR: empty --client-id")
|
||||
return 1
|
||||
if args.max_restarts < 0:
|
||||
print("[transport_recovery] ERROR: --max-restarts must be >= 0")
|
||||
return 1
|
||||
|
||||
print(
|
||||
f"[transport_recovery] API_URL={api_url} client_id={client_id} "
|
||||
f"max_restarts={args.max_restarts} provision_if_needed={args.provision_if_needed}"
|
||||
)
|
||||
|
||||
diagnostics: Dict = {
|
||||
"client_id": client_id,
|
||||
"started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"steps": [],
|
||||
}
|
||||
|
||||
c_status, c_data = client_card(api_url, client_id)
|
||||
diagnostics["client_card_before"] = {"http": c_status, "payload": c_data}
|
||||
if c_status != 200 or not bool(c_data.get("ok", False)):
|
||||
print(f"[transport_recovery] ERROR: client card unavailable http={c_status} payload={c_data}")
|
||||
write_diagnostics(args.diagnostics_json, diagnostics)
|
||||
return 1
|
||||
|
||||
h_status, health = action(api_url, client_id, "health")
|
||||
diagnostics["steps"].append({"action": "health", "http": h_status, "payload": health})
|
||||
if h_status != 200:
|
||||
print(f"[transport_recovery] ERROR: health request failed http={h_status}")
|
||||
write_diagnostics(args.diagnostics_json, diagnostics)
|
||||
return 1
|
||||
print(f"[transport_recovery] initial {summarize(health)}")
|
||||
if is_healthy_up(health):
|
||||
print("[transport_recovery] already healthy")
|
||||
write_diagnostics(args.diagnostics_json, diagnostics)
|
||||
return 0
|
||||
|
||||
recovered = False
|
||||
provision_tried = False
|
||||
|
||||
for attempt in range(1, args.max_restarts + 1):
|
||||
r_status, restart = action(api_url, client_id, "restart")
|
||||
diagnostics["steps"].append({"action": "restart", "attempt": attempt, "http": r_status, "payload": restart})
|
||||
print(
|
||||
f"[transport_recovery] restart attempt={attempt} "
|
||||
f"http={r_status} ok={restart.get('ok')} code={restart.get('code')}"
|
||||
)
|
||||
|
||||
if args.retry_delay_sec > 0:
|
||||
time.sleep(args.retry_delay_sec)
|
||||
h_status, health = action(api_url, client_id, "health")
|
||||
diagnostics["steps"].append(
|
||||
{"action": "health_after_restart", "attempt": attempt, "http": h_status, "payload": health}
|
||||
)
|
||||
if h_status == 200 and is_healthy_up(health):
|
||||
recovered = True
|
||||
print(f"[transport_recovery] recovered after restart attempt={attempt}")
|
||||
break
|
||||
|
||||
if args.provision_if_needed and not provision_tried:
|
||||
p_status, provision = action(api_url, client_id, "provision")
|
||||
diagnostics["steps"].append({"action": "provision", "http": p_status, "payload": provision})
|
||||
print(
|
||||
f"[transport_recovery] provision "
|
||||
f"http={p_status} ok={provision.get('ok')} code={provision.get('code')}"
|
||||
)
|
||||
provision_tried = True
|
||||
|
||||
s_status, start = action(api_url, client_id, "start")
|
||||
diagnostics["steps"].append({"action": "start", "http": s_status, "payload": start})
|
||||
print(f"[transport_recovery] start http={s_status} ok={start.get('ok')} code={start.get('code')}")
|
||||
|
||||
if args.retry_delay_sec > 0:
|
||||
time.sleep(args.retry_delay_sec)
|
||||
h_status, health = action(api_url, client_id, "health")
|
||||
diagnostics["steps"].append({"action": "health_after_start", "http": h_status, "payload": health})
|
||||
if h_status == 200 and is_healthy_up(health):
|
||||
recovered = True
|
||||
print("[transport_recovery] recovered after provision/start")
|
||||
break
|
||||
|
||||
m_status, metrics = action(api_url, client_id, "metrics")
|
||||
diagnostics["metrics"] = {"http": m_status, "payload": metrics}
|
||||
diagnostics["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
|
||||
if recovered:
|
||||
print("[transport_recovery] RESULT: recovered")
|
||||
write_diagnostics(args.diagnostics_json, diagnostics)
|
||||
return 0
|
||||
|
||||
c2_status, c2_data = client_card(api_url, client_id)
|
||||
diagnostics["client_card_after"] = {"http": c2_status, "payload": c2_data}
|
||||
h2_status, h2 = action(api_url, client_id, "health")
|
||||
diagnostics["health_after"] = {"http": h2_status, "payload": h2}
|
||||
print(f"[transport_recovery] RESULT: unrecovered ({summarize(h2 if isinstance(h2, dict) else {})})")
|
||||
write_diagnostics(args.diagnostics_json, diagnostics)
|
||||
return 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user