214 lines
8.2 KiB
Python
Executable File
214 lines
8.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import time
|
|
from typing import Dict, Optional, Tuple
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
|
|
|
|
def request_json(api_url: str, method: str, path: str, payload: Optional[Dict] = None) -> Tuple[int, Dict]:
|
|
data = None
|
|
headers = {"Accept": "application/json"}
|
|
if payload is not None:
|
|
data = json.dumps(payload).encode("utf-8")
|
|
headers["Content-Type"] = "application/json"
|
|
|
|
req = urllib.request.Request(
|
|
f"{api_url.rstrip('/')}{path}",
|
|
data=data,
|
|
method=method.upper(),
|
|
headers=headers,
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30.0) as resp:
|
|
raw = resp.read().decode("utf-8", errors="replace")
|
|
status = int(resp.getcode() or 200)
|
|
except urllib.error.HTTPError as e:
|
|
raw = e.read().decode("utf-8", errors="replace")
|
|
status = int(e.code or 500)
|
|
except Exception as e:
|
|
return 0, {"ok": False, "message": str(e), "code": "HTTP_CLIENT_ERROR"}
|
|
|
|
try:
|
|
parsed = json.loads(raw) if raw else {}
|
|
except Exception:
|
|
parsed = {"raw": raw}
|
|
if not isinstance(parsed, dict):
|
|
parsed = {"raw": parsed}
|
|
return status, parsed
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Recovery runbook for /api/v1/transport/clients/{id} lifecycle/health"
|
|
)
|
|
parser.add_argument("--api-url", default=os.environ.get("API_URL", "http://127.0.0.1:8080"))
|
|
parser.add_argument("--client-id", required=True)
|
|
parser.add_argument("--max-restarts", type=int, default=2)
|
|
parser.add_argument("--retry-delay-sec", type=float, default=1.0)
|
|
parser.add_argument("--provision-if-needed", dest="provision_if_needed", action="store_true")
|
|
parser.add_argument("--no-provision-if-needed", dest="provision_if_needed", action="store_false")
|
|
parser.set_defaults(provision_if_needed=True)
|
|
parser.add_argument("--diagnostics-json", default="")
|
|
return parser.parse_args()
|
|
|
|
|
|
def summarize(resp: Dict) -> str:
|
|
status = str(resp.get("status") or "").strip().lower()
|
|
code = str(resp.get("code") or "").strip()
|
|
last_err = str(resp.get("last_error") or "").strip()
|
|
if not last_err:
|
|
health = resp.get("health") or {}
|
|
if isinstance(health, dict):
|
|
last_err = str(health.get("last_error") or "").strip()
|
|
return f"status={status or 'unknown'} code={code or '-'} last_error={last_err or '-'}"
|
|
|
|
|
|
def is_healthy_up(health: Dict) -> bool:
|
|
status = str(health.get("status") or "").strip().lower()
|
|
if status != "up":
|
|
return False
|
|
code = str(health.get("code") or "").strip()
|
|
if code and code != "TRANSPORT_CLIENT_DEGRADED":
|
|
return False
|
|
last_err = str(health.get("last_error") or "").strip()
|
|
if not last_err:
|
|
h = health.get("health") or {}
|
|
if isinstance(h, dict):
|
|
last_err = str(h.get("last_error") or "").strip()
|
|
return last_err == ""
|
|
|
|
|
|
def action(api_url: str, client_id: str, name: str) -> Tuple[int, Dict]:
|
|
method = "POST"
|
|
path = f"/api/v1/transport/clients/{urllib.parse.quote(client_id)}/{name}"
|
|
if name in ("health", "metrics"):
|
|
method = "GET"
|
|
return request_json(api_url, method, path)
|
|
|
|
|
|
def client_card(api_url: str, client_id: str) -> Tuple[int, Dict]:
|
|
return request_json(api_url, "GET", f"/api/v1/transport/clients/{urllib.parse.quote(client_id)}")
|
|
|
|
|
|
def write_diagnostics(path: str, diag: Dict) -> None:
|
|
if not path.strip():
|
|
return
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(diag, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
api_url = args.api_url.strip()
|
|
client_id = args.client_id.strip()
|
|
if not api_url:
|
|
print("[transport_recovery] ERROR: empty --api-url")
|
|
return 1
|
|
if not client_id:
|
|
print("[transport_recovery] ERROR: empty --client-id")
|
|
return 1
|
|
if args.max_restarts < 0:
|
|
print("[transport_recovery] ERROR: --max-restarts must be >= 0")
|
|
return 1
|
|
|
|
print(
|
|
f"[transport_recovery] API_URL={api_url} client_id={client_id} "
|
|
f"max_restarts={args.max_restarts} provision_if_needed={args.provision_if_needed}"
|
|
)
|
|
|
|
diagnostics: Dict = {
|
|
"client_id": client_id,
|
|
"started_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"steps": [],
|
|
}
|
|
|
|
c_status, c_data = client_card(api_url, client_id)
|
|
diagnostics["client_card_before"] = {"http": c_status, "payload": c_data}
|
|
if c_status != 200 or not bool(c_data.get("ok", False)):
|
|
print(f"[transport_recovery] ERROR: client card unavailable http={c_status} payload={c_data}")
|
|
write_diagnostics(args.diagnostics_json, diagnostics)
|
|
return 1
|
|
|
|
h_status, health = action(api_url, client_id, "health")
|
|
diagnostics["steps"].append({"action": "health", "http": h_status, "payload": health})
|
|
if h_status != 200:
|
|
print(f"[transport_recovery] ERROR: health request failed http={h_status}")
|
|
write_diagnostics(args.diagnostics_json, diagnostics)
|
|
return 1
|
|
print(f"[transport_recovery] initial {summarize(health)}")
|
|
if is_healthy_up(health):
|
|
print("[transport_recovery] already healthy")
|
|
write_diagnostics(args.diagnostics_json, diagnostics)
|
|
return 0
|
|
|
|
recovered = False
|
|
provision_tried = False
|
|
|
|
for attempt in range(1, args.max_restarts + 1):
|
|
r_status, restart = action(api_url, client_id, "restart")
|
|
diagnostics["steps"].append({"action": "restart", "attempt": attempt, "http": r_status, "payload": restart})
|
|
print(
|
|
f"[transport_recovery] restart attempt={attempt} "
|
|
f"http={r_status} ok={restart.get('ok')} code={restart.get('code')}"
|
|
)
|
|
|
|
if args.retry_delay_sec > 0:
|
|
time.sleep(args.retry_delay_sec)
|
|
h_status, health = action(api_url, client_id, "health")
|
|
diagnostics["steps"].append(
|
|
{"action": "health_after_restart", "attempt": attempt, "http": h_status, "payload": health}
|
|
)
|
|
if h_status == 200 and is_healthy_up(health):
|
|
recovered = True
|
|
print(f"[transport_recovery] recovered after restart attempt={attempt}")
|
|
break
|
|
|
|
if args.provision_if_needed and not provision_tried:
|
|
p_status, provision = action(api_url, client_id, "provision")
|
|
diagnostics["steps"].append({"action": "provision", "http": p_status, "payload": provision})
|
|
print(
|
|
f"[transport_recovery] provision "
|
|
f"http={p_status} ok={provision.get('ok')} code={provision.get('code')}"
|
|
)
|
|
provision_tried = True
|
|
|
|
s_status, start = action(api_url, client_id, "start")
|
|
diagnostics["steps"].append({"action": "start", "http": s_status, "payload": start})
|
|
print(f"[transport_recovery] start http={s_status} ok={start.get('ok')} code={start.get('code')}")
|
|
|
|
if args.retry_delay_sec > 0:
|
|
time.sleep(args.retry_delay_sec)
|
|
h_status, health = action(api_url, client_id, "health")
|
|
diagnostics["steps"].append({"action": "health_after_start", "http": h_status, "payload": health})
|
|
if h_status == 200 and is_healthy_up(health):
|
|
recovered = True
|
|
print("[transport_recovery] recovered after provision/start")
|
|
break
|
|
|
|
m_status, metrics = action(api_url, client_id, "metrics")
|
|
diagnostics["metrics"] = {"http": m_status, "payload": metrics}
|
|
diagnostics["finished_at"] = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
|
|
|
if recovered:
|
|
print("[transport_recovery] RESULT: recovered")
|
|
write_diagnostics(args.diagnostics_json, diagnostics)
|
|
return 0
|
|
|
|
c2_status, c2_data = client_card(api_url, client_id)
|
|
diagnostics["client_card_after"] = {"http": c2_status, "payload": c2_data}
|
|
h2_status, h2 = action(api_url, client_id, "health")
|
|
diagnostics["health_after"] = {"http": h2_status, "payload": h2}
|
|
print(f"[transport_recovery] RESULT: unrecovered ({summarize(h2 if isinstance(h2, dict) else {})})")
|
|
write_diagnostics(args.diagnostics_json, diagnostics)
|
|
return 2
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|